Total coverage: 144920 (8%)of 1835957
26 26 108 108 108 108 108 108 108 150 148 23 150 108 149 18 13 15 15 15 15 15 15 8 13 10 6 108 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 // SPDX-License-Identifier: GPL-2.0-or-later /* * ip_vs_est.c: simple rate estimator for IPVS * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> * Network name space (netns) aware. * Global data moved to netns i.e struct netns_ipvs * Affected data: est_list and est_lock. * estimation_timer() runs with timer per netns. * get_stats()) do the per cpu summing. */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/types.h> #include <linux/interrupt.h> #include <linux/sysctl.h> #include <linux/list.h> #include <linux/rcupdate_wait.h> #include <net/ip_vs.h> /* This code is to estimate rate in a shorter interval (such as 8 seconds) for virtual services and real servers. For measure rate in a long interval, it is easy to implement a user level daemon which periodically reads those statistical counters and measure rate. We measure rate during the last 8 seconds every 2 seconds: avgrate = avgrate*(1-W) + rate*W where W = 2^(-2) NOTES. * Average bps is scaled by 2^5, while average pps and cps are scaled by 2^10. * Netlink users can see 64-bit values but sockopt users are restricted to 32-bit values for conns, packets, bps, cps and pps. * A lot of code is taken from net/core/gen_estimator.c KEY POINTS: - cpustats counters are updated per-cpu in SoftIRQ context with BH disabled - kthreads read the cpustats to update the estimators (svcs, dests, total) - the states of estimators can be read (get stats) or modified (zero stats) from processes KTHREADS: - estimators are added initially to est_temp_list and later kthread 0 distributes them to one or many kthreads for estimation - kthread contexts are created and attached to array - the kthread tasks are started when first service is added, before that the total stats are not estimated - when configuration (cpulist/nice) is changed, the tasks are restarted by work (est_reload_work) - kthread tasks are stopped while the cpulist is empty - the kthread context holds lists with estimators (chains) which are processed every 2 seconds - as estimators can be added dynamically and in bursts, we try to spread them to multiple chains which are estimated at different time - on start, kthread 0 enters calculation phase to determine the chain limits and the limit of estimators per kthread - est_add_ktid: ktid where to add new ests, can point to empty slot where we should add kt data */ static struct lock_class_key __ipvs_est_key; static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs); static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs); static void ip_vs_chain_estimation(struct hlist_head *chain) { struct ip_vs_estimator *e; struct ip_vs_cpu_stats *c; struct ip_vs_stats *s; u64 rate; hlist_for_each_entry_rcu(e, chain, list) { u64 conns, inpkts, outpkts, inbytes, outbytes; u64 kconns = 0, kinpkts = 0, koutpkts = 0; u64 kinbytes = 0, koutbytes = 0; unsigned int start; int i; if (kthread_should_stop()) break; s = container_of(e, struct ip_vs_stats, est); for_each_possible_cpu(i) { c = per_cpu_ptr(s->cpustats, i); do { start = u64_stats_fetch_begin(&c->syncp); conns = u64_stats_read(&c->cnt.conns); inpkts = u64_stats_read(&c->cnt.inpkts); outpkts = u64_stats_read(&c->cnt.outpkts); inbytes = u64_stats_read(&c->cnt.inbytes); outbytes = u64_stats_read(&c->cnt.outbytes); } while (u64_stats_fetch_retry(&c->syncp, start)); kconns += conns; kinpkts += inpkts; koutpkts += outpkts; kinbytes += inbytes; koutbytes += outbytes; } spin_lock(&s->lock); s->kstats.conns = kconns; s->kstats.inpkts = kinpkts; s->kstats.outpkts = koutpkts; s->kstats.inbytes = kinbytes; s->kstats.outbytes = koutbytes; /* scaled by 2^10, but divided 2 seconds */ rate = (s->kstats.conns - e->last_conns) << 9; e->last_conns = s->kstats.conns; e->cps += ((s64)rate - (s64)e->cps) >> 2; rate = (s->kstats.inpkts - e->last_inpkts) << 9; e->last_inpkts = s->kstats.inpkts; e->inpps += ((s64)rate - (s64)e->inpps) >> 2; rate = (s->kstats.outpkts - e->last_outpkts) << 9; e->last_outpkts = s->kstats.outpkts; e->outpps += ((s64)rate - (s64)e->outpps) >> 2; /* scaled by 2^5, but divided 2 seconds */ rate = (s->kstats.inbytes - e->last_inbytes) << 4; e->last_inbytes = s->kstats.inbytes; e->inbps += ((s64)rate - (s64)e->inbps) >> 2; rate = (s->kstats.outbytes - e->last_outbytes) << 4; e->last_outbytes = s->kstats.outbytes; e->outbps += ((s64)rate - (s64)e->outbps) >> 2; spin_unlock(&s->lock); } } static void ip_vs_tick_estimation(struct ip_vs_est_kt_data *kd, int row) { struct ip_vs_est_tick_data *td; int cid; rcu_read_lock(); td = rcu_dereference(kd->ticks[row]); if (!td) goto out; for_each_set_bit(cid, td->present, IPVS_EST_TICK_CHAINS) { if (kthread_should_stop()) break; ip_vs_chain_estimation(&td->chains[cid]); cond_resched_rcu(); td = rcu_dereference(kd->ticks[row]); if (!td) break; } out: rcu_read_unlock(); } static int ip_vs_estimation_kthread(void *data) { struct ip_vs_est_kt_data *kd = data; struct netns_ipvs *ipvs = kd->ipvs; int row = kd->est_row; unsigned long now; int id = kd->id; long gap; if (id > 0) { if (!ipvs->est_chain_max) return 0; } else { if (!ipvs->est_chain_max) { ipvs->est_calc_phase = 1; /* commit est_calc_phase before reading est_genid */ smp_mb(); } /* kthread 0 will handle the calc phase */ if (ipvs->est_calc_phase) ip_vs_est_calc_phase(ipvs); } while (1) { if (!id && !hlist_empty(&ipvs->est_temp_list)) ip_vs_est_drain_temp_list(ipvs); set_current_state(TASK_IDLE); if (kthread_should_stop()) break; /* before estimation, check if we should sleep */ now = jiffies; gap = kd->est_timer - now; if (gap > 0) { if (gap > IPVS_EST_TICK) { kd->est_timer = now - IPVS_EST_TICK; gap = IPVS_EST_TICK; } schedule_timeout(gap); } else { __set_current_state(TASK_RUNNING); if (gap < -8 * IPVS_EST_TICK) kd->est_timer = now; } if (kd->tick_len[row]) ip_vs_tick_estimation(kd, row); row++; if (row >= IPVS_EST_NTICKS) row = 0; WRITE_ONCE(kd->est_row, row); kd->est_timer += IPVS_EST_TICK; } __set_current_state(TASK_RUNNING); return 0; } /* Schedule stop/start for kthread tasks */ void ip_vs_est_reload_start(struct netns_ipvs *ipvs) { /* Ignore reloads before first service is added */ if (!ipvs->enable) return; ip_vs_est_stopped_recalc(ipvs); /* Bump the kthread configuration genid */ atomic_inc(&ipvs->est_genid); queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0); } /* Start kthread task with current configuration */ int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, struct ip_vs_est_kt_data *kd) { unsigned long now; int ret = 0; long gap; lockdep_assert_held(&ipvs->est_mutex); if (kd->task) goto out; now = jiffies; gap = kd->est_timer - now; /* Sync est_timer if task is starting later */ if (abs(gap) > 4 * IPVS_EST_TICK) kd->est_timer = now; kd->task = kthread_create(ip_vs_estimation_kthread, kd, "ipvs-e:%d:%d", ipvs->gen, kd->id); if (IS_ERR(kd->task)) { ret = PTR_ERR(kd->task); kd->task = NULL; goto out; } set_user_nice(kd->task, sysctl_est_nice(ipvs)); set_cpus_allowed_ptr(kd->task, sysctl_est_cpulist(ipvs)); pr_info("starting estimator thread %d...\n", kd->id); wake_up_process(kd->task); out: return ret; } void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd) { if (kd->task) { pr_info("stopping estimator thread %d...\n", kd->id); kthread_stop(kd->task); kd->task = NULL; } } /* Apply parameters to kthread */ static void ip_vs_est_set_params(struct netns_ipvs *ipvs, struct ip_vs_est_kt_data *kd) { kd->chain_max = ipvs->est_chain_max; /* We are using single chain on RCU preemption */ if (IPVS_EST_TICK_CHAINS == 1) kd->chain_max *= IPVS_EST_CHAIN_FACTOR; kd->tick_max = IPVS_EST_TICK_CHAINS * kd->chain_max; kd->est_max_count = IPVS_EST_NTICKS * kd->tick_max; } /* Create and start estimation kthread in a free or new array slot */ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) { struct ip_vs_est_kt_data *kd = NULL; int id = ipvs->est_kt_count; int ret = -ENOMEM; void *arr = NULL; int i; if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads && ipvs->enable && ipvs->est_max_threads) return -EINVAL; mutex_lock(&ipvs->est_mutex); for (i = 0; i < id; i++) { if (!ipvs->est_kt_arr[i]) break; } if (i >= id) { arr = krealloc_array(ipvs->est_kt_arr, id + 1, sizeof(struct ip_vs_est_kt_data *), GFP_KERNEL); if (!arr) goto out; ipvs->est_kt_arr = arr; } else { id = i; } kd = kzalloc(sizeof(*kd), GFP_KERNEL); if (!kd) goto out; kd->ipvs = ipvs; bitmap_fill(kd->avail, IPVS_EST_NTICKS); kd->est_timer = jiffies; kd->id = id; ip_vs_est_set_params(ipvs, kd); /* Pre-allocate stats used in calc phase */ if (!id && !kd->calc_stats) { kd->calc_stats = ip_vs_stats_alloc(); if (!kd->calc_stats) goto out; } /* Start kthread tasks only when services are present */ if (ipvs->enable && !ip_vs_est_stopped(ipvs)) { ret = ip_vs_est_kthread_start(ipvs, kd); if (ret < 0) goto out; } if (arr) ipvs->est_kt_count++; ipvs->est_kt_arr[id] = kd; kd = NULL; /* Use most recent kthread for new ests */ ipvs->est_add_ktid = id; ret = 0; out: mutex_unlock(&ipvs->est_mutex); if (kd) { ip_vs_stats_free(kd->calc_stats); kfree(kd); } return ret; } /* Select ktid where to add new ests: available, unused or new slot */ static void ip_vs_est_update_ktid(struct netns_ipvs *ipvs) { int ktid, best = ipvs->est_kt_count; struct ip_vs_est_kt_data *kd; for (ktid = 0; ktid < ipvs->est_kt_count; ktid++) { kd = ipvs->est_kt_arr[ktid]; if (kd) { if (kd->est_count < kd->est_max_count) { best = ktid; break; } } else if (ktid < best) { best = ktid; } } ipvs->est_add_ktid = best; } /* Add estimator to current kthread (est_add_ktid) */ static int ip_vs_enqueue_estimator(struct netns_ipvs *ipvs, struct ip_vs_estimator *est) { struct ip_vs_est_kt_data *kd = NULL; struct ip_vs_est_tick_data *td; int ktid, row, crow, cid, ret; int delay = est->ktrow; BUILD_BUG_ON_MSG(IPVS_EST_TICK_CHAINS > 127, "Too many chains for ktcid"); if (ipvs->est_add_ktid < ipvs->est_kt_count) { kd = ipvs->est_kt_arr[ipvs->est_add_ktid]; if (kd) goto add_est; } ret = ip_vs_est_add_kthread(ipvs); if (ret < 0) goto out; kd = ipvs->est_kt_arr[ipvs->est_add_ktid]; add_est: ktid = kd->id; /* For small number of estimators prefer to use few ticks, * otherwise try to add into the last estimated row. * est_row and add_row point after the row we should use */ if (kd->est_count >= 2 * kd->tick_max || delay < IPVS_EST_NTICKS - 1) crow = READ_ONCE(kd->est_row); else crow = kd->add_row; crow += delay; if (crow >= IPVS_EST_NTICKS) crow -= IPVS_EST_NTICKS; /* Assume initial delay ? */ if (delay >= IPVS_EST_NTICKS - 1) { /* Preserve initial delay or decrease it if no space in tick */ row = crow; if (crow < IPVS_EST_NTICKS - 1) { crow++; row = find_last_bit(kd->avail, crow); } if (row >= crow) row = find_last_bit(kd->avail, IPVS_EST_NTICKS); } else { /* Preserve delay or increase it if no space in tick */ row = IPVS_EST_NTICKS; if (crow > 0) row = find_next_bit(kd->avail, IPVS_EST_NTICKS, crow); if (row >= IPVS_EST_NTICKS) row = find_first_bit(kd->avail, IPVS_EST_NTICKS); } td = rcu_dereference_protected(kd->ticks[row], 1); if (!td) { td = kzalloc(sizeof(*td), GFP_KERNEL); if (!td) { ret = -ENOMEM; goto out; } rcu_assign_pointer(kd->ticks[row], td); } cid = find_first_zero_bit(td->full, IPVS_EST_TICK_CHAINS); kd->est_count++; kd->tick_len[row]++; if (!td->chain_len[cid]) __set_bit(cid, td->present); td->chain_len[cid]++; est->ktid = ktid; est->ktrow = row; est->ktcid = cid; hlist_add_head_rcu(&est->list, &td->chains[cid]); if (td->chain_len[cid] >= kd->chain_max) { __set_bit(cid, td->full); if (kd->tick_len[row] >= kd->tick_max) __clear_bit(row, kd->avail); } /* Update est_add_ktid to point to first available/empty kt slot */ if (kd->est_count == kd->est_max_count) ip_vs_est_update_ktid(ipvs); ret = 0; out: return ret; } /* Start estimation for stats */ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) { struct ip_vs_estimator *est = &stats->est; int ret; if (!ipvs->est_max_threads && ipvs->enable) ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); est->ktid = -1; est->ktrow = IPVS_EST_NTICKS - 1; /* Initial delay */ /* We prefer this code to be short, kthread 0 will requeue the * estimator to available chain. If tasks are disabled, we * will not allocate much memory, just for kt 0. */ ret = 0; if (!ipvs->est_kt_count || !ipvs->est_kt_arr[0]) ret = ip_vs_est_add_kthread(ipvs); if (ret >= 0) hlist_add_head(&est->list, &ipvs->est_temp_list); else INIT_HLIST_NODE(&est->list); return ret; } static void ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data *kd) { if (kd) { if (kd->task) { pr_info("stop unused estimator thread %d...\n", kd->id); kthread_stop(kd->task); } ip_vs_stats_free(kd->calc_stats); kfree(kd); } } /* Unlink estimator from chain */ void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) { struct ip_vs_estimator *est = &stats->est; struct ip_vs_est_tick_data *td; struct ip_vs_est_kt_data *kd; int ktid = est->ktid; int row = est->ktrow; int cid = est->ktcid; /* Failed to add to chain ? */ if (hlist_unhashed(&est->list)) return; /* On return, estimator can be freed, dequeue it now */ /* In est_temp_list ? */ if (ktid < 0) { hlist_del(&est->list); goto end_kt0; } hlist_del_rcu(&est->list); kd = ipvs->est_kt_arr[ktid]; td = rcu_dereference_protected(kd->ticks[row], 1); __clear_bit(cid, td->full); td->chain_len[cid]--; if (!td->chain_len[cid]) __clear_bit(cid, td->present); kd->tick_len[row]--; __set_bit(row, kd->avail); if (!kd->tick_len[row]) { RCU_INIT_POINTER(kd->ticks[row], NULL); kfree_rcu(td, rcu_head); } kd->est_count--; if (kd->est_count) { /* This kt slot can become available just now, prefer it */ if (ktid < ipvs->est_add_ktid) ipvs->est_add_ktid = ktid; return; } if (ktid > 0) { mutex_lock(&ipvs->est_mutex); ip_vs_est_kthread_destroy(kd); ipvs->est_kt_arr[ktid] = NULL; if (ktid == ipvs->est_kt_count - 1) { ipvs->est_kt_count--; while (ipvs->est_kt_count > 1 && !ipvs->est_kt_arr[ipvs->est_kt_count - 1]) ipvs->est_kt_count--; } mutex_unlock(&ipvs->est_mutex); /* This slot is now empty, prefer another available kt slot */ if (ktid == ipvs->est_add_ktid) ip_vs_est_update_ktid(ipvs); } end_kt0: /* kt 0 is freed after all other kthreads and chains are empty */ if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) { kd = ipvs->est_kt_arr[0]; if (!kd || !kd->est_count) { mutex_lock(&ipvs->est_mutex); if (kd) { ip_vs_est_kthread_destroy(kd); ipvs->est_kt_arr[0] = NULL; } ipvs->est_kt_count--; mutex_unlock(&ipvs->est_mutex); ipvs->est_add_ktid = 0; } } } /* Register all ests from est_temp_list to kthreads */ static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs) { struct ip_vs_estimator *est; while (1) { int max = 16; mutex_lock(&__ip_vs_mutex); while (max-- > 0) { est = hlist_entry_safe(ipvs->est_temp_list.first, struct ip_vs_estimator, list); if (est) { if (kthread_should_stop()) goto unlock; hlist_del_init(&est->list); if (ip_vs_enqueue_estimator(ipvs, est) >= 0) continue; est->ktid = -1; hlist_add_head(&est->list, &ipvs->est_temp_list); /* Abort, some entries will not be estimated * until next attempt */ } goto unlock; } mutex_unlock(&__ip_vs_mutex); cond_resched(); } unlock: mutex_unlock(&__ip_vs_mutex); } /* Calculate limits for all kthreads */ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max) { DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); struct ip_vs_est_kt_data *kd; struct hlist_head chain; struct ip_vs_stats *s; int cache_factor = 4; int i, loops, ntest; s32 min_est = 0; ktime_t t1, t2; int max = 8; int ret = 1; s64 diff; u64 val; INIT_HLIST_HEAD(&chain); mutex_lock(&__ip_vs_mutex); kd = ipvs->est_kt_arr[0]; mutex_unlock(&__ip_vs_mutex); s = kd ? kd->calc_stats : NULL; if (!s) goto out; hlist_add_head(&s->est.list, &chain); loops = 1; /* Get best result from many tests */ for (ntest = 0; ntest < 12; ntest++) { if (!(ntest & 3)) { /* Wait for cpufreq frequency transition */ wait_event_idle_timeout(wq, kthread_should_stop(), HZ / 50); if (!ipvs->enable || kthread_should_stop()) goto stop; } local_bh_disable(); rcu_read_lock(); /* Put stats in cache */ ip_vs_chain_estimation(&chain); t1 = ktime_get(); for (i = loops * cache_factor; i > 0; i--) ip_vs_chain_estimation(&chain); t2 = ktime_get(); rcu_read_unlock(); local_bh_enable(); if (!ipvs->enable || kthread_should_stop()) goto stop; cond_resched(); diff = ktime_to_ns(ktime_sub(t2, t1)); if (diff <= 1 * NSEC_PER_USEC) { /* Do more loops on low time resolution */ loops *= 2; continue; } if (diff >= NSEC_PER_SEC) continue; val = diff; do_div(val, loops); if (!min_est || val < min_est) { min_est = val; /* goal: 95usec per chain */ val = 95 * NSEC_PER_USEC; if (val >= min_est) { do_div(val, min_est); max = (int)val; } else { max = 1; } } } out: if (s) hlist_del_init(&s->est.list); *chain_max = max; return ret; stop: ret = 0; goto out; } /* Calculate the parameters and apply them in context of kt #0 * ECP: est_calc_phase * ECM: est_chain_max * ECP ECM Insert Chain enable Description * --------------------------------------------------------------------------- * 0 0 est_temp_list 0 create kt #0 context * 0 0 est_temp_list 0->1 service added, start kthread #0 task * 0->1 0 est_temp_list 1 kt task #0 started, enters calc phase * 1 0 est_temp_list 1 kt #0: determine est_chain_max, * stop tasks, move ests to est_temp_list * and free kd for kthreads 1..last * 1->0 0->N kt chains 1 ests can go to kthreads * 0 N kt chains 1 drain est_temp_list, create new kthread * contexts, start tasks, estimate */ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) { int genid = atomic_read(&ipvs->est_genid); struct ip_vs_est_tick_data *td; struct ip_vs_est_kt_data *kd; struct ip_vs_estimator *est; struct ip_vs_stats *stats; int id, row, cid, delay; bool last, last_td; int chain_max; int step; if (!ip_vs_est_calc_limits(ipvs, &chain_max)) return; mutex_lock(&__ip_vs_mutex); /* Stop all other tasks, so that we can immediately move the * estimators to est_temp_list without RCU grace period */ mutex_lock(&ipvs->est_mutex); for (id = 1; id < ipvs->est_kt_count; id++) { /* netns clean up started, abort */ if (!ipvs->enable) goto unlock2; kd = ipvs->est_kt_arr[id]; if (!kd) continue; ip_vs_est_kthread_stop(kd); } mutex_unlock(&ipvs->est_mutex); /* Move all estimators to est_temp_list but carefully, * all estimators and kthread data can be released while * we reschedule. Even for kthread 0. */ step = 0; /* Order entries in est_temp_list in ascending delay, so now * walk delay(desc), id(desc), cid(asc) */ delay = IPVS_EST_NTICKS; next_delay: delay--; if (delay < 0) goto end_dequeue; last_kt: /* Destroy contexts backwards */ id = ipvs->est_kt_count; next_kt: if (!ipvs->enable || kthread_should_stop()) goto unlock; id--; if (id < 0) goto next_delay; kd = ipvs->est_kt_arr[id]; if (!kd) goto next_kt; /* kt 0 can exist with empty chains */ if (!id && kd->est_count <= 1) goto next_delay; row = kd->est_row + delay; if (row >= IPVS_EST_NTICKS) row -= IPVS_EST_NTICKS; td = rcu_dereference_protected(kd->ticks[row], 1); if (!td) goto next_kt; cid = 0; walk_chain: if (kthread_should_stop()) goto unlock; step++; if (!(step & 63)) { /* Give chance estimators to be added (to est_temp_list) * and deleted (releasing kthread contexts) */ mutex_unlock(&__ip_vs_mutex); cond_resched(); mutex_lock(&__ip_vs_mutex); /* Current kt released ? */ if (id >= ipvs->est_kt_count) goto last_kt; if (kd != ipvs->est_kt_arr[id]) goto next_kt; /* Current td released ? */ if (td != rcu_dereference_protected(kd->ticks[row], 1)) goto next_kt; /* No fatal changes on the current kd and td */ } est = hlist_entry_safe(td->chains[cid].first, struct ip_vs_estimator, list); if (!est) { cid++; if (cid >= IPVS_EST_TICK_CHAINS) goto next_kt; goto walk_chain; } /* We can cheat and increase est_count to protect kt 0 context * from release but we prefer to keep the last estimator */ last = kd->est_count <= 1; /* Do not free kt #0 data */ if (!id && last) goto next_delay; last_td = kd->tick_len[row] <= 1; stats = container_of(est, struct ip_vs_stats, est); ip_vs_stop_estimator(ipvs, stats); /* Tasks are stopped, move without RCU grace period */ est->ktid = -1; est->ktrow = row - kd->est_row; if (est->ktrow < 0) est->ktrow += IPVS_EST_NTICKS; hlist_add_head(&est->list, &ipvs->est_temp_list); /* kd freed ? */ if (last) goto next_kt; /* td freed ? */ if (last_td) goto next_kt; goto walk_chain; end_dequeue: /* All estimators removed while calculating ? */ if (!ipvs->est_kt_count) goto unlock; kd = ipvs->est_kt_arr[0]; if (!kd) goto unlock; kd->add_row = kd->est_row; ipvs->est_chain_max = chain_max; ip_vs_est_set_params(ipvs, kd); pr_info("using max %d ests per chain, %d per kthread\n", kd->chain_max, kd->est_max_count); /* Try to keep tot_stats in kt0, enqueue it early */ if (ipvs->tot_stats && !hlist_unhashed(&ipvs->tot_stats->s.est.list) && ipvs->tot_stats->s.est.ktid == -1) { hlist_del(&ipvs->tot_stats->s.est.list); hlist_add_head(&ipvs->tot_stats->s.est.list, &ipvs->est_temp_list); } mutex_lock(&ipvs->est_mutex); /* We completed the calc phase, new calc phase not requested */ if (genid == atomic_read(&ipvs->est_genid)) ipvs->est_calc_phase = 0; unlock2: mutex_unlock(&ipvs->est_mutex); unlock: mutex_unlock(&__ip_vs_mutex); } void ip_vs_zero_estimator(struct ip_vs_stats *stats) { struct ip_vs_estimator *est = &stats->est; struct ip_vs_kstats *k = &stats->kstats; /* reset counters, caller must hold the stats->lock lock */ est->last_inbytes = k->inbytes; est->last_outbytes = k->outbytes; est->last_conns = k->conns; est->last_inpkts = k->inpkts; est->last_outpkts = k->outpkts; est->cps = 0; est->inpps = 0; est->outpps = 0; est->inbps = 0; est->outbps = 0; } /* Get decoded rates */ void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats) { struct ip_vs_estimator *e = &stats->est; dst->cps = (e->cps + 0x1FF) >> 10; dst->inpps = (e->inpps + 0x1FF) >> 10; dst->outpps = (e->outpps + 0x1FF) >> 10; dst->inbps = (e->inbps + 0xF) >> 5; dst->outbps = (e->outbps + 0xF) >> 5; } int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs) { INIT_HLIST_HEAD(&ipvs->est_temp_list); ipvs->est_kt_arr = NULL; ipvs->est_max_threads = 0; ipvs->est_calc_phase = 0; ipvs->est_chain_max = 0; ipvs->est_kt_count = 0; ipvs->est_add_ktid = 0; atomic_set(&ipvs->est_genid, 0); atomic_set(&ipvs->est_genid_done, 0); __mutex_init(&ipvs->est_mutex, "ipvs->est_mutex", &__ipvs_est_key); return 0; } void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs) { int i; for (i = 0; i < ipvs->est_kt_count; i++) ip_vs_est_kthread_destroy(ipvs->est_kt_arr[i]); kfree(ipvs->est_kt_arr); mutex_destroy(&ipvs->est_mutex); }
2 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_TABLES_IPV6_H_ #define _NF_TABLES_IPV6_H_ #include <linux/netfilter_ipv6/ip6_tables.h> #include <net/ipv6.h> #include <net/netfilter/nf_tables.h> static inline void nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt) { unsigned int flags = IP6_FH_F_AUTH; int protohdr, thoff = 0; unsigned short frag_off; protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); if (protohdr < 0 || thoff > U16_MAX) { nft_set_pktinfo_unspec(pkt); return; } pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = protohdr; pkt->thoff = thoff; pkt->fragoff = frag_off; } static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt) { #if IS_ENABLED(CONFIG_IPV6) unsigned int flags = IP6_FH_F_AUTH; struct ipv6hdr *ip6h, _ip6h; unsigned int thoff = 0; unsigned short frag_off; u32 pkt_len, skb_len; int protohdr; ip6h = skb_header_pointer(pkt->skb, skb_network_offset(pkt->skb), sizeof(*ip6h), &_ip6h); if (!ip6h) return -1; if (ip6h->version != 6) return -1; pkt_len = ntohs(ip6h->payload_len); skb_len = pkt->skb->len - skb_network_offset(pkt->skb); if (pkt_len + sizeof(*ip6h) > skb_len) return -1; protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); if (protohdr < 0 || thoff > U16_MAX) return -1; pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = protohdr; pkt->thoff = thoff; pkt->fragoff = frag_off; return 0; #else return -1; #endif } static inline void nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt) { if (__nft_set_pktinfo_ipv6_validate(pkt) < 0) nft_set_pktinfo_unspec(pkt); } static inline int nft_set_pktinfo_ipv6_ingress(struct nft_pktinfo *pkt) { #if IS_ENABLED(CONFIG_IPV6) unsigned int flags = IP6_FH_F_AUTH; unsigned short frag_off; unsigned int thoff = 0; struct inet6_dev *idev; struct ipv6hdr *ip6h; int protohdr; u32 pkt_len; if (!pskb_may_pull(pkt->skb, sizeof(*ip6h))) return -1; ip6h = ipv6_hdr(pkt->skb); if (ip6h->version != 6) goto inhdr_error; pkt_len = ntohs(ip6h->payload_len); if (pkt_len + sizeof(*ip6h) > pkt->skb->len) { idev = __in6_dev_get(nft_in(pkt)); __IP6_INC_STATS(nft_net(pkt), idev, IPSTATS_MIB_INTRUNCATEDPKTS); return -1; } protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags); if (protohdr < 0 || thoff > U16_MAX) goto inhdr_error; pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = protohdr; pkt->thoff = thoff; pkt->fragoff = frag_off; return 0; inhdr_error: idev = __in6_dev_get(nft_in(pkt)); __IP6_INC_STATS(nft_net(pkt), idev, IPSTATS_MIB_INHDRERRORS); return -1; #else return -1; #endif } #endif
15 13 13 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _CRYPTO_INTERNAL_CHACHA_H #define _CRYPTO_INTERNAL_CHACHA_H #include <crypto/chacha.h> #include <crypto/internal/skcipher.h> #include <linux/crypto.h> struct chacha_ctx { u32 key[8]; int nrounds; }; static inline int chacha_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keysize, int nrounds) { struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); int i; if (keysize != CHACHA_KEY_SIZE) return -EINVAL; for (i = 0; i < ARRAY_SIZE(ctx->key); i++) ctx->key[i] = get_unaligned_le32(key + i * sizeof(u32)); ctx->nrounds = nrounds; return 0; } static inline int chacha20_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keysize) { return chacha_setkey(tfm, key, keysize, 20); } static inline int chacha12_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keysize) { return chacha_setkey(tfm, key, keysize, 12); } #endif /* _CRYPTO_CHACHA_H */
8 8 8 8 8 8 1 13 1 15 15 14 1 1 1 1 1 1 1 1 9 12 23 1 5 27 15 1 16 16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 // SPDX-License-Identifier: GPL-2.0 #include <linux/anon_inodes.h> #include <linux/exportfs.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/cgroup.h> #include <linux/magic.h> #include <linux/mount.h> #include <linux/pid.h> #include <linux/pidfs.h> #include <linux/pid_namespace.h> #include <linux/poll.h> #include <linux/proc_fs.h> #include <linux/proc_ns.h> #include <linux/pseudo_fs.h> #include <linux/ptrace.h> #include <linux/seq_file.h> #include <uapi/linux/pidfd.h> #include <linux/ipc_namespace.h> #include <linux/time_namespace.h> #include <linux/utsname.h> #include <net/net_namespace.h> #include "internal.h" #include "mount.h" static struct rb_root pidfs_ino_tree = RB_ROOT; #if BITS_PER_LONG == 32 static inline unsigned long pidfs_ino(u64 ino) { return lower_32_bits(ino); } /* On 32 bit the generation number are the upper 32 bits. */ static inline u32 pidfs_gen(u64 ino) { return upper_32_bits(ino); } #else /* On 64 bit simply return ino. */ static inline unsigned long pidfs_ino(u64 ino) { return ino; } /* On 64 bit the generation number is 0. */ static inline u32 pidfs_gen(u64 ino) { return 0; } #endif static int pidfs_ino_cmp(struct rb_node *a, const struct rb_node *b) { struct pid *pid_a = rb_entry(a, struct pid, pidfs_node); struct pid *pid_b = rb_entry(b, struct pid, pidfs_node); u64 pid_ino_a = pid_a->ino; u64 pid_ino_b = pid_b->ino; if (pid_ino_a < pid_ino_b) return -1; if (pid_ino_a > pid_ino_b) return 1; return 0; } void pidfs_add_pid(struct pid *pid) { static u64 pidfs_ino_nr = 2; /* * On 64 bit nothing special happens. The 64bit number assigned * to struct pid is the inode number. * * On 32 bit the 64 bit number assigned to struct pid is split * into two 32 bit numbers. The lower 32 bits are used as the * inode number and the upper 32 bits are used as the inode * generation number. * * On 32 bit pidfs_ino() will return the lower 32 bit. When * pidfs_ino() returns zero a wrap around happened. When a * wraparound happens the 64 bit number will be incremented by 2 * so inode numbering starts at 2 again. * * On 64 bit comparing two pidfds is as simple as comparing * inode numbers. * * When a wraparound happens on 32 bit multiple pidfds with the * same inode number are likely to exist (This isn't a problem * since before pidfs pidfds used the anonymous inode meaning * all pidfds had the same inode number.). Userspace can * reconstruct the 64 bit identifier by retrieving both the * inode number and the inode generation number to compare or * use file handles. */ if (pidfs_ino(pidfs_ino_nr) == 0) pidfs_ino_nr += 2; pid->ino = pidfs_ino_nr; pid->stashed = NULL; pidfs_ino_nr++; write_seqcount_begin(&pidmap_lock_seq); rb_find_add_rcu(&pid->pidfs_node, &pidfs_ino_tree, pidfs_ino_cmp); write_seqcount_end(&pidmap_lock_seq); } void pidfs_remove_pid(struct pid *pid) { write_seqcount_begin(&pidmap_lock_seq); rb_erase(&pid->pidfs_node, &pidfs_ino_tree); write_seqcount_end(&pidmap_lock_seq); } #ifdef CONFIG_PROC_FS /** * pidfd_show_fdinfo - print information about a pidfd * @m: proc fdinfo file * @f: file referencing a pidfd * * Pid: * This function will print the pid that a given pidfd refers to in the * pid namespace of the procfs instance. * If the pid namespace of the process is not a descendant of the pid * namespace of the procfs instance 0 will be shown as its pid. This is * similar to calling getppid() on a process whose parent is outside of * its pid namespace. * * NSpid: * If pid namespaces are supported then this function will also print * the pid of a given pidfd refers to for all descendant pid namespaces * starting from the current pid namespace of the instance, i.e. the * Pid field and the first entry in the NSpid field will be identical. * If the pid namespace of the process is not a descendant of the pid * namespace of the procfs instance 0 will be shown as its first NSpid * entry and no others will be shown. * Note that this differs from the Pid and NSpid fields in * /proc/<pid>/status where Pid and NSpid are always shown relative to * the pid namespace of the procfs instance. The difference becomes * obvious when sending around a pidfd between pid namespaces from a * different branch of the tree, i.e. where no ancestral relation is * present between the pid namespaces: * - create two new pid namespaces ns1 and ns2 in the initial pid * namespace (also take care to create new mount namespaces in the * new pid namespace and mount procfs) * - create a process with a pidfd in ns1 * - send pidfd from ns1 to ns2 * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid * have exactly one entry, which is 0 */ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) { struct pid *pid = pidfd_pid(f); struct pid_namespace *ns; pid_t nr = -1; if (likely(pid_has_task(pid, PIDTYPE_PID))) { ns = proc_pid_ns(file_inode(m->file)->i_sb); nr = pid_nr_ns(pid, ns); } seq_put_decimal_ll(m, "Pid:\t", nr); #ifdef CONFIG_PID_NS seq_put_decimal_ll(m, "\nNSpid:\t", nr); if (nr > 0) { int i; /* If nr is non-zero it means that 'pid' is valid and that * ns, i.e. the pid namespace associated with the procfs * instance, is in the pid namespace hierarchy of pid. * Start at one below the already printed level. */ for (i = ns->level + 1; i <= pid->level; i++) seq_put_decimal_ll(m, "\t", pid->numbers[i].nr); } #endif seq_putc(m, '\n'); } #endif /* * Poll support for process exit notification. */ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) { struct pid *pid = pidfd_pid(file); bool thread = file->f_flags & PIDFD_THREAD; struct task_struct *task; __poll_t poll_flags = 0; poll_wait(file, &pid->wait_pidfd, pts); /* * Depending on PIDFD_THREAD, inform pollers when the thread * or the whole thread-group exits. */ guard(rcu)(); task = pid_task(pid, PIDTYPE_PID); if (!task) poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP; else if (task->exit_state && (thread || thread_group_empty(task))) poll_flags = EPOLLIN | EPOLLRDNORM; return poll_flags; } static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg) { struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg; size_t usize = _IOC_SIZE(cmd); struct pidfd_info kinfo = {}; struct user_namespace *user_ns; const struct cred *c; __u64 mask; #ifdef CONFIG_CGROUPS struct cgroup *cgrp; #endif if (!uinfo) return -EINVAL; if (usize < PIDFD_INFO_SIZE_VER0) return -EINVAL; /* First version, no smaller struct possible */ if (copy_from_user(&mask, &uinfo->mask, sizeof(mask))) return -EFAULT; c = get_task_cred(task); if (!c) return -ESRCH; /* Unconditionally return identifiers and credentials, the rest only on request */ user_ns = current_user_ns(); kinfo.ruid = from_kuid_munged(user_ns, c->uid); kinfo.rgid = from_kgid_munged(user_ns, c->gid); kinfo.euid = from_kuid_munged(user_ns, c->euid); kinfo.egid = from_kgid_munged(user_ns, c->egid); kinfo.suid = from_kuid_munged(user_ns, c->suid); kinfo.sgid = from_kgid_munged(user_ns, c->sgid); kinfo.fsuid = from_kuid_munged(user_ns, c->fsuid); kinfo.fsgid = from_kgid_munged(user_ns, c->fsgid); kinfo.mask |= PIDFD_INFO_CREDS; put_cred(c); #ifdef CONFIG_CGROUPS rcu_read_lock(); cgrp = task_dfl_cgroup(task); kinfo.cgroupid = cgroup_id(cgrp); kinfo.mask |= PIDFD_INFO_CGROUPID; rcu_read_unlock(); #endif /* * Copy pid/tgid last, to reduce the chances the information might be * stale. Note that it is not possible to ensure it will be valid as the * task might return as soon as the copy_to_user finishes, but that's ok * and userspace expects that might happen and can act accordingly, so * this is just best-effort. What we can do however is checking that all * the fields are set correctly, or return ESRCH to avoid providing * incomplete information. */ kinfo.ppid = task_ppid_nr_ns(task, NULL); kinfo.tgid = task_tgid_vnr(task); kinfo.pid = task_pid_vnr(task); kinfo.mask |= PIDFD_INFO_PID; if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1)) return -ESRCH; /* * If userspace and the kernel have the same struct size it can just * be copied. If userspace provides an older struct, only the bits that * userspace knows about will be copied. If userspace provides a new * struct, only the bits that the kernel knows about will be copied. */ if (copy_to_user(uinfo, &kinfo, min(usize, sizeof(kinfo)))) return -EFAULT; return 0; } static bool pidfs_ioctl_valid(unsigned int cmd) { switch (cmd) { case FS_IOC_GETVERSION: case PIDFD_GET_CGROUP_NAMESPACE: case PIDFD_GET_IPC_NAMESPACE: case PIDFD_GET_MNT_NAMESPACE: case PIDFD_GET_NET_NAMESPACE: case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: case PIDFD_GET_TIME_NAMESPACE: case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: case PIDFD_GET_UTS_NAMESPACE: case PIDFD_GET_USER_NAMESPACE: case PIDFD_GET_PID_NAMESPACE: return true; } /* Extensible ioctls require some more careful checks. */ switch (_IOC_NR(cmd)) { case _IOC_NR(PIDFD_GET_INFO): /* * Try to prevent performing a pidfd ioctl when someone * erronously mistook the file descriptor for a pidfd. * This is not perfect but will catch most cases. */ return (_IOC_TYPE(cmd) == _IOC_TYPE(PIDFD_GET_INFO)); } return false; } static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct task_struct *task __free(put_task) = NULL; struct nsproxy *nsp __free(put_nsproxy) = NULL; struct pid *pid = pidfd_pid(file); struct ns_common *ns_common = NULL; struct pid_namespace *pid_ns; if (!pidfs_ioctl_valid(cmd)) return -ENOIOCTLCMD; if (cmd == FS_IOC_GETVERSION) { if (!arg) return -EINVAL; __u32 __user *argp = (__u32 __user *)arg; return put_user(file_inode(file)->i_generation, argp); } task = get_pid_task(pid, PIDTYPE_PID); if (!task) return -ESRCH; /* Extensible IOCTL that does not open namespace FDs, take a shortcut */ if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO)) return pidfd_info(task, cmd, arg); if (arg) return -EINVAL; scoped_guard(task_lock, task) { nsp = task->nsproxy; if (nsp) get_nsproxy(nsp); } if (!nsp) return -ESRCH; /* just pretend it didn't exist */ /* * We're trying to open a file descriptor to the namespace so perform a * filesystem cred ptrace check. Also, we mirror nsfs behavior. */ if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) return -EACCES; switch (cmd) { /* Namespaces that hang of nsproxy. */ case PIDFD_GET_CGROUP_NAMESPACE: if (IS_ENABLED(CONFIG_CGROUPS)) { get_cgroup_ns(nsp->cgroup_ns); ns_common = to_ns_common(nsp->cgroup_ns); } break; case PIDFD_GET_IPC_NAMESPACE: if (IS_ENABLED(CONFIG_IPC_NS)) { get_ipc_ns(nsp->ipc_ns); ns_common = to_ns_common(nsp->ipc_ns); } break; case PIDFD_GET_MNT_NAMESPACE: get_mnt_ns(nsp->mnt_ns); ns_common = to_ns_common(nsp->mnt_ns); break; case PIDFD_GET_NET_NAMESPACE: if (IS_ENABLED(CONFIG_NET_NS)) { ns_common = to_ns_common(nsp->net_ns); get_net_ns(ns_common); } break; case PIDFD_GET_PID_FOR_CHILDREN_NAMESPACE: if (IS_ENABLED(CONFIG_PID_NS)) { get_pid_ns(nsp->pid_ns_for_children); ns_common = to_ns_common(nsp->pid_ns_for_children); } break; case PIDFD_GET_TIME_NAMESPACE: if (IS_ENABLED(CONFIG_TIME_NS)) { get_time_ns(nsp->time_ns); ns_common = to_ns_common(nsp->time_ns); } break; case PIDFD_GET_TIME_FOR_CHILDREN_NAMESPACE: if (IS_ENABLED(CONFIG_TIME_NS)) { get_time_ns(nsp->time_ns_for_children); ns_common = to_ns_common(nsp->time_ns_for_children); } break; case PIDFD_GET_UTS_NAMESPACE: if (IS_ENABLED(CONFIG_UTS_NS)) { get_uts_ns(nsp->uts_ns); ns_common = to_ns_common(nsp->uts_ns); } break; /* Namespaces that don't hang of nsproxy. */ case PIDFD_GET_USER_NAMESPACE: if (IS_ENABLED(CONFIG_USER_NS)) { rcu_read_lock(); ns_common = to_ns_common(get_user_ns(task_cred_xxx(task, user_ns))); rcu_read_unlock(); } break; case PIDFD_GET_PID_NAMESPACE: if (IS_ENABLED(CONFIG_PID_NS)) { rcu_read_lock(); pid_ns = task_active_pid_ns(task); if (pid_ns) ns_common = to_ns_common(get_pid_ns(pid_ns)); rcu_read_unlock(); } break; default: return -ENOIOCTLCMD; } if (!ns_common) return -EOPNOTSUPP; /* open_namespace() unconditionally consumes the reference */ return open_namespace(ns_common); } static const struct file_operations pidfs_file_operations = { .poll = pidfd_poll, #ifdef CONFIG_PROC_FS .show_fdinfo = pidfd_show_fdinfo, #endif .unlocked_ioctl = pidfd_ioctl, .compat_ioctl = compat_ptr_ioctl, }; struct pid *pidfd_pid(const struct file *file) { if (file->f_op != &pidfs_file_operations) return ERR_PTR(-EBADF); return file_inode(file)->i_private; } static struct vfsmount *pidfs_mnt __ro_after_init; /* * The vfs falls back to simple_setattr() if i_op->setattr() isn't * implemented. Let's reject it completely until we have a clean * permission concept for pidfds. */ static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { return -EOPNOTSUPP; } /* * User space expects pidfs inodes to have no file type in st_mode. * * In particular, 'lsof' has this legacy logic: * * type = s->st_mode & S_IFMT; * switch (type) { * ... * case 0: * if (!strcmp(p, "anon_inode")) * Lf->ntype = Ntype = N_ANON_INODE; * * to detect our old anon_inode logic. * * Rather than mess with our internal sane inode data, just fix it * up here in getattr() by masking off the format bits. */ static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->mode &= ~S_IFMT; return 0; } static const struct inode_operations pidfs_inode_operations = { .getattr = pidfs_getattr, .setattr = pidfs_setattr, }; static void pidfs_evict_inode(struct inode *inode) { struct pid *pid = inode->i_private; clear_inode(inode); put_pid(pid); } static const struct super_operations pidfs_sops = { .drop_inode = generic_delete_inode, .evict_inode = pidfs_evict_inode, .statfs = simple_statfs, }; /* * 'lsof' has knowledge of out historical anon_inode use, and expects * the pidfs dentry name to start with 'anon_inode'. */ static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen) { return dynamic_dname(buffer, buflen, "anon_inode:[pidfd]"); } const struct dentry_operations pidfs_dentry_operations = { .d_delete = always_delete_dentry, .d_dname = pidfs_dname, .d_prune = stashed_dentry_prune, }; static int pidfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, struct inode *parent) { const struct pid *pid = inode->i_private; if (*max_len < 2) { *max_len = 2; return FILEID_INVALID; } *max_len = 2; *(u64 *)fh = pid->ino; return FILEID_KERNFS; } static int pidfs_ino_find(const void *key, const struct rb_node *node) { const u64 pid_ino = *(u64 *)key; const struct pid *pid = rb_entry(node, struct pid, pidfs_node); if (pid_ino < pid->ino) return -1; if (pid_ino > pid->ino) return 1; return 0; } /* Find a struct pid based on the inode number. */ static struct pid *pidfs_ino_get_pid(u64 ino) { struct pid *pid; struct rb_node *node; unsigned int seq; guard(rcu)(); do { seq = read_seqcount_begin(&pidmap_lock_seq); node = rb_find_rcu(&ino, &pidfs_ino_tree, pidfs_ino_find); if (node) break; } while (read_seqcount_retry(&pidmap_lock_seq, seq)); if (!node) return NULL; pid = rb_entry(node, struct pid, pidfs_node); /* Within our pid namespace hierarchy? */ if (pid_vnr(pid) == 0) return NULL; return get_pid(pid); } static struct dentry *pidfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { int ret; u64 pid_ino; struct path path; struct pid *pid; if (fh_len < 2) return NULL; switch (fh_type) { case FILEID_KERNFS: pid_ino = *(u64 *)fid; break; default: return NULL; } pid = pidfs_ino_get_pid(pid_ino); if (!pid) return NULL; ret = path_from_stashed(&pid->stashed, pidfs_mnt, pid, &path); if (ret < 0) return ERR_PTR(ret); mntput(path.mnt); return path.dentry; } /* * Make sure that we reject any nonsensical flags that users pass via * open_by_handle_at(). Note that PIDFD_THREAD is defined as O_EXCL, and * PIDFD_NONBLOCK as O_NONBLOCK. */ #define VALID_FILE_HANDLE_OPEN_FLAGS \ (O_RDONLY | O_WRONLY | O_RDWR | O_NONBLOCK | O_CLOEXEC | O_EXCL) static int pidfs_export_permission(struct handle_to_path_ctx *ctx, unsigned int oflags) { if (oflags & ~(VALID_FILE_HANDLE_OPEN_FLAGS | O_LARGEFILE)) return -EINVAL; /* * pidfd_ino_get_pid() will verify that the struct pid is part * of the caller's pid namespace hierarchy. No further * permission checks are needed. */ return 0; } static struct file *pidfs_export_open(struct path *path, unsigned int oflags) { /* * Clear O_LARGEFILE as open_by_handle_at() forces it and raise * O_RDWR as pidfds always are. */ oflags &= ~O_LARGEFILE; return dentry_open(path, oflags | O_RDWR, current_cred()); } static const struct export_operations pidfs_export_operations = { .encode_fh = pidfs_encode_fh, .fh_to_dentry = pidfs_fh_to_dentry, .open = pidfs_export_open, .permission = pidfs_export_permission, }; static int pidfs_init_inode(struct inode *inode, void *data) { const struct pid *pid = data; inode->i_private = data; inode->i_flags |= S_PRIVATE; inode->i_mode |= S_IRWXU; inode->i_op = &pidfs_inode_operations; inode->i_fop = &pidfs_file_operations; inode->i_ino = pidfs_ino(pid->ino); inode->i_generation = pidfs_gen(pid->ino); return 0; } static void pidfs_put_data(void *data) { struct pid *pid = data; put_pid(pid); } static const struct stashed_operations pidfs_stashed_ops = { .init_inode = pidfs_init_inode, .put_data = pidfs_put_data, }; static int pidfs_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx; ctx = init_pseudo(fc, PID_FS_MAGIC); if (!ctx) return -ENOMEM; ctx->ops = &pidfs_sops; ctx->eops = &pidfs_export_operations; ctx->dops = &pidfs_dentry_operations; fc->s_fs_info = (void *)&pidfs_stashed_ops; return 0; } static struct file_system_type pidfs_type = { .name = "pidfs", .init_fs_context = pidfs_init_fs_context, .kill_sb = kill_anon_super, }; struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) { struct file *pidfd_file; struct path path; int ret; ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); if (ret < 0) return ERR_PTR(ret); pidfd_file = dentry_open(&path, flags, current_cred()); path_put(&path); return pidfd_file; } void __init pidfs_init(void) { pidfs_mnt = kern_mount(&pidfs_type); if (IS_ERR(pidfs_mnt)) panic("Failed to mount pidfs pseudo filesystem"); }
1 1 8 1 5 1 3 1 1 1 5 2 1 1 1 2 11 8 11 11 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netfilter.h> #include <linux/rhashtable.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/netdevice.h> #include <linux/if_ether.h> #include <net/gso.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/neighbour.h> #include <net/netfilter/nf_flow_table.h> #include <net/netfilter/nf_conntrack_acct.h> /* For layer 4 checksum field offset. */ #include <linux/tcp.h> #include <linux/udp.h> static int nf_flow_state_check(struct flow_offload *flow, int proto, struct sk_buff *skb, unsigned int thoff) { struct tcphdr *tcph; if (proto != IPPROTO_TCP) return 0; tcph = (void *)(skb_network_header(skb) + thoff); if (tcph->syn && test_bit(NF_FLOW_CLOSING, &flow->flags)) { flow_offload_teardown(flow); return -1; } if ((tcph->fin || tcph->rst) && !test_bit(NF_FLOW_CLOSING, &flow->flags)) set_bit(NF_FLOW_CLOSING, &flow->flags); return 0; } static void nf_flow_nat_ip_tcp(struct sk_buff *skb, unsigned int thoff, __be32 addr, __be32 new_addr) { struct tcphdr *tcph; tcph = (void *)(skb_network_header(skb) + thoff); inet_proto_csum_replace4(&tcph->check, skb, addr, new_addr, true); } static void nf_flow_nat_ip_udp(struct sk_buff *skb, unsigned int thoff, __be32 addr, __be32 new_addr) { struct udphdr *udph; udph = (void *)(skb_network_header(skb) + thoff); if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace4(&udph->check, skb, addr, new_addr, true); if (!udph->check) udph->check = CSUM_MANGLED_0; } } static void nf_flow_nat_ip_l4proto(struct sk_buff *skb, struct iphdr *iph, unsigned int thoff, __be32 addr, __be32 new_addr) { switch (iph->protocol) { case IPPROTO_TCP: nf_flow_nat_ip_tcp(skb, thoff, addr, new_addr); break; case IPPROTO_UDP: nf_flow_nat_ip_udp(skb, thoff, addr, new_addr); break; } } static void nf_flow_snat_ip(const struct flow_offload *flow, struct sk_buff *skb, struct iphdr *iph, unsigned int thoff, enum flow_offload_tuple_dir dir) { __be32 addr, new_addr; switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: addr = iph->saddr; new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v4.s_addr; iph->saddr = new_addr; break; case FLOW_OFFLOAD_DIR_REPLY: addr = iph->daddr; new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v4.s_addr; iph->daddr = new_addr; break; } csum_replace4(&iph->check, addr, new_addr); nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr); } static void nf_flow_dnat_ip(const struct flow_offload *flow, struct sk_buff *skb, struct iphdr *iph, unsigned int thoff, enum flow_offload_tuple_dir dir) { __be32 addr, new_addr; switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: addr = iph->daddr; new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v4.s_addr; iph->daddr = new_addr; break; case FLOW_OFFLOAD_DIR_REPLY: addr = iph->saddr; new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v4.s_addr; iph->saddr = new_addr; break; } csum_replace4(&iph->check, addr, new_addr); nf_flow_nat_ip_l4proto(skb, iph, thoff, addr, new_addr); } static void nf_flow_nat_ip(const struct flow_offload *flow, struct sk_buff *skb, unsigned int thoff, enum flow_offload_tuple_dir dir, struct iphdr *iph) { if (test_bit(NF_FLOW_SNAT, &flow->flags)) { nf_flow_snat_port(flow, skb, thoff, iph->protocol, dir); nf_flow_snat_ip(flow, skb, iph, thoff, dir); } if (test_bit(NF_FLOW_DNAT, &flow->flags)) { nf_flow_dnat_port(flow, skb, thoff, iph->protocol, dir); nf_flow_dnat_ip(flow, skb, iph, thoff, dir); } } static bool ip_has_options(unsigned int thoff) { return thoff != sizeof(struct iphdr); } static void nf_flow_tuple_encap(struct sk_buff *skb, struct flow_offload_tuple *tuple) { struct vlan_ethhdr *veth; struct pppoe_hdr *phdr; int i = 0; if (skb_vlan_tag_present(skb)) { tuple->encap[i].id = skb_vlan_tag_get(skb); tuple->encap[i].proto = skb->vlan_proto; i++; } switch (skb->protocol) { case htons(ETH_P_8021Q): veth = (struct vlan_ethhdr *)skb_mac_header(skb); tuple->encap[i].id = ntohs(veth->h_vlan_TCI); tuple->encap[i].proto = skb->protocol; break; case htons(ETH_P_PPP_SES): phdr = (struct pppoe_hdr *)skb_network_header(skb); tuple->encap[i].id = ntohs(phdr->sid); tuple->encap[i].proto = skb->protocol; break; } } struct nf_flowtable_ctx { const struct net_device *in; u32 offset; u32 hdrsize; }; static int nf_flow_tuple_ip(struct nf_flowtable_ctx *ctx, struct sk_buff *skb, struct flow_offload_tuple *tuple) { struct flow_ports *ports; unsigned int thoff; struct iphdr *iph; u8 ipproto; if (!pskb_may_pull(skb, sizeof(*iph) + ctx->offset)) return -1; iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset); thoff = (iph->ihl * 4); if (ip_is_fragment(iph) || unlikely(ip_has_options(thoff))) return -1; thoff += ctx->offset; ipproto = iph->protocol; switch (ipproto) { case IPPROTO_TCP: ctx->hdrsize = sizeof(struct tcphdr); break; case IPPROTO_UDP: ctx->hdrsize = sizeof(struct udphdr); break; #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: ctx->hdrsize = sizeof(struct gre_base_hdr); break; #endif default: return -1; } if (iph->ttl <= 1) return -1; if (!pskb_may_pull(skb, thoff + ctx->hdrsize)) return -1; switch (ipproto) { case IPPROTO_TCP: case IPPROTO_UDP: ports = (struct flow_ports *)(skb_network_header(skb) + thoff); tuple->src_port = ports->source; tuple->dst_port = ports->dest; break; case IPPROTO_GRE: { struct gre_base_hdr *greh; greh = (struct gre_base_hdr *)(skb_network_header(skb) + thoff); if ((greh->flags & GRE_VERSION) != GRE_VERSION_0) return -1; break; } } iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset); tuple->src_v4.s_addr = iph->saddr; tuple->dst_v4.s_addr = iph->daddr; tuple->l3proto = AF_INET; tuple->l4proto = ipproto; tuple->iifidx = ctx->in->ifindex; nf_flow_tuple_encap(skb, tuple); return 0; } /* Based on ip_exceeds_mtu(). */ static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) { if (skb->len <= mtu) return false; if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) return false; return true; } static inline bool nf_flow_dst_check(struct flow_offload_tuple *tuple) { if (tuple->xmit_type != FLOW_OFFLOAD_XMIT_NEIGH && tuple->xmit_type != FLOW_OFFLOAD_XMIT_XFRM) return true; return dst_check(tuple->dst_cache, tuple->dst_cookie); } static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, const struct nf_hook_state *state, struct dst_entry *dst) { skb_orphan(skb); skb_dst_set_noref(skb, dst); dst_output(state->net, state->sk, skb); return NF_STOLEN; } static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, u32 *offset) { struct vlan_ethhdr *veth; __be16 inner_proto; switch (skb->protocol) { case htons(ETH_P_8021Q): if (!pskb_may_pull(skb, skb_mac_offset(skb) + sizeof(*veth))) return false; veth = (struct vlan_ethhdr *)skb_mac_header(skb); if (veth->h_vlan_encapsulated_proto == proto) { *offset += VLAN_HLEN; return true; } break; case htons(ETH_P_PPP_SES): if (nf_flow_pppoe_proto(skb, &inner_proto) && inner_proto == proto) { *offset += PPPOE_SES_HLEN; return true; } break; } return false; } static void nf_flow_encap_pop(struct sk_buff *skb, struct flow_offload_tuple_rhash *tuplehash) { struct vlan_hdr *vlan_hdr; int i; for (i = 0; i < tuplehash->tuple.encap_num; i++) { if (skb_vlan_tag_present(skb)) { __vlan_hwaccel_clear_tag(skb); continue; } switch (skb->protocol) { case htons(ETH_P_8021Q): vlan_hdr = (struct vlan_hdr *)skb->data; __skb_pull(skb, VLAN_HLEN); vlan_set_encap_proto(skb, vlan_hdr); skb_reset_network_header(skb); break; case htons(ETH_P_PPP_SES): skb->protocol = __nf_flow_pppoe_proto(skb); skb_pull(skb, PPPOE_SES_HLEN); skb_reset_network_header(skb); break; } } } static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, const struct flow_offload_tuple_rhash *tuplehash, unsigned short type) { struct net_device *outdev; outdev = dev_get_by_index_rcu(net, tuplehash->tuple.out.ifidx); if (!outdev) return NF_DROP; skb->dev = outdev; dev_hard_header(skb, skb->dev, type, tuplehash->tuple.out.h_dest, tuplehash->tuple.out.h_source, skb->len); dev_queue_xmit(skb); return NF_STOLEN; } static struct flow_offload_tuple_rhash * nf_flow_offload_lookup(struct nf_flowtable_ctx *ctx, struct nf_flowtable *flow_table, struct sk_buff *skb) { struct flow_offload_tuple tuple = {}; if (skb->protocol != htons(ETH_P_IP) && !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset)) return NULL; if (nf_flow_tuple_ip(ctx, skb, &tuple) < 0) return NULL; return flow_offload_lookup(flow_table, &tuple); } static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx, struct nf_flowtable *flow_table, struct flow_offload_tuple_rhash *tuplehash, struct sk_buff *skb) { enum flow_offload_tuple_dir dir; struct flow_offload *flow; unsigned int thoff, mtu; struct iphdr *iph; dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset; if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) return 0; iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset); thoff = (iph->ihl * 4) + ctx->offset; if (nf_flow_state_check(flow, iph->protocol, skb, thoff)) return 0; if (!nf_flow_dst_check(&tuplehash->tuple)) { flow_offload_teardown(flow); return 0; } if (skb_try_make_writable(skb, thoff + ctx->hdrsize)) return -1; flow_offload_refresh(flow_table, flow, false); nf_flow_encap_pop(skb, tuplehash); thoff -= ctx->offset; iph = ip_hdr(skb); nf_flow_nat_ip(flow, skb, thoff, dir, iph); ip_decrease_ttl(iph); skb_clear_tstamp(skb); if (flow_table->flags & NF_FLOWTABLE_COUNTER) nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len); return 1; } unsigned int nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct flow_offload_tuple_rhash *tuplehash; struct nf_flowtable *flow_table = priv; enum flow_offload_tuple_dir dir; struct nf_flowtable_ctx ctx = { .in = state->in, }; struct flow_offload *flow; struct net_device *outdev; struct rtable *rt; __be32 nexthop; int ret; tuplehash = nf_flow_offload_lookup(&ctx, flow_table, skb); if (!tuplehash) return NF_ACCEPT; ret = nf_flow_offload_forward(&ctx, flow_table, tuplehash, skb); if (ret < 0) return NF_DROP; else if (ret == 0) return NF_ACCEPT; if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) { rt = dst_rtable(tuplehash->tuple.dst_cache); memset(skb->cb, 0, sizeof(struct inet_skb_parm)); IPCB(skb)->iif = skb->dev->ifindex; IPCB(skb)->flags = IPSKB_FORWARDED; return nf_flow_xmit_xfrm(skb, state, &rt->dst); } dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: rt = dst_rtable(tuplehash->tuple.dst_cache); outdev = rt->dst.dev; skb->dev = outdev; nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); skb_dst_set_noref(skb, &rt->dst); neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb); ret = NF_STOLEN; break; case FLOW_OFFLOAD_XMIT_DIRECT: ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IP); if (ret == NF_DROP) flow_offload_teardown(flow); break; default: WARN_ON_ONCE(1); ret = NF_DROP; break; } return ret; } EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook); static void nf_flow_nat_ipv6_tcp(struct sk_buff *skb, unsigned int thoff, struct in6_addr *addr, struct in6_addr *new_addr, struct ipv6hdr *ip6h) { struct tcphdr *tcph; tcph = (void *)(skb_network_header(skb) + thoff); inet_proto_csum_replace16(&tcph->check, skb, addr->s6_addr32, new_addr->s6_addr32, true); } static void nf_flow_nat_ipv6_udp(struct sk_buff *skb, unsigned int thoff, struct in6_addr *addr, struct in6_addr *new_addr) { struct udphdr *udph; udph = (void *)(skb_network_header(skb) + thoff); if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace16(&udph->check, skb, addr->s6_addr32, new_addr->s6_addr32, true); if (!udph->check) udph->check = CSUM_MANGLED_0; } } static void nf_flow_nat_ipv6_l4proto(struct sk_buff *skb, struct ipv6hdr *ip6h, unsigned int thoff, struct in6_addr *addr, struct in6_addr *new_addr) { switch (ip6h->nexthdr) { case IPPROTO_TCP: nf_flow_nat_ipv6_tcp(skb, thoff, addr, new_addr, ip6h); break; case IPPROTO_UDP: nf_flow_nat_ipv6_udp(skb, thoff, addr, new_addr); break; } } static void nf_flow_snat_ipv6(const struct flow_offload *flow, struct sk_buff *skb, struct ipv6hdr *ip6h, unsigned int thoff, enum flow_offload_tuple_dir dir) { struct in6_addr addr, new_addr; switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: addr = ip6h->saddr; new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_v6; ip6h->saddr = new_addr; break; case FLOW_OFFLOAD_DIR_REPLY: addr = ip6h->daddr; new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_v6; ip6h->daddr = new_addr; break; } nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr); } static void nf_flow_dnat_ipv6(const struct flow_offload *flow, struct sk_buff *skb, struct ipv6hdr *ip6h, unsigned int thoff, enum flow_offload_tuple_dir dir) { struct in6_addr addr, new_addr; switch (dir) { case FLOW_OFFLOAD_DIR_ORIGINAL: addr = ip6h->daddr; new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_v6; ip6h->daddr = new_addr; break; case FLOW_OFFLOAD_DIR_REPLY: addr = ip6h->saddr; new_addr = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_v6; ip6h->saddr = new_addr; break; } nf_flow_nat_ipv6_l4proto(skb, ip6h, thoff, &addr, &new_addr); } static void nf_flow_nat_ipv6(const struct flow_offload *flow, struct sk_buff *skb, enum flow_offload_tuple_dir dir, struct ipv6hdr *ip6h) { unsigned int thoff = sizeof(*ip6h); if (test_bit(NF_FLOW_SNAT, &flow->flags)) { nf_flow_snat_port(flow, skb, thoff, ip6h->nexthdr, dir); nf_flow_snat_ipv6(flow, skb, ip6h, thoff, dir); } if (test_bit(NF_FLOW_DNAT, &flow->flags)) { nf_flow_dnat_port(flow, skb, thoff, ip6h->nexthdr, dir); nf_flow_dnat_ipv6(flow, skb, ip6h, thoff, dir); } } static int nf_flow_tuple_ipv6(struct nf_flowtable_ctx *ctx, struct sk_buff *skb, struct flow_offload_tuple *tuple) { struct flow_ports *ports; struct ipv6hdr *ip6h; unsigned int thoff; u8 nexthdr; thoff = sizeof(*ip6h) + ctx->offset; if (!pskb_may_pull(skb, thoff)) return -1; ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset); nexthdr = ip6h->nexthdr; switch (nexthdr) { case IPPROTO_TCP: ctx->hdrsize = sizeof(struct tcphdr); break; case IPPROTO_UDP: ctx->hdrsize = sizeof(struct udphdr); break; #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: ctx->hdrsize = sizeof(struct gre_base_hdr); break; #endif default: return -1; } if (ip6h->hop_limit <= 1) return -1; if (!pskb_may_pull(skb, thoff + ctx->hdrsize)) return -1; switch (nexthdr) { case IPPROTO_TCP: case IPPROTO_UDP: ports = (struct flow_ports *)(skb_network_header(skb) + thoff); tuple->src_port = ports->source; tuple->dst_port = ports->dest; break; case IPPROTO_GRE: { struct gre_base_hdr *greh; greh = (struct gre_base_hdr *)(skb_network_header(skb) + thoff); if ((greh->flags & GRE_VERSION) != GRE_VERSION_0) return -1; break; } } ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset); tuple->src_v6 = ip6h->saddr; tuple->dst_v6 = ip6h->daddr; tuple->l3proto = AF_INET6; tuple->l4proto = nexthdr; tuple->iifidx = ctx->in->ifindex; nf_flow_tuple_encap(skb, tuple); return 0; } static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx, struct nf_flowtable *flow_table, struct flow_offload_tuple_rhash *tuplehash, struct sk_buff *skb) { enum flow_offload_tuple_dir dir; struct flow_offload *flow; unsigned int thoff, mtu; struct ipv6hdr *ip6h; dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset; if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) return 0; ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset); thoff = sizeof(*ip6h) + ctx->offset; if (nf_flow_state_check(flow, ip6h->nexthdr, skb, thoff)) return 0; if (!nf_flow_dst_check(&tuplehash->tuple)) { flow_offload_teardown(flow); return 0; } if (skb_try_make_writable(skb, thoff + ctx->hdrsize)) return -1; flow_offload_refresh(flow_table, flow, false); nf_flow_encap_pop(skb, tuplehash); ip6h = ipv6_hdr(skb); nf_flow_nat_ipv6(flow, skb, dir, ip6h); ip6h->hop_limit--; skb_clear_tstamp(skb); if (flow_table->flags & NF_FLOWTABLE_COUNTER) nf_ct_acct_update(flow->ct, tuplehash->tuple.dir, skb->len); return 1; } static struct flow_offload_tuple_rhash * nf_flow_offload_ipv6_lookup(struct nf_flowtable_ctx *ctx, struct nf_flowtable *flow_table, struct sk_buff *skb) { struct flow_offload_tuple tuple = {}; if (skb->protocol != htons(ETH_P_IPV6) && !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IPV6), &ctx->offset)) return NULL; if (nf_flow_tuple_ipv6(ctx, skb, &tuple) < 0) return NULL; return flow_offload_lookup(flow_table, &tuple); } unsigned int nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct flow_offload_tuple_rhash *tuplehash; struct nf_flowtable *flow_table = priv; enum flow_offload_tuple_dir dir; struct nf_flowtable_ctx ctx = { .in = state->in, }; const struct in6_addr *nexthop; struct flow_offload *flow; struct net_device *outdev; struct rt6_info *rt; int ret; tuplehash = nf_flow_offload_ipv6_lookup(&ctx, flow_table, skb); if (tuplehash == NULL) return NF_ACCEPT; ret = nf_flow_offload_ipv6_forward(&ctx, flow_table, tuplehash, skb); if (ret < 0) return NF_DROP; else if (ret == 0) return NF_ACCEPT; if (unlikely(tuplehash->tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)) { rt = dst_rt6_info(tuplehash->tuple.dst_cache); memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); IP6CB(skb)->iif = skb->dev->ifindex; IP6CB(skb)->flags = IP6SKB_FORWARDED; return nf_flow_xmit_xfrm(skb, state, &rt->dst); } dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: rt = dst_rt6_info(tuplehash->tuple.dst_cache); outdev = rt->dst.dev; skb->dev = outdev; nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6); skb_dst_set_noref(skb, &rt->dst); neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb); ret = NF_STOLEN; break; case FLOW_OFFLOAD_XMIT_DIRECT: ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IPV6); if (ret == NF_DROP) flow_offload_teardown(flow); break; default: WARN_ON_ONCE(1); ret = NF_DROP; break; } return ret; } EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook);
780 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_KSM_H #define __LINUX_KSM_H /* * Memory merging support. * * This code enables dynamic sharing of identical pages found in different * memory areas, even if they are not shared by fork(). */ #include <linux/bitops.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/rmap.h> #include <linux/sched.h> #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags); void ksm_add_vma(struct vm_area_struct *vma); int ksm_enable_merge_any(struct mm_struct *mm); int ksm_disable_merge_any(struct mm_struct *mm); int ksm_disable(struct mm_struct *mm); int __ksm_enter(struct mm_struct *mm); void __ksm_exit(struct mm_struct *mm); /* * To identify zeropages that were mapped by KSM, we reuse the dirty bit * in the PTE. If the PTE is dirty, the zeropage was mapped by KSM when * deduplicating memory. */ #define is_ksm_zero_pte(pte) (is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte)) extern atomic_long_t ksm_zero_pages; static inline void ksm_map_zero_page(struct mm_struct *mm) { atomic_long_inc(&ksm_zero_pages); atomic_long_inc(&mm->ksm_zero_pages); } static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte) { if (is_ksm_zero_pte(pte)) { atomic_long_dec(&ksm_zero_pages); atomic_long_dec(&mm->ksm_zero_pages); } } static inline long mm_ksm_zero_pages(struct mm_struct *mm) { return atomic_long_read(&mm->ksm_zero_pages); } static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { /* Adding mm to ksm is best effort on fork. */ if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) __ksm_enter(mm); } static inline int ksm_execve(struct mm_struct *mm) { if (test_bit(MMF_VM_MERGE_ANY, &mm->flags)) return __ksm_enter(mm); return 0; } static inline void ksm_exit(struct mm_struct *mm) { if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) __ksm_exit(mm); } /* * When do_swap_page() first faults in from swap what used to be a KSM page, * no problem, it will be assigned to this vma's anon_vma; but thereafter, * it might be faulted into a different anon_vma (or perhaps to a different * offset in the same anon_vma). do_swap_page() cannot do all the locking * needed to reconstitute a cross-anon_vma KSM page: for now it has to make * a copy, and leave remerging the pages to a later pass of ksmd. * * We'd like to make this conditional on vma->vm_flags & VM_MERGEABLE, * but what if the vma was unmerged while the page was swapped out? */ struct folio *ksm_might_need_to_copy(struct folio *folio, struct vm_area_struct *vma, unsigned long addr); void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc); void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); void collect_procs_ksm(const struct folio *folio, const struct page *page, struct list_head *to_kill, int force_early); long ksm_process_profit(struct mm_struct *); bool ksm_process_mergeable(struct mm_struct *mm); #else /* !CONFIG_KSM */ static inline void ksm_add_vma(struct vm_area_struct *vma) { } static inline int ksm_disable(struct mm_struct *mm) { return 0; } static inline void ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { } static inline int ksm_execve(struct mm_struct *mm) { return 0; } static inline void ksm_exit(struct mm_struct *mm) { } static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte) { } static inline void collect_procs_ksm(const struct folio *folio, const struct page *page, struct list_head *to_kill, int force_early) { } #ifdef CONFIG_MMU static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags) { return 0; } static inline struct folio *ksm_might_need_to_copy(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { return folio; } static inline void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) { } static inline void folio_migrate_ksm(struct folio *newfolio, struct folio *old) { } #endif /* CONFIG_MMU */ #endif /* !CONFIG_KSM */ #endif /* __LINUX_KSM_H */
3 1 1 1 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 // SPDX-License-Identifier: GPL-2.0-only /* Masquerade. Simple mapping which alters range to a local IP address (depending on route). */ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/netfilter/x_tables.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_masquerade.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("Xtables: automatic-address SNAT"); /* FIXME: Multiple targets. --RR */ static int masquerade_tg_check(const struct xt_tgchk_param *par) { const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; if (mr->range[0].flags & NF_NAT_RANGE_MAP_IPS) { pr_debug("bad MAP_IPS.\n"); return -EINVAL; } if (mr->rangesize != 1) { pr_debug("bad rangesize %u\n", mr->rangesize); return -EINVAL; } return nf_ct_netns_get(par->net, par->family); } static unsigned int masquerade_tg(struct sk_buff *skb, const struct xt_action_param *par) { struct nf_nat_range2 range; const struct nf_nat_ipv4_multi_range_compat *mr; mr = par->targinfo; range.flags = mr->range[0].flags; range.min_proto = mr->range[0].min; range.max_proto = mr->range[0].max; return nf_nat_masquerade_ipv4(skb, xt_hooknum(par), &range, xt_out(par)); } static void masquerade_tg_destroy(const struct xt_tgdtor_param *par) { nf_ct_netns_put(par->net, par->family); } #if IS_ENABLED(CONFIG_IPV6) static unsigned int masquerade_tg6(struct sk_buff *skb, const struct xt_action_param *par) { return nf_nat_masquerade_ipv6(skb, par->targinfo, xt_out(par)); } static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par) { const struct nf_nat_range2 *range = par->targinfo; if (range->flags & NF_NAT_RANGE_MAP_IPS) return -EINVAL; return nf_ct_netns_get(par->net, par->family); } #endif static struct xt_target masquerade_tg_reg[] __read_mostly = { { #if IS_ENABLED(CONFIG_IPV6) .name = "MASQUERADE", .family = NFPROTO_IPV6, .target = masquerade_tg6, .targetsize = sizeof(struct nf_nat_range), .table = "nat", .hooks = 1 << NF_INET_POST_ROUTING, .checkentry = masquerade_tg6_checkentry, .destroy = masquerade_tg_destroy, .me = THIS_MODULE, }, { #endif .name = "MASQUERADE", .family = NFPROTO_IPV4, .target = masquerade_tg, .targetsize = sizeof(struct nf_nat_ipv4_multi_range_compat), .table = "nat", .hooks = 1 << NF_INET_POST_ROUTING, .checkentry = masquerade_tg_check, .destroy = masquerade_tg_destroy, .me = THIS_MODULE, } }; static int __init masquerade_tg_init(void) { int ret; ret = xt_register_targets(masquerade_tg_reg, ARRAY_SIZE(masquerade_tg_reg)); if (ret) return ret; ret = nf_nat_masquerade_inet_register_notifiers(); if (ret) { xt_unregister_targets(masquerade_tg_reg, ARRAY_SIZE(masquerade_tg_reg)); return ret; } return ret; } static void __exit masquerade_tg_exit(void) { xt_unregister_targets(masquerade_tg_reg, ARRAY_SIZE(masquerade_tg_reg)); nf_nat_masquerade_inet_unregister_notifiers(); } module_init(masquerade_tg_init); module_exit(masquerade_tg_exit); #if IS_ENABLED(CONFIG_IPV6) MODULE_ALIAS("ip6t_MASQUERADE"); #endif MODULE_ALIAS("ipt_MASQUERADE");
46 27 27 27 23 27 2 8 21 21 21 21 11 7 4 62 1 2 4 4 21 2 16 5 6 10 8 1 34 4 8 11 20 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 /* * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <net/sock.h> #include <linux/in.h> #include <linux/ipv6.h> #include <linux/if_arp.h> #include <linux/jhash.h> #include <linux/ratelimit.h> #include "rds.h" static struct rhashtable bind_hash_table; static const struct rhashtable_params ht_parms = { .nelem_hint = 768, .key_len = RDS_BOUND_KEY_LEN, .key_offset = offsetof(struct rds_sock, rs_bound_key), .head_offset = offsetof(struct rds_sock, rs_bound_node), .max_size = 16384, .min_size = 1024, }; /* Create a key for the bind hash table manipulation. Port is in network byte * order. */ static inline void __rds_create_bind_key(u8 *key, const struct in6_addr *addr, __be16 port, __u32 scope_id) { memcpy(key, addr, sizeof(*addr)); key += sizeof(*addr); memcpy(key, &port, sizeof(port)); key += sizeof(port); memcpy(key, &scope_id, sizeof(scope_id)); } /* * Return the rds_sock bound at the given local address. * * The rx path can race with rds_release. We notice if rds_release() has * marked this socket and don't return a rs ref to the rx path. */ struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port, __u32 scope_id) { u8 key[RDS_BOUND_KEY_LEN]; struct rds_sock *rs; __rds_create_bind_key(key, addr, port, scope_id); rcu_read_lock(); rs = rhashtable_lookup(&bind_hash_table, key, ht_parms); if (rs && (sock_flag(rds_rs_to_sk(rs), SOCK_DEAD) || !refcount_inc_not_zero(&rds_rs_to_sk(rs)->sk_refcnt))) rs = NULL; rcu_read_unlock(); rdsdebug("returning rs %p for %pI6c:%u\n", rs, addr, ntohs(port)); return rs; } /* returns -ve errno or +ve port */ static int rds_add_bound(struct rds_sock *rs, const struct in6_addr *addr, __be16 *port, __u32 scope_id) { int ret = -EADDRINUSE; u16 rover, last; u8 key[RDS_BOUND_KEY_LEN]; if (*port != 0) { rover = be16_to_cpu(*port); if (rover == RDS_FLAG_PROBE_PORT) return -EINVAL; last = rover; } else { rover = max_t(u16, get_random_u16(), 2); last = rover - 1; } do { if (rover == 0) rover++; if (rover == RDS_FLAG_PROBE_PORT) continue; __rds_create_bind_key(key, addr, cpu_to_be16(rover), scope_id); if (rhashtable_lookup_fast(&bind_hash_table, key, ht_parms)) continue; memcpy(rs->rs_bound_key, key, sizeof(rs->rs_bound_key)); rs->rs_bound_addr = *addr; net_get_random_once(&rs->rs_hash_initval, sizeof(rs->rs_hash_initval)); rs->rs_bound_port = cpu_to_be16(rover); rs->rs_bound_node.next = NULL; rds_sock_addref(rs); if (!rhashtable_insert_fast(&bind_hash_table, &rs->rs_bound_node, ht_parms)) { *port = rs->rs_bound_port; rs->rs_bound_scope_id = scope_id; ret = 0; rdsdebug("rs %p binding to %pI6c:%d\n", rs, addr, (int)ntohs(*port)); break; } else { rs->rs_bound_addr = in6addr_any; rds_sock_put(rs); ret = -ENOMEM; break; } } while (rover++ != last); return ret; } void rds_remove_bound(struct rds_sock *rs) { if (ipv6_addr_any(&rs->rs_bound_addr)) return; rdsdebug("rs %p unbinding from %pI6c:%d\n", rs, &rs->rs_bound_addr, ntohs(rs->rs_bound_port)); rhashtable_remove_fast(&bind_hash_table, &rs->rs_bound_node, ht_parms); rds_sock_put(rs); rs->rs_bound_addr = in6addr_any; } int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; struct rds_sock *rs = rds_sk_to_rs(sk); struct in6_addr v6addr, *binding_addr; struct rds_transport *trans; __u32 scope_id = 0; int ret = 0; __be16 port; /* We allow an RDS socket to be bound to either IPv4 or IPv6 * address. */ if (addr_len < offsetofend(struct sockaddr, sa_family)) return -EINVAL; if (uaddr->sa_family == AF_INET) { struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; if (addr_len < sizeof(struct sockaddr_in) || sin->sin_addr.s_addr == htonl(INADDR_ANY) || sin->sin_addr.s_addr == htonl(INADDR_BROADCAST) || ipv4_is_multicast(sin->sin_addr.s_addr)) return -EINVAL; ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &v6addr); binding_addr = &v6addr; port = sin->sin_port; #if IS_ENABLED(CONFIG_IPV6) } else if (uaddr->sa_family == AF_INET6) { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)uaddr; int addr_type; if (addr_len < sizeof(struct sockaddr_in6)) return -EINVAL; addr_type = ipv6_addr_type(&sin6->sin6_addr); if (!(addr_type & IPV6_ADDR_UNICAST)) { __be32 addr4; if (!(addr_type & IPV6_ADDR_MAPPED)) return -EINVAL; /* It is a mapped address. Need to do some sanity * checks. */ addr4 = sin6->sin6_addr.s6_addr32[3]; if (addr4 == htonl(INADDR_ANY) || addr4 == htonl(INADDR_BROADCAST) || ipv4_is_multicast(addr4)) return -EINVAL; } /* The scope ID must be specified for link local address. */ if (addr_type & IPV6_ADDR_LINKLOCAL) { if (sin6->sin6_scope_id == 0) return -EINVAL; scope_id = sin6->sin6_scope_id; } binding_addr = &sin6->sin6_addr; port = sin6->sin6_port; #endif } else { return -EINVAL; } lock_sock(sk); /* RDS socket does not allow re-binding. */ if (!ipv6_addr_any(&rs->rs_bound_addr)) { ret = -EINVAL; goto out; } /* Socket is connected. The binding address should have the same * scope ID as the connected address, except the case when one is * non-link local address (scope_id is 0). */ if (!ipv6_addr_any(&rs->rs_conn_addr) && scope_id && rs->rs_bound_scope_id && scope_id != rs->rs_bound_scope_id) { ret = -EINVAL; goto out; } /* The transport can be set using SO_RDS_TRANSPORT option before the * socket is bound. */ if (rs->rs_transport) { trans = rs->rs_transport; if (!trans->laddr_check || trans->laddr_check(sock_net(sock->sk), binding_addr, scope_id) != 0) { ret = -ENOPROTOOPT; goto out; } } else { trans = rds_trans_get_preferred(sock_net(sock->sk), binding_addr, scope_id); if (!trans) { ret = -EADDRNOTAVAIL; pr_info_ratelimited("RDS: %s could not find a transport for %pI6c, load rds_tcp or rds_rdma?\n", __func__, binding_addr); goto out; } rs->rs_transport = trans; } sock_set_flag(sk, SOCK_RCU_FREE); ret = rds_add_bound(rs, binding_addr, &port, scope_id); if (ret) rs->rs_transport = NULL; out: release_sock(sk); return ret; } void rds_bind_lock_destroy(void) { rhashtable_destroy(&bind_hash_table); } int rds_bind_lock_init(void) { return rhashtable_init(&bind_hash_table, &ht_parms); }
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 /* SPDX-License-Identifier: GPL-2.0 */ /* include/net/dsfield.h - Manipulation of the Differentiated Services field */ /* Written 1998-2000 by Werner Almesberger, EPFL ICA */ #ifndef __NET_DSFIELD_H #define __NET_DSFIELD_H #include <linux/types.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <asm/byteorder.h> static inline __u8 ipv4_get_dsfield(const struct iphdr *iph) { return iph->tos; } static inline __u8 ipv6_get_dsfield(const struct ipv6hdr *ipv6h) { return ntohs(*(__force const __be16 *)ipv6h) >> 4; } static inline void ipv4_change_dsfield(struct iphdr *iph,__u8 mask, __u8 value) { __u32 check = ntohs((__force __be16)iph->check); __u8 dsfield; dsfield = (iph->tos & mask) | value; check += iph->tos; if ((check+1) >> 16) check = (check+1) & 0xffff; check -= dsfield; check += check >> 16; /* adjust carry */ iph->check = (__force __sum16)htons(check); iph->tos = dsfield; } static inline void ipv6_change_dsfield(struct ipv6hdr *ipv6h,__u8 mask, __u8 value) { __be16 *p = (__force __be16 *)ipv6h; *p = (*p & htons((((u16)mask << 4) | 0xf00f))) | htons((u16)value << 4); } #endif
240 28 10 240 77 78 78 77 78 50 199 199 240 240 230 231 313 13 9 5 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 // SPDX-License-Identifier: GPL-2.0-only /* File: fs/xattr.c Extended attribute handling. Copyright (C) 2001 by Andreas Gruenbacher <a.gruenbacher@computer.org> Copyright (C) 2001 SGI - Silicon Graphics, Inc <linux-xfs@oss.sgi.com> Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> */ #include <linux/fs.h> #include <linux/filelock.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/xattr.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/export.h> #include <linux/fsnotify.h> #include <linux/audit.h> #include <linux/vmalloc.h> #include <linux/posix_acl_xattr.h> #include <linux/uaccess.h> #include "internal.h" static const char * strcmp_prefix(const char *a, const char *a_prefix) { while (*a_prefix && *a == *a_prefix) { a++; a_prefix++; } return *a_prefix ? NULL : a; } /* * In order to implement different sets of xattr operations for each xattr * prefix, a filesystem should create a null-terminated array of struct * xattr_handler (one for each prefix) and hang a pointer to it off of the * s_xattr field of the superblock. */ #define for_each_xattr_handler(handlers, handler) \ if (handlers) \ for ((handler) = *(handlers)++; \ (handler) != NULL; \ (handler) = *(handlers)++) /* * Find the xattr_handler with the matching prefix. */ static const struct xattr_handler * xattr_resolve_name(struct inode *inode, const char **name) { const struct xattr_handler * const *handlers = inode->i_sb->s_xattr; const struct xattr_handler *handler; if (!(inode->i_opflags & IOP_XATTR)) { if (unlikely(is_bad_inode(inode))) return ERR_PTR(-EIO); return ERR_PTR(-EOPNOTSUPP); } for_each_xattr_handler(handlers, handler) { const char *n; n = strcmp_prefix(*name, xattr_prefix(handler)); if (n) { if (!handler->prefix ^ !*n) { if (*n) continue; return ERR_PTR(-EINVAL); } *name = n; return handler; } } return ERR_PTR(-EOPNOTSUPP); } /** * may_write_xattr - check whether inode allows writing xattr * @idmap: idmap of the mount the inode was found from * @inode: the inode on which to set an xattr * * Check whether the inode allows writing xattrs. Specifically, we can never * set or remove an extended attribute on a read-only filesystem or on an * immutable / append-only inode. * * We also need to ensure that the inode has a mapping in the mount to * not risk writing back invalid i_{g,u}id values. * * Return: On success zero is returned. On error a negative errno is returned. */ int may_write_xattr(struct mnt_idmap *idmap, struct inode *inode) { if (IS_IMMUTABLE(inode)) return -EPERM; if (IS_APPEND(inode)) return -EPERM; if (HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; return 0; } /* * Check permissions for extended attribute access. This is a bit complicated * because different namespaces have very different rules. */ static int xattr_permission(struct mnt_idmap *idmap, struct inode *inode, const char *name, int mask) { if (mask & MAY_WRITE) { int ret; ret = may_write_xattr(idmap, inode); if (ret) return ret; } /* * No restriction for security.* and system.* from the VFS. Decision * on these is left to the underlying filesystem / security module. */ if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return 0; /* * The trusted.* namespace can only be accessed by privileged users. */ if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) { if (!capable(CAP_SYS_ADMIN)) return (mask & MAY_WRITE) ? -EPERM : -ENODATA; return 0; } /* * In the user.* namespace, only regular files and directories can have * extended attributes. For sticky directories, only the owner and * privileged users can write attributes. */ if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) { if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) return (mask & MAY_WRITE) ? -EPERM : -ENODATA; if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && (mask & MAY_WRITE) && !inode_owner_or_capable(idmap, inode)) return -EPERM; } return inode_permission(idmap, inode, mask); } /* * Look for any handler that deals with the specified namespace. */ int xattr_supports_user_prefix(struct inode *inode) { const struct xattr_handler * const *handlers = inode->i_sb->s_xattr; const struct xattr_handler *handler; if (!(inode->i_opflags & IOP_XATTR)) { if (unlikely(is_bad_inode(inode))) return -EIO; return -EOPNOTSUPP; } for_each_xattr_handler(handlers, handler) { if (!strncmp(xattr_prefix(handler), XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) return 0; } return -EOPNOTSUPP; } EXPORT_SYMBOL(xattr_supports_user_prefix); int __vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) { const struct xattr_handler *handler; if (is_posix_acl_xattr(name)) return -EOPNOTSUPP; handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); if (!handler->set) return -EOPNOTSUPP; if (size == 0) value = ""; /* empty EA, do not remove */ return handler->set(handler, idmap, dentry, inode, name, value, size, flags); } EXPORT_SYMBOL(__vfs_setxattr); /** * __vfs_setxattr_noperm - perform setxattr operation without performing * permission checks. * * @idmap: idmap of the mount the inode was found from * @dentry: object to perform setxattr on * @name: xattr name to set * @value: value to set @name to * @size: size of @value * @flags: flags to pass into filesystem operations * * returns the result of the internal setxattr or setsecurity operations. * * This function requires the caller to lock the inode's i_mutex before it * is executed. It also assumes that the caller will make the appropriate * permission checks. */ int __vfs_setxattr_noperm(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; int error = -EAGAIN; int issec = !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); if (issec) inode->i_flags &= ~S_NOSEC; if (inode->i_opflags & IOP_XATTR) { error = __vfs_setxattr(idmap, dentry, inode, name, value, size, flags); if (!error) { fsnotify_xattr(dentry); security_inode_post_setxattr(dentry, name, value, size, flags); } } else { if (unlikely(is_bad_inode(inode))) return -EIO; } if (error == -EAGAIN) { error = -EOPNOTSUPP; if (issec) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; error = security_inode_setsecurity(inode, suffix, value, size, flags); if (!error) fsnotify_xattr(dentry); } } return error; } /** * __vfs_setxattr_locked - set an extended attribute while holding the inode * lock * * @idmap: idmap of the mount of the target inode * @dentry: object to perform setxattr on * @name: xattr name to set * @value: value to set @name to * @size: size of @value * @flags: flags to pass into filesystem operations * @delegated_inode: on return, will contain an inode pointer that * a delegation was broken on, NULL if none. */ int __vfs_setxattr_locked(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; int error; error = xattr_permission(idmap, inode, name, MAY_WRITE); if (error) return error; error = security_inode_setxattr(idmap, dentry, name, value, size, flags); if (error) goto out; error = try_break_deleg(inode, delegated_inode); if (error) goto out; error = __vfs_setxattr_noperm(idmap, dentry, name, value, size, flags); out: return error; } EXPORT_SYMBOL_GPL(__vfs_setxattr_locked); int vfs_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; struct inode *delegated_inode = NULL; const void *orig_value = value; int error; if (size && strcmp(name, XATTR_NAME_CAPS) == 0) { error = cap_convert_nscap(idmap, dentry, &value, size); if (error < 0) return error; size = error; } retry_deleg: inode_lock(inode); error = __vfs_setxattr_locked(idmap, dentry, name, value, size, flags, &delegated_inode); inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } if (value != orig_value) kfree(value); return error; } EXPORT_SYMBOL_GPL(vfs_setxattr); static ssize_t xattr_getsecurity(struct mnt_idmap *idmap, struct inode *inode, const char *name, void *value, size_t size) { void *buffer = NULL; ssize_t len; if (!value || !size) { len = security_inode_getsecurity(idmap, inode, name, &buffer, false); goto out_noalloc; } len = security_inode_getsecurity(idmap, inode, name, &buffer, true); if (len < 0) return len; if (size < len) { len = -ERANGE; goto out; } memcpy(value, buffer, len); out: kfree(buffer); out_noalloc: return len; } /* * vfs_getxattr_alloc - allocate memory, if necessary, before calling getxattr * * Allocate memory, if not already allocated, or re-allocate correct size, * before retrieving the extended attribute. The xattr value buffer should * always be freed by the caller, even on error. * * Returns the result of alloc, if failed, or the getxattr operation. */ int vfs_getxattr_alloc(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, char **xattr_value, size_t xattr_size, gfp_t flags) { const struct xattr_handler *handler; struct inode *inode = dentry->d_inode; char *value = *xattr_value; int error; error = xattr_permission(idmap, inode, name, MAY_READ); if (error) return error; handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); if (!handler->get) return -EOPNOTSUPP; error = handler->get(handler, dentry, inode, name, NULL, 0); if (error < 0) return error; if (!value || (error > xattr_size)) { value = krealloc(*xattr_value, error + 1, flags); if (!value) return -ENOMEM; memset(value, 0, error + 1); } error = handler->get(handler, dentry, inode, name, value, error); *xattr_value = value; return error; } ssize_t __vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size) { const struct xattr_handler *handler; if (is_posix_acl_xattr(name)) return -EOPNOTSUPP; handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); if (!handler->get) return -EOPNOTSUPP; return handler->get(handler, dentry, inode, name, value, size); } EXPORT_SYMBOL(__vfs_getxattr); ssize_t vfs_getxattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, void *value, size_t size) { struct inode *inode = dentry->d_inode; int error; error = xattr_permission(idmap, inode, name, MAY_READ); if (error) return error; error = security_inode_getxattr(dentry, name); if (error) return error; if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; int ret = xattr_getsecurity(idmap, inode, suffix, value, size); /* * Only overwrite the return value if a security module * is actually active. */ if (ret == -EOPNOTSUPP) goto nolsm; return ret; } nolsm: return __vfs_getxattr(dentry, inode, name, value, size); } EXPORT_SYMBOL_GPL(vfs_getxattr); /** * vfs_listxattr - retrieve \0 separated list of xattr names * @dentry: the dentry from whose inode the xattr names are retrieved * @list: buffer to store xattr names into * @size: size of the buffer * * This function returns the names of all xattrs associated with the * inode of @dentry. * * Note, for legacy reasons the vfs_listxattr() function lists POSIX * ACLs as well. Since POSIX ACLs are decoupled from IOP_XATTR the * vfs_listxattr() function doesn't check for this flag since a * filesystem could implement POSIX ACLs without implementing any other * xattrs. * * However, since all codepaths that remove IOP_XATTR also assign of * inode operations that either don't implement or implement a stub * ->listxattr() operation. * * Return: On success, the size of the buffer that was used. On error a * negative error code. */ ssize_t vfs_listxattr(struct dentry *dentry, char *list, size_t size) { struct inode *inode = d_inode(dentry); ssize_t error; error = security_inode_listxattr(dentry); if (error) return error; if (inode->i_op->listxattr) { error = inode->i_op->listxattr(dentry, list, size); } else { error = security_inode_listsecurity(inode, list, size); if (size && error > size) error = -ERANGE; } return error; } EXPORT_SYMBOL_GPL(vfs_listxattr); int __vfs_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { struct inode *inode = d_inode(dentry); const struct xattr_handler *handler; if (is_posix_acl_xattr(name)) return -EOPNOTSUPP; handler = xattr_resolve_name(inode, &name); if (IS_ERR(handler)) return PTR_ERR(handler); if (!handler->set) return -EOPNOTSUPP; return handler->set(handler, idmap, dentry, inode, name, NULL, 0, XATTR_REPLACE); } EXPORT_SYMBOL(__vfs_removexattr); /** * __vfs_removexattr_locked - set an extended attribute while holding the inode * lock * * @idmap: idmap of the mount of the target inode * @dentry: object to perform setxattr on * @name: name of xattr to remove * @delegated_inode: on return, will contain an inode pointer that * a delegation was broken on, NULL if none. */ int __vfs_removexattr_locked(struct mnt_idmap *idmap, struct dentry *dentry, const char *name, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; int error; error = xattr_permission(idmap, inode, name, MAY_WRITE); if (error) return error; error = security_inode_removexattr(idmap, dentry, name); if (error) goto out; error = try_break_deleg(inode, delegated_inode); if (error) goto out; error = __vfs_removexattr(idmap, dentry, name); if (error) return error; fsnotify_xattr(dentry); security_inode_post_removexattr(dentry, name); out: return error; } EXPORT_SYMBOL_GPL(__vfs_removexattr_locked); int vfs_removexattr(struct mnt_idmap *idmap, struct dentry *dentry, const char *name) { struct inode *inode = dentry->d_inode; struct inode *delegated_inode = NULL; int error; retry_deleg: inode_lock(inode); error = __vfs_removexattr_locked(idmap, dentry, name, &delegated_inode); inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } return error; } EXPORT_SYMBOL_GPL(vfs_removexattr); int import_xattr_name(struct xattr_name *kname, const char __user *name) { int error = strncpy_from_user(kname->name, name, sizeof(kname->name)); if (error == 0 || error == sizeof(kname->name)) return -ERANGE; if (error < 0) return error; return 0; } /* * Extended attribute SET operations */ int setxattr_copy(const char __user *name, struct kernel_xattr_ctx *ctx) { int error; if (ctx->flags & ~(XATTR_CREATE|XATTR_REPLACE)) return -EINVAL; error = import_xattr_name(ctx->kname, name); if (error) return error; if (ctx->size) { if (ctx->size > XATTR_SIZE_MAX) return -E2BIG; ctx->kvalue = vmemdup_user(ctx->cvalue, ctx->size); if (IS_ERR(ctx->kvalue)) { error = PTR_ERR(ctx->kvalue); ctx->kvalue = NULL; } } return error; } static int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry, struct kernel_xattr_ctx *ctx) { if (is_posix_acl_xattr(ctx->kname->name)) return do_set_acl(idmap, dentry, ctx->kname->name, ctx->kvalue, ctx->size); return vfs_setxattr(idmap, dentry, ctx->kname->name, ctx->kvalue, ctx->size, ctx->flags); } int file_setxattr(struct file *f, struct kernel_xattr_ctx *ctx) { int error = mnt_want_write_file(f); if (!error) { audit_file(f); error = do_setxattr(file_mnt_idmap(f), f->f_path.dentry, ctx); mnt_drop_write_file(f); } return error; } /* unconditionally consumes filename */ int filename_setxattr(int dfd, struct filename *filename, unsigned int lookup_flags, struct kernel_xattr_ctx *ctx) { struct path path; int error; retry: error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); if (error) goto out; error = mnt_want_write(path.mnt); if (!error) { error = do_setxattr(mnt_idmap(path.mnt), path.dentry, ctx); mnt_drop_write(path.mnt); } path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: putname(filename); return error; } static int path_setxattrat(int dfd, const char __user *pathname, unsigned int at_flags, const char __user *name, const void __user *value, size_t size, int flags) { struct xattr_name kname; struct kernel_xattr_ctx ctx = { .cvalue = value, .kvalue = NULL, .size = size, .kname = &kname, .flags = flags, }; struct filename *filename; unsigned int lookup_flags = 0; int error; if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; if (!(at_flags & AT_SYMLINK_NOFOLLOW)) lookup_flags = LOOKUP_FOLLOW; error = setxattr_copy(name, &ctx); if (error) return error; filename = getname_maybe_null(pathname, at_flags); if (!filename) { CLASS(fd, f)(dfd); if (fd_empty(f)) error = -EBADF; else error = file_setxattr(fd_file(f), &ctx); } else { error = filename_setxattr(dfd, filename, lookup_flags, &ctx); } kvfree(ctx.kvalue); return error; } SYSCALL_DEFINE6(setxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags, const char __user *, name, const struct xattr_args __user *, uargs, size_t, usize) { struct xattr_args args = {}; int error; BUILD_BUG_ON(sizeof(struct xattr_args) < XATTR_ARGS_SIZE_VER0); BUILD_BUG_ON(sizeof(struct xattr_args) != XATTR_ARGS_SIZE_LATEST); if (unlikely(usize < XATTR_ARGS_SIZE_VER0)) return -EINVAL; if (usize > PAGE_SIZE) return -E2BIG; error = copy_struct_from_user(&args, sizeof(args), uargs, usize); if (error) return error; return path_setxattrat(dfd, pathname, at_flags, name, u64_to_user_ptr(args.value), args.size, args.flags); } SYSCALL_DEFINE5(setxattr, const char __user *, pathname, const char __user *, name, const void __user *, value, size_t, size, int, flags) { return path_setxattrat(AT_FDCWD, pathname, 0, name, value, size, flags); } SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname, const char __user *, name, const void __user *, value, size_t, size, int, flags) { return path_setxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name, value, size, flags); } SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, const void __user *,value, size_t, size, int, flags) { return path_setxattrat(fd, NULL, AT_EMPTY_PATH, name, value, size, flags); } /* * Extended attribute GET operations */ static ssize_t do_getxattr(struct mnt_idmap *idmap, struct dentry *d, struct kernel_xattr_ctx *ctx) { ssize_t error; char *kname = ctx->kname->name; void *kvalue = NULL; if (ctx->size) { if (ctx->size > XATTR_SIZE_MAX) ctx->size = XATTR_SIZE_MAX; kvalue = kvzalloc(ctx->size, GFP_KERNEL); if (!kvalue) return -ENOMEM; } if (is_posix_acl_xattr(kname)) error = do_get_acl(idmap, d, kname, kvalue, ctx->size); else error = vfs_getxattr(idmap, d, kname, kvalue, ctx->size); if (error > 0) { if (ctx->size && copy_to_user(ctx->value, kvalue, error)) error = -EFAULT; } else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) { /* The file system tried to returned a value bigger than XATTR_SIZE_MAX bytes. Not possible. */ error = -E2BIG; } kvfree(kvalue); return error; } ssize_t file_getxattr(struct file *f, struct kernel_xattr_ctx *ctx) { audit_file(f); return do_getxattr(file_mnt_idmap(f), f->f_path.dentry, ctx); } /* unconditionally consumes filename */ ssize_t filename_getxattr(int dfd, struct filename *filename, unsigned int lookup_flags, struct kernel_xattr_ctx *ctx) { struct path path; ssize_t error; retry: error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); if (error) goto out; error = do_getxattr(mnt_idmap(path.mnt), path.dentry, ctx); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: putname(filename); return error; } static ssize_t path_getxattrat(int dfd, const char __user *pathname, unsigned int at_flags, const char __user *name, void __user *value, size_t size) { struct xattr_name kname; struct kernel_xattr_ctx ctx = { .value = value, .size = size, .kname = &kname, .flags = 0, }; struct filename *filename; ssize_t error; if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; error = import_xattr_name(&kname, name); if (error) return error; filename = getname_maybe_null(pathname, at_flags); if (!filename) { CLASS(fd, f)(dfd); if (fd_empty(f)) return -EBADF; return file_getxattr(fd_file(f), &ctx); } else { int lookup_flags = 0; if (!(at_flags & AT_SYMLINK_NOFOLLOW)) lookup_flags = LOOKUP_FOLLOW; return filename_getxattr(dfd, filename, lookup_flags, &ctx); } } SYSCALL_DEFINE6(getxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags, const char __user *, name, struct xattr_args __user *, uargs, size_t, usize) { struct xattr_args args = {}; int error; BUILD_BUG_ON(sizeof(struct xattr_args) < XATTR_ARGS_SIZE_VER0); BUILD_BUG_ON(sizeof(struct xattr_args) != XATTR_ARGS_SIZE_LATEST); if (unlikely(usize < XATTR_ARGS_SIZE_VER0)) return -EINVAL; if (usize > PAGE_SIZE) return -E2BIG; error = copy_struct_from_user(&args, sizeof(args), uargs, usize); if (error) return error; if (args.flags != 0) return -EINVAL; return path_getxattrat(dfd, pathname, at_flags, name, u64_to_user_ptr(args.value), args.size); } SYSCALL_DEFINE4(getxattr, const char __user *, pathname, const char __user *, name, void __user *, value, size_t, size) { return path_getxattrat(AT_FDCWD, pathname, 0, name, value, size); } SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname, const char __user *, name, void __user *, value, size_t, size) { return path_getxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name, value, size); } SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name, void __user *, value, size_t, size) { return path_getxattrat(fd, NULL, AT_EMPTY_PATH, name, value, size); } /* * Extended attribute LIST operations */ static ssize_t listxattr(struct dentry *d, char __user *list, size_t size) { ssize_t error; char *klist = NULL; if (size) { if (size > XATTR_LIST_MAX) size = XATTR_LIST_MAX; klist = kvmalloc(size, GFP_KERNEL); if (!klist) return -ENOMEM; } error = vfs_listxattr(d, klist, size); if (error > 0) { if (size && copy_to_user(list, klist, error)) error = -EFAULT; } else if (error == -ERANGE && size >= XATTR_LIST_MAX) { /* The file system tried to returned a list bigger than XATTR_LIST_MAX bytes. Not possible. */ error = -E2BIG; } kvfree(klist); return error; } static ssize_t file_listxattr(struct file *f, char __user *list, size_t size) { audit_file(f); return listxattr(f->f_path.dentry, list, size); } /* unconditionally consumes filename */ static ssize_t filename_listxattr(int dfd, struct filename *filename, unsigned int lookup_flags, char __user *list, size_t size) { struct path path; ssize_t error; retry: error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); if (error) goto out; error = listxattr(path.dentry, list, size); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: putname(filename); return error; } static ssize_t path_listxattrat(int dfd, const char __user *pathname, unsigned int at_flags, char __user *list, size_t size) { struct filename *filename; int lookup_flags; if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; filename = getname_maybe_null(pathname, at_flags); if (!filename) { CLASS(fd, f)(dfd); if (fd_empty(f)) return -EBADF; return file_listxattr(fd_file(f), list, size); } lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; return filename_listxattr(dfd, filename, lookup_flags, list, size); } SYSCALL_DEFINE5(listxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags, char __user *, list, size_t, size) { return path_listxattrat(dfd, pathname, at_flags, list, size); } SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list, size_t, size) { return path_listxattrat(AT_FDCWD, pathname, 0, list, size); } SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list, size_t, size) { return path_listxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, list, size); } SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size) { return path_listxattrat(fd, NULL, AT_EMPTY_PATH, list, size); } /* * Extended attribute REMOVE operations */ static long removexattr(struct mnt_idmap *idmap, struct dentry *d, const char *name) { if (is_posix_acl_xattr(name)) return vfs_remove_acl(idmap, d, name); return vfs_removexattr(idmap, d, name); } static int file_removexattr(struct file *f, struct xattr_name *kname) { int error = mnt_want_write_file(f); if (!error) { audit_file(f); error = removexattr(file_mnt_idmap(f), f->f_path.dentry, kname->name); mnt_drop_write_file(f); } return error; } /* unconditionally consumes filename */ static int filename_removexattr(int dfd, struct filename *filename, unsigned int lookup_flags, struct xattr_name *kname) { struct path path; int error; retry: error = filename_lookup(dfd, filename, lookup_flags, &path, NULL); if (error) goto out; error = mnt_want_write(path.mnt); if (!error) { error = removexattr(mnt_idmap(path.mnt), path.dentry, kname->name); mnt_drop_write(path.mnt); } path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out: putname(filename); return error; } static int path_removexattrat(int dfd, const char __user *pathname, unsigned int at_flags, const char __user *name) { struct xattr_name kname; struct filename *filename; unsigned int lookup_flags; int error; if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) return -EINVAL; error = import_xattr_name(&kname, name); if (error) return error; filename = getname_maybe_null(pathname, at_flags); if (!filename) { CLASS(fd, f)(dfd); if (fd_empty(f)) return -EBADF; return file_removexattr(fd_file(f), &kname); } lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; return filename_removexattr(dfd, filename, lookup_flags, &kname); } SYSCALL_DEFINE4(removexattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags, const char __user *, name) { return path_removexattrat(dfd, pathname, at_flags, name); } SYSCALL_DEFINE2(removexattr, const char __user *, pathname, const char __user *, name) { return path_removexattrat(AT_FDCWD, pathname, 0, name); } SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname, const char __user *, name) { return path_removexattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name); } SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) { return path_removexattrat(fd, NULL, AT_EMPTY_PATH, name); } int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name) { size_t len; len = strlen(name) + 1; if (*buffer) { if (*remaining_size < len) return -ERANGE; memcpy(*buffer, name, len); *buffer += len; } *remaining_size -= len; return 0; } /** * generic_listxattr - run through a dentry's xattr list() operations * @dentry: dentry to list the xattrs * @buffer: result buffer * @buffer_size: size of @buffer * * Combine the results of the list() operation from every xattr_handler in the * xattr_handler stack. * * Note that this will not include the entries for POSIX ACLs. */ ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { const struct xattr_handler *handler, * const *handlers = dentry->d_sb->s_xattr; ssize_t remaining_size = buffer_size; for_each_xattr_handler(handlers, handler) { int err; if (!handler->name || (handler->list && !handler->list(dentry))) continue; err = xattr_list_one(&buffer, &remaining_size, handler->name); if (err) return err; } return buffer_size - remaining_size; } EXPORT_SYMBOL(generic_listxattr); /** * xattr_full_name - Compute full attribute name from suffix * * @handler: handler of the xattr_handler operation * @name: name passed to the xattr_handler operation * * The get and set xattr handler operations are called with the remainder of * the attribute name after skipping the handler's prefix: for example, "foo" * is passed to the get operation of a handler with prefix "user." to get * attribute "user.foo". The full name is still "there" in the name though. * * Note: the list xattr handler operation when called from the vfs is passed a * NULL name; some file systems use this operation internally, with varying * semantics. */ const char *xattr_full_name(const struct xattr_handler *handler, const char *name) { size_t prefix_len = strlen(xattr_prefix(handler)); return name - prefix_len; } EXPORT_SYMBOL(xattr_full_name); /** * simple_xattr_space - estimate the memory used by a simple xattr * @name: the full name of the xattr * @size: the size of its value * * This takes no account of how much larger the two slab objects actually are: * that would depend on the slab implementation, when what is required is a * deterministic number, which grows with name length and size and quantity. * * Return: The approximate number of bytes of memory used by such an xattr. */ size_t simple_xattr_space(const char *name, size_t size) { /* * Use "40" instead of sizeof(struct simple_xattr), to return the * same result on 32-bit and 64-bit, and even if simple_xattr grows. */ return 40 + size + strlen(name); } /** * simple_xattr_free - free an xattr object * @xattr: the xattr object * * Free the xattr object. Can handle @xattr being NULL. */ void simple_xattr_free(struct simple_xattr *xattr) { if (xattr) kfree(xattr->name); kvfree(xattr); } /** * simple_xattr_alloc - allocate new xattr object * @value: value of the xattr object * @size: size of @value * * Allocate a new xattr object and initialize respective members. The caller is * responsible for handling the name of the xattr. * * Return: On success a new xattr object is returned. On failure NULL is * returned. */ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) { struct simple_xattr *new_xattr; size_t len; /* wrap around? */ len = sizeof(*new_xattr) + size; if (len < sizeof(*new_xattr)) return NULL; new_xattr = kvmalloc(len, GFP_KERNEL_ACCOUNT); if (!new_xattr) return NULL; new_xattr->size = size; memcpy(new_xattr->value, value, size); return new_xattr; } /** * rbtree_simple_xattr_cmp - compare xattr name with current rbtree xattr entry * @key: xattr name * @node: current node * * Compare the xattr name with the xattr name attached to @node in the rbtree. * * Return: Negative value if continuing left, positive if continuing right, 0 * if the xattr attached to @node matches @key. */ static int rbtree_simple_xattr_cmp(const void *key, const struct rb_node *node) { const char *xattr_name = key; const struct simple_xattr *xattr; xattr = rb_entry(node, struct simple_xattr, rb_node); return strcmp(xattr->name, xattr_name); } /** * rbtree_simple_xattr_node_cmp - compare two xattr rbtree nodes * @new_node: new node * @node: current node * * Compare the xattr attached to @new_node with the xattr attached to @node. * * Return: Negative value if continuing left, positive if continuing right, 0 * if the xattr attached to @new_node matches the xattr attached to @node. */ static int rbtree_simple_xattr_node_cmp(struct rb_node *new_node, const struct rb_node *node) { struct simple_xattr *xattr; xattr = rb_entry(new_node, struct simple_xattr, rb_node); return rbtree_simple_xattr_cmp(xattr->name, node); } /** * simple_xattr_get - get an xattr object * @xattrs: the header of the xattr object * @name: the name of the xattr to retrieve * @buffer: the buffer to store the value into * @size: the size of @buffer * * Try to find and retrieve the xattr object associated with @name. * If @buffer is provided store the value of @xattr in @buffer * otherwise just return the length. The size of @buffer is limited * to XATTR_SIZE_MAX which currently is 65536. * * Return: On success the length of the xattr value is returned. On error a * negative error code is returned. */ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, void *buffer, size_t size) { struct simple_xattr *xattr = NULL; struct rb_node *rbp; int ret = -ENODATA; read_lock(&xattrs->lock); rbp = rb_find(name, &xattrs->rb_root, rbtree_simple_xattr_cmp); if (rbp) { xattr = rb_entry(rbp, struct simple_xattr, rb_node); ret = xattr->size; if (buffer) { if (size < xattr->size) ret = -ERANGE; else memcpy(buffer, xattr->value, xattr->size); } } read_unlock(&xattrs->lock); return ret; } /** * simple_xattr_set - set an xattr object * @xattrs: the header of the xattr object * @name: the name of the xattr to retrieve * @value: the value to store along the xattr * @size: the size of @value * @flags: the flags determining how to set the xattr * * Set a new xattr object. * If @value is passed a new xattr object will be allocated. If XATTR_REPLACE * is specified in @flags a matching xattr object for @name must already exist. * If it does it will be replaced with the new xattr object. If it doesn't we * fail. If XATTR_CREATE is specified and a matching xattr does already exist * we fail. If it doesn't we create a new xattr. If @flags is zero we simply * insert the new xattr replacing any existing one. * * If @value is empty and a matching xattr object is found we delete it if * XATTR_REPLACE is specified in @flags or @flags is zero. * * If @value is empty and no matching xattr object for @name is found we do * nothing if XATTR_CREATE is specified in @flags or @flags is zero. For * XATTR_REPLACE we fail as mentioned above. * * Return: On success, the removed or replaced xattr is returned, to be freed * by the caller; or NULL if none. On failure a negative error code is returned. */ struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, const char *name, const void *value, size_t size, int flags) { struct simple_xattr *old_xattr = NULL, *new_xattr = NULL; struct rb_node *parent = NULL, **rbp; int err = 0, ret; /* value == NULL means remove */ if (value) { new_xattr = simple_xattr_alloc(value, size); if (!new_xattr) return ERR_PTR(-ENOMEM); new_xattr->name = kstrdup(name, GFP_KERNEL_ACCOUNT); if (!new_xattr->name) { simple_xattr_free(new_xattr); return ERR_PTR(-ENOMEM); } } write_lock(&xattrs->lock); rbp = &xattrs->rb_root.rb_node; while (*rbp) { parent = *rbp; ret = rbtree_simple_xattr_cmp(name, *rbp); if (ret < 0) rbp = &(*rbp)->rb_left; else if (ret > 0) rbp = &(*rbp)->rb_right; else old_xattr = rb_entry(*rbp, struct simple_xattr, rb_node); if (old_xattr) break; } if (old_xattr) { /* Fail if XATTR_CREATE is requested and the xattr exists. */ if (flags & XATTR_CREATE) { err = -EEXIST; goto out_unlock; } if (new_xattr) rb_replace_node(&old_xattr->rb_node, &new_xattr->rb_node, &xattrs->rb_root); else rb_erase(&old_xattr->rb_node, &xattrs->rb_root); } else { /* Fail if XATTR_REPLACE is requested but no xattr is found. */ if (flags & XATTR_REPLACE) { err = -ENODATA; goto out_unlock; } /* * If XATTR_CREATE or no flags are specified together with a * new value simply insert it. */ if (new_xattr) { rb_link_node(&new_xattr->rb_node, parent, rbp); rb_insert_color(&new_xattr->rb_node, &xattrs->rb_root); } /* * If XATTR_CREATE or no flags are specified and neither an * old or new xattr exist then we don't need to do anything. */ } out_unlock: write_unlock(&xattrs->lock); if (!err) return old_xattr; simple_xattr_free(new_xattr); return ERR_PTR(err); } static bool xattr_is_trusted(const char *name) { return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); } /** * simple_xattr_list - list all xattr objects * @inode: inode from which to get the xattrs * @xattrs: the header of the xattr object * @buffer: the buffer to store all xattrs into * @size: the size of @buffer * * List all xattrs associated with @inode. If @buffer is NULL we returned * the required size of the buffer. If @buffer is provided we store the * xattrs value into it provided it is big enough. * * Note, the number of xattr names that can be listed with listxattr(2) is * limited to XATTR_LIST_MAX aka 65536 bytes. If a larger buffer is passed * then vfs_listxattr() caps it to XATTR_LIST_MAX and if more xattr names * are found it will return -E2BIG. * * Return: On success the required size or the size of the copied xattrs is * returned. On error a negative error code is returned. */ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, size_t size) { bool trusted = ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN); struct simple_xattr *xattr; struct rb_node *rbp; ssize_t remaining_size = size; int err = 0; err = posix_acl_listxattr(inode, &buffer, &remaining_size); if (err) return err; read_lock(&xattrs->lock); for (rbp = rb_first(&xattrs->rb_root); rbp; rbp = rb_next(rbp)) { xattr = rb_entry(rbp, struct simple_xattr, rb_node); /* skip "trusted." attributes for unprivileged callers */ if (!trusted && xattr_is_trusted(xattr->name)) continue; err = xattr_list_one(&buffer, &remaining_size, xattr->name); if (err) break; } read_unlock(&xattrs->lock); return err ? err : size - remaining_size; } /** * rbtree_simple_xattr_less - compare two xattr rbtree nodes * @new_node: new node * @node: current node * * Compare the xattr attached to @new_node with the xattr attached to @node. * Note that this function technically tolerates duplicate entries. * * Return: True if insertion point in the rbtree is found. */ static bool rbtree_simple_xattr_less(struct rb_node *new_node, const struct rb_node *node) { return rbtree_simple_xattr_node_cmp(new_node, node) < 0; } /** * simple_xattr_add - add xattr objects * @xattrs: the header of the xattr object * @new_xattr: the xattr object to add * * Add an xattr object to @xattrs. This assumes no replacement or removal * of matching xattrs is wanted. Should only be called during inode * initialization when a few distinct initial xattrs are supposed to be set. */ void simple_xattr_add(struct simple_xattrs *xattrs, struct simple_xattr *new_xattr) { write_lock(&xattrs->lock); rb_add(&new_xattr->rb_node, &xattrs->rb_root, rbtree_simple_xattr_less); write_unlock(&xattrs->lock); } /** * simple_xattrs_init - initialize new xattr header * @xattrs: header to initialize * * Initialize relevant fields of a an xattr header. */ void simple_xattrs_init(struct simple_xattrs *xattrs) { xattrs->rb_root = RB_ROOT; rwlock_init(&xattrs->lock); } /** * simple_xattrs_free - free xattrs * @xattrs: xattr header whose xattrs to destroy * @freed_space: approximate number of bytes of memory freed from @xattrs * * Destroy all xattrs in @xattr. When this is called no one can hold a * reference to any of the xattrs anymore. */ void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space) { struct rb_node *rbp; if (freed_space) *freed_space = 0; rbp = rb_first(&xattrs->rb_root); while (rbp) { struct simple_xattr *xattr; struct rb_node *rbp_next; rbp_next = rb_next(rbp); xattr = rb_entry(rbp, struct simple_xattr, rb_node); rb_erase(&xattr->rb_node, &xattrs->rb_root); if (freed_space) *freed_space += simple_xattr_space(xattr->name, xattr->size); simple_xattr_free(xattr); rbp = rbp_next; } }
6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 // SPDX-License-Identifier: GPL-2.0 #include <linux/highmem.h> #include <linux/module.h> #include <linux/security.h> #include <linux/slab.h> #include <linux/types.h> #include "sysfs.h" /* * sysfs support for firmware loader */ void __fw_load_abort(struct fw_priv *fw_priv) { /* * There is a small window in which user can write to 'loading' * between loading done/aborted and disappearance of 'loading' */ if (fw_state_is_aborted(fw_priv) || fw_state_is_done(fw_priv)) return; fw_state_aborted(fw_priv); } #ifdef CONFIG_FW_LOADER_USER_HELPER static ssize_t timeout_show(const struct class *class, const struct class_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", __firmware_loading_timeout()); } /** * timeout_store() - set number of seconds to wait for firmware * @class: device class pointer * @attr: device attribute pointer * @buf: buffer to scan for timeout value * @count: number of bytes in @buf * * Sets the number of seconds to wait for the firmware. Once * this expires an error will be returned to the driver and no * firmware will be provided. * * Note: zero means 'wait forever'. **/ static ssize_t timeout_store(const struct class *class, const struct class_attribute *attr, const char *buf, size_t count) { int tmp_loading_timeout = simple_strtol(buf, NULL, 10); if (tmp_loading_timeout < 0) tmp_loading_timeout = 0; __fw_fallback_set_timeout(tmp_loading_timeout); return count; } static CLASS_ATTR_RW(timeout); static struct attribute *firmware_class_attrs[] = { &class_attr_timeout.attr, NULL, }; ATTRIBUTE_GROUPS(firmware_class); static int do_firmware_uevent(const struct fw_sysfs *fw_sysfs, struct kobj_uevent_env *env) { if (add_uevent_var(env, "FIRMWARE=%s", fw_sysfs->fw_priv->fw_name)) return -ENOMEM; if (add_uevent_var(env, "TIMEOUT=%i", __firmware_loading_timeout())) return -ENOMEM; if (add_uevent_var(env, "ASYNC=%d", fw_sysfs->nowait)) return -ENOMEM; return 0; } static int firmware_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct fw_sysfs *fw_sysfs = to_fw_sysfs(dev); int err = 0; mutex_lock(&fw_lock); if (fw_sysfs->fw_priv) err = do_firmware_uevent(fw_sysfs, env); mutex_unlock(&fw_lock); return err; } #endif /* CONFIG_FW_LOADER_USER_HELPER */ static void fw_dev_release(struct device *dev) { struct fw_sysfs *fw_sysfs = to_fw_sysfs(dev); if (fw_sysfs->fw_upload_priv) fw_upload_free(fw_sysfs); kfree(fw_sysfs); } static struct class firmware_class = { .name = "firmware", #ifdef CONFIG_FW_LOADER_USER_HELPER .class_groups = firmware_class_groups, .dev_uevent = firmware_uevent, #endif .dev_release = fw_dev_release, }; int register_sysfs_loader(void) { int ret = class_register(&firmware_class); if (ret != 0) return ret; return register_firmware_config_sysctl(); } void unregister_sysfs_loader(void) { unregister_firmware_config_sysctl(); class_unregister(&firmware_class); } static ssize_t firmware_loading_show(struct device *dev, struct device_attribute *attr, char *buf) { struct fw_sysfs *fw_sysfs = to_fw_sysfs(dev); int loading = 0; mutex_lock(&fw_lock); if (fw_sysfs->fw_priv) loading = fw_state_is_loading(fw_sysfs->fw_priv); mutex_unlock(&fw_lock); return sysfs_emit(buf, "%d\n", loading); } /** * firmware_loading_store() - set value in the 'loading' control file * @dev: device pointer * @attr: device attribute pointer * @buf: buffer to scan for loading control value * @count: number of bytes in @buf * * The relevant values are: * * 1: Start a load, discarding any previous partial load. * 0: Conclude the load and hand the data to the driver code. * -1: Conclude the load with an error and discard any written data. **/ static ssize_t firmware_loading_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct fw_sysfs *fw_sysfs = to_fw_sysfs(dev); struct fw_priv *fw_priv; ssize_t written = count; int loading = simple_strtol(buf, NULL, 10); mutex_lock(&fw_lock); fw_priv = fw_sysfs->fw_priv; if (fw_state_is_aborted(fw_priv) || fw_state_is_done(fw_priv)) goto out; switch (loading) { case 1: /* discarding any previous partial load */ fw_free_paged_buf(fw_priv); fw_state_start(fw_priv); break; case 0: if (fw_state_is_loading(fw_priv)) { int rc; /* * Several loading requests may be pending on * one same firmware buf, so let all requests * see the mapped 'buf->data' once the loading * is completed. */ rc = fw_map_paged_buf(fw_priv); if (rc) dev_err(dev, "%s: map pages failed\n", __func__); else rc = security_kernel_post_load_data(fw_priv->data, fw_priv->size, LOADING_FIRMWARE, "blob"); /* * Same logic as fw_load_abort, only the DONE bit * is ignored and we set ABORT only on failure. */ if (rc) { fw_state_aborted(fw_priv); written = rc; } else { fw_state_done(fw_priv); /* * If this is a user-initiated firmware upload * then start the upload in a worker thread now. */ rc = fw_upload_start(fw_sysfs); if (rc) written = rc; } break; } fallthrough; default: dev_err(dev, "%s: unexpected value (%d)\n", __func__, loading); fallthrough; case -1: fw_load_abort(fw_sysfs); if (fw_sysfs->fw_upload_priv) fw_state_init(fw_sysfs->fw_priv); break; } out: mutex_unlock(&fw_lock); return written; } DEVICE_ATTR(loading, 0644, firmware_loading_show, firmware_loading_store); static void firmware_rw_data(struct fw_priv *fw_priv, char *buffer, loff_t offset, size_t count, bool read) { if (read) memcpy(buffer, fw_priv->data + offset, count); else memcpy(fw_priv->data + offset, buffer, count); } static void firmware_rw(struct fw_priv *fw_priv, char *buffer, loff_t offset, size_t count, bool read) { while (count) { int page_nr = offset >> PAGE_SHIFT; int page_ofs = offset & (PAGE_SIZE - 1); int page_cnt = min_t(size_t, PAGE_SIZE - page_ofs, count); if (read) memcpy_from_page(buffer, fw_priv->pages[page_nr], page_ofs, page_cnt); else memcpy_to_page(fw_priv->pages[page_nr], page_ofs, buffer, page_cnt); buffer += page_cnt; offset += page_cnt; count -= page_cnt; } } static ssize_t firmware_data_read(struct file *filp, struct kobject *kobj, const struct bin_attribute *bin_attr, char *buffer, loff_t offset, size_t count) { struct device *dev = kobj_to_dev(kobj); struct fw_sysfs *fw_sysfs = to_fw_sysfs(dev); struct fw_priv *fw_priv; ssize_t ret_count; mutex_lock(&fw_lock); fw_priv = fw_sysfs->fw_priv; if (!fw_priv || fw_state_is_done(fw_priv)) { ret_count = -ENODEV; goto out; } if (offset > fw_priv->size) { ret_count = 0; goto out; } if (count > fw_priv->size - offset) count = fw_priv->size - offset; ret_count = count; if (fw_priv->data) firmware_rw_data(fw_priv, buffer, offset, count, true); else firmware_rw(fw_priv, buffer, offset, count, true); out: mutex_unlock(&fw_lock); return ret_count; } static int fw_realloc_pages(struct fw_sysfs *fw_sysfs, int min_size) { int err; err = fw_grow_paged_buf(fw_sysfs->fw_priv, PAGE_ALIGN(min_size) >> PAGE_SHIFT); if (err) fw_load_abort(fw_sysfs); return err; } /** * firmware_data_write() - write method for firmware * @filp: open sysfs file * @kobj: kobject for the device * @bin_attr: bin_attr structure * @buffer: buffer being written * @offset: buffer offset for write in total data store area * @count: buffer size * * Data written to the 'data' attribute will be later handed to * the driver as a firmware image. **/ static ssize_t firmware_data_write(struct file *filp, struct kobject *kobj, const struct bin_attribute *bin_attr, char *buffer, loff_t offset, size_t count) { struct device *dev = kobj_to_dev(kobj); struct fw_sysfs *fw_sysfs = to_fw_sysfs(dev); struct fw_priv *fw_priv; ssize_t retval; if (!capable(CAP_SYS_RAWIO)) return -EPERM; mutex_lock(&fw_lock); fw_priv = fw_sysfs->fw_priv; if (!fw_priv || fw_state_is_done(fw_priv)) { retval = -ENODEV; goto out; } if (fw_priv->data) { if (offset + count > fw_priv->allocated_size) { retval = -ENOMEM; goto out; } firmware_rw_data(fw_priv, buffer, offset, count, false); retval = count; } else { retval = fw_realloc_pages(fw_sysfs, offset + count); if (retval) goto out; retval = count; firmware_rw(fw_priv, buffer, offset, count, false); } fw_priv->size = max_t(size_t, offset + count, fw_priv->size); out: mutex_unlock(&fw_lock); return retval; } static const struct bin_attribute firmware_attr_data = { .attr = { .name = "data", .mode = 0644 }, .size = 0, .read_new = firmware_data_read, .write_new = firmware_data_write, }; static struct attribute *fw_dev_attrs[] = { &dev_attr_loading.attr, #ifdef CONFIG_FW_UPLOAD &dev_attr_cancel.attr, &dev_attr_status.attr, &dev_attr_error.attr, &dev_attr_remaining_size.attr, #endif NULL }; static const struct bin_attribute *const fw_dev_bin_attrs[] = { &firmware_attr_data, NULL }; static const struct attribute_group fw_dev_attr_group = { .attrs = fw_dev_attrs, .bin_attrs_new = fw_dev_bin_attrs, #ifdef CONFIG_FW_UPLOAD .is_visible = fw_upload_is_visible, #endif }; static const struct attribute_group *fw_dev_attr_groups[] = { &fw_dev_attr_group, NULL }; struct fw_sysfs * fw_create_instance(struct firmware *firmware, const char *fw_name, struct device *device, u32 opt_flags) { struct fw_sysfs *fw_sysfs; struct device *f_dev; fw_sysfs = kzalloc(sizeof(*fw_sysfs), GFP_KERNEL); if (!fw_sysfs) { fw_sysfs = ERR_PTR(-ENOMEM); goto exit; } fw_sysfs->nowait = !!(opt_flags & FW_OPT_NOWAIT); fw_sysfs->fw = firmware; f_dev = &fw_sysfs->dev; device_initialize(f_dev); dev_set_name(f_dev, "%s", fw_name); f_dev->parent = device; f_dev->class = &firmware_class; f_dev->groups = fw_dev_attr_groups; exit: return fw_sysfs; }
2 1 1 1 1 1 1 1 1 1 1 7 4 5 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/module.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/ip6_fib.h> #include <net/ip6_checksum.h> #include <net/netfilter/ipv6/nf_reject.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_bridge.h> static bool nf_reject_v6_csum_ok(struct sk_buff *skb, int hook) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); int thoff; __be16 fo; u8 proto = ip6h->nexthdr; if (skb_csum_unnecessary(skb)) return true; if (ip6h->payload_len && pskb_trim_rcsum(skb, ntohs(ip6h->payload_len) + sizeof(*ip6h))) return false; ip6h = ipv6_hdr(skb); thoff = ipv6_skip_exthdr(skb, ((u8*)(ip6h+1) - skb->data), &proto, &fo); if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0) return false; if (!nf_reject_verify_csum(skb, thoff, proto)) return true; return nf_ip6_checksum(skb, hook, thoff, proto) == 0; } static int nf_reject_ip6hdr_validate(struct sk_buff *skb) { struct ipv6hdr *hdr; u32 pkt_len; if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) return 0; hdr = ipv6_hdr(skb); if (hdr->version != 6) return 0; pkt_len = ntohs(hdr->payload_len); if (pkt_len + sizeof(struct ipv6hdr) > skb->len) return 0; return 1; } struct sk_buff *nf_reject_skb_v6_tcp_reset(struct net *net, struct sk_buff *oldskb, const struct net_device *dev, int hook) { struct sk_buff *nskb; const struct tcphdr *oth; struct tcphdr _oth; unsigned int otcplen; struct ipv6hdr *nip6h; if (!nf_reject_ip6hdr_validate(oldskb)) return NULL; oth = nf_reject_ip6_tcphdr_get(oldskb, &_oth, &otcplen, hook); if (!oth) return NULL; nskb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(struct tcphdr) + LL_MAX_HEADER, GFP_ATOMIC); if (!nskb) return NULL; nskb->dev = (struct net_device *)dev; skb_reserve(nskb, LL_MAX_HEADER); nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP, READ_ONCE(net->ipv6.devconf_all->hop_limit)); nf_reject_ip6_tcphdr_put(nskb, oldskb, oth, otcplen); nip6h->payload_len = htons(nskb->len - sizeof(struct ipv6hdr)); return nskb; } EXPORT_SYMBOL_GPL(nf_reject_skb_v6_tcp_reset); struct sk_buff *nf_reject_skb_v6_unreach(struct net *net, struct sk_buff *oldskb, const struct net_device *dev, int hook, u8 code) { struct sk_buff *nskb; struct ipv6hdr *nip6h; struct icmp6hdr *icmp6h; unsigned int len; if (!nf_reject_ip6hdr_validate(oldskb)) return NULL; /* Include "As much of invoking packet as possible without the ICMPv6 * packet exceeding the minimum IPv6 MTU" in the ICMP payload. */ len = min_t(unsigned int, 1220, oldskb->len); if (!pskb_may_pull(oldskb, len)) return NULL; if (!nf_reject_v6_csum_ok(oldskb, hook)) return NULL; nskb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(struct icmp6hdr) + LL_MAX_HEADER + len, GFP_ATOMIC); if (!nskb) return NULL; nskb->dev = (struct net_device *)dev; skb_reserve(nskb, LL_MAX_HEADER); nip6h = nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_ICMPV6, READ_ONCE(net->ipv6.devconf_all->hop_limit)); skb_reset_transport_header(nskb); icmp6h = skb_put_zero(nskb, sizeof(struct icmp6hdr)); icmp6h->icmp6_type = ICMPV6_DEST_UNREACH; icmp6h->icmp6_code = code; skb_put_data(nskb, skb_network_header(oldskb), len); nip6h->payload_len = htons(nskb->len - sizeof(struct ipv6hdr)); icmp6h->icmp6_cksum = csum_ipv6_magic(&nip6h->saddr, &nip6h->daddr, nskb->len - sizeof(struct ipv6hdr), IPPROTO_ICMPV6, csum_partial(icmp6h, nskb->len - sizeof(struct ipv6hdr), 0)); return nskb; } EXPORT_SYMBOL_GPL(nf_reject_skb_v6_unreach); const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, struct tcphdr *otcph, unsigned int *otcplen, int hook) { const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); u8 proto; __be16 frag_off; int tcphoff; proto = oip6h->nexthdr; tcphoff = ipv6_skip_exthdr(oldskb, ((u8 *)(oip6h + 1) - oldskb->data), &proto, &frag_off); if ((tcphoff < 0) || (tcphoff > oldskb->len)) { pr_debug("Cannot get TCP header.\n"); return NULL; } *otcplen = oldskb->len - tcphoff; /* IP header checks: fragment, too short. */ if (proto != IPPROTO_TCP || *otcplen < sizeof(struct tcphdr)) { pr_debug("proto(%d) != IPPROTO_TCP or too short (len = %d)\n", proto, *otcplen); return NULL; } otcph = skb_header_pointer(oldskb, tcphoff, sizeof(struct tcphdr), otcph); if (otcph == NULL) return NULL; /* No RST for RST. */ if (otcph->rst) { pr_debug("RST is set\n"); return NULL; } /* Check checksum. */ if (nf_ip6_checksum(oldskb, hook, tcphoff, IPPROTO_TCP)) { pr_debug("TCP checksum is invalid\n"); return NULL; } return otcph; } EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_get); struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, __u8 protocol, int hoplimit) { struct ipv6hdr *ip6h; const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); #define DEFAULT_TOS_VALUE 0x0U const __u8 tclass = DEFAULT_TOS_VALUE; skb_put(nskb, sizeof(struct ipv6hdr)); skb_reset_network_header(nskb); ip6h = ipv6_hdr(nskb); ip6_flow_hdr(ip6h, tclass, 0); ip6h->hop_limit = hoplimit; ip6h->nexthdr = protocol; ip6h->saddr = oip6h->daddr; ip6h->daddr = oip6h->saddr; nskb->protocol = htons(ETH_P_IPV6); return ip6h; } EXPORT_SYMBOL_GPL(nf_reject_ip6hdr_put); void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, const struct tcphdr *oth, unsigned int otcplen) { struct tcphdr *tcph; skb_reset_transport_header(nskb); tcph = skb_put_zero(nskb, sizeof(struct tcphdr)); /* Truncate to length (no data) */ tcph->doff = sizeof(struct tcphdr)/4; tcph->source = oth->dest; tcph->dest = oth->source; if (oth->ack) { tcph->seq = oth->ack_seq; } else { tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + otcplen - (oth->doff<<2)); tcph->ack = 1; } tcph->rst = 1; /* Adjust TCP checksum */ tcph->check = csum_ipv6_magic(&ipv6_hdr(nskb)->saddr, &ipv6_hdr(nskb)->daddr, sizeof(struct tcphdr), IPPROTO_TCP, csum_partial(tcph, sizeof(struct tcphdr), 0)); } EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_put); static int nf_reject6_fill_skb_dst(struct sk_buff *skb_in) { struct dst_entry *dst = NULL; struct flowi fl; memset(&fl, 0, sizeof(struct flowi)); fl.u.ip6.daddr = ipv6_hdr(skb_in)->saddr; nf_ip6_route(dev_net(skb_in->dev), &dst, &fl, false); if (!dst) return -1; skb_dst_set(skb_in, dst); return 0; } void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, int hook) { const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); struct dst_entry *dst = NULL; const struct tcphdr *otcph; struct sk_buff *nskb; struct tcphdr _otcph; unsigned int otcplen; struct flowi6 fl6; if ((!(ipv6_addr_type(&oip6h->saddr) & IPV6_ADDR_UNICAST)) || (!(ipv6_addr_type(&oip6h->daddr) & IPV6_ADDR_UNICAST))) { pr_debug("addr is not unicast.\n"); return; } otcph = nf_reject_ip6_tcphdr_get(oldskb, &_otcph, &otcplen, hook); if (!otcph) return; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_TCP; fl6.saddr = oip6h->daddr; fl6.daddr = oip6h->saddr; fl6.fl6_sport = otcph->dest; fl6.fl6_dport = otcph->source; if (hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) { nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false); if (!dst) return; skb_dst_set(oldskb, dst); } fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst(oldskb)->dev); fl6.flowi6_mark = IP6_REPLY_MARK(net, oldskb->mark); security_skb_classify_flow(oldskb, flowi6_to_flowi_common(&fl6)); dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { dst_release(dst); return; } dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); if (IS_ERR(dst)) return; nskb = alloc_skb(LL_MAX_HEADER + sizeof(struct ipv6hdr) + sizeof(struct tcphdr) + dst->trailer_len, GFP_ATOMIC); if (!nskb) { net_dbg_ratelimited("cannot alloc skb\n"); dst_release(dst); return; } skb_dst_set(nskb, dst); nskb->mark = fl6.flowi6_mark; skb_reserve(nskb, LL_MAX_HEADER); nf_reject_ip6hdr_put(nskb, oldskb, IPPROTO_TCP, ip6_dst_hoplimit(dst)); nf_reject_ip6_tcphdr_put(nskb, oldskb, otcph, otcplen); nf_ct_attach(nskb, oldskb); nf_ct_set_closing(skb_nfct(oldskb)); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* If we use ip6_local_out for bridged traffic, the MAC source on * the RST will be ours, instead of the destination's. This confuses * some routers/firewalls, and they drop the packet. So we need to * build the eth header using the original destination's MAC as the * source, and send the RST packet directly. */ if (nf_bridge_info_exists(oldskb)) { struct ethhdr *oeth = eth_hdr(oldskb); struct ipv6hdr *ip6h = ipv6_hdr(nskb); struct net_device *br_indev; br_indev = nf_bridge_get_physindev(oldskb, net); if (!br_indev) { kfree_skb(nskb); return; } nskb->dev = br_indev; nskb->protocol = htons(ETH_P_IPV6); ip6h->payload_len = htons(sizeof(struct tcphdr)); if (dev_hard_header(nskb, nskb->dev, ntohs(nskb->protocol), oeth->h_source, oeth->h_dest, nskb->len) < 0) { kfree_skb(nskb); return; } dev_queue_xmit(nskb); } else #endif ip6_local_out(net, sk, nskb); } EXPORT_SYMBOL_GPL(nf_send_reset6); static bool reject6_csum_ok(struct sk_buff *skb, int hook) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); int thoff; __be16 fo; u8 proto; if (skb_csum_unnecessary(skb)) return true; proto = ip6h->nexthdr; thoff = ipv6_skip_exthdr(skb, ((u8 *)(ip6h + 1) - skb->data), &proto, &fo); if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0) return false; if (!nf_reject_verify_csum(skb, thoff, proto)) return true; return nf_ip6_checksum(skb, hook, thoff, proto) == 0; } void nf_send_unreach6(struct net *net, struct sk_buff *skb_in, unsigned char code, unsigned int hooknum) { if (!reject6_csum_ok(skb_in, hooknum)) return; if (hooknum == NF_INET_LOCAL_OUT && skb_in->dev == NULL) skb_in->dev = net->loopback_dev; if ((hooknum == NF_INET_PRE_ROUTING || hooknum == NF_INET_INGRESS) && nf_reject6_fill_skb_dst(skb_in) < 0) return; icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0); } EXPORT_SYMBOL_GPL(nf_send_unreach6); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IPv6 packet rejection core");
496 7667 177 539 52 1256 50 102 1253 315 2 2 8 8 27 2 30 32 2 4 171 178 170 174 143 2 142 144 329 184 308 307 4 3864 12 74 117 33 79 6 17 45 4 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 /* SPDX-License-Identifier: GPL-2.0 */ /* * Linux Socket Filter Data Structures */ #ifndef __LINUX_FILTER_H__ #define __LINUX_FILTER_H__ #include <linux/atomic.h> #include <linux/bpf.h> #include <linux/refcount.h> #include <linux/compat.h> #include <linux/skbuff.h> #include <linux/linkage.h> #include <linux/printk.h> #include <linux/workqueue.h> #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/capability.h> #include <linux/set_memory.h> #include <linux/kallsyms.h> #include <linux/if_vlan.h> #include <linux/vmalloc.h> #include <linux/sockptr.h> #include <crypto/sha1.h> #include <linux/u64_stats_sync.h> #include <net/sch_generic.h> #include <asm/byteorder.h> #include <uapi/linux/filter.h> struct sk_buff; struct sock; struct seccomp_data; struct bpf_prog_aux; struct xdp_rxq_info; struct xdp_buff; struct sock_reuseport; struct ctl_table; struct ctl_table_header; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function * calls in BPF_CALL instruction. */ #define BPF_REG_ARG1 BPF_REG_1 #define BPF_REG_ARG2 BPF_REG_2 #define BPF_REG_ARG3 BPF_REG_3 #define BPF_REG_ARG4 BPF_REG_4 #define BPF_REG_ARG5 BPF_REG_5 #define BPF_REG_CTX BPF_REG_6 #define BPF_REG_FP BPF_REG_10 /* Additional register mappings for converted user programs. */ #define BPF_REG_A BPF_REG_0 #define BPF_REG_X BPF_REG_7 #define BPF_REG_TMP BPF_REG_2 /* scratch reg */ #define BPF_REG_D BPF_REG_8 /* data, callee-saved */ #define BPF_REG_H BPF_REG_9 /* hlen, callee-saved */ /* Kernel hidden auxiliary/helper register. */ #define BPF_REG_AX MAX_BPF_REG #define MAX_BPF_EXT_REG (MAX_BPF_REG + 1) #define MAX_BPF_JIT_REG MAX_BPF_EXT_REG /* unused opcode to mark special call to bpf_tail_call() helper */ #define BPF_TAIL_CALL 0xf0 /* unused opcode to mark special load instruction. Same as BPF_ABS */ #define BPF_PROBE_MEM 0x20 /* unused opcode to mark special ldsx instruction. Same as BPF_IND */ #define BPF_PROBE_MEMSX 0x40 /* unused opcode to mark special load instruction. Same as BPF_MSH */ #define BPF_PROBE_MEM32 0xa0 /* unused opcode to mark special atomic instruction */ #define BPF_PROBE_ATOMIC 0xe0 /* unused opcode to mark call to interpreter with arguments */ #define BPF_CALL_ARGS 0xe0 /* unused opcode to mark speculation barrier for mitigating * Speculative Store Bypass */ #define BPF_NOSPEC 0xc0 /* As per nm, we expose JITed images as text (code) section for * kallsyms. That way, tools like perf can find it to match * addresses. */ #define BPF_SYM_ELF_TYPE 't' /* BPF program can access up to 512 bytes of stack space. */ #define MAX_BPF_STACK 512 /* Helper macros for filter block array initializers. */ /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ #define BPF_ALU64_REG_OFF(OP, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) #define BPF_ALU64_REG(OP, DST, SRC) \ BPF_ALU64_REG_OFF(OP, DST, SRC, 0) #define BPF_ALU32_REG_OFF(OP, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_OP(OP) | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) #define BPF_ALU32_REG(OP, DST, SRC) \ BPF_ALU32_REG_OFF(OP, DST, SRC, 0) /* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */ #define BPF_ALU64_IMM_OFF(OP, DST, IMM, OFF) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) #define BPF_ALU64_IMM(OP, DST, IMM) \ BPF_ALU64_IMM_OFF(OP, DST, IMM, 0) #define BPF_ALU32_IMM_OFF(OP, DST, IMM, OFF) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) #define BPF_ALU32_IMM(OP, DST, IMM) \ BPF_ALU32_IMM_OFF(OP, DST, IMM, 0) /* Endianess conversion, cpu_to_{l,b}e(), {l,b}e_to_cpu() */ #define BPF_ENDIAN(TYPE, DST, LEN) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_END | BPF_SRC(TYPE), \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = LEN }) /* Byte Swap, bswap16/32/64 */ #define BPF_BSWAP(DST, LEN) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_END | BPF_SRC(BPF_TO_LE), \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = LEN }) /* Short form of mov, dst_reg = src_reg */ #define BPF_MOV64_REG(DST, SRC) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_MOV | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = 0 }) #define BPF_MOV32_REG(DST, SRC) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_MOV | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = 0 }) /* Special (internal-only) form of mov, used to resolve per-CPU addrs: * dst_reg = src_reg + <percpu_base_off> * BPF_ADDR_PERCPU is used as a special insn->off value. */ #define BPF_ADDR_PERCPU (-1) #define BPF_MOV64_PERCPU_REG(DST, SRC) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_MOV | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = BPF_ADDR_PERCPU, \ .imm = 0 }) static inline bool insn_is_mov_percpu_addr(const struct bpf_insn *insn) { return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->off == BPF_ADDR_PERCPU; } /* Short form of mov, dst_reg = imm32 */ #define BPF_MOV64_IMM(DST, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_MOV | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = IMM }) #define BPF_MOV32_IMM(DST, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_MOV | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = IMM }) /* Short form of movsx, dst_reg = (s8,s16,s32)src_reg */ #define BPF_MOVSX64_REG(DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_MOV | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) #define BPF_MOVSX32_REG(DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_MOV | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) /* Special form of mov32, used for doing explicit zero extension on dst. */ #define BPF_ZEXT_REG(DST) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_MOV | BPF_X, \ .dst_reg = DST, \ .src_reg = DST, \ .off = 0, \ .imm = 1 }) static inline bool insn_is_zext(const struct bpf_insn *insn) { return insn->code == (BPF_ALU | BPF_MOV | BPF_X) && insn->imm == 1; } /* addr_space_cast from as(0) to as(1) is for converting bpf arena pointers * to pointers in user vma. */ static inline bool insn_is_cast_user(const struct bpf_insn *insn) { return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->off == BPF_ADDR_SPACE_CAST && insn->imm == 1U << 16; } /* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */ #define BPF_LD_IMM64(DST, IMM) \ BPF_LD_IMM64_RAW(DST, 0, IMM) #define BPF_LD_IMM64_RAW(DST, SRC, IMM) \ ((struct bpf_insn) { \ .code = BPF_LD | BPF_DW | BPF_IMM, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = (__u32) (IMM) }), \ ((struct bpf_insn) { \ .code = 0, /* zero is reserved opcode */ \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = ((__u64) (IMM)) >> 32 }) /* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */ #define BPF_LD_MAP_FD(DST, MAP_FD) \ BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD) /* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */ #define BPF_MOV64_RAW(TYPE, DST, SRC, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_MOV | BPF_SRC(TYPE), \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = IMM }) #define BPF_MOV32_RAW(TYPE, DST, SRC, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_MOV | BPF_SRC(TYPE), \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = IMM }) /* Direct packet access, R0 = *(uint *) (skb->data + imm32) */ #define BPF_LD_ABS(SIZE, IMM) \ ((struct bpf_insn) { \ .code = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = IMM }) /* Indirect packet access, R0 = *(uint *) (skb->data + src_reg + imm32) */ #define BPF_LD_IND(SIZE, SRC, IMM) \ ((struct bpf_insn) { \ .code = BPF_LD | BPF_SIZE(SIZE) | BPF_IND, \ .dst_reg = 0, \ .src_reg = SRC, \ .off = 0, \ .imm = IMM }) /* Memory load, dst_reg = *(uint *) (src_reg + off16) */ #define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) /* Memory load, dst_reg = *(signed size *) (src_reg + off16) */ #define BPF_LDX_MEMSX(SIZE, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEMSX, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) /* Memory store, *(uint *) (dst_reg + off16) = src_reg */ #define BPF_STX_MEM(SIZE, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) /* * Atomic operations: * * BPF_ADD *(uint *) (dst_reg + off16) += src_reg * BPF_AND *(uint *) (dst_reg + off16) &= src_reg * BPF_OR *(uint *) (dst_reg + off16) |= src_reg * BPF_XOR *(uint *) (dst_reg + off16) ^= src_reg * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); * BPF_AND | BPF_FETCH src_reg = atomic_fetch_and(dst_reg + off16, src_reg); * BPF_OR | BPF_FETCH src_reg = atomic_fetch_or(dst_reg + off16, src_reg); * BPF_XOR | BPF_FETCH src_reg = atomic_fetch_xor(dst_reg + off16, src_reg); * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg) * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg) */ #define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = OP }) /* Legacy alias */ #define BPF_STX_XADD(SIZE, DST, SRC, OFF) BPF_ATOMIC_OP(SIZE, BPF_ADD, DST, SRC, OFF) /* Memory store, *(uint *) (dst_reg + off16) = imm32 */ #define BPF_ST_MEM(SIZE, DST, OFF, IMM) \ ((struct bpf_insn) { \ .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) /* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */ #define BPF_JMP_REG(OP, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_OP(OP) | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) /* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */ #define BPF_JMP_IMM(OP, DST, IMM, OFF) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) /* Like BPF_JMP_REG, but with 32-bit wide operands for comparison. */ #define BPF_JMP32_REG(OP, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_JMP32 | BPF_OP(OP) | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) /* Like BPF_JMP_IMM, but with 32-bit wide operands for comparison. */ #define BPF_JMP32_IMM(OP, DST, IMM, OFF) \ ((struct bpf_insn) { \ .code = BPF_JMP32 | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) /* Unconditional jumps, goto pc + off16 */ #define BPF_JMP_A(OFF) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_JA, \ .dst_reg = 0, \ .src_reg = 0, \ .off = OFF, \ .imm = 0 }) /* Unconditional jumps, gotol pc + imm32 */ #define BPF_JMP32_A(IMM) \ ((struct bpf_insn) { \ .code = BPF_JMP32 | BPF_JA, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = IMM }) /* Relative call */ #define BPF_CALL_REL(TGT) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_CALL, \ .dst_reg = 0, \ .src_reg = BPF_PSEUDO_CALL, \ .off = 0, \ .imm = TGT }) /* Convert function address to BPF immediate */ #define BPF_CALL_IMM(x) ((void *)(x) - (void *)__bpf_call_base) #define BPF_EMIT_CALL(FUNC) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_CALL, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = BPF_CALL_IMM(FUNC) }) /* Raw code statement block */ #define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \ ((struct bpf_insn) { \ .code = CODE, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = IMM }) /* Program exit */ #define BPF_EXIT_INSN() \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_EXIT, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = 0 }) /* Speculation barrier */ #define BPF_ST_NOSPEC() \ ((struct bpf_insn) { \ .code = BPF_ST | BPF_NOSPEC, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = 0 }) /* Internal classic blocks for direct assignment */ #define __BPF_STMT(CODE, K) \ ((struct sock_filter) BPF_STMT(CODE, K)) #define __BPF_JUMP(CODE, K, JT, JF) \ ((struct sock_filter) BPF_JUMP(CODE, K, JT, JF)) #define bytes_to_bpf_size(bytes) \ ({ \ int bpf_size = -EINVAL; \ \ if (bytes == sizeof(u8)) \ bpf_size = BPF_B; \ else if (bytes == sizeof(u16)) \ bpf_size = BPF_H; \ else if (bytes == sizeof(u32)) \ bpf_size = BPF_W; \ else if (bytes == sizeof(u64)) \ bpf_size = BPF_DW; \ \ bpf_size; \ }) #define bpf_size_to_bytes(bpf_size) \ ({ \ int bytes = -EINVAL; \ \ if (bpf_size == BPF_B) \ bytes = sizeof(u8); \ else if (bpf_size == BPF_H) \ bytes = sizeof(u16); \ else if (bpf_size == BPF_W) \ bytes = sizeof(u32); \ else if (bpf_size == BPF_DW) \ bytes = sizeof(u64); \ \ bytes; \ }) #define BPF_SIZEOF(type) \ ({ \ const int __size = bytes_to_bpf_size(sizeof(type)); \ BUILD_BUG_ON(__size < 0); \ __size; \ }) #define BPF_FIELD_SIZEOF(type, field) \ ({ \ const int __size = bytes_to_bpf_size(sizeof_field(type, field)); \ BUILD_BUG_ON(__size < 0); \ __size; \ }) #define BPF_LDST_BYTES(insn) \ ({ \ const int __size = bpf_size_to_bytes(BPF_SIZE((insn)->code)); \ WARN_ON(__size < 0); \ __size; \ }) #define __BPF_MAP_0(m, v, ...) v #define __BPF_MAP_1(m, v, t, a, ...) m(t, a) #define __BPF_MAP_2(m, v, t, a, ...) m(t, a), __BPF_MAP_1(m, v, __VA_ARGS__) #define __BPF_MAP_3(m, v, t, a, ...) m(t, a), __BPF_MAP_2(m, v, __VA_ARGS__) #define __BPF_MAP_4(m, v, t, a, ...) m(t, a), __BPF_MAP_3(m, v, __VA_ARGS__) #define __BPF_MAP_5(m, v, t, a, ...) m(t, a), __BPF_MAP_4(m, v, __VA_ARGS__) #define __BPF_REG_0(...) __BPF_PAD(5) #define __BPF_REG_1(...) __BPF_MAP(1, __VA_ARGS__), __BPF_PAD(4) #define __BPF_REG_2(...) __BPF_MAP(2, __VA_ARGS__), __BPF_PAD(3) #define __BPF_REG_3(...) __BPF_MAP(3, __VA_ARGS__), __BPF_PAD(2) #define __BPF_REG_4(...) __BPF_MAP(4, __VA_ARGS__), __BPF_PAD(1) #define __BPF_REG_5(...) __BPF_MAP(5, __VA_ARGS__) #define __BPF_MAP(n, ...) __BPF_MAP_##n(__VA_ARGS__) #define __BPF_REG(n, ...) __BPF_REG_##n(__VA_ARGS__) #define __BPF_CAST(t, a) \ (__force t) \ (__force \ typeof(__builtin_choose_expr(sizeof(t) == sizeof(unsigned long), \ (unsigned long)0, (t)0))) a #define __BPF_V void #define __BPF_N #define __BPF_DECL_ARGS(t, a) t a #define __BPF_DECL_REGS(t, a) u64 a #define __BPF_PAD(n) \ __BPF_MAP(n, __BPF_DECL_ARGS, __BPF_N, u64, __ur_1, u64, __ur_2, \ u64, __ur_3, u64, __ur_4, u64, __ur_5) #define BPF_CALL_x(x, attr, name, ...) \ static __always_inline \ u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ typedef u64 (*btf_##name)(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ attr u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)); \ attr u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)) \ { \ return ((btf_##name)____##name)(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\ } \ static __always_inline \ u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)) #define __NOATTR #define BPF_CALL_0(name, ...) BPF_CALL_x(0, __NOATTR, name, __VA_ARGS__) #define BPF_CALL_1(name, ...) BPF_CALL_x(1, __NOATTR, name, __VA_ARGS__) #define BPF_CALL_2(name, ...) BPF_CALL_x(2, __NOATTR, name, __VA_ARGS__) #define BPF_CALL_3(name, ...) BPF_CALL_x(3, __NOATTR, name, __VA_ARGS__) #define BPF_CALL_4(name, ...) BPF_CALL_x(4, __NOATTR, name, __VA_ARGS__) #define BPF_CALL_5(name, ...) BPF_CALL_x(5, __NOATTR, name, __VA_ARGS__) #define NOTRACE_BPF_CALL_1(name, ...) BPF_CALL_x(1, notrace, name, __VA_ARGS__) #define bpf_ctx_range(TYPE, MEMBER) \ offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1 #define bpf_ctx_range_till(TYPE, MEMBER1, MEMBER2) \ offsetof(TYPE, MEMBER1) ... offsetofend(TYPE, MEMBER2) - 1 #if BITS_PER_LONG == 64 # define bpf_ctx_range_ptr(TYPE, MEMBER) \ offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1 #else # define bpf_ctx_range_ptr(TYPE, MEMBER) \ offsetof(TYPE, MEMBER) ... offsetof(TYPE, MEMBER) + 8 - 1 #endif /* BITS_PER_LONG == 64 */ #define bpf_target_off(TYPE, MEMBER, SIZE, PTR_SIZE) \ ({ \ BUILD_BUG_ON(sizeof_field(TYPE, MEMBER) != (SIZE)); \ *(PTR_SIZE) = (SIZE); \ offsetof(TYPE, MEMBER); \ }) /* A struct sock_filter is architecture independent. */ struct compat_sock_fprog { u16 len; compat_uptr_t filter; /* struct sock_filter * */ }; struct sock_fprog_kern { u16 len; struct sock_filter *filter; }; /* Some arches need doubleword alignment for their instructions and/or data */ #define BPF_IMAGE_ALIGNMENT 8 struct bpf_binary_header { u32 size; u8 image[] __aligned(BPF_IMAGE_ALIGNMENT); }; struct bpf_prog_stats { u64_stats_t cnt; u64_stats_t nsecs; u64_stats_t misses; struct u64_stats_sync syncp; } __aligned(2 * sizeof(u64)); struct sk_filter { refcount_t refcnt; struct rcu_head rcu; struct bpf_prog *prog; }; DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); extern struct mutex nf_conn_btf_access_lock; extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size); typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx, const struct bpf_insn *insnsi, unsigned int (*bpf_func)(const void *, const struct bpf_insn *)); static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog, const void *ctx, bpf_dispatcher_fn dfunc) { u32 ret; cant_migrate(); if (static_branch_unlikely(&bpf_stats_enabled_key)) { struct bpf_prog_stats *stats; u64 duration, start = sched_clock(); unsigned long flags; ret = dfunc(ctx, prog->insnsi, prog->bpf_func); duration = sched_clock() - start; stats = this_cpu_ptr(prog->stats); flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->cnt); u64_stats_add(&stats->nsecs, duration); u64_stats_update_end_irqrestore(&stats->syncp, flags); } else { ret = dfunc(ctx, prog->insnsi, prog->bpf_func); } return ret; } static __always_inline u32 bpf_prog_run(const struct bpf_prog *prog, const void *ctx) { return __bpf_prog_run(prog, ctx, bpf_dispatcher_nop_func); } /* * Use in preemptible and therefore migratable context to make sure that * the execution of the BPF program runs on one CPU. * * This uses migrate_disable/enable() explicitly to document that the * invocation of a BPF program does not require reentrancy protection * against a BPF program which is invoked from a preempting task. */ static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog, const void *ctx) { u32 ret; migrate_disable(); ret = bpf_prog_run(prog, ctx); migrate_enable(); return ret; } #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN struct bpf_skb_data_end { struct qdisc_skb_cb qdisc_cb; void *data_meta; void *data_end; }; struct bpf_nh_params { u32 nh_family; union { u32 ipv4_nh; struct in6_addr ipv6_nh; }; }; /* flags for bpf_redirect_info kern_flags */ #define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */ #define BPF_RI_F_RI_INIT BIT(1) #define BPF_RI_F_CPU_MAP_INIT BIT(2) #define BPF_RI_F_DEV_MAP_INIT BIT(3) #define BPF_RI_F_XSK_MAP_INIT BIT(4) struct bpf_redirect_info { u64 tgt_index; void *tgt_value; struct bpf_map *map; u32 flags; u32 map_id; enum bpf_map_type map_type; struct bpf_nh_params nh; u32 kern_flags; }; struct bpf_net_context { struct bpf_redirect_info ri; struct list_head cpu_map_flush_list; struct list_head dev_map_flush_list; struct list_head xskmap_map_flush_list; }; static inline struct bpf_net_context *bpf_net_ctx_set(struct bpf_net_context *bpf_net_ctx) { struct task_struct *tsk = current; if (tsk->bpf_net_context != NULL) return NULL; bpf_net_ctx->ri.kern_flags = 0; tsk->bpf_net_context = bpf_net_ctx; return bpf_net_ctx; } static inline void bpf_net_ctx_clear(struct bpf_net_context *bpf_net_ctx) { if (bpf_net_ctx) current->bpf_net_context = NULL; } static inline struct bpf_net_context *bpf_net_ctx_get(void) { return current->bpf_net_context; } static inline struct bpf_redirect_info *bpf_net_ctx_get_ri(void) { struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get(); if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_RI_INIT)) { memset(&bpf_net_ctx->ri, 0, offsetof(struct bpf_net_context, ri.nh)); bpf_net_ctx->ri.kern_flags |= BPF_RI_F_RI_INIT; } return &bpf_net_ctx->ri; } static inline struct list_head *bpf_net_ctx_get_cpu_map_flush_list(void) { struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get(); if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_CPU_MAP_INIT)) { INIT_LIST_HEAD(&bpf_net_ctx->cpu_map_flush_list); bpf_net_ctx->ri.kern_flags |= BPF_RI_F_CPU_MAP_INIT; } return &bpf_net_ctx->cpu_map_flush_list; } static inline struct list_head *bpf_net_ctx_get_dev_flush_list(void) { struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get(); if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_DEV_MAP_INIT)) { INIT_LIST_HEAD(&bpf_net_ctx->dev_map_flush_list); bpf_net_ctx->ri.kern_flags |= BPF_RI_F_DEV_MAP_INIT; } return &bpf_net_ctx->dev_map_flush_list; } static inline struct list_head *bpf_net_ctx_get_xskmap_flush_list(void) { struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get(); if (!(bpf_net_ctx->ri.kern_flags & BPF_RI_F_XSK_MAP_INIT)) { INIT_LIST_HEAD(&bpf_net_ctx->xskmap_map_flush_list); bpf_net_ctx->ri.kern_flags |= BPF_RI_F_XSK_MAP_INIT; } return &bpf_net_ctx->xskmap_map_flush_list; } static inline void bpf_net_ctx_get_all_used_flush_lists(struct list_head **lh_map, struct list_head **lh_dev, struct list_head **lh_xsk) { struct bpf_net_context *bpf_net_ctx = bpf_net_ctx_get(); u32 kern_flags = bpf_net_ctx->ri.kern_flags; struct list_head *lh; *lh_map = *lh_dev = *lh_xsk = NULL; if (!IS_ENABLED(CONFIG_BPF_SYSCALL)) return; lh = &bpf_net_ctx->dev_map_flush_list; if (kern_flags & BPF_RI_F_DEV_MAP_INIT && !list_empty(lh)) *lh_dev = lh; lh = &bpf_net_ctx->cpu_map_flush_list; if (kern_flags & BPF_RI_F_CPU_MAP_INIT && !list_empty(lh)) *lh_map = lh; lh = &bpf_net_ctx->xskmap_map_flush_list; if (IS_ENABLED(CONFIG_XDP_SOCKETS) && kern_flags & BPF_RI_F_XSK_MAP_INIT && !list_empty(lh)) *lh_xsk = lh; } /* Compute the linear packet data range [data, data_end) which * will be accessed by various program types (cls_bpf, act_bpf, * lwt, ...). Subsystems allowing direct data access must (!) * ensure that cb[] area can be written to when BPF program is * invoked (otherwise cb[] save/restore is necessary). */ static inline void bpf_compute_data_pointers(struct sk_buff *skb) { struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; BUILD_BUG_ON(sizeof(*cb) > sizeof_field(struct sk_buff, cb)); cb->data_meta = skb->data - skb_metadata_len(skb); cb->data_end = skb->data + skb_headlen(skb); } /* Similar to bpf_compute_data_pointers(), except that save orginal * data in cb->data and cb->meta_data for restore. */ static inline void bpf_compute_and_save_data_end( struct sk_buff *skb, void **saved_data_end) { struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; *saved_data_end = cb->data_end; cb->data_end = skb->data + skb_headlen(skb); } /* Restore data saved by bpf_compute_and_save_data_end(). */ static inline void bpf_restore_data_end( struct sk_buff *skb, void *saved_data_end) { struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; cb->data_end = saved_data_end; } static inline u8 *bpf_skb_cb(const struct sk_buff *skb) { /* eBPF programs may read/write skb->cb[] area to transfer meta * data between tail calls. Since this also needs to work with * tc, that scratch memory is mapped to qdisc_skb_cb's data area. * * In some socket filter cases, the cb unfortunately needs to be * saved/restored so that protocol specific skb->cb[] data won't * be lost. In any case, due to unpriviledged eBPF programs * attached to sockets, we need to clear the bpf_skb_cb() area * to not leak previous contents to user space. */ BUILD_BUG_ON(sizeof_field(struct __sk_buff, cb) != BPF_SKB_CB_LEN); BUILD_BUG_ON(sizeof_field(struct __sk_buff, cb) != sizeof_field(struct qdisc_skb_cb, data)); return qdisc_skb_cb(skb)->data; } /* Must be invoked with migration disabled */ static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog, const void *ctx) { const struct sk_buff *skb = ctx; u8 *cb_data = bpf_skb_cb(skb); u8 cb_saved[BPF_SKB_CB_LEN]; u32 res; if (unlikely(prog->cb_access)) { memcpy(cb_saved, cb_data, sizeof(cb_saved)); memset(cb_data, 0, sizeof(cb_saved)); } res = bpf_prog_run(prog, skb); if (unlikely(prog->cb_access)) memcpy(cb_data, cb_saved, sizeof(cb_saved)); return res; } static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog, struct sk_buff *skb) { u32 res; migrate_disable(); res = __bpf_prog_run_save_cb(prog, skb); migrate_enable(); return res; } static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog, struct sk_buff *skb) { u8 *cb_data = bpf_skb_cb(skb); u32 res; if (unlikely(prog->cb_access)) memset(cb_data, 0, BPF_SKB_CB_LEN); res = bpf_prog_run_pin_on_cpu(prog, skb); return res; } DECLARE_BPF_DISPATCHER(xdp) DECLARE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key); u32 xdp_master_redirect(struct xdp_buff *xdp); void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog); static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog) { return prog->len * sizeof(struct bpf_insn); } static inline u32 bpf_prog_tag_scratch_size(const struct bpf_prog *prog) { return round_up(bpf_prog_insn_size(prog) + sizeof(__be64) + 1, SHA1_BLOCK_SIZE); } static inline unsigned int bpf_prog_size(unsigned int proglen) { return max(sizeof(struct bpf_prog), offsetof(struct bpf_prog, insns[proglen])); } static inline bool bpf_prog_was_classic(const struct bpf_prog *prog) { /* When classic BPF programs have been loaded and the arch * does not have a classic BPF JIT (anymore), they have been * converted via bpf_migrate_filter() to eBPF and thus always * have an unspec program type. */ return prog->type == BPF_PROG_TYPE_UNSPEC; } static inline u32 bpf_ctx_off_adjust_machine(u32 size) { const u32 size_machine = sizeof(unsigned long); if (size > size_machine && size % size_machine == 0) size = size_machine; return size; } static inline bool bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) { return size <= size_default && (size & (size - 1)) == 0; } static inline u8 bpf_ctx_narrow_access_offset(u32 off, u32 size, u32 size_default) { u8 access_off = off & (size_default - 1); #ifdef __LITTLE_ENDIAN return access_off; #else return size_default - (access_off + size); #endif } #define bpf_ctx_wide_access_ok(off, size, type, field) \ (size == sizeof(__u64) && \ off >= offsetof(type, field) && \ off + sizeof(__u64) <= offsetofend(type, field) && \ off % sizeof(__u64) == 0) #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) static inline int __must_check bpf_prog_lock_ro(struct bpf_prog *fp) { #ifndef CONFIG_BPF_JIT_ALWAYS_ON if (!fp->jited) { set_vm_flush_reset_perms(fp); return set_memory_ro((unsigned long)fp, fp->pages); } #endif return 0; } static inline int __must_check bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) { set_vm_flush_reset_perms(hdr); return set_memory_rox((unsigned long)hdr, hdr->size >> PAGE_SHIFT); } int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap); static inline int sk_filter(struct sock *sk, struct sk_buff *skb) { return sk_filter_trim_cap(sk, skb, 1); } struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err); void bpf_prog_free(struct bpf_prog *fp); bool bpf_opcode_in_insntable(u8 code); void bpf_prog_fill_jited_linfo(struct bpf_prog *prog, const u32 *insn_to_jit_off); int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog); void bpf_prog_jit_attempt_done(struct bpf_prog *prog); struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags); struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags); struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags); void __bpf_prog_free(struct bpf_prog *fp); static inline void bpf_prog_unlock_free(struct bpf_prog *fp) { __bpf_prog_free(fp); } typedef int (*bpf_aux_classic_check_t)(struct sock_filter *filter, unsigned int flen); int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog); int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, bpf_aux_classic_check_t trans, bool save_orig); void bpf_prog_destroy(struct bpf_prog *fp); int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_attach_bpf(u32 ufd, struct sock *sk); int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk); void sk_reuseport_prog_free(struct bpf_prog *prog); int sk_detach_filter(struct sock *sk); int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len); bool sk_filter_charge(struct sock *sk, struct sk_filter *fp); void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); #define __bpf_call_base_args \ ((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \ (void *)__bpf_call_base) struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); bool bpf_jit_needs_zext(void); bool bpf_jit_inlines_helper_call(s32 imm); bool bpf_jit_supports_subprog_tailcalls(void); bool bpf_jit_supports_percpu_insn(void); bool bpf_jit_supports_kfunc_call(void); bool bpf_jit_supports_far_kfunc_call(void); bool bpf_jit_supports_exceptions(void); bool bpf_jit_supports_ptr_xchg(void); bool bpf_jit_supports_arena(void); bool bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena); bool bpf_jit_supports_private_stack(void); u64 bpf_arch_uaddress_limit(void); void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); bool bpf_helper_changes_pkt_data(enum bpf_func_id func_id); static inline bool bpf_dump_raw_ok(const struct cred *cred) { /* Reconstruction of call-sites is dependent on kallsyms, * thus make dump the same restriction. */ return kallsyms_show_value(cred); } struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt); static inline bool xdp_return_frame_no_direct(void) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT; } static inline void xdp_set_return_frame_no_direct(void) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT; } static inline void xdp_clear_return_frame_no_direct(void) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT; } static inline int xdp_ok_fwd_dev(const struct net_device *fwd, unsigned int pktlen) { unsigned int len; if (unlikely(!(fwd->flags & IFF_UP))) return -ENETDOWN; len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN; if (pktlen > len) return -EMSGSIZE; return 0; } /* The pair of xdp_do_redirect and xdp_do_flush MUST be called in the * same cpu context. Further for best results no more than a single map * for the do_redirect/do_flush pair should be used. This limitation is * because we only track one map and force a flush when the map changes. * This does not appear to be a real limitation for existing software. */ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, const struct bpf_prog *prog); int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, const struct bpf_prog *prog); int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp, struct xdp_frame *xdpf, const struct bpf_prog *prog); void xdp_do_flush(void); void bpf_warn_invalid_xdp_action(const struct net_device *dev, const struct bpf_prog *prog, u32 act); #ifdef CONFIG_INET struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, struct bpf_prog *prog, struct sk_buff *skb, struct sock *migrating_sk, u32 hash); #else static inline struct sock * bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, struct bpf_prog *prog, struct sk_buff *skb, struct sock *migrating_sk, u32 hash) { return NULL; } #endif #ifdef CONFIG_BPF_JIT extern int bpf_jit_enable; extern int bpf_jit_harden; extern int bpf_jit_kallsyms; extern long bpf_jit_limit; extern long bpf_jit_limit_max; typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); void bpf_jit_fill_hole_with_zero(void *area, unsigned int size); struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, bpf_jit_fill_hole_t bpf_fill_ill_insns); void bpf_jit_binary_free(struct bpf_binary_header *hdr); u64 bpf_jit_alloc_exec_limit(void); void *bpf_jit_alloc_exec(unsigned long size); void bpf_jit_free_exec(void *addr); void bpf_jit_free(struct bpf_prog *fp); struct bpf_binary_header * bpf_jit_binary_pack_hdr(const struct bpf_prog *fp); void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns); void bpf_prog_pack_free(void *ptr, u32 size); static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) { return list_empty(&fp->aux->ksym.lnode) || fp->aux->ksym.lnode.prev == LIST_POISON2; } struct bpf_binary_header * bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **ro_image, unsigned int alignment, struct bpf_binary_header **rw_hdr, u8 **rw_image, bpf_jit_fill_hole_t bpf_fill_ill_insns); int bpf_jit_binary_pack_finalize(struct bpf_binary_header *ro_header, struct bpf_binary_header *rw_header); void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header, struct bpf_binary_header *rw_header); int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, struct bpf_jit_poke_descriptor *poke); int bpf_jit_get_func_addr(const struct bpf_prog *prog, const struct bpf_insn *insn, bool extra_pass, u64 *func_addr, bool *func_addr_fixed); struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *fp); void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other); static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen, u32 pass, void *image) { pr_err("flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n", flen, proglen, pass, image, current->comm, task_pid_nr(current)); if (image) print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET, 16, 1, image, proglen, false); } static inline bool bpf_jit_is_ebpf(void) { # ifdef CONFIG_HAVE_EBPF_JIT return true; # else return false; # endif } static inline bool ebpf_jit_enabled(void) { return bpf_jit_enable && bpf_jit_is_ebpf(); } static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp) { return fp->jited && bpf_jit_is_ebpf(); } static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) { /* These are the prerequisites, should someone ever have the * idea to call blinding outside of them, we make sure to * bail out. */ if (!bpf_jit_is_ebpf()) return false; if (!prog->jit_requested) return false; if (!bpf_jit_harden) return false; if (bpf_jit_harden == 1 && bpf_token_capable(prog->aux->token, CAP_BPF)) return false; return true; } static inline bool bpf_jit_kallsyms_enabled(void) { /* There are a couple of corner cases where kallsyms should * not be enabled f.e. on hardening. */ if (bpf_jit_harden) return false; if (!bpf_jit_kallsyms) return false; if (bpf_jit_kallsyms == 1) return true; return false; } int __bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char *sym); bool is_bpf_text_address(unsigned long addr); int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym); struct bpf_prog *bpf_prog_ksym_find(unsigned long addr); static inline int bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym) { int ret = __bpf_address_lookup(addr, size, off, sym); if (ret && modname) *modname = NULL; return ret; } void bpf_prog_kallsyms_add(struct bpf_prog *fp); void bpf_prog_kallsyms_del(struct bpf_prog *fp); #else /* CONFIG_BPF_JIT */ static inline bool ebpf_jit_enabled(void) { return false; } static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) { return false; } static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp) { return false; } static inline int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, struct bpf_jit_poke_descriptor *poke) { return -ENOTSUPP; } static inline void bpf_jit_free(struct bpf_prog *fp) { bpf_prog_unlock_free(fp); } static inline bool bpf_jit_kallsyms_enabled(void) { return false; } static inline int __bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char *sym) { return 0; } static inline bool is_bpf_text_address(unsigned long addr) { return false; } static inline int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym) { return -ERANGE; } static inline struct bpf_prog *bpf_prog_ksym_find(unsigned long addr) { return NULL; } static inline int bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym) { return 0; } static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp) { } static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp) { } #endif /* CONFIG_BPF_JIT */ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp); #define BPF_ANC BIT(15) static inline bool bpf_needs_clear_a(const struct sock_filter *first) { switch (first->code) { case BPF_RET | BPF_K: case BPF_LD | BPF_W | BPF_LEN: return false; case BPF_LD | BPF_W | BPF_ABS: case BPF_LD | BPF_H | BPF_ABS: case BPF_LD | BPF_B | BPF_ABS: if (first->k == SKF_AD_OFF + SKF_AD_ALU_XOR_X) return true; return false; default: return true; } } static inline u16 bpf_anc_helper(const struct sock_filter *ftest) { BUG_ON(ftest->code & BPF_ANC); switch (ftest->code) { case BPF_LD | BPF_W | BPF_ABS: case BPF_LD | BPF_H | BPF_ABS: case BPF_LD | BPF_B | BPF_ABS: #define BPF_ANCILLARY(CODE) case SKF_AD_OFF + SKF_AD_##CODE: \ return BPF_ANC | SKF_AD_##CODE switch (ftest->k) { BPF_ANCILLARY(PROTOCOL); BPF_ANCILLARY(PKTTYPE); BPF_ANCILLARY(IFINDEX); BPF_ANCILLARY(NLATTR); BPF_ANCILLARY(NLATTR_NEST); BPF_ANCILLARY(MARK); BPF_ANCILLARY(QUEUE); BPF_ANCILLARY(HATYPE); BPF_ANCILLARY(RXHASH); BPF_ANCILLARY(CPU); BPF_ANCILLARY(ALU_XOR_X); BPF_ANCILLARY(VLAN_TAG); BPF_ANCILLARY(VLAN_TAG_PRESENT); BPF_ANCILLARY(PAY_OFFSET); BPF_ANCILLARY(RANDOM); BPF_ANCILLARY(VLAN_TPID); } fallthrough; default: return ftest->code; } } void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size); static inline int bpf_tell_extensions(void) { return SKF_AD_MAX; } struct bpf_sock_addr_kern { struct sock *sk; struct sockaddr *uaddr; /* Temporary "register" to make indirect stores to nested structures * defined above. We need three registers to make such a store, but * only two (src and dst) are available at convert_ctx_access time */ u64 tmp_reg; void *t_ctx; /* Attach type specific context. */ u32 uaddrlen; }; struct bpf_sock_ops_kern { struct sock *sk; union { u32 args[4]; u32 reply; u32 replylong[4]; }; struct sk_buff *syn_skb; struct sk_buff *skb; void *skb_data_end; u8 op; u8 is_fullsock; u8 remaining_opt_len; u64 temp; /* temp and everything after is not * initialized to 0 before calling * the BPF program. New fields that * should be initialized to 0 should * be inserted before temp. * temp is scratch storage used by * sock_ops_convert_ctx_access * as temporary storage of a register. */ }; struct bpf_sysctl_kern { struct ctl_table_header *head; const struct ctl_table *table; void *cur_val; size_t cur_len; void *new_val; size_t new_len; int new_updated; int write; loff_t *ppos; /* Temporary "register" for indirect stores to ppos. */ u64 tmp_reg; }; #define BPF_SOCKOPT_KERN_BUF_SIZE 32 struct bpf_sockopt_buf { u8 data[BPF_SOCKOPT_KERN_BUF_SIZE]; }; struct bpf_sockopt_kern { struct sock *sk; u8 *optval; u8 *optval_end; s32 level; s32 optname; s32 optlen; /* for retval in struct bpf_cg_run_ctx */ struct task_struct *current_task; /* Temporary "register" for indirect stores to ppos. */ u64 tmp_reg; }; int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len); struct bpf_sk_lookup_kern { u16 family; u16 protocol; __be16 sport; u16 dport; struct { __be32 saddr; __be32 daddr; } v4; struct { const struct in6_addr *saddr; const struct in6_addr *daddr; } v6; struct sock *selected_sk; u32 ingress_ifindex; bool no_reuseport; }; extern struct static_key_false bpf_sk_lookup_enabled; /* Runners for BPF_SK_LOOKUP programs to invoke on socket lookup. * * Allowed return values for a BPF SK_LOOKUP program are SK_PASS and * SK_DROP. Their meaning is as follows: * * SK_PASS && ctx.selected_sk != NULL: use selected_sk as lookup result * SK_PASS && ctx.selected_sk == NULL: continue to htable-based socket lookup * SK_DROP : terminate lookup with -ECONNREFUSED * * This macro aggregates return values and selected sockets from * multiple BPF programs according to following rules in order: * * 1. If any program returned SK_PASS and a non-NULL ctx.selected_sk, * macro result is SK_PASS and last ctx.selected_sk is used. * 2. If any program returned SK_DROP return value, * macro result is SK_DROP. * 3. Otherwise result is SK_PASS and ctx.selected_sk is NULL. * * Caller must ensure that the prog array is non-NULL, and that the * array as well as the programs it contains remain valid. */ #define BPF_PROG_SK_LOOKUP_RUN_ARRAY(array, ctx, func) \ ({ \ struct bpf_sk_lookup_kern *_ctx = &(ctx); \ struct bpf_prog_array_item *_item; \ struct sock *_selected_sk = NULL; \ bool _no_reuseport = false; \ struct bpf_prog *_prog; \ bool _all_pass = true; \ u32 _ret; \ \ migrate_disable(); \ _item = &(array)->items[0]; \ while ((_prog = READ_ONCE(_item->prog))) { \ /* restore most recent selection */ \ _ctx->selected_sk = _selected_sk; \ _ctx->no_reuseport = _no_reuseport; \ \ _ret = func(_prog, _ctx); \ if (_ret == SK_PASS && _ctx->selected_sk) { \ /* remember last non-NULL socket */ \ _selected_sk = _ctx->selected_sk; \ _no_reuseport = _ctx->no_reuseport; \ } else if (_ret == SK_DROP && _all_pass) { \ _all_pass = false; \ } \ _item++; \ } \ _ctx->selected_sk = _selected_sk; \ _ctx->no_reuseport = _no_reuseport; \ migrate_enable(); \ _all_pass || _selected_sk ? SK_PASS : SK_DROP; \ }) static inline bool bpf_sk_lookup_run_v4(const struct net *net, int protocol, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 dport, const int ifindex, struct sock **psk) { struct bpf_prog_array *run_array; struct sock *selected_sk = NULL; bool no_reuseport = false; rcu_read_lock(); run_array = rcu_dereference(net->bpf.run_array[NETNS_BPF_SK_LOOKUP]); if (run_array) { struct bpf_sk_lookup_kern ctx = { .family = AF_INET, .protocol = protocol, .v4.saddr = saddr, .v4.daddr = daddr, .sport = sport, .dport = dport, .ingress_ifindex = ifindex, }; u32 act; act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, bpf_prog_run); if (act == SK_PASS) { selected_sk = ctx.selected_sk; no_reuseport = ctx.no_reuseport; } else { selected_sk = ERR_PTR(-ECONNREFUSED); } } rcu_read_unlock(); *psk = selected_sk; return no_reuseport; } #if IS_ENABLED(CONFIG_IPV6) static inline bool bpf_sk_lookup_run_v6(const struct net *net, int protocol, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const u16 dport, const int ifindex, struct sock **psk) { struct bpf_prog_array *run_array; struct sock *selected_sk = NULL; bool no_reuseport = false; rcu_read_lock(); run_array = rcu_dereference(net->bpf.run_array[NETNS_BPF_SK_LOOKUP]); if (run_array) { struct bpf_sk_lookup_kern ctx = { .family = AF_INET6, .protocol = protocol, .v6.saddr = saddr, .v6.daddr = daddr, .sport = sport, .dport = dport, .ingress_ifindex = ifindex, }; u32 act; act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, bpf_prog_run); if (act == SK_PASS) { selected_sk = ctx.selected_sk; no_reuseport = ctx.no_reuseport; } else { selected_sk = ERR_PTR(-ECONNREFUSED); } } rcu_read_unlock(); *psk = selected_sk; return no_reuseport; } #endif /* IS_ENABLED(CONFIG_IPV6) */ static __always_inline long __bpf_xdp_redirect_map(struct bpf_map *map, u64 index, u64 flags, const u64 flag_mask, void *lookup_elem(struct bpf_map *map, u32 key)) { struct bpf_redirect_info *ri = bpf_net_ctx_get_ri(); const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX; /* Lower bits of the flags are used as return code on lookup failure */ if (unlikely(flags & ~(action_mask | flag_mask))) return XDP_ABORTED; ri->tgt_value = lookup_elem(map, index); if (unlikely(!ri->tgt_value) && !(flags & BPF_F_BROADCAST)) { /* If the lookup fails we want to clear out the state in the * redirect_info struct completely, so that if an eBPF program * performs multiple lookups, the last one always takes * precedence. */ ri->map_id = INT_MAX; /* Valid map id idr range: [1,INT_MAX[ */ ri->map_type = BPF_MAP_TYPE_UNSPEC; return flags & action_mask; } ri->tgt_index = index; ri->map_id = map->id; ri->map_type = map->map_type; if (flags & BPF_F_BROADCAST) { WRITE_ONCE(ri->map, map); ri->flags = flags; } else { WRITE_ONCE(ri->map, NULL); ri->flags = 0; } return XDP_REDIRECT; } #ifdef CONFIG_NET int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len); int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags); int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len); int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len); void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len); void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf, unsigned long len, bool flush); #else /* CONFIG_NET */ static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) { return -EOPNOTSUPP; } static inline int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) { return -EOPNOTSUPP; } static inline int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len) { return -EOPNOTSUPP; } static inline int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len) { return -EOPNOTSUPP; } static inline void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len) { return NULL; } static inline void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf, unsigned long len, bool flush) { } #endif /* CONFIG_NET */ #endif /* __LINUX_FILTER_H__ */
26 26 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 #ifndef __LINUX_ERSPAN_H #define __LINUX_ERSPAN_H /* * GRE header for ERSPAN type I encapsulation (4 octets [34:37]) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |0|0|0|0|0|00000|000000000|00000| Protocol Type for ERSPAN | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * The Type I ERSPAN frame format is based on the barebones IP + GRE * encapsulation (as described above) on top of the raw mirrored frame. * There is no extra ERSPAN header. * * * GRE header for ERSPAN type II and II encapsulation (8 octets [34:41]) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |0|0|0|1|0|00000|000000000|00000| Protocol Type for ERSPAN | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Sequence Number (increments per packet per session) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Note that in the above GRE header [RFC1701] out of the C, R, K, S, * s, Recur, Flags, Version fields only S (bit 03) is set to 1. The * other fields are set to zero, so only a sequence number follows. * * ERSPAN Version 1 (Type II) header (8 octets [42:49]) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Ver | VLAN | COS | En|T| Session ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Reserved | Index | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * * ERSPAN Version 2 (Type III) header (12 octets [42:49]) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Ver | VLAN | COS |BSO|T| Session ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Timestamp | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | SGT |P| FT | Hw ID |D|Gra|O| * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Platform Specific SubHeader (8 octets, optional) * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Platf ID | Platform Specific Info | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Platform Specific Info | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * GRE proto ERSPAN type I/II = 0x88BE, type III = 0x22EB */ #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/skbuff.h> #include <uapi/linux/erspan.h> #define ERSPAN_VERSION 0x1 /* ERSPAN type II */ #define VER_MASK 0xf000 #define VLAN_MASK 0x0fff #define COS_MASK 0xe000 #define EN_MASK 0x1800 #define T_MASK 0x0400 #define ID_MASK 0x03ff #define INDEX_MASK 0xfffff #define ERSPAN_VERSION2 0x2 /* ERSPAN type III*/ #define BSO_MASK EN_MASK #define SGT_MASK 0xffff0000 #define P_MASK 0x8000 #define FT_MASK 0x7c00 #define HWID_MASK 0x03f0 #define DIR_MASK 0x0008 #define GRA_MASK 0x0006 #define O_MASK 0x0001 #define HWID_OFFSET 4 #define DIR_OFFSET 3 enum erspan_encap_type { ERSPAN_ENCAP_NOVLAN = 0x0, /* originally without VLAN tag */ ERSPAN_ENCAP_ISL = 0x1, /* originally ISL encapsulated */ ERSPAN_ENCAP_8021Q = 0x2, /* originally 802.1Q encapsulated */ ERSPAN_ENCAP_INFRAME = 0x3, /* VLAN tag preserved in frame */ }; #define ERSPAN_V1_MDSIZE 4 #define ERSPAN_V2_MDSIZE 8 struct erspan_base_hdr { #if defined(__LITTLE_ENDIAN_BITFIELD) __u8 vlan_upper:4, ver:4; __u8 vlan:8; __u8 session_id_upper:2, t:1, en:2, cos:3; __u8 session_id:8; #elif defined(__BIG_ENDIAN_BITFIELD) __u8 ver: 4, vlan_upper:4; __u8 vlan:8; __u8 cos:3, en:2, t:1, session_id_upper:2; __u8 session_id:8; #else #error "Please fix <asm/byteorder.h>" #endif }; static inline void set_session_id(struct erspan_base_hdr *ershdr, u16 id) { ershdr->session_id = id & 0xff; ershdr->session_id_upper = (id >> 8) & 0x3; } static inline u16 get_session_id(const struct erspan_base_hdr *ershdr) { return (ershdr->session_id_upper << 8) + ershdr->session_id; } static inline void set_vlan(struct erspan_base_hdr *ershdr, u16 vlan) { ershdr->vlan = vlan & 0xff; ershdr->vlan_upper = (vlan >> 8) & 0xf; } static inline u16 get_vlan(const struct erspan_base_hdr *ershdr) { return (ershdr->vlan_upper << 8) + ershdr->vlan; } static inline void set_hwid(struct erspan_md2 *md2, u8 hwid) { md2->hwid = hwid & 0xf; md2->hwid_upper = (hwid >> 4) & 0x3; } static inline u8 get_hwid(const struct erspan_md2 *md2) { return (md2->hwid_upper << 4) + md2->hwid; } static inline int erspan_hdr_len(int version) { if (version == 0) return 0; return sizeof(struct erspan_base_hdr) + (version == 1 ? ERSPAN_V1_MDSIZE : ERSPAN_V2_MDSIZE); } static inline u8 tos_to_cos(u8 tos) { u8 dscp, cos; dscp = tos >> 2; cos = dscp >> 3; return cos; } static inline void erspan_build_header(struct sk_buff *skb, u32 id, u32 index, bool truncate, bool is_ipv4) { struct ethhdr *eth = (struct ethhdr *)skb->data; enum erspan_encap_type enc_type; struct erspan_base_hdr *ershdr; struct qtag_prefix { __be16 eth_type; __be16 tci; } *qp; u16 vlan_tci = 0; u8 tos; __be32 *idx; tos = is_ipv4 ? ip_hdr(skb)->tos : (ipv6_hdr(skb)->priority << 4) + (ipv6_hdr(skb)->flow_lbl[0] >> 4); enc_type = ERSPAN_ENCAP_NOVLAN; /* If mirrored packet has vlan tag, extract tci and * preserve vlan header in the mirrored frame. */ if (eth->h_proto == htons(ETH_P_8021Q)) { qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN); vlan_tci = ntohs(qp->tci); enc_type = ERSPAN_ENCAP_INFRAME; } skb_push(skb, sizeof(*ershdr) + ERSPAN_V1_MDSIZE); ershdr = (struct erspan_base_hdr *)skb->data; memset(ershdr, 0, sizeof(*ershdr) + ERSPAN_V1_MDSIZE); /* Build base header */ ershdr->ver = ERSPAN_VERSION; ershdr->cos = tos_to_cos(tos); ershdr->en = enc_type; ershdr->t = truncate; set_vlan(ershdr, vlan_tci); set_session_id(ershdr, id); /* Build metadata */ idx = (__be32 *)(ershdr + 1); *idx = htonl(index & INDEX_MASK); } /* ERSPAN GRA: timestamp granularity * 00b --> granularity = 100 microseconds * 01b --> granularity = 100 nanoseconds * 10b --> granularity = IEEE 1588 * Here we only support 100 microseconds. */ static inline __be32 erspan_get_timestamp(void) { u64 h_usecs; ktime_t kt; kt = ktime_get_real(); h_usecs = ktime_divns(kt, 100 * NSEC_PER_USEC); /* ERSPAN base header only has 32-bit, * so it wraps around 4 days. */ return htonl((u32)h_usecs); } /* ERSPAN BSO (Bad/Short/Oversized), see RFC1757 * 00b --> Good frame with no error, or unknown integrity * 01b --> Payload is a Short Frame * 10b --> Payload is an Oversized Frame * 11b --> Payload is a Bad Frame with CRC or Alignment Error */ enum erspan_bso { BSO_NOERROR = 0x0, BSO_SHORT = 0x1, BSO_OVERSIZED = 0x2, BSO_BAD = 0x3, }; static inline u8 erspan_detect_bso(struct sk_buff *skb) { /* BSO_BAD is not handled because the frame CRC * or alignment error information is in FCS. */ if (skb->len < ETH_ZLEN) return BSO_SHORT; if (skb->len > ETH_FRAME_LEN) return BSO_OVERSIZED; return BSO_NOERROR; } static inline void erspan_build_header_v2(struct sk_buff *skb, u32 id, u8 direction, u16 hwid, bool truncate, bool is_ipv4) { struct ethhdr *eth = (struct ethhdr *)skb->data; struct erspan_base_hdr *ershdr; struct erspan_md2 *md2; struct qtag_prefix { __be16 eth_type; __be16 tci; } *qp; u16 vlan_tci = 0; u8 gra = 0; /* 100 usec */ u8 bso = 0; /* Bad/Short/Oversized */ u8 sgt = 0; u8 tos; tos = is_ipv4 ? ip_hdr(skb)->tos : (ipv6_hdr(skb)->priority << 4) + (ipv6_hdr(skb)->flow_lbl[0] >> 4); /* Unlike v1, v2 does not have En field, * so only extract vlan tci field. */ if (eth->h_proto == htons(ETH_P_8021Q)) { qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN); vlan_tci = ntohs(qp->tci); } bso = erspan_detect_bso(skb); skb_push(skb, sizeof(*ershdr) + ERSPAN_V2_MDSIZE); ershdr = (struct erspan_base_hdr *)skb->data; memset(ershdr, 0, sizeof(*ershdr) + ERSPAN_V2_MDSIZE); /* Build base header */ ershdr->ver = ERSPAN_VERSION2; ershdr->cos = tos_to_cos(tos); ershdr->en = bso; ershdr->t = truncate; set_vlan(ershdr, vlan_tci); set_session_id(ershdr, id); /* Build metadata */ md2 = (struct erspan_md2 *)(ershdr + 1); md2->timestamp = erspan_get_timestamp(); md2->sgt = htons(sgt); md2->p = 1; md2->ft = 0; md2->dir = direction; md2->gra = gra; md2->o = 0; set_hwid(md2, hwid); } #endif
4 6 3 1 3 6 6 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 // SPDX-License-Identifier: GPL-2.0-or-later /* * authencesn.c - AEAD wrapper for IPsec with extended sequence numbers, * derived from authenc.c * * Copyright (C) 2010 secunet Security Networks AG * Copyright (C) 2010 Steffen Klassert <steffen.klassert@secunet.com> * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/internal/aead.h> #include <crypto/internal/hash.h> #include <crypto/internal/skcipher.h> #include <crypto/authenc.h> #include <crypto/null.h> #include <crypto/scatterwalk.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/spinlock.h> struct authenc_esn_instance_ctx { struct crypto_ahash_spawn auth; struct crypto_skcipher_spawn enc; }; struct crypto_authenc_esn_ctx { unsigned int reqoff; struct crypto_ahash *auth; struct crypto_skcipher *enc; struct crypto_sync_skcipher *null; }; struct authenc_esn_request_ctx { struct scatterlist src[2]; struct scatterlist dst[2]; char tail[]; }; static void authenc_esn_request_complete(struct aead_request *req, int err) { if (err != -EINPROGRESS) aead_request_complete(req, err); } static int crypto_authenc_esn_setauthsize(struct crypto_aead *authenc_esn, unsigned int authsize) { if (authsize > 0 && authsize < 4) return -EINVAL; return 0; } static int crypto_authenc_esn_setkey(struct crypto_aead *authenc_esn, const u8 *key, unsigned int keylen) { struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(authenc_esn); struct crypto_ahash *auth = ctx->auth; struct crypto_skcipher *enc = ctx->enc; struct crypto_authenc_keys keys; int err = -EINVAL; if (crypto_authenc_extractkeys(&keys, key, keylen) != 0) goto out; crypto_ahash_clear_flags(auth, CRYPTO_TFM_REQ_MASK); crypto_ahash_set_flags(auth, crypto_aead_get_flags(authenc_esn) & CRYPTO_TFM_REQ_MASK); err = crypto_ahash_setkey(auth, keys.authkey, keys.authkeylen); if (err) goto out; crypto_skcipher_clear_flags(enc, CRYPTO_TFM_REQ_MASK); crypto_skcipher_set_flags(enc, crypto_aead_get_flags(authenc_esn) & CRYPTO_TFM_REQ_MASK); err = crypto_skcipher_setkey(enc, keys.enckey, keys.enckeylen); out: memzero_explicit(&keys, sizeof(keys)); return err; } static int crypto_authenc_esn_genicv_tail(struct aead_request *req, unsigned int flags) { struct crypto_aead *authenc_esn = crypto_aead_reqtfm(req); struct authenc_esn_request_ctx *areq_ctx = aead_request_ctx(req); u8 *hash = areq_ctx->tail; unsigned int authsize = crypto_aead_authsize(authenc_esn); unsigned int assoclen = req->assoclen; unsigned int cryptlen = req->cryptlen; struct scatterlist *dst = req->dst; u32 tmp[2]; /* Move high-order bits of sequence number back. */ scatterwalk_map_and_copy(tmp, dst, 4, 4, 0); scatterwalk_map_and_copy(tmp + 1, dst, assoclen + cryptlen, 4, 0); scatterwalk_map_and_copy(tmp, dst, 0, 8, 1); scatterwalk_map_and_copy(hash, dst, assoclen + cryptlen, authsize, 1); return 0; } static void authenc_esn_geniv_ahash_done(void *data, int err) { struct aead_request *req = data; err = err ?: crypto_authenc_esn_genicv_tail(req, 0); aead_request_complete(req, err); } static int crypto_authenc_esn_genicv(struct aead_request *req, unsigned int flags) { struct crypto_aead *authenc_esn = crypto_aead_reqtfm(req); struct authenc_esn_request_ctx *areq_ctx = aead_request_ctx(req); struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(authenc_esn); struct crypto_ahash *auth = ctx->auth; u8 *hash = areq_ctx->tail; struct ahash_request *ahreq = (void *)(areq_ctx->tail + ctx->reqoff); unsigned int authsize = crypto_aead_authsize(authenc_esn); unsigned int assoclen = req->assoclen; unsigned int cryptlen = req->cryptlen; struct scatterlist *dst = req->dst; u32 tmp[2]; if (!authsize) return 0; /* Move high-order bits of sequence number to the end. */ scatterwalk_map_and_copy(tmp, dst, 0, 8, 0); scatterwalk_map_and_copy(tmp, dst, 4, 4, 1); scatterwalk_map_and_copy(tmp + 1, dst, assoclen + cryptlen, 4, 1); sg_init_table(areq_ctx->dst, 2); dst = scatterwalk_ffwd(areq_ctx->dst, dst, 4); ahash_request_set_tfm(ahreq, auth); ahash_request_set_crypt(ahreq, dst, hash, assoclen + cryptlen); ahash_request_set_callback(ahreq, flags, authenc_esn_geniv_ahash_done, req); return crypto_ahash_digest(ahreq) ?: crypto_authenc_esn_genicv_tail(req, aead_request_flags(req)); } static void crypto_authenc_esn_encrypt_done(void *data, int err) { struct aead_request *areq = data; if (!err) err = crypto_authenc_esn_genicv(areq, 0); authenc_esn_request_complete(areq, err); } static int crypto_authenc_esn_copy(struct aead_request *req, unsigned int len) { struct crypto_aead *authenc_esn = crypto_aead_reqtfm(req); struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(authenc_esn); SYNC_SKCIPHER_REQUEST_ON_STACK(skreq, ctx->null); skcipher_request_set_sync_tfm(skreq, ctx->null); skcipher_request_set_callback(skreq, aead_request_flags(req), NULL, NULL); skcipher_request_set_crypt(skreq, req->src, req->dst, len, NULL); return crypto_skcipher_encrypt(skreq); } static int crypto_authenc_esn_encrypt(struct aead_request *req) { struct crypto_aead *authenc_esn = crypto_aead_reqtfm(req); struct authenc_esn_request_ctx *areq_ctx = aead_request_ctx(req); struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(authenc_esn); struct skcipher_request *skreq = (void *)(areq_ctx->tail + ctx->reqoff); struct crypto_skcipher *enc = ctx->enc; unsigned int assoclen = req->assoclen; unsigned int cryptlen = req->cryptlen; struct scatterlist *src, *dst; int err; sg_init_table(areq_ctx->src, 2); src = scatterwalk_ffwd(areq_ctx->src, req->src, assoclen); dst = src; if (req->src != req->dst) { err = crypto_authenc_esn_copy(req, assoclen); if (err) return err; sg_init_table(areq_ctx->dst, 2); dst = scatterwalk_ffwd(areq_ctx->dst, req->dst, assoclen); } skcipher_request_set_tfm(skreq, enc); skcipher_request_set_callback(skreq, aead_request_flags(req), crypto_authenc_esn_encrypt_done, req); skcipher_request_set_crypt(skreq, src, dst, cryptlen, req->iv); err = crypto_skcipher_encrypt(skreq); if (err) return err; return crypto_authenc_esn_genicv(req, aead_request_flags(req)); } static int crypto_authenc_esn_decrypt_tail(struct aead_request *req, unsigned int flags) { struct crypto_aead *authenc_esn = crypto_aead_reqtfm(req); unsigned int authsize = crypto_aead_authsize(authenc_esn); struct authenc_esn_request_ctx *areq_ctx = aead_request_ctx(req); struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(authenc_esn); struct skcipher_request *skreq = (void *)(areq_ctx->tail + ctx->reqoff); struct crypto_ahash *auth = ctx->auth; u8 *ohash = areq_ctx->tail; unsigned int cryptlen = req->cryptlen - authsize; unsigned int assoclen = req->assoclen; struct scatterlist *dst = req->dst; u8 *ihash = ohash + crypto_ahash_digestsize(auth); u32 tmp[2]; if (!authsize) goto decrypt; /* Move high-order bits of sequence number back. */ scatterwalk_map_and_copy(tmp, dst, 4, 4, 0); scatterwalk_map_and_copy(tmp + 1, dst, assoclen + cryptlen, 4, 0); scatterwalk_map_and_copy(tmp, dst, 0, 8, 1); if (crypto_memneq(ihash, ohash, authsize)) return -EBADMSG; decrypt: sg_init_table(areq_ctx->dst, 2); dst = scatterwalk_ffwd(areq_ctx->dst, dst, assoclen); skcipher_request_set_tfm(skreq, ctx->enc); skcipher_request_set_callback(skreq, flags, req->base.complete, req->base.data); skcipher_request_set_crypt(skreq, dst, dst, cryptlen, req->iv); return crypto_skcipher_decrypt(skreq); } static void authenc_esn_verify_ahash_done(void *data, int err) { struct aead_request *req = data; err = err ?: crypto_authenc_esn_decrypt_tail(req, 0); authenc_esn_request_complete(req, err); } static int crypto_authenc_esn_decrypt(struct aead_request *req) { struct crypto_aead *authenc_esn = crypto_aead_reqtfm(req); struct authenc_esn_request_ctx *areq_ctx = aead_request_ctx(req); struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(authenc_esn); struct ahash_request *ahreq = (void *)(areq_ctx->tail + ctx->reqoff); unsigned int authsize = crypto_aead_authsize(authenc_esn); struct crypto_ahash *auth = ctx->auth; u8 *ohash = areq_ctx->tail; unsigned int assoclen = req->assoclen; unsigned int cryptlen = req->cryptlen; u8 *ihash = ohash + crypto_ahash_digestsize(auth); struct scatterlist *dst = req->dst; u32 tmp[2]; int err; cryptlen -= authsize; if (req->src != dst) { err = crypto_authenc_esn_copy(req, assoclen + cryptlen); if (err) return err; } scatterwalk_map_and_copy(ihash, req->src, assoclen + cryptlen, authsize, 0); if (!authsize) goto tail; /* Move high-order bits of sequence number to the end. */ scatterwalk_map_and_copy(tmp, dst, 0, 8, 0); scatterwalk_map_and_copy(tmp, dst, 4, 4, 1); scatterwalk_map_and_copy(tmp + 1, dst, assoclen + cryptlen, 4, 1); sg_init_table(areq_ctx->dst, 2); dst = scatterwalk_ffwd(areq_ctx->dst, dst, 4); ahash_request_set_tfm(ahreq, auth); ahash_request_set_crypt(ahreq, dst, ohash, assoclen + cryptlen); ahash_request_set_callback(ahreq, aead_request_flags(req), authenc_esn_verify_ahash_done, req); err = crypto_ahash_digest(ahreq); if (err) return err; tail: return crypto_authenc_esn_decrypt_tail(req, aead_request_flags(req)); } static int crypto_authenc_esn_init_tfm(struct crypto_aead *tfm) { struct aead_instance *inst = aead_alg_instance(tfm); struct authenc_esn_instance_ctx *ictx = aead_instance_ctx(inst); struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(tfm); struct crypto_ahash *auth; struct crypto_skcipher *enc; struct crypto_sync_skcipher *null; int err; auth = crypto_spawn_ahash(&ictx->auth); if (IS_ERR(auth)) return PTR_ERR(auth); enc = crypto_spawn_skcipher(&ictx->enc); err = PTR_ERR(enc); if (IS_ERR(enc)) goto err_free_ahash; null = crypto_get_default_null_skcipher(); err = PTR_ERR(null); if (IS_ERR(null)) goto err_free_skcipher; ctx->auth = auth; ctx->enc = enc; ctx->null = null; ctx->reqoff = 2 * crypto_ahash_digestsize(auth); crypto_aead_set_reqsize( tfm, sizeof(struct authenc_esn_request_ctx) + ctx->reqoff + max_t(unsigned int, crypto_ahash_reqsize(auth) + sizeof(struct ahash_request), sizeof(struct skcipher_request) + crypto_skcipher_reqsize(enc))); return 0; err_free_skcipher: crypto_free_skcipher(enc); err_free_ahash: crypto_free_ahash(auth); return err; } static void crypto_authenc_esn_exit_tfm(struct crypto_aead *tfm) { struct crypto_authenc_esn_ctx *ctx = crypto_aead_ctx(tfm); crypto_free_ahash(ctx->auth); crypto_free_skcipher(ctx->enc); crypto_put_default_null_skcipher(); } static void crypto_authenc_esn_free(struct aead_instance *inst) { struct authenc_esn_instance_ctx *ctx = aead_instance_ctx(inst); crypto_drop_skcipher(&ctx->enc); crypto_drop_ahash(&ctx->auth); kfree(inst); } static int crypto_authenc_esn_create(struct crypto_template *tmpl, struct rtattr **tb) { u32 mask; struct aead_instance *inst; struct authenc_esn_instance_ctx *ctx; struct skcipher_alg_common *enc; struct hash_alg_common *auth; struct crypto_alg *auth_base; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_AEAD, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*ctx), GFP_KERNEL); if (!inst) return -ENOMEM; ctx = aead_instance_ctx(inst); err = crypto_grab_ahash(&ctx->auth, aead_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; auth = crypto_spawn_ahash_alg(&ctx->auth); auth_base = &auth->base; err = crypto_grab_skcipher(&ctx->enc, aead_crypto_instance(inst), crypto_attr_alg_name(tb[2]), 0, mask); if (err) goto err_free_inst; enc = crypto_spawn_skcipher_alg_common(&ctx->enc); err = -ENAMETOOLONG; if (snprintf(inst->alg.base.cra_name, CRYPTO_MAX_ALG_NAME, "authencesn(%s,%s)", auth_base->cra_name, enc->base.cra_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; if (snprintf(inst->alg.base.cra_driver_name, CRYPTO_MAX_ALG_NAME, "authencesn(%s,%s)", auth_base->cra_driver_name, enc->base.cra_driver_name) >= CRYPTO_MAX_ALG_NAME) goto err_free_inst; inst->alg.base.cra_priority = enc->base.cra_priority * 10 + auth_base->cra_priority; inst->alg.base.cra_blocksize = enc->base.cra_blocksize; inst->alg.base.cra_alignmask = enc->base.cra_alignmask; inst->alg.base.cra_ctxsize = sizeof(struct crypto_authenc_esn_ctx); inst->alg.ivsize = enc->ivsize; inst->alg.chunksize = enc->chunksize; inst->alg.maxauthsize = auth->digestsize; inst->alg.init = crypto_authenc_esn_init_tfm; inst->alg.exit = crypto_authenc_esn_exit_tfm; inst->alg.setkey = crypto_authenc_esn_setkey; inst->alg.setauthsize = crypto_authenc_esn_setauthsize; inst->alg.encrypt = crypto_authenc_esn_encrypt; inst->alg.decrypt = crypto_authenc_esn_decrypt; inst->free = crypto_authenc_esn_free; err = aead_register_instance(tmpl, inst); if (err) { err_free_inst: crypto_authenc_esn_free(inst); } return err; } static struct crypto_template crypto_authenc_esn_tmpl = { .name = "authencesn", .create = crypto_authenc_esn_create, .module = THIS_MODULE, }; static int __init crypto_authenc_esn_module_init(void) { return crypto_register_template(&crypto_authenc_esn_tmpl); } static void __exit crypto_authenc_esn_module_exit(void) { crypto_unregister_template(&crypto_authenc_esn_tmpl); } subsys_initcall(crypto_authenc_esn_module_init); module_exit(crypto_authenc_esn_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>"); MODULE_DESCRIPTION("AEAD wrapper for IPsec with extended sequence numbers"); MODULE_ALIAS_CRYPTO("authencesn");
574 632 16 77 77 78 77 3 77 77 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2008 IBM Corporation * * Authors: * Mimi Zohar <zohar@us.ibm.com> * * File: ima_iint.c * - implements the IMA hook: ima_inode_free * - cache integrity information in the inode security blob */ #include <linux/slab.h> #include "ima.h" static struct kmem_cache *ima_iint_cache __ro_after_init; /** * ima_iint_find - Return the iint associated with an inode * @inode: Pointer to the inode * * Return the IMA integrity information (iint) associated with an inode, if the * inode was processed by IMA. * * Return: Found iint or NULL. */ struct ima_iint_cache *ima_iint_find(struct inode *inode) { if (!IS_IMA(inode)) return NULL; return ima_inode_get_iint(inode); } #define IMA_MAX_NESTING (FILESYSTEM_MAX_STACK_DEPTH + 1) /* * It is not clear that IMA should be nested at all, but as long is it measures * files both on overlayfs and on underlying fs, we need to annotate the iint * mutex to avoid lockdep false positives related to IMA + overlayfs. * See ovl_lockdep_annotate_inode_mutex_key() for more details. */ static inline void ima_iint_lockdep_annotate(struct ima_iint_cache *iint, struct inode *inode) { #ifdef CONFIG_LOCKDEP static struct lock_class_key ima_iint_mutex_key[IMA_MAX_NESTING]; int depth = inode->i_sb->s_stack_depth; if (WARN_ON_ONCE(depth < 0 || depth >= IMA_MAX_NESTING)) depth = 0; lockdep_set_class(&iint->mutex, &ima_iint_mutex_key[depth]); #endif } static void ima_iint_init_always(struct ima_iint_cache *iint, struct inode *inode) { iint->ima_hash = NULL; iint->real_inode.version = 0; iint->flags = 0UL; iint->atomic_flags = 0UL; iint->ima_file_status = INTEGRITY_UNKNOWN; iint->ima_mmap_status = INTEGRITY_UNKNOWN; iint->ima_bprm_status = INTEGRITY_UNKNOWN; iint->ima_read_status = INTEGRITY_UNKNOWN; iint->ima_creds_status = INTEGRITY_UNKNOWN; iint->measured_pcrs = 0; mutex_init(&iint->mutex); ima_iint_lockdep_annotate(iint, inode); } static void ima_iint_free(struct ima_iint_cache *iint) { kfree(iint->ima_hash); mutex_destroy(&iint->mutex); kmem_cache_free(ima_iint_cache, iint); } /** * ima_inode_get - Find or allocate an iint associated with an inode * @inode: Pointer to the inode * * Find an iint associated with an inode, and allocate a new one if not found. * Caller must lock i_mutex. * * Return: An iint on success, NULL on error. */ struct ima_iint_cache *ima_inode_get(struct inode *inode) { struct ima_iint_cache *iint; iint = ima_iint_find(inode); if (iint) return iint; iint = kmem_cache_alloc(ima_iint_cache, GFP_NOFS); if (!iint) return NULL; ima_iint_init_always(iint, inode); inode->i_flags |= S_IMA; ima_inode_set_iint(inode, iint); return iint; } /** * ima_inode_free_rcu - Called to free an inode via a RCU callback * @inode_security: The inode->i_security pointer * * Free the IMA data associated with an inode. */ void ima_inode_free_rcu(void *inode_security) { struct ima_iint_cache **iint_p = inode_security + ima_blob_sizes.lbs_inode; /* *iint_p should be NULL if !IS_IMA(inode) */ if (*iint_p) ima_iint_free(*iint_p); } static void ima_iint_init_once(void *foo) { struct ima_iint_cache *iint = (struct ima_iint_cache *)foo; memset(iint, 0, sizeof(*iint)); } void __init ima_iintcache_init(void) { ima_iint_cache = kmem_cache_create("ima_iint_cache", sizeof(struct ima_iint_cache), 0, SLAB_PANIC, ima_iint_init_once); }
15 1 15 15 15 15 15 15 15 8 13 1 1 13 13 15 13 8 11 12 12 12 12 11 12 12 12 12 11 2 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 // SPDX-License-Identifier: GPL-2.0-or-later #include <crypto/hash.h> #include <linux/cpu.h> #include <linux/kref.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/percpu.h> #include <linux/workqueue.h> #include <net/tcp.h> static size_t __scratch_size; struct sigpool_scratch { local_lock_t bh_lock; void __rcu *pad; }; static DEFINE_PER_CPU(struct sigpool_scratch, sigpool_scratch) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; struct sigpool_entry { struct crypto_ahash *hash; const char *alg; struct kref kref; uint16_t needs_key:1, reserved:15; }; #define CPOOL_SIZE (PAGE_SIZE / sizeof(struct sigpool_entry)) static struct sigpool_entry cpool[CPOOL_SIZE]; static unsigned int cpool_populated; static DEFINE_MUTEX(cpool_mutex); /* Slow-path */ struct scratches_to_free { struct rcu_head rcu; unsigned int cnt; void *scratches[]; }; static void free_old_scratches(struct rcu_head *head) { struct scratches_to_free *stf; stf = container_of(head, struct scratches_to_free, rcu); while (stf->cnt--) kfree(stf->scratches[stf->cnt]); kfree(stf); } /** * sigpool_reserve_scratch - re-allocates scratch buffer, slow-path * @size: request size for the scratch/temp buffer */ static int sigpool_reserve_scratch(size_t size) { struct scratches_to_free *stf; size_t stf_sz = struct_size(stf, scratches, num_possible_cpus()); int cpu, err = 0; lockdep_assert_held(&cpool_mutex); if (__scratch_size >= size) return 0; stf = kmalloc(stf_sz, GFP_KERNEL); if (!stf) return -ENOMEM; stf->cnt = 0; size = max(size, __scratch_size); cpus_read_lock(); for_each_possible_cpu(cpu) { void *scratch, *old_scratch; scratch = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); if (!scratch) { err = -ENOMEM; break; } old_scratch = rcu_replace_pointer(per_cpu(sigpool_scratch.pad, cpu), scratch, lockdep_is_held(&cpool_mutex)); if (!cpu_online(cpu) || !old_scratch) { kfree(old_scratch); continue; } stf->scratches[stf->cnt++] = old_scratch; } cpus_read_unlock(); if (!err) __scratch_size = size; call_rcu(&stf->rcu, free_old_scratches); return err; } static void sigpool_scratch_free(void) { int cpu; for_each_possible_cpu(cpu) kfree(rcu_replace_pointer(per_cpu(sigpool_scratch.pad, cpu), NULL, lockdep_is_held(&cpool_mutex))); __scratch_size = 0; } static int __cpool_try_clone(struct crypto_ahash *hash) { struct crypto_ahash *tmp; tmp = crypto_clone_ahash(hash); if (IS_ERR(tmp)) return PTR_ERR(tmp); crypto_free_ahash(tmp); return 0; } static int __cpool_alloc_ahash(struct sigpool_entry *e, const char *alg) { struct crypto_ahash *cpu0_hash; int ret; e->alg = kstrdup(alg, GFP_KERNEL); if (!e->alg) return -ENOMEM; cpu0_hash = crypto_alloc_ahash(alg, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(cpu0_hash)) { ret = PTR_ERR(cpu0_hash); goto out_free_alg; } e->needs_key = crypto_ahash_get_flags(cpu0_hash) & CRYPTO_TFM_NEED_KEY; ret = __cpool_try_clone(cpu0_hash); if (ret) goto out_free_cpu0_hash; e->hash = cpu0_hash; kref_init(&e->kref); return 0; out_free_cpu0_hash: crypto_free_ahash(cpu0_hash); out_free_alg: kfree(e->alg); e->alg = NULL; return ret; } /** * tcp_sigpool_alloc_ahash - allocates pool for ahash requests * @alg: name of async hash algorithm * @scratch_size: reserve a tcp_sigpool::scratch buffer of this size */ int tcp_sigpool_alloc_ahash(const char *alg, size_t scratch_size) { int i, ret; /* slow-path */ mutex_lock(&cpool_mutex); ret = sigpool_reserve_scratch(scratch_size); if (ret) goto out; for (i = 0; i < cpool_populated; i++) { if (!cpool[i].alg) continue; if (strcmp(cpool[i].alg, alg)) continue; /* pairs with tcp_sigpool_release() */ if (!kref_get_unless_zero(&cpool[i].kref)) kref_init(&cpool[i].kref); ret = i; goto out; } for (i = 0; i < cpool_populated; i++) { if (!cpool[i].alg) break; } if (i >= CPOOL_SIZE) { ret = -ENOSPC; goto out; } ret = __cpool_alloc_ahash(&cpool[i], alg); if (!ret) { ret = i; if (i == cpool_populated) cpool_populated++; } out: mutex_unlock(&cpool_mutex); return ret; } EXPORT_SYMBOL_GPL(tcp_sigpool_alloc_ahash); static void __cpool_free_entry(struct sigpool_entry *e) { crypto_free_ahash(e->hash); kfree(e->alg); memset(e, 0, sizeof(*e)); } static void cpool_cleanup_work_cb(struct work_struct *work) { bool free_scratch = true; unsigned int i; mutex_lock(&cpool_mutex); for (i = 0; i < cpool_populated; i++) { if (kref_read(&cpool[i].kref) > 0) { free_scratch = false; continue; } if (!cpool[i].alg) continue; __cpool_free_entry(&cpool[i]); } if (free_scratch) sigpool_scratch_free(); mutex_unlock(&cpool_mutex); } static DECLARE_WORK(cpool_cleanup_work, cpool_cleanup_work_cb); static void cpool_schedule_cleanup(struct kref *kref) { schedule_work(&cpool_cleanup_work); } /** * tcp_sigpool_release - decreases number of users for a pool. If it was * the last user of the pool, releases any memory that was consumed. * @id: tcp_sigpool that was previously allocated by tcp_sigpool_alloc_ahash() */ void tcp_sigpool_release(unsigned int id) { if (WARN_ON_ONCE(id >= cpool_populated || !cpool[id].alg)) return; /* slow-path */ kref_put(&cpool[id].kref, cpool_schedule_cleanup); } EXPORT_SYMBOL_GPL(tcp_sigpool_release); /** * tcp_sigpool_get - increases number of users (refcounter) for a pool * @id: tcp_sigpool that was previously allocated by tcp_sigpool_alloc_ahash() */ void tcp_sigpool_get(unsigned int id) { if (WARN_ON_ONCE(id >= cpool_populated || !cpool[id].alg)) return; kref_get(&cpool[id].kref); } EXPORT_SYMBOL_GPL(tcp_sigpool_get); int tcp_sigpool_start(unsigned int id, struct tcp_sigpool *c) __cond_acquires(RCU_BH) { struct crypto_ahash *hash; rcu_read_lock_bh(); if (WARN_ON_ONCE(id >= cpool_populated || !cpool[id].alg)) { rcu_read_unlock_bh(); return -EINVAL; } hash = crypto_clone_ahash(cpool[id].hash); if (IS_ERR(hash)) { rcu_read_unlock_bh(); return PTR_ERR(hash); } c->req = ahash_request_alloc(hash, GFP_ATOMIC); if (!c->req) { crypto_free_ahash(hash); rcu_read_unlock_bh(); return -ENOMEM; } ahash_request_set_callback(c->req, 0, NULL, NULL); /* Pairs with tcp_sigpool_reserve_scratch(), scratch area is * valid (allocated) until tcp_sigpool_end(). */ local_lock_nested_bh(&sigpool_scratch.bh_lock); c->scratch = rcu_dereference_bh(*this_cpu_ptr(&sigpool_scratch.pad)); return 0; } EXPORT_SYMBOL_GPL(tcp_sigpool_start); void tcp_sigpool_end(struct tcp_sigpool *c) __releases(RCU_BH) { struct crypto_ahash *hash = crypto_ahash_reqtfm(c->req); local_unlock_nested_bh(&sigpool_scratch.bh_lock); rcu_read_unlock_bh(); ahash_request_free(c->req); crypto_free_ahash(hash); } EXPORT_SYMBOL_GPL(tcp_sigpool_end); /** * tcp_sigpool_algo - return algorithm of tcp_sigpool * @id: tcp_sigpool that was previously allocated by tcp_sigpool_alloc_ahash() * @buf: buffer to return name of algorithm * @buf_len: size of @buf */ size_t tcp_sigpool_algo(unsigned int id, char *buf, size_t buf_len) { if (WARN_ON_ONCE(id >= cpool_populated || !cpool[id].alg)) return -EINVAL; return strscpy(buf, cpool[id].alg, buf_len); } EXPORT_SYMBOL_GPL(tcp_sigpool_algo); /** * tcp_sigpool_hash_skb_data - hash data in skb with initialized tcp_sigpool * @hp: tcp_sigpool pointer * @skb: buffer to add sign for * @header_len: TCP header length for this segment */ int tcp_sigpool_hash_skb_data(struct tcp_sigpool *hp, const struct sk_buff *skb, unsigned int header_len) { const unsigned int head_data_len = skb_headlen(skb) > header_len ? skb_headlen(skb) - header_len : 0; const struct skb_shared_info *shi = skb_shinfo(skb); const struct tcphdr *tp = tcp_hdr(skb); struct ahash_request *req = hp->req; struct sk_buff *frag_iter; struct scatterlist sg; unsigned int i; sg_init_table(&sg, 1); sg_set_buf(&sg, ((u8 *)tp) + header_len, head_data_len); ahash_request_set_crypt(req, &sg, NULL, head_data_len); if (crypto_ahash_update(req)) return 1; for (i = 0; i < shi->nr_frags; ++i) { const skb_frag_t *f = &shi->frags[i]; unsigned int offset = skb_frag_off(f); struct page *page; page = skb_frag_page(f) + (offset >> PAGE_SHIFT); sg_set_page(&sg, page, skb_frag_size(f), offset_in_page(offset)); ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f)); if (crypto_ahash_update(req)) return 1; } skb_walk_frags(skb, frag_iter) if (tcp_sigpool_hash_skb_data(hp, frag_iter, 0)) return 1; return 0; } EXPORT_SYMBOL(tcp_sigpool_hash_skb_data); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Per-CPU pool of crypto requests");
1 7 7 5 4 5 5 1 6 2 4 4 2 3 3 5 6 9 9 7 1 4 5 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 // SPDX-License-Identifier: GPL-2.0-or-later /* * Glue Code for assembler optimized version of 3DES * * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> * * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/algapi.h> #include <crypto/des.h> #include <crypto/internal/skcipher.h> #include <linux/crypto.h> #include <linux/init.h> #include <linux/module.h> #include <linux/types.h> struct des3_ede_x86_ctx { struct des3_ede_ctx enc; struct des3_ede_ctx dec; }; /* regular block cipher functions */ asmlinkage void des3_ede_x86_64_crypt_blk(const u32 *expkey, u8 *dst, const u8 *src); /* 3-way parallel cipher functions */ asmlinkage void des3_ede_x86_64_crypt_blk_3way(const u32 *expkey, u8 *dst, const u8 *src); static inline void des3_ede_enc_blk(struct des3_ede_x86_ctx *ctx, u8 *dst, const u8 *src) { u32 *enc_ctx = ctx->enc.expkey; des3_ede_x86_64_crypt_blk(enc_ctx, dst, src); } static inline void des3_ede_dec_blk(struct des3_ede_x86_ctx *ctx, u8 *dst, const u8 *src) { u32 *dec_ctx = ctx->dec.expkey; des3_ede_x86_64_crypt_blk(dec_ctx, dst, src); } static inline void des3_ede_dec_blk_3way(struct des3_ede_x86_ctx *ctx, u8 *dst, const u8 *src) { u32 *dec_ctx = ctx->dec.expkey; des3_ede_x86_64_crypt_blk_3way(dec_ctx, dst, src); } static void des3_ede_x86_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { des3_ede_enc_blk(crypto_tfm_ctx(tfm), dst, src); } static void des3_ede_x86_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { des3_ede_dec_blk(crypto_tfm_ctx(tfm), dst, src); } static int ecb_crypt(struct skcipher_request *req, const u32 *expkey) { const unsigned int bsize = DES3_EDE_BLOCK_SIZE; struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes)) { u8 *wsrc = walk.src.virt.addr; u8 *wdst = walk.dst.virt.addr; /* Process four block batch */ if (nbytes >= bsize * 3) { do { des3_ede_x86_64_crypt_blk_3way(expkey, wdst, wsrc); wsrc += bsize * 3; wdst += bsize * 3; nbytes -= bsize * 3; } while (nbytes >= bsize * 3); if (nbytes < bsize) goto done; } /* Handle leftovers */ do { des3_ede_x86_64_crypt_blk(expkey, wdst, wsrc); wsrc += bsize; wdst += bsize; nbytes -= bsize; } while (nbytes >= bsize); done: err = skcipher_walk_done(&walk, nbytes); } return err; } static int ecb_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm); return ecb_crypt(req, ctx->enc.expkey); } static int ecb_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm); return ecb_crypt(req, ctx->dec.expkey); } static unsigned int __cbc_encrypt(struct des3_ede_x86_ctx *ctx, struct skcipher_walk *walk) { unsigned int bsize = DES3_EDE_BLOCK_SIZE; unsigned int nbytes = walk->nbytes; u64 *src = (u64 *)walk->src.virt.addr; u64 *dst = (u64 *)walk->dst.virt.addr; u64 *iv = (u64 *)walk->iv; do { *dst = *src ^ *iv; des3_ede_enc_blk(ctx, (u8 *)dst, (u8 *)dst); iv = dst; src += 1; dst += 1; nbytes -= bsize; } while (nbytes >= bsize); *(u64 *)walk->iv = *iv; return nbytes; } static int cbc_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while (walk.nbytes) { nbytes = __cbc_encrypt(ctx, &walk); err = skcipher_walk_done(&walk, nbytes); } return err; } static unsigned int __cbc_decrypt(struct des3_ede_x86_ctx *ctx, struct skcipher_walk *walk) { unsigned int bsize = DES3_EDE_BLOCK_SIZE; unsigned int nbytes = walk->nbytes; u64 *src = (u64 *)walk->src.virt.addr; u64 *dst = (u64 *)walk->dst.virt.addr; u64 ivs[3 - 1]; u64 last_iv; /* Start of the last block. */ src += nbytes / bsize - 1; dst += nbytes / bsize - 1; last_iv = *src; /* Process four block batch */ if (nbytes >= bsize * 3) { do { nbytes -= bsize * 3 - bsize; src -= 3 - 1; dst -= 3 - 1; ivs[0] = src[0]; ivs[1] = src[1]; des3_ede_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src); dst[1] ^= ivs[0]; dst[2] ^= ivs[1]; nbytes -= bsize; if (nbytes < bsize) goto done; *dst ^= *(src - 1); src -= 1; dst -= 1; } while (nbytes >= bsize * 3); } /* Handle leftovers */ for (;;) { des3_ede_dec_blk(ctx, (u8 *)dst, (u8 *)src); nbytes -= bsize; if (nbytes < bsize) break; *dst ^= *(src - 1); src -= 1; dst -= 1; } done: *dst ^= *(u64 *)walk->iv; *(u64 *)walk->iv = last_iv; return nbytes; } static int cbc_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct des3_ede_x86_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while (walk.nbytes) { nbytes = __cbc_decrypt(ctx, &walk); err = skcipher_walk_done(&walk, nbytes); } return err; } static int des3_ede_x86_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen) { struct des3_ede_x86_ctx *ctx = crypto_tfm_ctx(tfm); u32 i, j, tmp; int err; err = des3_ede_expand_key(&ctx->enc, key, keylen); if (err == -ENOKEY) { if (crypto_tfm_get_flags(tfm) & CRYPTO_TFM_REQ_FORBID_WEAK_KEYS) err = -EINVAL; else err = 0; } if (err) { memset(ctx, 0, sizeof(*ctx)); return err; } /* Fix encryption context for this implementation and form decryption * context. */ j = DES3_EDE_EXPKEY_WORDS - 2; for (i = 0; i < DES3_EDE_EXPKEY_WORDS; i += 2, j -= 2) { tmp = ror32(ctx->enc.expkey[i + 1], 4); ctx->enc.expkey[i + 1] = tmp; ctx->dec.expkey[j + 0] = ctx->enc.expkey[i + 0]; ctx->dec.expkey[j + 1] = tmp; } return 0; } static int des3_ede_x86_setkey_skcipher(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { return des3_ede_x86_setkey(&tfm->base, key, keylen); } static struct crypto_alg des3_ede_cipher = { .cra_name = "des3_ede", .cra_driver_name = "des3_ede-asm", .cra_priority = 200, .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = DES3_EDE_BLOCK_SIZE, .cra_ctxsize = sizeof(struct des3_ede_x86_ctx), .cra_module = THIS_MODULE, .cra_u = { .cipher = { .cia_min_keysize = DES3_EDE_KEY_SIZE, .cia_max_keysize = DES3_EDE_KEY_SIZE, .cia_setkey = des3_ede_x86_setkey, .cia_encrypt = des3_ede_x86_encrypt, .cia_decrypt = des3_ede_x86_decrypt, } } }; static struct skcipher_alg des3_ede_skciphers[] = { { .base.cra_name = "ecb(des3_ede)", .base.cra_driver_name = "ecb-des3_ede-asm", .base.cra_priority = 300, .base.cra_blocksize = DES3_EDE_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct des3_ede_x86_ctx), .base.cra_module = THIS_MODULE, .min_keysize = DES3_EDE_KEY_SIZE, .max_keysize = DES3_EDE_KEY_SIZE, .setkey = des3_ede_x86_setkey_skcipher, .encrypt = ecb_encrypt, .decrypt = ecb_decrypt, }, { .base.cra_name = "cbc(des3_ede)", .base.cra_driver_name = "cbc-des3_ede-asm", .base.cra_priority = 300, .base.cra_blocksize = DES3_EDE_BLOCK_SIZE, .base.cra_ctxsize = sizeof(struct des3_ede_x86_ctx), .base.cra_module = THIS_MODULE, .min_keysize = DES3_EDE_KEY_SIZE, .max_keysize = DES3_EDE_KEY_SIZE, .ivsize = DES3_EDE_BLOCK_SIZE, .setkey = des3_ede_x86_setkey_skcipher, .encrypt = cbc_encrypt, .decrypt = cbc_decrypt, } }; static bool is_blacklisted_cpu(void) { if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return false; if (boot_cpu_data.x86 == 0x0f) { /* * On Pentium 4, des3_ede-x86_64 is slower than generic C * implementation because use of 64bit rotates (which are really * slow on P4). Therefore blacklist P4s. */ return true; } return false; } static int force; module_param(force, int, 0); MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist"); static int __init des3_ede_x86_init(void) { int err; if (!force && is_blacklisted_cpu()) { pr_info("des3_ede-x86_64: performance on this CPU would be suboptimal: disabling des3_ede-x86_64.\n"); return -ENODEV; } err = crypto_register_alg(&des3_ede_cipher); if (err) return err; err = crypto_register_skciphers(des3_ede_skciphers, ARRAY_SIZE(des3_ede_skciphers)); if (err) crypto_unregister_alg(&des3_ede_cipher); return err; } static void __exit des3_ede_x86_fini(void) { crypto_unregister_alg(&des3_ede_cipher); crypto_unregister_skciphers(des3_ede_skciphers, ARRAY_SIZE(des3_ede_skciphers)); } module_init(des3_ede_x86_init); module_exit(des3_ede_x86_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Triple DES EDE Cipher Algorithm, asm optimized"); MODULE_ALIAS_CRYPTO("des3_ede"); MODULE_ALIAS_CRYPTO("des3_ede-asm"); MODULE_AUTHOR("Jussi Kivilinna <jussi.kivilinna@iki.fi>");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Landlock LSM - Credential hooks * * Copyright © 2019-2020 Mickaël Salaün <mic@digikod.net> * Copyright © 2019-2020 ANSSI */ #ifndef _SECURITY_LANDLOCK_CRED_H #define _SECURITY_LANDLOCK_CRED_H #include <linux/cred.h> #include <linux/init.h> #include <linux/rcupdate.h> #include "ruleset.h" #include "setup.h" struct landlock_cred_security { struct landlock_ruleset *domain; }; static inline struct landlock_cred_security * landlock_cred(const struct cred *cred) { return cred->security + landlock_blob_sizes.lbs_cred; } static inline struct landlock_ruleset *landlock_get_current_domain(void) { return landlock_cred(current_cred())->domain; } /* * The call needs to come from an RCU read-side critical section. */ static inline const struct landlock_ruleset * landlock_get_task_domain(const struct task_struct *const task) { return landlock_cred(__task_cred(task))->domain; } static inline bool landlocked(const struct task_struct *const task) { bool has_dom; if (task == current) return !!landlock_get_current_domain(); rcu_read_lock(); has_dom = !!landlock_get_task_domain(task); rcu_read_unlock(); return has_dom; } __init void landlock_add_cred_hooks(void); #endif /* _SECURITY_LANDLOCK_CRED_H */
6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 11 11 11 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 50 50 50 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 // SPDX-License-Identifier: GPL-2.0 /* * MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler, * for the blk-mq scheduling framework * * Copyright (C) 2016 Jens Axboe <axboe@kernel.dk> */ #include <linux/kernel.h> #include <linux/fs.h> #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/compiler.h> #include <linux/rbtree.h> #include <linux/sbitmap.h> #include <trace/events/block.h> #include "elevator.h" #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" /* * See Documentation/block/deadline-iosched.rst */ static const int read_expire = HZ / 2; /* max time before a read is submitted. */ static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ /* * Time after which to dispatch lower priority requests even if higher * priority requests are pending. */ static const int prio_aging_expire = 10 * HZ; static const int writes_starved = 2; /* max times reads can starve a write */ static const int fifo_batch = 16; /* # of sequential requests treated as one by the above parameters. For throughput. */ enum dd_data_dir { DD_READ = READ, DD_WRITE = WRITE, }; enum { DD_DIR_COUNT = 2 }; enum dd_prio { DD_RT_PRIO = 0, DD_BE_PRIO = 1, DD_IDLE_PRIO = 2, DD_PRIO_MAX = 2, }; enum { DD_PRIO_COUNT = 3 }; /* * I/O statistics per I/O priority. It is fine if these counters overflow. * What matters is that these counters are at least as wide as * log2(max_outstanding_requests). */ struct io_stats_per_prio { uint32_t inserted; uint32_t merged; uint32_t dispatched; atomic_t completed; }; /* * Deadline scheduler data per I/O priority (enum dd_prio). Requests are * present on both sort_list[] and fifo_list[]. */ struct dd_per_prio { struct list_head dispatch; struct rb_root sort_list[DD_DIR_COUNT]; struct list_head fifo_list[DD_DIR_COUNT]; /* Position of the most recently dispatched request. */ sector_t latest_pos[DD_DIR_COUNT]; struct io_stats_per_prio stats; }; struct deadline_data { /* * run time data */ struct dd_per_prio per_prio[DD_PRIO_COUNT]; /* Data direction of latest dispatched request. */ enum dd_data_dir last_dir; unsigned int batching; /* number of sequential requests made */ unsigned int starved; /* times reads have starved writes */ /* * settings that change how the i/o scheduler behaves */ int fifo_expire[DD_DIR_COUNT]; int fifo_batch; int writes_starved; int front_merges; u32 async_depth; int prio_aging_expire; spinlock_t lock; }; /* Maps an I/O priority class to a deadline scheduler priority. */ static const enum dd_prio ioprio_class_to_prio[] = { [IOPRIO_CLASS_NONE] = DD_BE_PRIO, [IOPRIO_CLASS_RT] = DD_RT_PRIO, [IOPRIO_CLASS_BE] = DD_BE_PRIO, [IOPRIO_CLASS_IDLE] = DD_IDLE_PRIO, }; static inline struct rb_root * deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq) { return &per_prio->sort_list[rq_data_dir(rq)]; } /* * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a * request. */ static u8 dd_rq_ioclass(struct request *rq) { return IOPRIO_PRIO_CLASS(req_get_ioprio(rq)); } /* * Return the first request for which blk_rq_pos() >= @pos. */ static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio, enum dd_data_dir data_dir, sector_t pos) { struct rb_node *node = per_prio->sort_list[data_dir].rb_node; struct request *rq, *res = NULL; if (!node) return NULL; rq = rb_entry_rq(node); while (node) { rq = rb_entry_rq(node); if (blk_rq_pos(rq) >= pos) { res = rq; node = node->rb_left; } else { node = node->rb_right; } } return res; } static void deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq) { struct rb_root *root = deadline_rb_root(per_prio, rq); elv_rb_add(root, rq); } static inline void deadline_del_rq_rb(struct dd_per_prio *per_prio, struct request *rq) { elv_rb_del(deadline_rb_root(per_prio, rq), rq); } /* * remove rq from rbtree and fifo. */ static void deadline_remove_request(struct request_queue *q, struct dd_per_prio *per_prio, struct request *rq) { list_del_init(&rq->queuelist); /* * We might not be on the rbtree, if we are doing an insert merge */ if (!RB_EMPTY_NODE(&rq->rb_node)) deadline_del_rq_rb(per_prio, rq); elv_rqhash_del(q, rq); if (q->last_merge == rq) q->last_merge = NULL; } static void dd_request_merged(struct request_queue *q, struct request *req, enum elv_merge type) { struct deadline_data *dd = q->elevator->elevator_data; const u8 ioprio_class = dd_rq_ioclass(req); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; struct dd_per_prio *per_prio = &dd->per_prio[prio]; /* * if the merge was a front merge, we need to reposition request */ if (type == ELEVATOR_FRONT_MERGE) { elv_rb_del(deadline_rb_root(per_prio, req), req); deadline_add_rq_rb(per_prio, req); } } /* * Callback function that is invoked after @next has been merged into @req. */ static void dd_merged_requests(struct request_queue *q, struct request *req, struct request *next) { struct deadline_data *dd = q->elevator->elevator_data; const u8 ioprio_class = dd_rq_ioclass(next); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; lockdep_assert_held(&dd->lock); dd->per_prio[prio].stats.merged++; /* * if next expires before rq, assign its expire time to rq * and move into next position (next will be deleted) in fifo */ if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) { if (time_before((unsigned long)next->fifo_time, (unsigned long)req->fifo_time)) { list_move(&req->queuelist, &next->queuelist); req->fifo_time = next->fifo_time; } } /* * kill knowledge of next, this one is a goner */ deadline_remove_request(q, &dd->per_prio[prio], next); } /* * move an entry to dispatch queue */ static void deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio, struct request *rq) { /* * take it off the sort and fifo list */ deadline_remove_request(rq->q, per_prio, rq); } /* Number of requests queued for a given priority level. */ static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio) { const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats; lockdep_assert_held(&dd->lock); return stats->inserted - atomic_read(&stats->completed); } /* * deadline_check_fifo returns true if and only if there are expired requests * in the FIFO list. Requires !list_empty(&dd->fifo_list[data_dir]). */ static inline bool deadline_check_fifo(struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { struct request *rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); return time_is_before_eq_jiffies((unsigned long)rq->fifo_time); } /* * For the specified data direction, return the next request to * dispatch using arrival ordered lists. */ static struct request * deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { if (list_empty(&per_prio->fifo_list[data_dir])) return NULL; return rq_entry_fifo(per_prio->fifo_list[data_dir].next); } /* * For the specified data direction, return the next request to * dispatch using sector position sorted lists. */ static struct request * deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { return deadline_from_pos(per_prio, data_dir, per_prio->latest_pos[data_dir]); } /* * Returns true if and only if @rq started after @latest_start where * @latest_start is in jiffies. */ static bool started_after(struct deadline_data *dd, struct request *rq, unsigned long latest_start) { unsigned long start_time = (unsigned long)rq->fifo_time; start_time -= dd->fifo_expire[rq_data_dir(rq)]; return time_after(start_time, latest_start); } /* * deadline_dispatch_requests selects the best request according to * read/write expire, fifo_batch, etc and with a start time <= @latest_start. */ static struct request *__dd_dispatch_request(struct deadline_data *dd, struct dd_per_prio *per_prio, unsigned long latest_start) { struct request *rq, *next_rq; enum dd_data_dir data_dir; enum dd_prio prio; u8 ioprio_class; lockdep_assert_held(&dd->lock); if (!list_empty(&per_prio->dispatch)) { rq = list_first_entry(&per_prio->dispatch, struct request, queuelist); if (started_after(dd, rq, latest_start)) return NULL; list_del_init(&rq->queuelist); data_dir = rq_data_dir(rq); goto done; } /* * batches are currently reads XOR writes */ rq = deadline_next_request(dd, per_prio, dd->last_dir); if (rq && dd->batching < dd->fifo_batch) { /* we have a next request and are still entitled to batch */ data_dir = rq_data_dir(rq); goto dispatch_request; } /* * at this point we are not running a batch. select the appropriate * data direction (read / write) */ if (!list_empty(&per_prio->fifo_list[DD_READ])) { BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_READ])); if (deadline_fifo_request(dd, per_prio, DD_WRITE) && (dd->starved++ >= dd->writes_starved)) goto dispatch_writes; data_dir = DD_READ; goto dispatch_find_request; } /* * there are either no reads or writes have been starved */ if (!list_empty(&per_prio->fifo_list[DD_WRITE])) { dispatch_writes: BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_WRITE])); dd->starved = 0; data_dir = DD_WRITE; goto dispatch_find_request; } return NULL; dispatch_find_request: /* * we are not running a batch, find best request for selected data_dir */ next_rq = deadline_next_request(dd, per_prio, data_dir); if (deadline_check_fifo(per_prio, data_dir) || !next_rq) { /* * A deadline has expired, the last request was in the other * direction, or we have run out of higher-sectored requests. * Start again from the request with the earliest expiry time. */ rq = deadline_fifo_request(dd, per_prio, data_dir); } else { /* * The last req was the same dir and we have a next request in * sort order. No expired requests so continue on from here. */ rq = next_rq; } if (!rq) return NULL; dd->last_dir = data_dir; dd->batching = 0; dispatch_request: if (started_after(dd, rq, latest_start)) return NULL; /* * rq is the selected appropriate request. */ dd->batching++; deadline_move_request(dd, per_prio, rq); done: ioprio_class = dd_rq_ioclass(rq); prio = ioprio_class_to_prio[ioprio_class]; dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq); dd->per_prio[prio].stats.dispatched++; rq->rq_flags |= RQF_STARTED; return rq; } /* * Check whether there are any requests with priority other than DD_RT_PRIO * that were inserted more than prio_aging_expire jiffies ago. */ static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd, unsigned long now) { struct request *rq; enum dd_prio prio; int prio_cnt; lockdep_assert_held(&dd->lock); prio_cnt = !!dd_queued(dd, DD_RT_PRIO) + !!dd_queued(dd, DD_BE_PRIO) + !!dd_queued(dd, DD_IDLE_PRIO); if (prio_cnt < 2) return NULL; for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) { rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now - dd->prio_aging_expire); if (rq) return rq; } return NULL; } /* * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests(). * * One confusing aspect here is that we get called for a specific * hardware queue, but we may return a request that is for a * different hardware queue. This is because mq-deadline has shared * state for all hardware queues, in terms of sorting, FIFOs, etc. */ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; const unsigned long now = jiffies; struct request *rq; enum dd_prio prio; spin_lock(&dd->lock); rq = dd_dispatch_prio_aged_requests(dd, now); if (rq) goto unlock; /* * Next, dispatch requests in priority order. Ignore lower priority * requests if any higher priority requests are pending. */ for (prio = 0; prio <= DD_PRIO_MAX; prio++) { rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now); if (rq || dd_queued(dd, prio)) break; } unlock: spin_unlock(&dd->lock); return rq; } /* * 'depth' is a number in the range 1..INT_MAX representing a number of * requests. Scale it with a factor (1 << bt->sb.shift) / q->nr_requests since * 1..(1 << bt->sb.shift) is the range expected by sbitmap_get_shallow(). * Values larger than q->nr_requests have the same effect as q->nr_requests. */ static int dd_to_word_depth(struct blk_mq_hw_ctx *hctx, unsigned int qdepth) { struct sbitmap_queue *bt = &hctx->sched_tags->bitmap_tags; const unsigned int nrr = hctx->queue->nr_requests; return ((qdepth << bt->sb.shift) + nrr - 1) / nrr; } /* * Called by __blk_mq_alloc_request(). The shallow_depth value set by this * function is used by __blk_mq_get_tag(). */ static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) { struct deadline_data *dd = data->q->elevator->elevator_data; /* Do not throttle synchronous reads. */ if (op_is_sync(opf) && !op_is_write(opf)) return; /* * Throttle asynchronous requests and writes such that these requests * do not block the allocation of synchronous requests. */ data->shallow_depth = dd_to_word_depth(data->hctx, dd->async_depth); } /* Called by blk_mq_update_nr_requests(). */ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; struct blk_mq_tags *tags = hctx->sched_tags; dd->async_depth = q->nr_requests; sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1); } /* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */ static int dd_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { dd_depth_updated(hctx); return 0; } static void dd_exit_sched(struct elevator_queue *e) { struct deadline_data *dd = e->elevator_data; enum dd_prio prio; for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; const struct io_stats_per_prio *stats = &per_prio->stats; uint32_t queued; WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ])); WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE])); spin_lock(&dd->lock); queued = dd_queued(dd, prio); spin_unlock(&dd->lock); WARN_ONCE(queued != 0, "statistics for priority %d: i %u m %u d %u c %u\n", prio, stats->inserted, stats->merged, stats->dispatched, atomic_read(&stats->completed)); } kfree(dd); } /* * initialize elevator private data (deadline_data). */ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) { struct deadline_data *dd; struct elevator_queue *eq; enum dd_prio prio; int ret = -ENOMEM; eq = elevator_alloc(q, e); if (!eq) return ret; dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); if (!dd) goto put_eq; eq->elevator_data = dd; for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; INIT_LIST_HEAD(&per_prio->dispatch); INIT_LIST_HEAD(&per_prio->fifo_list[DD_READ]); INIT_LIST_HEAD(&per_prio->fifo_list[DD_WRITE]); per_prio->sort_list[DD_READ] = RB_ROOT; per_prio->sort_list[DD_WRITE] = RB_ROOT; } dd->fifo_expire[DD_READ] = read_expire; dd->fifo_expire[DD_WRITE] = write_expire; dd->writes_starved = writes_starved; dd->front_merges = 1; dd->last_dir = DD_WRITE; dd->fifo_batch = fifo_batch; dd->prio_aging_expire = prio_aging_expire; spin_lock_init(&dd->lock); /* We dispatch from request queue wide instead of hw queue */ blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); q->elevator = eq; return 0; put_eq: kobject_put(&eq->kobj); return ret; } /* * Try to merge @bio into an existing request. If @bio has been merged into * an existing request, store the pointer to that request into *@rq. */ static int dd_request_merge(struct request_queue *q, struct request **rq, struct bio *bio) { struct deadline_data *dd = q->elevator->elevator_data; const u8 ioprio_class = IOPRIO_PRIO_CLASS(bio->bi_ioprio); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; struct dd_per_prio *per_prio = &dd->per_prio[prio]; sector_t sector = bio_end_sector(bio); struct request *__rq; if (!dd->front_merges) return ELEVATOR_NO_MERGE; __rq = elv_rb_find(&per_prio->sort_list[bio_data_dir(bio)], sector); if (__rq) { BUG_ON(sector != blk_rq_pos(__rq)); if (elv_bio_merge_ok(__rq, bio)) { *rq = __rq; if (blk_discard_mergable(__rq)) return ELEVATOR_DISCARD_MERGE; return ELEVATOR_FRONT_MERGE; } } return ELEVATOR_NO_MERGE; } /* * Attempt to merge a bio into an existing request. This function is called * before @bio is associated with a request. */ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { struct deadline_data *dd = q->elevator->elevator_data; struct request *free = NULL; bool ret; spin_lock(&dd->lock); ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); spin_unlock(&dd->lock); if (free) blk_mq_free_request(free); return ret; } /* * add rq to rbtree and fifo */ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_insert_t flags, struct list_head *free) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; const enum dd_data_dir data_dir = rq_data_dir(rq); u16 ioprio = req_get_ioprio(rq); u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio); struct dd_per_prio *per_prio; enum dd_prio prio; lockdep_assert_held(&dd->lock); prio = ioprio_class_to_prio[ioprio_class]; per_prio = &dd->per_prio[prio]; if (!rq->elv.priv[0]) per_prio->stats.inserted++; rq->elv.priv[0] = per_prio; if (blk_mq_sched_try_insert_merge(q, rq, free)) return; trace_block_rq_insert(rq); if (flags & BLK_MQ_INSERT_AT_HEAD) { list_add(&rq->queuelist, &per_prio->dispatch); rq->fifo_time = jiffies; } else { deadline_add_rq_rb(per_prio, rq); if (rq_mergeable(rq)) { elv_rqhash_add(q, rq); if (!q->last_merge) q->last_merge = rq; } /* * set expire time and add to fifo list */ rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]); } } /* * Called from blk_mq_insert_request() or blk_mq_dispatch_plug_list(). */ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, struct list_head *list, blk_insert_t flags) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; LIST_HEAD(free); spin_lock(&dd->lock); while (!list_empty(list)) { struct request *rq; rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); dd_insert_request(hctx, rq, flags, &free); } spin_unlock(&dd->lock); blk_mq_free_requests(&free); } /* Callback from inside blk_mq_rq_ctx_init(). */ static void dd_prepare_request(struct request *rq) { rq->elv.priv[0] = NULL; } /* * Callback from inside blk_mq_free_request(). */ static void dd_finish_request(struct request *rq) { struct dd_per_prio *per_prio = rq->elv.priv[0]; /* * The block layer core may call dd_finish_request() without having * called dd_insert_requests(). Skip requests that bypassed I/O * scheduling. See also blk_mq_request_bypass_insert(). */ if (per_prio) atomic_inc(&per_prio->stats.completed); } static bool dd_has_work_for_prio(struct dd_per_prio *per_prio) { return !list_empty_careful(&per_prio->dispatch) || !list_empty_careful(&per_prio->fifo_list[DD_READ]) || !list_empty_careful(&per_prio->fifo_list[DD_WRITE]); } static bool dd_has_work(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; enum dd_prio prio; for (prio = 0; prio <= DD_PRIO_MAX; prio++) if (dd_has_work_for_prio(&dd->per_prio[prio])) return true; return false; } /* * sysfs parts below */ #define SHOW_INT(__FUNC, __VAR) \ static ssize_t __FUNC(struct elevator_queue *e, char *page) \ { \ struct deadline_data *dd = e->elevator_data; \ \ return sysfs_emit(page, "%d\n", __VAR); \ } #define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR)) SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]); SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]); SHOW_JIFFIES(deadline_prio_aging_expire_show, dd->prio_aging_expire); SHOW_INT(deadline_writes_starved_show, dd->writes_starved); SHOW_INT(deadline_front_merges_show, dd->front_merges); SHOW_INT(deadline_async_depth_show, dd->async_depth); SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch); #undef SHOW_INT #undef SHOW_JIFFIES #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ { \ struct deadline_data *dd = e->elevator_data; \ int __data, __ret; \ \ __ret = kstrtoint(page, 0, &__data); \ if (__ret < 0) \ return __ret; \ if (__data < (MIN)) \ __data = (MIN); \ else if (__data > (MAX)) \ __data = (MAX); \ *(__PTR) = __CONV(__data); \ return count; \ } #define STORE_INT(__FUNC, __PTR, MIN, MAX) \ STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, ) #define STORE_JIFFIES(__FUNC, __PTR, MIN, MAX) \ STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies) STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX); STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX); STORE_JIFFIES(deadline_prio_aging_expire_store, &dd->prio_aging_expire, 0, INT_MAX); STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX); STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1); STORE_INT(deadline_async_depth_store, &dd->async_depth, 1, INT_MAX); STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX); #undef STORE_FUNCTION #undef STORE_INT #undef STORE_JIFFIES #define DD_ATTR(name) \ __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store) static const struct elv_fs_entry deadline_attrs[] = { DD_ATTR(read_expire), DD_ATTR(write_expire), DD_ATTR(writes_starved), DD_ATTR(front_merges), DD_ATTR(async_depth), DD_ATTR(fifo_batch), DD_ATTR(prio_aging_expire), __ATTR_NULL }; #ifdef CONFIG_BLK_DEBUG_FS #define DEADLINE_DEBUGFS_DDIR_ATTRS(prio, data_dir, name) \ static void *deadline_##name##_fifo_start(struct seq_file *m, \ loff_t *pos) \ __acquires(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ spin_lock(&dd->lock); \ return seq_list_start(&per_prio->fifo_list[data_dir], *pos); \ } \ \ static void *deadline_##name##_fifo_next(struct seq_file *m, void *v, \ loff_t *pos) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ return seq_list_next(v, &per_prio->fifo_list[data_dir], pos); \ } \ \ static void deadline_##name##_fifo_stop(struct seq_file *m, void *v) \ __releases(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ \ spin_unlock(&dd->lock); \ } \ \ static const struct seq_operations deadline_##name##_fifo_seq_ops = { \ .start = deadline_##name##_fifo_start, \ .next = deadline_##name##_fifo_next, \ .stop = deadline_##name##_fifo_stop, \ .show = blk_mq_debugfs_rq_show, \ }; \ \ static int deadline_##name##_next_rq_show(void *data, \ struct seq_file *m) \ { \ struct request_queue *q = data; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ struct request *rq; \ \ rq = deadline_from_pos(per_prio, data_dir, \ per_prio->latest_pos[data_dir]); \ if (rq) \ __blk_mq_debugfs_rq_show(m, rq); \ return 0; \ } DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_READ, read0); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_WRITE, write0); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_READ, read1); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_WRITE, write1); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_READ, read2); DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_WRITE, write2); #undef DEADLINE_DEBUGFS_DDIR_ATTRS static int deadline_batching_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; seq_printf(m, "%u\n", dd->batching); return 0; } static int deadline_starved_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; seq_printf(m, "%u\n", dd->starved); return 0; } static int dd_async_depth_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; seq_printf(m, "%u\n", dd->async_depth); return 0; } static int dd_queued_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; u32 rt, be, idle; spin_lock(&dd->lock); rt = dd_queued(dd, DD_RT_PRIO); be = dd_queued(dd, DD_BE_PRIO); idle = dd_queued(dd, DD_IDLE_PRIO); spin_unlock(&dd->lock); seq_printf(m, "%u %u %u\n", rt, be, idle); return 0; } /* Number of requests owned by the block driver for a given priority. */ static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio) { const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats; lockdep_assert_held(&dd->lock); return stats->dispatched + stats->merged - atomic_read(&stats->completed); } static int dd_owned_by_driver_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; u32 rt, be, idle; spin_lock(&dd->lock); rt = dd_owned_by_driver(dd, DD_RT_PRIO); be = dd_owned_by_driver(dd, DD_BE_PRIO); idle = dd_owned_by_driver(dd, DD_IDLE_PRIO); spin_unlock(&dd->lock); seq_printf(m, "%u %u %u\n", rt, be, idle); return 0; } #define DEADLINE_DISPATCH_ATTR(prio) \ static void *deadline_dispatch##prio##_start(struct seq_file *m, \ loff_t *pos) \ __acquires(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ spin_lock(&dd->lock); \ return seq_list_start(&per_prio->dispatch, *pos); \ } \ \ static void *deadline_dispatch##prio##_next(struct seq_file *m, \ void *v, loff_t *pos) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ return seq_list_next(v, &per_prio->dispatch, pos); \ } \ \ static void deadline_dispatch##prio##_stop(struct seq_file *m, void *v) \ __releases(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ \ spin_unlock(&dd->lock); \ } \ \ static const struct seq_operations deadline_dispatch##prio##_seq_ops = { \ .start = deadline_dispatch##prio##_start, \ .next = deadline_dispatch##prio##_next, \ .stop = deadline_dispatch##prio##_stop, \ .show = blk_mq_debugfs_rq_show, \ } DEADLINE_DISPATCH_ATTR(0); DEADLINE_DISPATCH_ATTR(1); DEADLINE_DISPATCH_ATTR(2); #undef DEADLINE_DISPATCH_ATTR #define DEADLINE_QUEUE_DDIR_ATTRS(name) \ {#name "_fifo_list", 0400, \ .seq_ops = &deadline_##name##_fifo_seq_ops} #define DEADLINE_NEXT_RQ_ATTR(name) \ {#name "_next_rq", 0400, deadline_##name##_next_rq_show} static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { DEADLINE_QUEUE_DDIR_ATTRS(read0), DEADLINE_QUEUE_DDIR_ATTRS(write0), DEADLINE_QUEUE_DDIR_ATTRS(read1), DEADLINE_QUEUE_DDIR_ATTRS(write1), DEADLINE_QUEUE_DDIR_ATTRS(read2), DEADLINE_QUEUE_DDIR_ATTRS(write2), DEADLINE_NEXT_RQ_ATTR(read0), DEADLINE_NEXT_RQ_ATTR(write0), DEADLINE_NEXT_RQ_ATTR(read1), DEADLINE_NEXT_RQ_ATTR(write1), DEADLINE_NEXT_RQ_ATTR(read2), DEADLINE_NEXT_RQ_ATTR(write2), {"batching", 0400, deadline_batching_show}, {"starved", 0400, deadline_starved_show}, {"async_depth", 0400, dd_async_depth_show}, {"dispatch0", 0400, .seq_ops = &deadline_dispatch0_seq_ops}, {"dispatch1", 0400, .seq_ops = &deadline_dispatch1_seq_ops}, {"dispatch2", 0400, .seq_ops = &deadline_dispatch2_seq_ops}, {"owned_by_driver", 0400, dd_owned_by_driver_show}, {"queued", 0400, dd_queued_show}, {}, }; #undef DEADLINE_QUEUE_DDIR_ATTRS #endif static struct elevator_type mq_deadline = { .ops = { .depth_updated = dd_depth_updated, .limit_depth = dd_limit_depth, .insert_requests = dd_insert_requests, .dispatch_request = dd_dispatch_request, .prepare_request = dd_prepare_request, .finish_request = dd_finish_request, .next_request = elv_rb_latter_request, .former_request = elv_rb_former_request, .bio_merge = dd_bio_merge, .request_merge = dd_request_merge, .requests_merged = dd_merged_requests, .request_merged = dd_request_merged, .has_work = dd_has_work, .init_sched = dd_init_sched, .exit_sched = dd_exit_sched, .init_hctx = dd_init_hctx, }, #ifdef CONFIG_BLK_DEBUG_FS .queue_debugfs_attrs = deadline_queue_debugfs_attrs, #endif .elevator_attrs = deadline_attrs, .elevator_name = "mq-deadline", .elevator_alias = "deadline", .elevator_owner = THIS_MODULE, }; MODULE_ALIAS("mq-deadline-iosched"); static int __init deadline_init(void) { return elv_register(&mq_deadline); } static void __exit deadline_exit(void) { elv_unregister(&mq_deadline); } module_init(deadline_init); module_exit(deadline_exit); MODULE_AUTHOR("Jens Axboe, Damien Le Moal and Bart Van Assche"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MQ deadline IO scheduler");
12014 12018 17 30 103 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 // SPDX-License-Identifier: GPL-2.0-only #include <linux/ethtool.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <net/net_namespace.h> #include <linux/if_arp.h> #include <net/rtnetlink.h> static netdev_tx_t nlmon_xmit(struct sk_buff *skb, struct net_device *dev) { dev_lstats_add(dev, skb->len); dev_kfree_skb(skb); return NETDEV_TX_OK; } struct nlmon { struct netlink_tap nt; }; static int nlmon_open(struct net_device *dev) { struct nlmon *nlmon = netdev_priv(dev); nlmon->nt.dev = dev; nlmon->nt.module = THIS_MODULE; return netlink_add_tap(&nlmon->nt); } static int nlmon_close(struct net_device *dev) { struct nlmon *nlmon = netdev_priv(dev); return netlink_remove_tap(&nlmon->nt); } static void nlmon_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { dev_lstats_read(dev, &stats->rx_packets, &stats->rx_bytes); } static u32 always_on(struct net_device *dev) { return 1; } static const struct ethtool_ops nlmon_ethtool_ops = { .get_link = always_on, }; static const struct net_device_ops nlmon_ops = { .ndo_open = nlmon_open, .ndo_stop = nlmon_close, .ndo_start_xmit = nlmon_xmit, .ndo_get_stats64 = nlmon_get_stats64, }; static void nlmon_setup(struct net_device *dev) { dev->type = ARPHRD_NETLINK; dev->priv_flags |= IFF_NO_QUEUE; dev->lltx = true; dev->netdev_ops = &nlmon_ops; dev->ethtool_ops = &nlmon_ethtool_ops; dev->needs_free_netdev = true; dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA; dev->flags = IFF_NOARP; dev->pcpu_stat_type = NETDEV_PCPU_STAT_LSTATS; /* That's rather a softlimit here, which, of course, * can be altered. Not a real MTU, but what is to be * expected in most cases. */ dev->mtu = NLMSG_GOODSIZE; dev->min_mtu = sizeof(struct nlmsghdr); } static int nlmon_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) return -EINVAL; return 0; } static struct rtnl_link_ops nlmon_link_ops __read_mostly = { .kind = "nlmon", .priv_size = sizeof(struct nlmon), .setup = nlmon_setup, .validate = nlmon_validate, }; static __init int nlmon_register(void) { return rtnl_link_register(&nlmon_link_ops); } static __exit void nlmon_unregister(void) { rtnl_link_unregister(&nlmon_link_ops); } module_init(nlmon_register); module_exit(nlmon_unregister); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); MODULE_AUTHOR("Mathieu Geli <geli@enseirb.fr>"); MODULE_DESCRIPTION("Netlink monitoring device"); MODULE_ALIAS_RTNL_LINK("nlmon");
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 // SPDX-License-Identifier: GPL-2.0-or-later /* * Force feedback support for Linux input subsystem * * Copyright (c) 2006 Anssi Hannula <anssi.hannula@gmail.com> * Copyright (c) 2006 Dmitry Torokhov <dtor@mail.ru> */ /* #define DEBUG */ #include <linux/input.h> #include <linux/limits.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/overflow.h> #include <linux/sched.h> #include <linux/slab.h> /* * Check that the effect_id is a valid effect and whether the user * is the owner */ static int check_effect_access(struct ff_device *ff, int effect_id, struct file *file) { if (effect_id < 0 || effect_id >= ff->max_effects || !ff->effect_owners[effect_id]) return -EINVAL; if (file && ff->effect_owners[effect_id] != file) return -EACCES; return 0; } /* * Checks whether 2 effects can be combined together */ static inline int check_effects_compatible(struct ff_effect *e1, struct ff_effect *e2) { return e1->type == e2->type && (e1->type != FF_PERIODIC || e1->u.periodic.waveform == e2->u.periodic.waveform); } /* * Convert an effect into compatible one */ static int compat_effect(struct ff_device *ff, struct ff_effect *effect) { int magnitude; switch (effect->type) { case FF_RUMBLE: if (!test_bit(FF_PERIODIC, ff->ffbit)) return -EINVAL; /* * calculate magnitude of sine wave as average of rumble's * 2/3 of strong magnitude and 1/3 of weak magnitude */ magnitude = effect->u.rumble.strong_magnitude / 3 + effect->u.rumble.weak_magnitude / 6; effect->type = FF_PERIODIC; effect->u.periodic.waveform = FF_SINE; effect->u.periodic.period = 50; effect->u.periodic.magnitude = magnitude; effect->u.periodic.offset = 0; effect->u.periodic.phase = 0; effect->u.periodic.envelope.attack_length = 0; effect->u.periodic.envelope.attack_level = 0; effect->u.periodic.envelope.fade_length = 0; effect->u.periodic.envelope.fade_level = 0; return 0; default: /* Let driver handle conversion */ return 0; } } /** * input_ff_upload() - upload effect into force-feedback device * @dev: input device * @effect: effect to be uploaded * @file: owner of the effect */ int input_ff_upload(struct input_dev *dev, struct ff_effect *effect, struct file *file) { struct ff_device *ff = dev->ff; struct ff_effect *old; int error; int id; if (!test_bit(EV_FF, dev->evbit)) return -ENOSYS; if (effect->type < FF_EFFECT_MIN || effect->type > FF_EFFECT_MAX || !test_bit(effect->type, dev->ffbit)) { dev_dbg(&dev->dev, "invalid or not supported effect type in upload\n"); return -EINVAL; } if (effect->type == FF_PERIODIC && (effect->u.periodic.waveform < FF_WAVEFORM_MIN || effect->u.periodic.waveform > FF_WAVEFORM_MAX || !test_bit(effect->u.periodic.waveform, dev->ffbit))) { dev_dbg(&dev->dev, "invalid or not supported wave form in upload\n"); return -EINVAL; } if (!test_bit(effect->type, ff->ffbit)) { error = compat_effect(ff, effect); if (error) return error; } guard(mutex)(&ff->mutex); if (effect->id == -1) { for (id = 0; id < ff->max_effects; id++) if (!ff->effect_owners[id]) break; if (id >= ff->max_effects) return -ENOSPC; effect->id = id; old = NULL; } else { id = effect->id; error = check_effect_access(ff, id, file); if (error) return error; old = &ff->effects[id]; if (!check_effects_compatible(effect, old)) return -EINVAL; } error = ff->upload(dev, effect, old); if (error) return error; scoped_guard(spinlock_irq, &dev->event_lock) { ff->effects[id] = *effect; ff->effect_owners[id] = file; } return 0; } EXPORT_SYMBOL_GPL(input_ff_upload); /* * Erases the effect if the requester is also the effect owner. The mutex * should already be locked before calling this function. */ static int erase_effect(struct input_dev *dev, int effect_id, struct file *file) { struct ff_device *ff = dev->ff; int error; error = check_effect_access(ff, effect_id, file); if (error) return error; scoped_guard(spinlock_irq, &dev->event_lock) { ff->playback(dev, effect_id, 0); ff->effect_owners[effect_id] = NULL; } if (ff->erase) { error = ff->erase(dev, effect_id); if (error) { scoped_guard(spinlock_irq, &dev->event_lock) ff->effect_owners[effect_id] = file; return error; } } return 0; } /** * input_ff_erase - erase a force-feedback effect from device * @dev: input device to erase effect from * @effect_id: id of the effect to be erased * @file: purported owner of the request * * This function erases a force-feedback effect from specified device. * The effect will only be erased if it was uploaded through the same * file handle that is requesting erase. */ int input_ff_erase(struct input_dev *dev, int effect_id, struct file *file) { struct ff_device *ff = dev->ff; if (!test_bit(EV_FF, dev->evbit)) return -ENOSYS; guard(mutex)(&ff->mutex); return erase_effect(dev, effect_id, file); } EXPORT_SYMBOL_GPL(input_ff_erase); /* * input_ff_flush - erase all effects owned by a file handle * @dev: input device to erase effect from * @file: purported owner of the effects * * This function erases all force-feedback effects associated with * the given owner from specified device. Note that @file may be %NULL, * in which case all effects will be erased. */ int input_ff_flush(struct input_dev *dev, struct file *file) { struct ff_device *ff = dev->ff; int i; dev_dbg(&dev->dev, "flushing now\n"); guard(mutex)(&ff->mutex); for (i = 0; i < ff->max_effects; i++) erase_effect(dev, i, file); return 0; } EXPORT_SYMBOL_GPL(input_ff_flush); /** * input_ff_event() - generic handler for force-feedback events * @dev: input device to send the effect to * @type: event type (anything but EV_FF is ignored) * @code: event code * @value: event value */ int input_ff_event(struct input_dev *dev, unsigned int type, unsigned int code, int value) { struct ff_device *ff = dev->ff; if (type != EV_FF) return 0; switch (code) { case FF_GAIN: if (!test_bit(FF_GAIN, dev->ffbit) || value > 0xffffU) break; ff->set_gain(dev, value); break; case FF_AUTOCENTER: if (!test_bit(FF_AUTOCENTER, dev->ffbit) || value > 0xffffU) break; ff->set_autocenter(dev, value); break; default: if (check_effect_access(ff, code, NULL) == 0) ff->playback(dev, code, value); break; } return 0; } EXPORT_SYMBOL_GPL(input_ff_event); /** * input_ff_create() - create force-feedback device * @dev: input device supporting force-feedback * @max_effects: maximum number of effects supported by the device * * This function allocates all necessary memory for a force feedback * portion of an input device and installs all default handlers. * @dev->ffbit should be already set up before calling this function. * Once ff device is created you need to setup its upload, erase, * playback and other handlers before registering input device */ int input_ff_create(struct input_dev *dev, unsigned int max_effects) { int i; if (!max_effects) { dev_err(&dev->dev, "cannot allocate device without any effects\n"); return -EINVAL; } if (max_effects > FF_MAX_EFFECTS) { dev_err(&dev->dev, "cannot allocate more than FF_MAX_EFFECTS effects\n"); return -EINVAL; } struct ff_device *ff __free(kfree) = kzalloc(struct_size(ff, effect_owners, max_effects), GFP_KERNEL); if (!ff) return -ENOMEM; ff->effects = kcalloc(max_effects, sizeof(*ff->effects), GFP_KERNEL); if (!ff->effects) return -ENOMEM; ff->max_effects = max_effects; mutex_init(&ff->mutex); dev->flush = input_ff_flush; dev->event = input_ff_event; __set_bit(EV_FF, dev->evbit); /* Copy "true" bits into ff device bitmap */ for_each_set_bit(i, dev->ffbit, FF_CNT) __set_bit(i, ff->ffbit); /* we can emulate RUMBLE with periodic effects */ if (test_bit(FF_PERIODIC, ff->ffbit)) __set_bit(FF_RUMBLE, dev->ffbit); dev->ff = no_free_ptr(ff); return 0; } EXPORT_SYMBOL_GPL(input_ff_create); /** * input_ff_destroy() - frees force feedback portion of input device * @dev: input device supporting force feedback * * This function is only needed in error path as input core will * automatically free force feedback structures when device is * destroyed. */ void input_ff_destroy(struct input_dev *dev) { struct ff_device *ff = dev->ff; __clear_bit(EV_FF, dev->evbit); if (ff) { if (ff->destroy) ff->destroy(ff); kfree(ff->private); kfree(ff->effects); kfree(ff); dev->ff = NULL; } } EXPORT_SYMBOL_GPL(input_ff_destroy);
170 24 157 151 63 20 18 53 2 146 146 48 139 138 139 171 174 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 // SPDX-License-Identifier: GPL-2.0-or-later /* * Create default crypto algorithm instances. * * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/internal/aead.h> #include <linux/completion.h> #include <linux/ctype.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/notifier.h> #include <linux/rtnetlink.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/string.h> #include "internal.h" struct cryptomgr_param { struct rtattr *tb[CRYPTO_MAX_ATTRS + 2]; struct { struct rtattr attr; struct crypto_attr_type data; } type; struct { struct rtattr attr; struct crypto_attr_alg data; } attrs[CRYPTO_MAX_ATTRS]; char template[CRYPTO_MAX_ALG_NAME]; struct crypto_larval *larval; u32 otype; u32 omask; }; struct crypto_test_param { char driver[CRYPTO_MAX_ALG_NAME]; char alg[CRYPTO_MAX_ALG_NAME]; u32 type; }; static int cryptomgr_probe(void *data) { struct cryptomgr_param *param = data; struct crypto_template *tmpl; int err = -ENOENT; tmpl = crypto_lookup_template(param->template); if (!tmpl) goto out; do { err = tmpl->create(tmpl, param->tb); } while (err == -EAGAIN && !signal_pending(current)); crypto_tmpl_put(tmpl); out: param->larval->adult = ERR_PTR(err); param->larval->alg.cra_flags |= CRYPTO_ALG_DEAD; complete_all(&param->larval->completion); crypto_alg_put(&param->larval->alg); kfree(param); module_put_and_kthread_exit(0); } static int cryptomgr_schedule_probe(struct crypto_larval *larval) { struct task_struct *thread; struct cryptomgr_param *param; const char *name = larval->alg.cra_name; const char *p; unsigned int len; int i; if (!try_module_get(THIS_MODULE)) goto err; param = kzalloc(sizeof(*param), GFP_KERNEL); if (!param) goto err_put_module; for (p = name; isalnum(*p) || *p == '-' || *p == '_'; p++) ; len = p - name; if (!len || *p != '(') goto err_free_param; memcpy(param->template, name, len); i = 0; for (;;) { name = ++p; for (; isalnum(*p) || *p == '-' || *p == '_'; p++) ; if (*p == '(') { int recursion = 0; for (;;) { if (!*++p) goto err_free_param; if (*p == '(') recursion++; else if (*p == ')' && !recursion--) break; } p++; } len = p - name; if (!len) goto err_free_param; param->attrs[i].attr.rta_len = sizeof(param->attrs[i]); param->attrs[i].attr.rta_type = CRYPTOA_ALG; memcpy(param->attrs[i].data.name, name, len); param->tb[i + 1] = &param->attrs[i].attr; i++; if (i >= CRYPTO_MAX_ATTRS) goto err_free_param; if (*p == ')') break; if (*p != ',') goto err_free_param; } param->tb[i + 1] = NULL; param->type.attr.rta_len = sizeof(param->type); param->type.attr.rta_type = CRYPTOA_TYPE; param->type.data.type = larval->alg.cra_flags & ~CRYPTO_ALG_TESTED; param->type.data.mask = larval->mask & ~CRYPTO_ALG_TESTED; param->tb[0] = &param->type.attr; param->otype = larval->alg.cra_flags; param->omask = larval->mask; crypto_alg_get(&larval->alg); param->larval = larval; thread = kthread_run(cryptomgr_probe, param, "cryptomgr_probe"); if (IS_ERR(thread)) goto err_put_larval; return NOTIFY_STOP; err_put_larval: crypto_alg_put(&larval->alg); err_free_param: kfree(param); err_put_module: module_put(THIS_MODULE); err: return NOTIFY_OK; } static int cryptomgr_test(void *data) { struct crypto_test_param *param = data; u32 type = param->type; int err; err = alg_test(param->driver, param->alg, type, CRYPTO_ALG_TESTED); crypto_alg_tested(param->driver, err); kfree(param); module_put_and_kthread_exit(0); } static int cryptomgr_schedule_test(struct crypto_alg *alg) { struct task_struct *thread; struct crypto_test_param *param; if (IS_ENABLED(CONFIG_CRYPTO_MANAGER_DISABLE_TESTS)) return NOTIFY_DONE; if (!try_module_get(THIS_MODULE)) goto err; param = kzalloc(sizeof(*param), GFP_KERNEL); if (!param) goto err_put_module; memcpy(param->driver, alg->cra_driver_name, sizeof(param->driver)); memcpy(param->alg, alg->cra_name, sizeof(param->alg)); param->type = alg->cra_flags; thread = kthread_run(cryptomgr_test, param, "cryptomgr_test"); if (IS_ERR(thread)) goto err_free_param; return NOTIFY_STOP; err_free_param: kfree(param); err_put_module: module_put(THIS_MODULE); err: return NOTIFY_OK; } static int cryptomgr_notify(struct notifier_block *this, unsigned long msg, void *data) { switch (msg) { case CRYPTO_MSG_ALG_REQUEST: return cryptomgr_schedule_probe(data); case CRYPTO_MSG_ALG_REGISTER: return cryptomgr_schedule_test(data); case CRYPTO_MSG_ALG_LOADED: break; } return NOTIFY_DONE; } static struct notifier_block cryptomgr_notifier = { .notifier_call = cryptomgr_notify, }; static int __init cryptomgr_init(void) { return crypto_register_notifier(&cryptomgr_notifier); } static void __exit cryptomgr_exit(void) { int err = crypto_unregister_notifier(&cryptomgr_notifier); BUG_ON(err); } /* * This is arch_initcall() so that the crypto self-tests are run on algorithms * registered early by subsys_initcall(). subsys_initcall() is needed for * generic implementations so that they're available for comparison tests when * other implementations are registered later by module_init(). */ arch_initcall(cryptomgr_init); module_exit(cryptomgr_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Crypto Algorithm Manager");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 // SPDX-License-Identifier: GPL-2.0 /* * consolidates trace point definitions * * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com> */ #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/string.h> #include <linux/if_arp.h> #include <linux/inetdevice.h> #include <linux/inet.h> #include <linux/interrupt.h> #include <linux/export.h> #include <linux/netpoll.h> #include <linux/sched.h> #include <linux/delay.h> #include <linux/rcupdate.h> #include <linux/types.h> #include <linux/workqueue.h> #include <linux/netlink.h> #include <linux/net_dropmon.h> #include <linux/slab.h> #include <linux/unaligned.h> #include <asm/bitops.h> #define CREATE_TRACE_POINTS #include <trace/events/skb.h> #include <trace/events/net.h> #include <trace/events/napi.h> #include <trace/events/sock.h> #include <trace/events/udp.h> #include <trace/events/tcp.h> #include <trace/events/fib.h> #include <trace/events/qdisc.h> #if IS_ENABLED(CONFIG_BRIDGE) #include <trace/events/bridge.h> EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_add); EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_external_learn_add); EXPORT_TRACEPOINT_SYMBOL_GPL(fdb_delete); EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update); EXPORT_TRACEPOINT_SYMBOL_GPL(br_mdb_full); #endif #if IS_ENABLED(CONFIG_PAGE_POOL) #include <trace/events/page_pool.h> #endif #include <trace/events/neigh.h> EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update); EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update_done); EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_timer_handler); EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_event_send_done); EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_event_send_dead); EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_cleanup_and_release); EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); EXPORT_TRACEPOINT_SYMBOL_GPL(napi_poll); EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_send_reset); EXPORT_TRACEPOINT_SYMBOL_GPL(tcp_bad_csum); EXPORT_TRACEPOINT_SYMBOL_GPL(udp_fail_queue_rcv_skb); EXPORT_TRACEPOINT_SYMBOL_GPL(sk_data_ready);
6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 // SPDX-License-Identifier: GPL-2.0 /* * This contains functions for filename crypto management * * Copyright (C) 2015, Google, Inc. * Copyright (C) 2015, Motorola Mobility * * Written by Uday Savagaonkar, 2014. * Modified by Jaegeuk Kim, 2015. * * This has not yet undergone a rigorous security audit. */ #include <linux/namei.h> #include <linux/scatterlist.h> #include <crypto/hash.h> #include <crypto/sha2.h> #include <crypto/skcipher.h> #include "fscrypt_private.h" /* * The minimum message length (input and output length), in bytes, for all * filenames encryption modes. Filenames shorter than this will be zero-padded * before being encrypted. */ #define FSCRYPT_FNAME_MIN_MSG_LEN 16 /* * struct fscrypt_nokey_name - identifier for directory entry when key is absent * * When userspace lists an encrypted directory without access to the key, the * filesystem must present a unique "no-key name" for each filename that allows * it to find the directory entry again if requested. Naively, that would just * mean using the ciphertext filenames. However, since the ciphertext filenames * can contain illegal characters ('\0' and '/'), they must be encoded in some * way. We use base64url. But that can cause names to exceed NAME_MAX (255 * bytes), so we also need to use a strong hash to abbreviate long names. * * The filesystem may also need another kind of hash, the "dirhash", to quickly * find the directory entry. Since filesystems normally compute the dirhash * over the on-disk filename (i.e. the ciphertext), it's not computable from * no-key names that abbreviate the ciphertext using the strong hash to fit in * NAME_MAX. It's also not computable if it's a keyed hash taken over the * plaintext (but it may still be available in the on-disk directory entry); * casefolded directories use this type of dirhash. At least in these cases, * each no-key name must include the name's dirhash too. * * To meet all these requirements, we base64url-encode the following * variable-length structure. It contains the dirhash, or 0's if the filesystem * didn't provide one; up to 149 bytes of the ciphertext name; and for * ciphertexts longer than 149 bytes, also the SHA-256 of the remaining bytes. * * This ensures that each no-key name contains everything needed to find the * directory entry again, contains only legal characters, doesn't exceed * NAME_MAX, is unambiguous unless there's a SHA-256 collision, and that we only * take the performance hit of SHA-256 on very long filenames (which are rare). */ struct fscrypt_nokey_name { u32 dirhash[2]; u8 bytes[149]; u8 sha256[SHA256_DIGEST_SIZE]; }; /* 189 bytes => 252 bytes base64url-encoded, which is <= NAME_MAX (255) */ /* * Decoded size of max-size no-key name, i.e. a name that was abbreviated using * the strong hash and thus includes the 'sha256' field. This isn't simply * sizeof(struct fscrypt_nokey_name), as the padding at the end isn't included. */ #define FSCRYPT_NOKEY_NAME_MAX offsetofend(struct fscrypt_nokey_name, sha256) /* Encoded size of max-size no-key name */ #define FSCRYPT_NOKEY_NAME_MAX_ENCODED \ FSCRYPT_BASE64URL_CHARS(FSCRYPT_NOKEY_NAME_MAX) static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) { return is_dot_dotdot(str->name, str->len); } /** * fscrypt_fname_encrypt() - encrypt a filename * @inode: inode of the parent directory (for regular filenames) * or of the symlink (for symlink targets). Key must already be * set up. * @iname: the filename to encrypt * @out: (output) the encrypted filename * @olen: size of the encrypted filename. It must be at least @iname->len. * Any extra space is filled with NUL padding before encryption. * * Return: 0 on success, -errno on failure */ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, u8 *out, unsigned int olen) { struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); const struct fscrypt_inode_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_enc_key.tfm; union fscrypt_iv iv; struct scatterlist sg; int res; /* * Copy the filename to the output buffer for encrypting in-place and * pad it with the needed number of NUL bytes. */ if (WARN_ON_ONCE(olen < iname->len)) return -ENOBUFS; memcpy(out, iname->name, iname->len); memset(out + iname->len, 0, olen - iname->len); /* Initialize the IV */ fscrypt_generate_iv(&iv, 0, ci); /* Set up the encryption request */ req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) return -ENOMEM; skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait); sg_init_one(&sg, out, olen); skcipher_request_set_crypt(req, &sg, &sg, olen, &iv); /* Do the encryption */ res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); skcipher_request_free(req); if (res < 0) { fscrypt_err(inode, "Filename encryption failed: %d", res); return res; } return 0; } EXPORT_SYMBOL_GPL(fscrypt_fname_encrypt); /** * fname_decrypt() - decrypt a filename * @inode: inode of the parent directory (for regular filenames) * or of the symlink (for symlink targets) * @iname: the encrypted filename to decrypt * @oname: (output) the decrypted filename. The caller must have allocated * enough space for this, e.g. using fscrypt_fname_alloc_buffer(). * * Return: 0 on success, -errno on failure */ static int fname_decrypt(const struct inode *inode, const struct fscrypt_str *iname, struct fscrypt_str *oname) { struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); struct scatterlist src_sg, dst_sg; const struct fscrypt_inode_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_enc_key.tfm; union fscrypt_iv iv; int res; /* Allocate request */ req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) return -ENOMEM; skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait); /* Initialize IV */ fscrypt_generate_iv(&iv, 0, ci); /* Create decryption request */ sg_init_one(&src_sg, iname->name, iname->len); sg_init_one(&dst_sg, oname->name, oname->len); skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, &iv); res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait); skcipher_request_free(req); if (res < 0) { fscrypt_err(inode, "Filename decryption failed: %d", res); return res; } oname->len = strnlen(oname->name, iname->len); return 0; } static const char base64url_table[65] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"; #define FSCRYPT_BASE64URL_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) /** * fscrypt_base64url_encode() - base64url-encode some binary data * @src: the binary data to encode * @srclen: the length of @src in bytes * @dst: (output) the base64url-encoded string. Not NUL-terminated. * * Encodes data using base64url encoding, i.e. the "Base 64 Encoding with URL * and Filename Safe Alphabet" specified by RFC 4648. '='-padding isn't used, * as it's unneeded and not required by the RFC. base64url is used instead of * base64 to avoid the '/' character, which isn't allowed in filenames. * * Return: the length of the resulting base64url-encoded string in bytes. * This will be equal to FSCRYPT_BASE64URL_CHARS(srclen). */ static int fscrypt_base64url_encode(const u8 *src, int srclen, char *dst) { u32 ac = 0; int bits = 0; int i; char *cp = dst; for (i = 0; i < srclen; i++) { ac = (ac << 8) | src[i]; bits += 8; do { bits -= 6; *cp++ = base64url_table[(ac >> bits) & 0x3f]; } while (bits >= 6); } if (bits) *cp++ = base64url_table[(ac << (6 - bits)) & 0x3f]; return cp - dst; } /** * fscrypt_base64url_decode() - base64url-decode a string * @src: the string to decode. Doesn't need to be NUL-terminated. * @srclen: the length of @src in bytes * @dst: (output) the decoded binary data * * Decodes a string using base64url encoding, i.e. the "Base 64 Encoding with * URL and Filename Safe Alphabet" specified by RFC 4648. '='-padding isn't * accepted, nor are non-encoding characters such as whitespace. * * This implementation hasn't been optimized for performance. * * Return: the length of the resulting decoded binary data in bytes, * or -1 if the string isn't a valid base64url string. */ static int fscrypt_base64url_decode(const char *src, int srclen, u8 *dst) { u32 ac = 0; int bits = 0; int i; u8 *bp = dst; for (i = 0; i < srclen; i++) { const char *p = strchr(base64url_table, src[i]); if (p == NULL || src[i] == 0) return -1; ac = (ac << 6) | (p - base64url_table); bits += 6; if (bits >= 8) { bits -= 8; *bp++ = (u8)(ac >> bits); } } if (ac & ((1 << bits) - 1)) return -1; return bp - dst; } bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, u32 orig_len, u32 max_len, u32 *encrypted_len_ret) { int padding = 4 << (fscrypt_policy_flags(policy) & FSCRYPT_POLICY_FLAGS_PAD_MASK); u32 encrypted_len; if (orig_len > max_len) return false; encrypted_len = max_t(u32, orig_len, FSCRYPT_FNAME_MIN_MSG_LEN); encrypted_len = round_up(encrypted_len, padding); *encrypted_len_ret = min(encrypted_len, max_len); return true; } /** * fscrypt_fname_encrypted_size() - calculate length of encrypted filename * @inode: parent inode of dentry name being encrypted. Key must * already be set up. * @orig_len: length of the original filename * @max_len: maximum length to return * @encrypted_len_ret: where calculated length should be returned (on success) * * Filenames that are shorter than the maximum length may have their lengths * increased slightly by encryption, due to padding that is applied. * * Return: false if the orig_len is greater than max_len. Otherwise, true and * fill out encrypted_len_ret with the length (up to max_len). */ bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, u32 max_len, u32 *encrypted_len_ret) { return __fscrypt_fname_encrypted_size(&inode->i_crypt_info->ci_policy, orig_len, max_len, encrypted_len_ret); } EXPORT_SYMBOL_GPL(fscrypt_fname_encrypted_size); /** * fscrypt_fname_alloc_buffer() - allocate a buffer for presented filenames * @max_encrypted_len: maximum length of encrypted filenames the buffer will be * used to present * @crypto_str: (output) buffer to allocate * * Allocate a buffer that is large enough to hold any decrypted or encoded * filename (null-terminated), for the given maximum encrypted filename length. * * Return: 0 on success, -errno on failure */ int fscrypt_fname_alloc_buffer(u32 max_encrypted_len, struct fscrypt_str *crypto_str) { u32 max_presented_len = max_t(u32, FSCRYPT_NOKEY_NAME_MAX_ENCODED, max_encrypted_len); crypto_str->name = kmalloc(max_presented_len + 1, GFP_NOFS); if (!crypto_str->name) return -ENOMEM; crypto_str->len = max_presented_len; return 0; } EXPORT_SYMBOL(fscrypt_fname_alloc_buffer); /** * fscrypt_fname_free_buffer() - free a buffer for presented filenames * @crypto_str: the buffer to free * * Free a buffer that was allocated by fscrypt_fname_alloc_buffer(). */ void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) { if (!crypto_str) return; kfree(crypto_str->name); crypto_str->name = NULL; } EXPORT_SYMBOL(fscrypt_fname_free_buffer); /** * fscrypt_fname_disk_to_usr() - convert an encrypted filename to * user-presentable form * @inode: inode of the parent directory (for regular filenames) * or of the symlink (for symlink targets) * @hash: first part of the name's dirhash, if applicable. This only needs to * be provided if the filename is located in an indexed directory whose * encryption key may be unavailable. Not needed for symlink targets. * @minor_hash: second part of the name's dirhash, if applicable * @iname: encrypted filename to convert. May also be "." or "..", which * aren't actually encrypted. * @oname: output buffer for the user-presentable filename. The caller must * have allocated enough space for this, e.g. using * fscrypt_fname_alloc_buffer(). * * If the key is available, we'll decrypt the disk name. Otherwise, we'll * encode it for presentation in fscrypt_nokey_name format. * See struct fscrypt_nokey_name for details. * * Return: 0 on success, -errno on failure */ int fscrypt_fname_disk_to_usr(const struct inode *inode, u32 hash, u32 minor_hash, const struct fscrypt_str *iname, struct fscrypt_str *oname) { const struct qstr qname = FSTR_TO_QSTR(iname); struct fscrypt_nokey_name nokey_name; u32 size; /* size of the unencoded no-key name */ if (fscrypt_is_dot_dotdot(&qname)) { oname->name[0] = '.'; oname->name[iname->len - 1] = '.'; oname->len = iname->len; return 0; } if (iname->len < FSCRYPT_FNAME_MIN_MSG_LEN) return -EUCLEAN; if (fscrypt_has_encryption_key(inode)) return fname_decrypt(inode, iname, oname); /* * Sanity check that struct fscrypt_nokey_name doesn't have padding * between fields and that its encoded size never exceeds NAME_MAX. */ BUILD_BUG_ON(offsetofend(struct fscrypt_nokey_name, dirhash) != offsetof(struct fscrypt_nokey_name, bytes)); BUILD_BUG_ON(offsetofend(struct fscrypt_nokey_name, bytes) != offsetof(struct fscrypt_nokey_name, sha256)); BUILD_BUG_ON(FSCRYPT_NOKEY_NAME_MAX_ENCODED > NAME_MAX); nokey_name.dirhash[0] = hash; nokey_name.dirhash[1] = minor_hash; if (iname->len <= sizeof(nokey_name.bytes)) { memcpy(nokey_name.bytes, iname->name, iname->len); size = offsetof(struct fscrypt_nokey_name, bytes[iname->len]); } else { memcpy(nokey_name.bytes, iname->name, sizeof(nokey_name.bytes)); /* Compute strong hash of remaining part of name. */ sha256(&iname->name[sizeof(nokey_name.bytes)], iname->len - sizeof(nokey_name.bytes), nokey_name.sha256); size = FSCRYPT_NOKEY_NAME_MAX; } oname->len = fscrypt_base64url_encode((const u8 *)&nokey_name, size, oname->name); return 0; } EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); /** * fscrypt_setup_filename() - prepare to search a possibly encrypted directory * @dir: the directory that will be searched * @iname: the user-provided filename being searched for * @lookup: 1 if we're allowed to proceed without the key because it's * ->lookup() or we're finding the dir_entry for deletion; 0 if we cannot * proceed without the key because we're going to create the dir_entry. * @fname: the filename information to be filled in * * Given a user-provided filename @iname, this function sets @fname->disk_name * to the name that would be stored in the on-disk directory entry, if possible. * If the directory is unencrypted this is simply @iname. Else, if we have the * directory's encryption key, then @iname is the plaintext, so we encrypt it to * get the disk_name. * * Else, for keyless @lookup operations, @iname should be a no-key name, so we * decode it to get the struct fscrypt_nokey_name. Non-@lookup operations will * be impossible in this case, so we fail them with ENOKEY. * * If successful, fscrypt_free_filename() must be called later to clean up. * * Return: 0 on success, -errno on failure */ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct fscrypt_name *fname) { struct fscrypt_nokey_name *nokey_name; int ret; memset(fname, 0, sizeof(struct fscrypt_name)); fname->usr_fname = iname; if (!IS_ENCRYPTED(dir) || fscrypt_is_dot_dotdot(iname)) { fname->disk_name.name = (unsigned char *)iname->name; fname->disk_name.len = iname->len; return 0; } ret = fscrypt_get_encryption_info(dir, lookup); if (ret) return ret; if (fscrypt_has_encryption_key(dir)) { if (!fscrypt_fname_encrypted_size(dir, iname->len, NAME_MAX, &fname->crypto_buf.len)) return -ENAMETOOLONG; fname->crypto_buf.name = kmalloc(fname->crypto_buf.len, GFP_NOFS); if (!fname->crypto_buf.name) return -ENOMEM; ret = fscrypt_fname_encrypt(dir, iname, fname->crypto_buf.name, fname->crypto_buf.len); if (ret) goto errout; fname->disk_name.name = fname->crypto_buf.name; fname->disk_name.len = fname->crypto_buf.len; return 0; } if (!lookup) return -ENOKEY; fname->is_nokey_name = true; /* * We don't have the key and we are doing a lookup; decode the * user-supplied name */ if (iname->len > FSCRYPT_NOKEY_NAME_MAX_ENCODED) return -ENOENT; fname->crypto_buf.name = kmalloc(FSCRYPT_NOKEY_NAME_MAX, GFP_KERNEL); if (fname->crypto_buf.name == NULL) return -ENOMEM; ret = fscrypt_base64url_decode(iname->name, iname->len, fname->crypto_buf.name); if (ret < (int)offsetof(struct fscrypt_nokey_name, bytes[1]) || (ret > offsetof(struct fscrypt_nokey_name, sha256) && ret != FSCRYPT_NOKEY_NAME_MAX)) { ret = -ENOENT; goto errout; } fname->crypto_buf.len = ret; nokey_name = (void *)fname->crypto_buf.name; fname->hash = nokey_name->dirhash[0]; fname->minor_hash = nokey_name->dirhash[1]; if (ret != FSCRYPT_NOKEY_NAME_MAX) { /* The full ciphertext filename is available. */ fname->disk_name.name = nokey_name->bytes; fname->disk_name.len = ret - offsetof(struct fscrypt_nokey_name, bytes); } return 0; errout: kfree(fname->crypto_buf.name); return ret; } EXPORT_SYMBOL(fscrypt_setup_filename); /** * fscrypt_match_name() - test whether the given name matches a directory entry * @fname: the name being searched for * @de_name: the name from the directory entry * @de_name_len: the length of @de_name in bytes * * Normally @fname->disk_name will be set, and in that case we simply compare * that to the name stored in the directory entry. The only exception is that * if we don't have the key for an encrypted directory and the name we're * looking for is very long, then we won't have the full disk_name and instead * we'll need to match against a fscrypt_nokey_name that includes a strong hash. * * Return: %true if the name matches, otherwise %false. */ bool fscrypt_match_name(const struct fscrypt_name *fname, const u8 *de_name, u32 de_name_len) { const struct fscrypt_nokey_name *nokey_name = (const void *)fname->crypto_buf.name; u8 digest[SHA256_DIGEST_SIZE]; if (likely(fname->disk_name.name)) { if (de_name_len != fname->disk_name.len) return false; return !memcmp(de_name, fname->disk_name.name, de_name_len); } if (de_name_len <= sizeof(nokey_name->bytes)) return false; if (memcmp(de_name, nokey_name->bytes, sizeof(nokey_name->bytes))) return false; sha256(&de_name[sizeof(nokey_name->bytes)], de_name_len - sizeof(nokey_name->bytes), digest); return !memcmp(digest, nokey_name->sha256, sizeof(digest)); } EXPORT_SYMBOL_GPL(fscrypt_match_name); /** * fscrypt_fname_siphash() - calculate the SipHash of a filename * @dir: the parent directory * @name: the filename to calculate the SipHash of * * Given a plaintext filename @name and a directory @dir which uses SipHash as * its dirhash method and has had its fscrypt key set up, this function * calculates the SipHash of that name using the directory's secret dirhash key. * * Return: the SipHash of @name using the hash key of @dir */ u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name) { const struct fscrypt_inode_info *ci = dir->i_crypt_info; WARN_ON_ONCE(!ci->ci_dirhash_key_initialized); return siphash(name->name, name->len, &ci->ci_dirhash_key); } EXPORT_SYMBOL_GPL(fscrypt_fname_siphash); /* * Validate dentries in encrypted directories to make sure we aren't potentially * caching stale dentries after a key has been added. */ int fscrypt_d_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags) { int err; /* * Plaintext names are always valid, since fscrypt doesn't support * reverting to no-key names without evicting the directory's inode * -- which implies eviction of the dentries in the directory. */ if (!(dentry->d_flags & DCACHE_NOKEY_NAME)) return 1; /* * No-key name; valid if the directory's key is still unavailable. * * Note in RCU mode we have to bail if we get here - * fscrypt_get_encryption_info() may block. */ if (flags & LOOKUP_RCU) return -ECHILD; /* * Pass allow_unsupported=true, so that files with an unsupported * encryption policy can be deleted. */ err = fscrypt_get_encryption_info(dir, true); if (err < 0) return err; return !fscrypt_has_encryption_key(dir); } EXPORT_SYMBOL_GPL(fscrypt_d_revalidate);
2 1 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 // SPDX-License-Identifier: GPL-2.0-or-later /* Kernel cryptographic api. * cast5.c - Cast5 cipher algorithm (rfc2144). * * Derived from GnuPG implementation of cast5. * * Major Changes. * Complete conformance to rfc2144. * Supports key size from 40 to 128 bits. * * Copyright (C) 1998, 1999, 2000, 2001 Free Software Foundation, Inc. * Copyright (C) 2003 Kartikey Mahendra Bhatt <kartik_me@hotmail.com>. */ #include <linux/unaligned.h> #include <crypto/algapi.h> #include <linux/init.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/types.h> #include <crypto/cast5.h> static const u32 s5[256] = { 0x7ec90c04, 0x2c6e74b9, 0x9b0e66df, 0xa6337911, 0xb86a7fff, 0x1dd358f5, 0x44dd9d44, 0x1731167f, 0x08fbf1fa, 0xe7f511cc, 0xd2051b00, 0x735aba00, 0x2ab722d8, 0x386381cb, 0xacf6243a, 0x69befd7a, 0xe6a2e77f, 0xf0c720cd, 0xc4494816, 0xccf5c180, 0x38851640, 0x15b0a848, 0xe68b18cb, 0x4caadeff, 0x5f480a01, 0x0412b2aa, 0x259814fc, 0x41d0efe2, 0x4e40b48d, 0x248eb6fb, 0x8dba1cfe, 0x41a99b02, 0x1a550a04, 0xba8f65cb, 0x7251f4e7, 0x95a51725, 0xc106ecd7, 0x97a5980a, 0xc539b9aa, 0x4d79fe6a, 0xf2f3f763, 0x68af8040, 0xed0c9e56, 0x11b4958b, 0xe1eb5a88, 0x8709e6b0, 0xd7e07156, 0x4e29fea7, 0x6366e52d, 0x02d1c000, 0xc4ac8e05, 0x9377f571, 0x0c05372a, 0x578535f2, 0x2261be02, 0xd642a0c9, 0xdf13a280, 0x74b55bd2, 0x682199c0, 0xd421e5ec, 0x53fb3ce8, 0xc8adedb3, 0x28a87fc9, 0x3d959981, 0x5c1ff900, 0xfe38d399, 0x0c4eff0b, 0x062407ea, 0xaa2f4fb1, 0x4fb96976, 0x90c79505, 0xb0a8a774, 0xef55a1ff, 0xe59ca2c2, 0xa6b62d27, 0xe66a4263, 0xdf65001f, 0x0ec50966, 0xdfdd55bc, 0x29de0655, 0x911e739a, 0x17af8975, 0x32c7911c, 0x89f89468, 0x0d01e980, 0x524755f4, 0x03b63cc9, 0x0cc844b2, 0xbcf3f0aa, 0x87ac36e9, 0xe53a7426, 0x01b3d82b, 0x1a9e7449, 0x64ee2d7e, 0xcddbb1da, 0x01c94910, 0xb868bf80, 0x0d26f3fd, 0x9342ede7, 0x04a5c284, 0x636737b6, 0x50f5b616, 0xf24766e3, 0x8eca36c1, 0x136e05db, 0xfef18391, 0xfb887a37, 0xd6e7f7d4, 0xc7fb7dc9, 0x3063fcdf, 0xb6f589de, 0xec2941da, 0x26e46695, 0xb7566419, 0xf654efc5, 0xd08d58b7, 0x48925401, 0xc1bacb7f, 0xe5ff550f, 0xb6083049, 0x5bb5d0e8, 0x87d72e5a, 0xab6a6ee1, 0x223a66ce, 0xc62bf3cd, 0x9e0885f9, 0x68cb3e47, 0x086c010f, 0xa21de820, 0xd18b69de, 0xf3f65777, 0xfa02c3f6, 0x407edac3, 0xcbb3d550, 0x1793084d, 0xb0d70eba, 0x0ab378d5, 0xd951fb0c, 0xded7da56, 0x4124bbe4, 0x94ca0b56, 0x0f5755d1, 0xe0e1e56e, 0x6184b5be, 0x580a249f, 0x94f74bc0, 0xe327888e, 0x9f7b5561, 0xc3dc0280, 0x05687715, 0x646c6bd7, 0x44904db3, 0x66b4f0a3, 0xc0f1648a, 0x697ed5af, 0x49e92ff6, 0x309e374f, 0x2cb6356a, 0x85808573, 0x4991f840, 0x76f0ae02, 0x083be84d, 0x28421c9a, 0x44489406, 0x736e4cb8, 0xc1092910, 0x8bc95fc6, 0x7d869cf4, 0x134f616f, 0x2e77118d, 0xb31b2be1, 0xaa90b472, 0x3ca5d717, 0x7d161bba, 0x9cad9010, 0xaf462ba2, 0x9fe459d2, 0x45d34559, 0xd9f2da13, 0xdbc65487, 0xf3e4f94e, 0x176d486f, 0x097c13ea, 0x631da5c7, 0x445f7382, 0x175683f4, 0xcdc66a97, 0x70be0288, 0xb3cdcf72, 0x6e5dd2f3, 0x20936079, 0x459b80a5, 0xbe60e2db, 0xa9c23101, 0xeba5315c, 0x224e42f2, 0x1c5c1572, 0xf6721b2c, 0x1ad2fff3, 0x8c25404e, 0x324ed72f, 0x4067b7fd, 0x0523138e, 0x5ca3bc78, 0xdc0fd66e, 0x75922283, 0x784d6b17, 0x58ebb16e, 0x44094f85, 0x3f481d87, 0xfcfeae7b, 0x77b5ff76, 0x8c2302bf, 0xaaf47556, 0x5f46b02a, 0x2b092801, 0x3d38f5f7, 0x0ca81f36, 0x52af4a8a, 0x66d5e7c0, 0xdf3b0874, 0x95055110, 0x1b5ad7a8, 0xf61ed5ad, 0x6cf6e479, 0x20758184, 0xd0cefa65, 0x88f7be58, 0x4a046826, 0x0ff6f8f3, 0xa09c7f70, 0x5346aba0, 0x5ce96c28, 0xe176eda3, 0x6bac307f, 0x376829d2, 0x85360fa9, 0x17e3fe2a, 0x24b79767, 0xf5a96b20, 0xd6cd2595, 0x68ff1ebf, 0x7555442c, 0xf19f06be, 0xf9e0659a, 0xeeb9491d, 0x34010718, 0xbb30cab8, 0xe822fe15, 0x88570983, 0x750e6249, 0xda627e55, 0x5e76ffa8, 0xb1534546, 0x6d47de08, 0xefe9e7d4 }; static const u32 s6[256] = { 0xf6fa8f9d, 0x2cac6ce1, 0x4ca34867, 0xe2337f7c, 0x95db08e7, 0x016843b4, 0xeced5cbc, 0x325553ac, 0xbf9f0960, 0xdfa1e2ed, 0x83f0579d, 0x63ed86b9, 0x1ab6a6b8, 0xde5ebe39, 0xf38ff732, 0x8989b138, 0x33f14961, 0xc01937bd, 0xf506c6da, 0xe4625e7e, 0xa308ea99, 0x4e23e33c, 0x79cbd7cc, 0x48a14367, 0xa3149619, 0xfec94bd5, 0xa114174a, 0xeaa01866, 0xa084db2d, 0x09a8486f, 0xa888614a, 0x2900af98, 0x01665991, 0xe1992863, 0xc8f30c60, 0x2e78ef3c, 0xd0d51932, 0xcf0fec14, 0xf7ca07d2, 0xd0a82072, 0xfd41197e, 0x9305a6b0, 0xe86be3da, 0x74bed3cd, 0x372da53c, 0x4c7f4448, 0xdab5d440, 0x6dba0ec3, 0x083919a7, 0x9fbaeed9, 0x49dbcfb0, 0x4e670c53, 0x5c3d9c01, 0x64bdb941, 0x2c0e636a, 0xba7dd9cd, 0xea6f7388, 0xe70bc762, 0x35f29adb, 0x5c4cdd8d, 0xf0d48d8c, 0xb88153e2, 0x08a19866, 0x1ae2eac8, 0x284caf89, 0xaa928223, 0x9334be53, 0x3b3a21bf, 0x16434be3, 0x9aea3906, 0xefe8c36e, 0xf890cdd9, 0x80226dae, 0xc340a4a3, 0xdf7e9c09, 0xa694a807, 0x5b7c5ecc, 0x221db3a6, 0x9a69a02f, 0x68818a54, 0xceb2296f, 0x53c0843a, 0xfe893655, 0x25bfe68a, 0xb4628abc, 0xcf222ebf, 0x25ac6f48, 0xa9a99387, 0x53bddb65, 0xe76ffbe7, 0xe967fd78, 0x0ba93563, 0x8e342bc1, 0xe8a11be9, 0x4980740d, 0xc8087dfc, 0x8de4bf99, 0xa11101a0, 0x7fd37975, 0xda5a26c0, 0xe81f994f, 0x9528cd89, 0xfd339fed, 0xb87834bf, 0x5f04456d, 0x22258698, 0xc9c4c83b, 0x2dc156be, 0x4f628daa, 0x57f55ec5, 0xe2220abe, 0xd2916ebf, 0x4ec75b95, 0x24f2c3c0, 0x42d15d99, 0xcd0d7fa0, 0x7b6e27ff, 0xa8dc8af0, 0x7345c106, 0xf41e232f, 0x35162386, 0xe6ea8926, 0x3333b094, 0x157ec6f2, 0x372b74af, 0x692573e4, 0xe9a9d848, 0xf3160289, 0x3a62ef1d, 0xa787e238, 0xf3a5f676, 0x74364853, 0x20951063, 0x4576698d, 0xb6fad407, 0x592af950, 0x36f73523, 0x4cfb6e87, 0x7da4cec0, 0x6c152daa, 0xcb0396a8, 0xc50dfe5d, 0xfcd707ab, 0x0921c42f, 0x89dff0bb, 0x5fe2be78, 0x448f4f33, 0x754613c9, 0x2b05d08d, 0x48b9d585, 0xdc049441, 0xc8098f9b, 0x7dede786, 0xc39a3373, 0x42410005, 0x6a091751, 0x0ef3c8a6, 0x890072d6, 0x28207682, 0xa9a9f7be, 0xbf32679d, 0xd45b5b75, 0xb353fd00, 0xcbb0e358, 0x830f220a, 0x1f8fb214, 0xd372cf08, 0xcc3c4a13, 0x8cf63166, 0x061c87be, 0x88c98f88, 0x6062e397, 0x47cf8e7a, 0xb6c85283, 0x3cc2acfb, 0x3fc06976, 0x4e8f0252, 0x64d8314d, 0xda3870e3, 0x1e665459, 0xc10908f0, 0x513021a5, 0x6c5b68b7, 0x822f8aa0, 0x3007cd3e, 0x74719eef, 0xdc872681, 0x073340d4, 0x7e432fd9, 0x0c5ec241, 0x8809286c, 0xf592d891, 0x08a930f6, 0x957ef305, 0xb7fbffbd, 0xc266e96f, 0x6fe4ac98, 0xb173ecc0, 0xbc60b42a, 0x953498da, 0xfba1ae12, 0x2d4bd736, 0x0f25faab, 0xa4f3fceb, 0xe2969123, 0x257f0c3d, 0x9348af49, 0x361400bc, 0xe8816f4a, 0x3814f200, 0xa3f94043, 0x9c7a54c2, 0xbc704f57, 0xda41e7f9, 0xc25ad33a, 0x54f4a084, 0xb17f5505, 0x59357cbe, 0xedbd15c8, 0x7f97c5ab, 0xba5ac7b5, 0xb6f6deaf, 0x3a479c3a, 0x5302da25, 0x653d7e6a, 0x54268d49, 0x51a477ea, 0x5017d55b, 0xd7d25d88, 0x44136c76, 0x0404a8c8, 0xb8e5a121, 0xb81a928a, 0x60ed5869, 0x97c55b96, 0xeaec991b, 0x29935913, 0x01fdb7f1, 0x088e8dfa, 0x9ab6f6f5, 0x3b4cbf9f, 0x4a5de3ab, 0xe6051d35, 0xa0e1d855, 0xd36b4cf1, 0xf544edeb, 0xb0e93524, 0xbebb8fbd, 0xa2d762cf, 0x49c92f54, 0x38b5f331, 0x7128a454, 0x48392905, 0xa65b1db8, 0x851c97bd, 0xd675cf2f }; static const u32 s7[256] = { 0x85e04019, 0x332bf567, 0x662dbfff, 0xcfc65693, 0x2a8d7f6f, 0xab9bc912, 0xde6008a1, 0x2028da1f, 0x0227bce7, 0x4d642916, 0x18fac300, 0x50f18b82, 0x2cb2cb11, 0xb232e75c, 0x4b3695f2, 0xb28707de, 0xa05fbcf6, 0xcd4181e9, 0xe150210c, 0xe24ef1bd, 0xb168c381, 0xfde4e789, 0x5c79b0d8, 0x1e8bfd43, 0x4d495001, 0x38be4341, 0x913cee1d, 0x92a79c3f, 0x089766be, 0xbaeeadf4, 0x1286becf, 0xb6eacb19, 0x2660c200, 0x7565bde4, 0x64241f7a, 0x8248dca9, 0xc3b3ad66, 0x28136086, 0x0bd8dfa8, 0x356d1cf2, 0x107789be, 0xb3b2e9ce, 0x0502aa8f, 0x0bc0351e, 0x166bf52a, 0xeb12ff82, 0xe3486911, 0xd34d7516, 0x4e7b3aff, 0x5f43671b, 0x9cf6e037, 0x4981ac83, 0x334266ce, 0x8c9341b7, 0xd0d854c0, 0xcb3a6c88, 0x47bc2829, 0x4725ba37, 0xa66ad22b, 0x7ad61f1e, 0x0c5cbafa, 0x4437f107, 0xb6e79962, 0x42d2d816, 0x0a961288, 0xe1a5c06e, 0x13749e67, 0x72fc081a, 0xb1d139f7, 0xf9583745, 0xcf19df58, 0xbec3f756, 0xc06eba30, 0x07211b24, 0x45c28829, 0xc95e317f, 0xbc8ec511, 0x38bc46e9, 0xc6e6fa14, 0xbae8584a, 0xad4ebc46, 0x468f508b, 0x7829435f, 0xf124183b, 0x821dba9f, 0xaff60ff4, 0xea2c4e6d, 0x16e39264, 0x92544a8b, 0x009b4fc3, 0xaba68ced, 0x9ac96f78, 0x06a5b79a, 0xb2856e6e, 0x1aec3ca9, 0xbe838688, 0x0e0804e9, 0x55f1be56, 0xe7e5363b, 0xb3a1f25d, 0xf7debb85, 0x61fe033c, 0x16746233, 0x3c034c28, 0xda6d0c74, 0x79aac56c, 0x3ce4e1ad, 0x51f0c802, 0x98f8f35a, 0x1626a49f, 0xeed82b29, 0x1d382fe3, 0x0c4fb99a, 0xbb325778, 0x3ec6d97b, 0x6e77a6a9, 0xcb658b5c, 0xd45230c7, 0x2bd1408b, 0x60c03eb7, 0xb9068d78, 0xa33754f4, 0xf430c87d, 0xc8a71302, 0xb96d8c32, 0xebd4e7be, 0xbe8b9d2d, 0x7979fb06, 0xe7225308, 0x8b75cf77, 0x11ef8da4, 0xe083c858, 0x8d6b786f, 0x5a6317a6, 0xfa5cf7a0, 0x5dda0033, 0xf28ebfb0, 0xf5b9c310, 0xa0eac280, 0x08b9767a, 0xa3d9d2b0, 0x79d34217, 0x021a718d, 0x9ac6336a, 0x2711fd60, 0x438050e3, 0x069908a8, 0x3d7fedc4, 0x826d2bef, 0x4eeb8476, 0x488dcf25, 0x36c9d566, 0x28e74e41, 0xc2610aca, 0x3d49a9cf, 0xbae3b9df, 0xb65f8de6, 0x92aeaf64, 0x3ac7d5e6, 0x9ea80509, 0xf22b017d, 0xa4173f70, 0xdd1e16c3, 0x15e0d7f9, 0x50b1b887, 0x2b9f4fd5, 0x625aba82, 0x6a017962, 0x2ec01b9c, 0x15488aa9, 0xd716e740, 0x40055a2c, 0x93d29a22, 0xe32dbf9a, 0x058745b9, 0x3453dc1e, 0xd699296e, 0x496cff6f, 0x1c9f4986, 0xdfe2ed07, 0xb87242d1, 0x19de7eae, 0x053e561a, 0x15ad6f8c, 0x66626c1c, 0x7154c24c, 0xea082b2a, 0x93eb2939, 0x17dcb0f0, 0x58d4f2ae, 0x9ea294fb, 0x52cf564c, 0x9883fe66, 0x2ec40581, 0x763953c3, 0x01d6692e, 0xd3a0c108, 0xa1e7160e, 0xe4f2dfa6, 0x693ed285, 0x74904698, 0x4c2b0edd, 0x4f757656, 0x5d393378, 0xa132234f, 0x3d321c5d, 0xc3f5e194, 0x4b269301, 0xc79f022f, 0x3c997e7e, 0x5e4f9504, 0x3ffafbbd, 0x76f7ad0e, 0x296693f4, 0x3d1fce6f, 0xc61e45be, 0xd3b5ab34, 0xf72bf9b7, 0x1b0434c0, 0x4e72b567, 0x5592a33d, 0xb5229301, 0xcfd2a87f, 0x60aeb767, 0x1814386b, 0x30bcc33d, 0x38a0c07d, 0xfd1606f2, 0xc363519b, 0x589dd390, 0x5479f8e6, 0x1cb8d647, 0x97fd61a9, 0xea7759f4, 0x2d57539d, 0x569a58cf, 0xe84e63ad, 0x462e1b78, 0x6580f87e, 0xf3817914, 0x91da55f4, 0x40a230f3, 0xd1988f35, 0xb6e318d2, 0x3ffa50bc, 0x3d40f021, 0xc3c0bdae, 0x4958c24c, 0x518f36b2, 0x84b1d370, 0x0fedce83, 0x878ddada, 0xf2a279c7, 0x94e01be8, 0x90716f4b, 0x954b8aa3 }; static const u32 sb8[256] = { 0xe216300d, 0xbbddfffc, 0xa7ebdabd, 0x35648095, 0x7789f8b7, 0xe6c1121b, 0x0e241600, 0x052ce8b5, 0x11a9cfb0, 0xe5952f11, 0xece7990a, 0x9386d174, 0x2a42931c, 0x76e38111, 0xb12def3a, 0x37ddddfc, 0xde9adeb1, 0x0a0cc32c, 0xbe197029, 0x84a00940, 0xbb243a0f, 0xb4d137cf, 0xb44e79f0, 0x049eedfd, 0x0b15a15d, 0x480d3168, 0x8bbbde5a, 0x669ded42, 0xc7ece831, 0x3f8f95e7, 0x72df191b, 0x7580330d, 0x94074251, 0x5c7dcdfa, 0xabbe6d63, 0xaa402164, 0xb301d40a, 0x02e7d1ca, 0x53571dae, 0x7a3182a2, 0x12a8ddec, 0xfdaa335d, 0x176f43e8, 0x71fb46d4, 0x38129022, 0xce949ad4, 0xb84769ad, 0x965bd862, 0x82f3d055, 0x66fb9767, 0x15b80b4e, 0x1d5b47a0, 0x4cfde06f, 0xc28ec4b8, 0x57e8726e, 0x647a78fc, 0x99865d44, 0x608bd593, 0x6c200e03, 0x39dc5ff6, 0x5d0b00a3, 0xae63aff2, 0x7e8bd632, 0x70108c0c, 0xbbd35049, 0x2998df04, 0x980cf42a, 0x9b6df491, 0x9e7edd53, 0x06918548, 0x58cb7e07, 0x3b74ef2e, 0x522fffb1, 0xd24708cc, 0x1c7e27cd, 0xa4eb215b, 0x3cf1d2e2, 0x19b47a38, 0x424f7618, 0x35856039, 0x9d17dee7, 0x27eb35e6, 0xc9aff67b, 0x36baf5b8, 0x09c467cd, 0xc18910b1, 0xe11dbf7b, 0x06cd1af8, 0x7170c608, 0x2d5e3354, 0xd4de495a, 0x64c6d006, 0xbcc0c62c, 0x3dd00db3, 0x708f8f34, 0x77d51b42, 0x264f620f, 0x24b8d2bf, 0x15c1b79e, 0x46a52564, 0xf8d7e54e, 0x3e378160, 0x7895cda5, 0x859c15a5, 0xe6459788, 0xc37bc75f, 0xdb07ba0c, 0x0676a3ab, 0x7f229b1e, 0x31842e7b, 0x24259fd7, 0xf8bef472, 0x835ffcb8, 0x6df4c1f2, 0x96f5b195, 0xfd0af0fc, 0xb0fe134c, 0xe2506d3d, 0x4f9b12ea, 0xf215f225, 0xa223736f, 0x9fb4c428, 0x25d04979, 0x34c713f8, 0xc4618187, 0xea7a6e98, 0x7cd16efc, 0x1436876c, 0xf1544107, 0xbedeee14, 0x56e9af27, 0xa04aa441, 0x3cf7c899, 0x92ecbae6, 0xdd67016d, 0x151682eb, 0xa842eedf, 0xfdba60b4, 0xf1907b75, 0x20e3030f, 0x24d8c29e, 0xe139673b, 0xefa63fb8, 0x71873054, 0xb6f2cf3b, 0x9f326442, 0xcb15a4cc, 0xb01a4504, 0xf1e47d8d, 0x844a1be5, 0xbae7dfdc, 0x42cbda70, 0xcd7dae0a, 0x57e85b7a, 0xd53f5af6, 0x20cf4d8c, 0xcea4d428, 0x79d130a4, 0x3486ebfb, 0x33d3cddc, 0x77853b53, 0x37effcb5, 0xc5068778, 0xe580b3e6, 0x4e68b8f4, 0xc5c8b37e, 0x0d809ea2, 0x398feb7c, 0x132a4f94, 0x43b7950e, 0x2fee7d1c, 0x223613bd, 0xdd06caa2, 0x37df932b, 0xc4248289, 0xacf3ebc3, 0x5715f6b7, 0xef3478dd, 0xf267616f, 0xc148cbe4, 0x9052815e, 0x5e410fab, 0xb48a2465, 0x2eda7fa4, 0xe87b40e4, 0xe98ea084, 0x5889e9e1, 0xefd390fc, 0xdd07d35b, 0xdb485694, 0x38d7e5b2, 0x57720101, 0x730edebc, 0x5b643113, 0x94917e4f, 0x503c2fba, 0x646f1282, 0x7523d24a, 0xe0779695, 0xf9c17a8f, 0x7a5b2121, 0xd187b896, 0x29263a4d, 0xba510cdf, 0x81f47c9f, 0xad1163ed, 0xea7b5965, 0x1a00726e, 0x11403092, 0x00da6d77, 0x4a0cdd61, 0xad1f4603, 0x605bdfb0, 0x9eedc364, 0x22ebe6a8, 0xcee7d28a, 0xa0e736a0, 0x5564a6b9, 0x10853209, 0xc7eb8f37, 0x2de705ca, 0x8951570f, 0xdf09822b, 0xbd691a6c, 0xaa12e4f2, 0x87451c0f, 0xe0f6a27a, 0x3ada4819, 0x4cf1764f, 0x0d771c2b, 0x67cdb156, 0x350d8384, 0x5938fa0f, 0x42399ef3, 0x36997b07, 0x0e84093d, 0x4aa93e61, 0x8360d87b, 0x1fa98b0c, 0x1149382c, 0xe97625a5, 0x0614d1b7, 0x0e25244b, 0x0c768347, 0x589e8d82, 0x0d2059d1, 0xa466bb1e, 0xf8da0a82, 0x04f19130, 0xba6e4ec0, 0x99265164, 0x1ee7230d, 0x50b2ad80, 0xeaee6801, 0x8db2a283, 0xea8bf59e }; #define s1 cast_s1 #define s2 cast_s2 #define s3 cast_s3 #define s4 cast_s4 #define F1(D, m, r) ((I = ((m) + (D))), (I = rol32(I, (r))), \ (((s1[I >> 24] ^ s2[(I>>16)&0xff]) - s3[(I>>8)&0xff]) + s4[I&0xff])) #define F2(D, m, r) ((I = ((m) ^ (D))), (I = rol32(I, (r))), \ (((s1[I >> 24] - s2[(I>>16)&0xff]) + s3[(I>>8)&0xff]) ^ s4[I&0xff])) #define F3(D, m, r) ((I = ((m) - (D))), (I = rol32(I, (r))), \ (((s1[I >> 24] + s2[(I>>16)&0xff]) ^ s3[(I>>8)&0xff]) - s4[I&0xff])) void __cast5_encrypt(struct cast5_ctx *c, u8 *outbuf, const u8 *inbuf) { u32 l, r, t; u32 I; /* used by the Fx macros */ u32 *Km; u8 *Kr; Km = c->Km; Kr = c->Kr; /* (L0,R0) <-- (m1...m64). (Split the plaintext into left and * right 32-bit halves L0 = m1...m32 and R0 = m33...m64.) */ l = get_unaligned_be32(inbuf); r = get_unaligned_be32(inbuf + 4); /* (16 rounds) for i from 1 to 16, compute Li and Ri as follows: * Li = Ri-1; * Ri = Li-1 ^ f(Ri-1,Kmi,Kri), where f is defined in Section 2.2 * Rounds 1, 4, 7, 10, 13, and 16 use f function Type 1. * Rounds 2, 5, 8, 11, and 14 use f function Type 2. * Rounds 3, 6, 9, 12, and 15 use f function Type 3. */ t = l; l = r; r = t ^ F1(r, Km[0], Kr[0]); t = l; l = r; r = t ^ F2(r, Km[1], Kr[1]); t = l; l = r; r = t ^ F3(r, Km[2], Kr[2]); t = l; l = r; r = t ^ F1(r, Km[3], Kr[3]); t = l; l = r; r = t ^ F2(r, Km[4], Kr[4]); t = l; l = r; r = t ^ F3(r, Km[5], Kr[5]); t = l; l = r; r = t ^ F1(r, Km[6], Kr[6]); t = l; l = r; r = t ^ F2(r, Km[7], Kr[7]); t = l; l = r; r = t ^ F3(r, Km[8], Kr[8]); t = l; l = r; r = t ^ F1(r, Km[9], Kr[9]); t = l; l = r; r = t ^ F2(r, Km[10], Kr[10]); t = l; l = r; r = t ^ F3(r, Km[11], Kr[11]); if (!(c->rr)) { t = l; l = r; r = t ^ F1(r, Km[12], Kr[12]); t = l; l = r; r = t ^ F2(r, Km[13], Kr[13]); t = l; l = r; r = t ^ F3(r, Km[14], Kr[14]); t = l; l = r; r = t ^ F1(r, Km[15], Kr[15]); } /* c1...c64 <-- (R16,L16). (Exchange final blocks L16, R16 and * concatenate to form the ciphertext.) */ put_unaligned_be32(r, outbuf); put_unaligned_be32(l, outbuf + 4); } EXPORT_SYMBOL_GPL(__cast5_encrypt); static void cast5_encrypt(struct crypto_tfm *tfm, u8 *outbuf, const u8 *inbuf) { __cast5_encrypt(crypto_tfm_ctx(tfm), outbuf, inbuf); } void __cast5_decrypt(struct cast5_ctx *c, u8 *outbuf, const u8 *inbuf) { u32 l, r, t; u32 I; u32 *Km; u8 *Kr; Km = c->Km; Kr = c->Kr; l = get_unaligned_be32(inbuf); r = get_unaligned_be32(inbuf + 4); if (!(c->rr)) { t = l; l = r; r = t ^ F1(r, Km[15], Kr[15]); t = l; l = r; r = t ^ F3(r, Km[14], Kr[14]); t = l; l = r; r = t ^ F2(r, Km[13], Kr[13]); t = l; l = r; r = t ^ F1(r, Km[12], Kr[12]); } t = l; l = r; r = t ^ F3(r, Km[11], Kr[11]); t = l; l = r; r = t ^ F2(r, Km[10], Kr[10]); t = l; l = r; r = t ^ F1(r, Km[9], Kr[9]); t = l; l = r; r = t ^ F3(r, Km[8], Kr[8]); t = l; l = r; r = t ^ F2(r, Km[7], Kr[7]); t = l; l = r; r = t ^ F1(r, Km[6], Kr[6]); t = l; l = r; r = t ^ F3(r, Km[5], Kr[5]); t = l; l = r; r = t ^ F2(r, Km[4], Kr[4]); t = l; l = r; r = t ^ F1(r, Km[3], Kr[3]); t = l; l = r; r = t ^ F3(r, Km[2], Kr[2]); t = l; l = r; r = t ^ F2(r, Km[1], Kr[1]); t = l; l = r; r = t ^ F1(r, Km[0], Kr[0]); put_unaligned_be32(r, outbuf); put_unaligned_be32(l, outbuf + 4); } EXPORT_SYMBOL_GPL(__cast5_decrypt); static void cast5_decrypt(struct crypto_tfm *tfm, u8 *outbuf, const u8 *inbuf) { __cast5_decrypt(crypto_tfm_ctx(tfm), outbuf, inbuf); } static void key_schedule(u32 *x, u32 *z, u32 *k) { #define xi(i) ((x[(i)/4] >> (8*(3-((i)%4)))) & 0xff) #define zi(i) ((z[(i)/4] >> (8*(3-((i)%4)))) & 0xff) z[0] = x[0] ^ s5[xi(13)] ^ s6[xi(15)] ^ s7[xi(12)] ^ sb8[xi(14)] ^ s7[xi(8)]; z[1] = x[2] ^ s5[zi(0)] ^ s6[zi(2)] ^ s7[zi(1)] ^ sb8[zi(3)] ^ sb8[xi(10)]; z[2] = x[3] ^ s5[zi(7)] ^ s6[zi(6)] ^ s7[zi(5)] ^ sb8[zi(4)] ^ s5[xi(9)]; z[3] = x[1] ^ s5[zi(10)] ^ s6[zi(9)] ^ s7[zi(11)] ^ sb8[zi(8)] ^ s6[xi(11)]; k[0] = s5[zi(8)] ^ s6[zi(9)] ^ s7[zi(7)] ^ sb8[zi(6)] ^ s5[zi(2)]; k[1] = s5[zi(10)] ^ s6[zi(11)] ^ s7[zi(5)] ^ sb8[zi(4)] ^ s6[zi(6)]; k[2] = s5[zi(12)] ^ s6[zi(13)] ^ s7[zi(3)] ^ sb8[zi(2)] ^ s7[zi(9)]; k[3] = s5[zi(14)] ^ s6[zi(15)] ^ s7[zi(1)] ^ sb8[zi(0)] ^ sb8[zi(12)]; x[0] = z[2] ^ s5[zi(5)] ^ s6[zi(7)] ^ s7[zi(4)] ^ sb8[zi(6)] ^ s7[zi(0)]; x[1] = z[0] ^ s5[xi(0)] ^ s6[xi(2)] ^ s7[xi(1)] ^ sb8[xi(3)] ^ sb8[zi(2)]; x[2] = z[1] ^ s5[xi(7)] ^ s6[xi(6)] ^ s7[xi(5)] ^ sb8[xi(4)] ^ s5[zi(1)]; x[3] = z[3] ^ s5[xi(10)] ^ s6[xi(9)] ^ s7[xi(11)] ^ sb8[xi(8)] ^ s6[zi(3)]; k[4] = s5[xi(3)] ^ s6[xi(2)] ^ s7[xi(12)] ^ sb8[xi(13)] ^ s5[xi(8)]; k[5] = s5[xi(1)] ^ s6[xi(0)] ^ s7[xi(14)] ^ sb8[xi(15)] ^ s6[xi(13)]; k[6] = s5[xi(7)] ^ s6[xi(6)] ^ s7[xi(8)] ^ sb8[xi(9)] ^ s7[xi(3)]; k[7] = s5[xi(5)] ^ s6[xi(4)] ^ s7[xi(10)] ^ sb8[xi(11)] ^ sb8[xi(7)]; z[0] = x[0] ^ s5[xi(13)] ^ s6[xi(15)] ^ s7[xi(12)] ^ sb8[xi(14)] ^ s7[xi(8)]; z[1] = x[2] ^ s5[zi(0)] ^ s6[zi(2)] ^ s7[zi(1)] ^ sb8[zi(3)] ^ sb8[xi(10)]; z[2] = x[3] ^ s5[zi(7)] ^ s6[zi(6)] ^ s7[zi(5)] ^ sb8[zi(4)] ^ s5[xi(9)]; z[3] = x[1] ^ s5[zi(10)] ^ s6[zi(9)] ^ s7[zi(11)] ^ sb8[zi(8)] ^ s6[xi(11)]; k[8] = s5[zi(3)] ^ s6[zi(2)] ^ s7[zi(12)] ^ sb8[zi(13)] ^ s5[zi(9)]; k[9] = s5[zi(1)] ^ s6[zi(0)] ^ s7[zi(14)] ^ sb8[zi(15)] ^ s6[zi(12)]; k[10] = s5[zi(7)] ^ s6[zi(6)] ^ s7[zi(8)] ^ sb8[zi(9)] ^ s7[zi(2)]; k[11] = s5[zi(5)] ^ s6[zi(4)] ^ s7[zi(10)] ^ sb8[zi(11)] ^ sb8[zi(6)]; x[0] = z[2] ^ s5[zi(5)] ^ s6[zi(7)] ^ s7[zi(4)] ^ sb8[zi(6)] ^ s7[zi(0)]; x[1] = z[0] ^ s5[xi(0)] ^ s6[xi(2)] ^ s7[xi(1)] ^ sb8[xi(3)] ^ sb8[zi(2)]; x[2] = z[1] ^ s5[xi(7)] ^ s6[xi(6)] ^ s7[xi(5)] ^ sb8[xi(4)] ^ s5[zi(1)]; x[3] = z[3] ^ s5[xi(10)] ^ s6[xi(9)] ^ s7[xi(11)] ^ sb8[xi(8)] ^ s6[zi(3)]; k[12] = s5[xi(8)] ^ s6[xi(9)] ^ s7[xi(7)] ^ sb8[xi(6)] ^ s5[xi(3)]; k[13] = s5[xi(10)] ^ s6[xi(11)] ^ s7[xi(5)] ^ sb8[xi(4)] ^ s6[xi(7)]; k[14] = s5[xi(12)] ^ s6[xi(13)] ^ s7[xi(3)] ^ sb8[xi(2)] ^ s7[xi(8)]; k[15] = s5[xi(14)] ^ s6[xi(15)] ^ s7[xi(1)] ^ sb8[xi(0)] ^ sb8[xi(13)]; #undef xi #undef zi } int cast5_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int key_len) { struct cast5_ctx *c = crypto_tfm_ctx(tfm); int i; u32 x[4]; u32 z[4]; u32 k[16]; __be32 p_key[4]; c->rr = key_len <= 10 ? 1 : 0; memset(p_key, 0, 16); memcpy(p_key, key, key_len); x[0] = be32_to_cpu(p_key[0]); x[1] = be32_to_cpu(p_key[1]); x[2] = be32_to_cpu(p_key[2]); x[3] = be32_to_cpu(p_key[3]); key_schedule(x, z, k); for (i = 0; i < 16; i++) c->Km[i] = k[i]; key_schedule(x, z, k); for (i = 0; i < 16; i++) c->Kr[i] = k[i] & 0x1f; return 0; } EXPORT_SYMBOL_GPL(cast5_setkey); static struct crypto_alg alg = { .cra_name = "cast5", .cra_driver_name = "cast5-generic", .cra_priority = 100, .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = CAST5_BLOCK_SIZE, .cra_ctxsize = sizeof(struct cast5_ctx), .cra_module = THIS_MODULE, .cra_u = { .cipher = { .cia_min_keysize = CAST5_MIN_KEY_SIZE, .cia_max_keysize = CAST5_MAX_KEY_SIZE, .cia_setkey = cast5_setkey, .cia_encrypt = cast5_encrypt, .cia_decrypt = cast5_decrypt } } }; static int __init cast5_mod_init(void) { return crypto_register_alg(&alg); } static void __exit cast5_mod_fini(void) { crypto_unregister_alg(&alg); } subsys_initcall(cast5_mod_init); module_exit(cast5_mod_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Cast5 Cipher Algorithm"); MODULE_ALIAS_CRYPTO("cast5"); MODULE_ALIAS_CRYPTO("cast5-generic");
4195 165 1691 1693 1693 208 230 423 647 753 754 39 225 187 187 2221 1349 642 239 399 73 445 516 3 187 187 187 185 170 4141 2474 941 566 187 273 1419 880 23 859 15 5 10 895 3 3 1317 418 901 2 6 6 1063 1059 42 1064 876 798 878 102 878 798 877 15 7 1 217 356 356 234 234 233 11 213 877 795 96 710 356 876 2 879 871 873 100 4 878 876 878 159 877 877 877 868 356 245 213 875 875 873 148 879 877 859 224 879 877 238 238 44 215 222 49 877 29 3 877 876 224 859 856 859 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io */ #include <uapi/linux/btf.h> #include <linux/kernel.h> #include <linux/types.h> #include <linux/bpf.h> #include <linux/bpf_verifier.h> #include <linux/math64.h> #include <linux/string.h> #define verbose(env, fmt, args...) bpf_verifier_log_write(env, fmt, ##args) static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log) { /* ubuf and len_total should both be specified (or not) together */ if (!!log->ubuf != !!log->len_total) return false; /* log buf without log_level is meaningless */ if (log->ubuf && log->level == 0) return false; if (log->level & ~BPF_LOG_MASK) return false; if (log->len_total > UINT_MAX >> 2) return false; return true; } int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level, char __user *log_buf, u32 log_size) { log->level = log_level; log->ubuf = log_buf; log->len_total = log_size; /* log attributes have to be sane */ if (!bpf_verifier_log_attr_valid(log)) return -EINVAL; return 0; } static void bpf_vlog_update_len_max(struct bpf_verifier_log *log, u32 add_len) { /* add_len includes terminal \0, so no need for +1. */ u64 len = log->end_pos + add_len; /* log->len_max could be larger than our current len due to * bpf_vlog_reset() calls, so we maintain the max of any length at any * previous point */ if (len > UINT_MAX) log->len_max = UINT_MAX; else if (len > log->len_max) log->len_max = len; } void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, va_list args) { u64 cur_pos; u32 new_n, n; n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args); if (log->level == BPF_LOG_KERNEL) { bool newline = n > 0 && log->kbuf[n - 1] == '\n'; pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n"); return; } n += 1; /* include terminating zero */ bpf_vlog_update_len_max(log, n); if (log->level & BPF_LOG_FIXED) { /* check if we have at least something to put into user buf */ new_n = 0; if (log->end_pos < log->len_total) { new_n = min_t(u32, log->len_total - log->end_pos, n); log->kbuf[new_n - 1] = '\0'; } cur_pos = log->end_pos; log->end_pos += n - 1; /* don't count terminating '\0' */ if (log->ubuf && new_n && copy_to_user(log->ubuf + cur_pos, log->kbuf, new_n)) goto fail; } else { u64 new_end, new_start; u32 buf_start, buf_end; new_end = log->end_pos + n; if (new_end - log->start_pos >= log->len_total) new_start = new_end - log->len_total; else new_start = log->start_pos; log->start_pos = new_start; log->end_pos = new_end - 1; /* don't count terminating '\0' */ if (!log->ubuf) return; new_n = min(n, log->len_total); cur_pos = new_end - new_n; div_u64_rem(cur_pos, log->len_total, &buf_start); div_u64_rem(new_end, log->len_total, &buf_end); /* new_end and buf_end are exclusive indices, so if buf_end is * exactly zero, then it actually points right to the end of * ubuf and there is no wrap around */ if (buf_end == 0) buf_end = log->len_total; /* if buf_start > buf_end, we wrapped around; * if buf_start == buf_end, then we fill ubuf completely; we * can't have buf_start == buf_end to mean that there is * nothing to write, because we always write at least * something, even if terminal '\0' */ if (buf_start < buf_end) { /* message fits within contiguous chunk of ubuf */ if (copy_to_user(log->ubuf + buf_start, log->kbuf + n - new_n, buf_end - buf_start)) goto fail; } else { /* message wraps around the end of ubuf, copy in two chunks */ if (copy_to_user(log->ubuf + buf_start, log->kbuf + n - new_n, log->len_total - buf_start)) goto fail; if (copy_to_user(log->ubuf, log->kbuf + n - buf_end, buf_end)) goto fail; } } return; fail: log->ubuf = NULL; } void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos) { char zero = 0; u32 pos; if (WARN_ON_ONCE(new_pos > log->end_pos)) return; if (!bpf_verifier_log_needed(log) || log->level == BPF_LOG_KERNEL) return; /* if position to which we reset is beyond current log window, * then we didn't preserve any useful content and should adjust * start_pos to end up with an empty log (start_pos == end_pos) */ log->end_pos = new_pos; if (log->end_pos < log->start_pos) log->start_pos = log->end_pos; if (!log->ubuf) return; if (log->level & BPF_LOG_FIXED) pos = log->end_pos + 1; else div_u64_rem(new_pos, log->len_total, &pos); if (pos < log->len_total && put_user(zero, log->ubuf + pos)) log->ubuf = NULL; } static void bpf_vlog_reverse_kbuf(char *buf, int len) { int i, j; for (i = 0, j = len - 1; i < j; i++, j--) swap(buf[i], buf[j]); } static int bpf_vlog_reverse_ubuf(struct bpf_verifier_log *log, int start, int end) { /* we split log->kbuf into two equal parts for both ends of array */ int n = sizeof(log->kbuf) / 2, nn; char *lbuf = log->kbuf, *rbuf = log->kbuf + n; /* Read ubuf's section [start, end) two chunks at a time, from left * and right side; within each chunk, swap all the bytes; after that * reverse the order of lbuf and rbuf and write result back to ubuf. * This way we'll end up with swapped contents of specified * [start, end) ubuf segment. */ while (end - start > 1) { nn = min(n, (end - start ) / 2); if (copy_from_user(lbuf, log->ubuf + start, nn)) return -EFAULT; if (copy_from_user(rbuf, log->ubuf + end - nn, nn)) return -EFAULT; bpf_vlog_reverse_kbuf(lbuf, nn); bpf_vlog_reverse_kbuf(rbuf, nn); /* we write lbuf to the right end of ubuf, while rbuf to the * left one to end up with properly reversed overall ubuf */ if (copy_to_user(log->ubuf + start, rbuf, nn)) return -EFAULT; if (copy_to_user(log->ubuf + end - nn, lbuf, nn)) return -EFAULT; start += nn; end -= nn; } return 0; } int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual) { u32 sublen; int err; *log_size_actual = 0; if (!log || log->level == 0 || log->level == BPF_LOG_KERNEL) return 0; if (!log->ubuf) goto skip_log_rotate; /* If we never truncated log, there is nothing to move around. */ if (log->start_pos == 0) goto skip_log_rotate; /* Otherwise we need to rotate log contents to make it start from the * buffer beginning and be a continuous zero-terminated string. Note * that if log->start_pos != 0 then we definitely filled up entire log * buffer with no gaps, and we just need to shift buffer contents to * the left by (log->start_pos % log->len_total) bytes. * * Unfortunately, user buffer could be huge and we don't want to * allocate temporary kernel memory of the same size just to shift * contents in a straightforward fashion. Instead, we'll be clever and * do in-place array rotation. This is a leetcode-style problem, which * could be solved by three rotations. * * Let's say we have log buffer that has to be shifted left by 7 bytes * (spaces and vertical bar is just for demonstrative purposes): * E F G H I J K | A B C D * * First, we reverse entire array: * D C B A | K J I H G F E * * Then we rotate first 4 bytes (DCBA) and separately last 7 bytes * (KJIHGFE), resulting in a properly rotated array: * A B C D | E F G H I J K * * We'll utilize log->kbuf to read user memory chunk by chunk, swap * bytes, and write them back. Doing it byte-by-byte would be * unnecessarily inefficient. Altogether we are going to read and * write each byte twice, for total 4 memory copies between kernel and * user space. */ /* length of the chopped off part that will be the beginning; * len(ABCD) in the example above */ div_u64_rem(log->start_pos, log->len_total, &sublen); sublen = log->len_total - sublen; err = bpf_vlog_reverse_ubuf(log, 0, log->len_total); err = err ?: bpf_vlog_reverse_ubuf(log, 0, sublen); err = err ?: bpf_vlog_reverse_ubuf(log, sublen, log->len_total); if (err) log->ubuf = NULL; skip_log_rotate: *log_size_actual = log->len_max; /* properly initialized log has either both ubuf!=NULL and len_total>0 * or ubuf==NULL and len_total==0, so if this condition doesn't hold, * we got a fault somewhere along the way, so report it back */ if (!!log->ubuf != !!log->len_total) return -EFAULT; /* did truncation actually happen? */ if (log->ubuf && log->len_max > log->len_total) return -ENOSPC; return 0; } /* log_level controls verbosity level of eBPF verifier. * bpf_verifier_log_write() is used to dump the verification trace to the log, * so the user can figure out what's wrong with the program */ __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, const char *fmt, ...) { va_list args; if (!bpf_verifier_log_needed(&env->log)) return; va_start(args, fmt); bpf_verifier_vlog(&env->log, fmt, args); va_end(args); } EXPORT_SYMBOL_GPL(bpf_verifier_log_write); __printf(2, 3) void bpf_log(struct bpf_verifier_log *log, const char *fmt, ...) { va_list args; if (!bpf_verifier_log_needed(log)) return; va_start(args, fmt); bpf_verifier_vlog(log, fmt, args); va_end(args); } EXPORT_SYMBOL_GPL(bpf_log); static const struct bpf_line_info * find_linfo(const struct bpf_verifier_env *env, u32 insn_off) { const struct bpf_line_info *linfo; const struct bpf_prog *prog; u32 nr_linfo; int l, r, m; prog = env->prog; nr_linfo = prog->aux->nr_linfo; if (!nr_linfo || insn_off >= prog->len) return NULL; linfo = prog->aux->linfo; /* Loop invariant: linfo[l].insn_off <= insns_off. * linfo[0].insn_off == 0 which always satisfies above condition. * Binary search is searching for rightmost linfo entry that satisfies * the above invariant, giving us the desired record that covers given * instruction offset. */ l = 0; r = nr_linfo - 1; while (l < r) { /* (r - l + 1) / 2 means we break a tie to the right, so if: * l=1, r=2, linfo[l].insn_off <= insn_off, linfo[r].insn_off > insn_off, * then m=2, we see that linfo[m].insn_off > insn_off, and so * r becomes 1 and we exit the loop with correct l==1. * If the tie was broken to the left, m=1 would end us up in * an endless loop where l and m stay at 1 and r stays at 2. */ m = l + (r - l + 1) / 2; if (linfo[m].insn_off <= insn_off) l = m; else r = m - 1; } return &linfo[l]; } static const char *ltrim(const char *s) { while (isspace(*s)) s++; return s; } __printf(3, 4) void verbose_linfo(struct bpf_verifier_env *env, u32 insn_off, const char *prefix_fmt, ...) { const struct bpf_line_info *linfo, *prev_linfo; const struct btf *btf; const char *s, *fname; if (!bpf_verifier_log_needed(&env->log)) return; prev_linfo = env->prev_linfo; linfo = find_linfo(env, insn_off); if (!linfo || linfo == prev_linfo) return; /* It often happens that two separate linfo records point to the same * source code line, but have differing column numbers. Given verifier * log doesn't emit column information, from user perspective we just * end up emitting the same source code line twice unnecessarily. * So instead check that previous and current linfo record point to * the same file (file_name_offs match) and the same line number, and * avoid emitting duplicated source code line in such case. */ if (prev_linfo && linfo->file_name_off == prev_linfo->file_name_off && BPF_LINE_INFO_LINE_NUM(linfo->line_col) == BPF_LINE_INFO_LINE_NUM(prev_linfo->line_col)) return; if (prefix_fmt) { va_list args; va_start(args, prefix_fmt); bpf_verifier_vlog(&env->log, prefix_fmt, args); va_end(args); } btf = env->prog->aux->btf; s = ltrim(btf_name_by_offset(btf, linfo->line_off)); verbose(env, "%s", s); /* source code line */ s = btf_name_by_offset(btf, linfo->file_name_off); /* leave only file name */ fname = strrchr(s, '/'); fname = fname ? fname + 1 : s; verbose(env, " @ %s:%u\n", fname, BPF_LINE_INFO_LINE_NUM(linfo->line_col)); env->prev_linfo = linfo; } static const char *btf_type_name(const struct btf *btf, u32 id) { return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); } /* string representation of 'enum bpf_reg_type' * * Note that reg_type_str() can not appear more than once in a single verbose() * statement. */ const char *reg_type_str(struct bpf_verifier_env *env, enum bpf_reg_type type) { char postfix[16] = {0}, prefix[64] = {0}; static const char * const str[] = { [NOT_INIT] = "?", [SCALAR_VALUE] = "scalar", [PTR_TO_CTX] = "ctx", [CONST_PTR_TO_MAP] = "map_ptr", [PTR_TO_MAP_VALUE] = "map_value", [PTR_TO_STACK] = "fp", [PTR_TO_PACKET] = "pkt", [PTR_TO_PACKET_META] = "pkt_meta", [PTR_TO_PACKET_END] = "pkt_end", [PTR_TO_FLOW_KEYS] = "flow_keys", [PTR_TO_SOCKET] = "sock", [PTR_TO_SOCK_COMMON] = "sock_common", [PTR_TO_TCP_SOCK] = "tcp_sock", [PTR_TO_TP_BUFFER] = "tp_buffer", [PTR_TO_XDP_SOCK] = "xdp_sock", [PTR_TO_BTF_ID] = "ptr_", [PTR_TO_MEM] = "mem", [PTR_TO_ARENA] = "arena", [PTR_TO_BUF] = "buf", [PTR_TO_FUNC] = "func", [PTR_TO_MAP_KEY] = "map_key", [CONST_PTR_TO_DYNPTR] = "dynptr_ptr", }; if (type & PTR_MAYBE_NULL) { if (base_type(type) == PTR_TO_BTF_ID) strscpy(postfix, "or_null_"); else strscpy(postfix, "_or_null"); } snprintf(prefix, sizeof(prefix), "%s%s%s%s%s%s%s", type & MEM_RDONLY ? "rdonly_" : "", type & MEM_RINGBUF ? "ringbuf_" : "", type & MEM_USER ? "user_" : "", type & MEM_PERCPU ? "percpu_" : "", type & MEM_RCU ? "rcu_" : "", type & PTR_UNTRUSTED ? "untrusted_" : "", type & PTR_TRUSTED ? "trusted_" : "" ); snprintf(env->tmp_str_buf, TMP_STR_BUF_LEN, "%s%s%s", prefix, str[base_type(type)], postfix); return env->tmp_str_buf; } const char *dynptr_type_str(enum bpf_dynptr_type type) { switch (type) { case BPF_DYNPTR_TYPE_LOCAL: return "local"; case BPF_DYNPTR_TYPE_RINGBUF: return "ringbuf"; case BPF_DYNPTR_TYPE_SKB: return "skb"; case BPF_DYNPTR_TYPE_XDP: return "xdp"; case BPF_DYNPTR_TYPE_INVALID: return "<invalid>"; default: WARN_ONCE(1, "unknown dynptr type %d\n", type); return "<unknown>"; } } const char *iter_type_str(const struct btf *btf, u32 btf_id) { if (!btf || btf_id == 0) return "<invalid>"; /* we already validated that type is valid and has conforming name */ return btf_type_name(btf, btf_id) + sizeof(ITER_PREFIX) - 1; } const char *iter_state_str(enum bpf_iter_state state) { switch (state) { case BPF_ITER_STATE_ACTIVE: return "active"; case BPF_ITER_STATE_DRAINED: return "drained"; case BPF_ITER_STATE_INVALID: return "<invalid>"; default: WARN_ONCE(1, "unknown iter state %d\n", state); return "<unknown>"; } } static char slot_type_char[] = { [STACK_INVALID] = '?', [STACK_SPILL] = 'r', [STACK_MISC] = 'm', [STACK_ZERO] = '0', [STACK_DYNPTR] = 'd', [STACK_ITER] = 'i', [STACK_IRQ_FLAG] = 'f' }; static void print_liveness(struct bpf_verifier_env *env, enum bpf_reg_liveness live) { if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE)) verbose(env, "_"); if (live & REG_LIVE_READ) verbose(env, "r"); if (live & REG_LIVE_WRITTEN) verbose(env, "w"); if (live & REG_LIVE_DONE) verbose(env, "D"); } #define UNUM_MAX_DECIMAL U16_MAX #define SNUM_MAX_DECIMAL S16_MAX #define SNUM_MIN_DECIMAL S16_MIN static bool is_unum_decimal(u64 num) { return num <= UNUM_MAX_DECIMAL; } static bool is_snum_decimal(s64 num) { return num >= SNUM_MIN_DECIMAL && num <= SNUM_MAX_DECIMAL; } static void verbose_unum(struct bpf_verifier_env *env, u64 num) { if (is_unum_decimal(num)) verbose(env, "%llu", num); else verbose(env, "%#llx", num); } static void verbose_snum(struct bpf_verifier_env *env, s64 num) { if (is_snum_decimal(num)) verbose(env, "%lld", num); else verbose(env, "%#llx", num); } int tnum_strn(char *str, size_t size, struct tnum a) { /* print as a constant, if tnum is fully known */ if (a.mask == 0) { if (is_unum_decimal(a.value)) return snprintf(str, size, "%llu", a.value); else return snprintf(str, size, "%#llx", a.value); } return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask); } EXPORT_SYMBOL_GPL(tnum_strn); static void print_scalar_ranges(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, const char **sep) { /* For signed ranges, we want to unify 64-bit and 32-bit values in the * output as much as possible, but there is a bit of a complication. * If we choose to print values as decimals, this is natural to do, * because negative 64-bit and 32-bit values >= -S32_MIN have the same * representation due to sign extension. But if we choose to print * them in hex format (see is_snum_decimal()), then sign extension is * misleading. * E.g., smin=-2 and smin32=-2 are exactly the same in decimal, but in * hex they will be smin=0xfffffffffffffffe and smin32=0xfffffffe, two * very different numbers. * So we avoid sign extension if we choose to print values in hex. */ struct { const char *name; u64 val; bool omit; } minmaxs[] = { {"smin", reg->smin_value, reg->smin_value == S64_MIN}, {"smax", reg->smax_value, reg->smax_value == S64_MAX}, {"umin", reg->umin_value, reg->umin_value == 0}, {"umax", reg->umax_value, reg->umax_value == U64_MAX}, {"smin32", is_snum_decimal((s64)reg->s32_min_value) ? (s64)reg->s32_min_value : (u32)reg->s32_min_value, reg->s32_min_value == S32_MIN}, {"smax32", is_snum_decimal((s64)reg->s32_max_value) ? (s64)reg->s32_max_value : (u32)reg->s32_max_value, reg->s32_max_value == S32_MAX}, {"umin32", reg->u32_min_value, reg->u32_min_value == 0}, {"umax32", reg->u32_max_value, reg->u32_max_value == U32_MAX}, }, *m1, *m2, *mend = &minmaxs[ARRAY_SIZE(minmaxs)]; bool neg1, neg2; for (m1 = &minmaxs[0]; m1 < mend; m1++) { if (m1->omit) continue; neg1 = m1->name[0] == 's' && (s64)m1->val < 0; verbose(env, "%s%s=", *sep, m1->name); *sep = ","; for (m2 = m1 + 2; m2 < mend; m2 += 2) { if (m2->omit || m2->val != m1->val) continue; /* don't mix negatives with positives */ neg2 = m2->name[0] == 's' && (s64)m2->val < 0; if (neg2 != neg1) continue; m2->omit = true; verbose(env, "%s=", m2->name); } if (m1->name[0] == 's') verbose_snum(env, m1->val); else verbose_unum(env, m1->val); } } static bool type_is_map_ptr(enum bpf_reg_type t) { switch (base_type(t)) { case CONST_PTR_TO_MAP: case PTR_TO_MAP_KEY: case PTR_TO_MAP_VALUE: return true; default: return false; } } /* * _a stands for append, was shortened to avoid multiline statements below. * This macro is used to output a comma separated list of attributes. */ #define verbose_a(fmt, ...) ({ verbose(env, "%s" fmt, sep, ##__VA_ARGS__); sep = ","; }) static void print_reg_state(struct bpf_verifier_env *env, const struct bpf_func_state *state, const struct bpf_reg_state *reg) { enum bpf_reg_type t; const char *sep = ""; t = reg->type; if (t == SCALAR_VALUE && reg->precise) verbose(env, "P"); if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) { verbose_snum(env, reg->var_off.value); return; } verbose(env, "%s", reg_type_str(env, t)); if (t == PTR_TO_ARENA) return; if (t == PTR_TO_STACK) { if (state->frameno != reg->frameno) verbose(env, "[%d]", reg->frameno); if (tnum_is_const(reg->var_off)) { verbose_snum(env, reg->var_off.value + reg->off); return; } } if (base_type(t) == PTR_TO_BTF_ID) verbose(env, "%s", btf_type_name(reg->btf, reg->btf_id)); verbose(env, "("); if (reg->id) verbose_a("id=%d", reg->id & ~BPF_ADD_CONST); if (reg->id & BPF_ADD_CONST) verbose(env, "%+d", reg->off); if (reg->ref_obj_id) verbose_a("ref_obj_id=%d", reg->ref_obj_id); if (type_is_non_owning_ref(reg->type)) verbose_a("%s", "non_own_ref"); if (type_is_map_ptr(t)) { if (reg->map_ptr->name[0]) verbose_a("map=%s", reg->map_ptr->name); verbose_a("ks=%d,vs=%d", reg->map_ptr->key_size, reg->map_ptr->value_size); } if (t != SCALAR_VALUE && reg->off) { verbose_a("off="); verbose_snum(env, reg->off); } if (type_is_pkt_pointer(t)) { verbose_a("r="); verbose_unum(env, reg->range); } if (base_type(t) == PTR_TO_MEM) { verbose_a("sz="); verbose_unum(env, reg->mem_size); } if (t == CONST_PTR_TO_DYNPTR) verbose_a("type=%s", dynptr_type_str(reg->dynptr.type)); if (tnum_is_const(reg->var_off)) { /* a pointer register with fixed offset */ if (reg->var_off.value) { verbose_a("imm="); verbose_snum(env, reg->var_off.value); } } else { print_scalar_ranges(env, reg, &sep); if (!tnum_is_unknown(reg->var_off)) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose_a("var_off=%s", tn_buf); } } verbose(env, ")"); } void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate, u32 frameno, bool print_all) { const struct bpf_func_state *state = vstate->frame[frameno]; const struct bpf_reg_state *reg; int i; if (state->frameno) verbose(env, " frame%d:", state->frameno); for (i = 0; i < MAX_BPF_REG; i++) { reg = &state->regs[i]; if (reg->type == NOT_INIT) continue; if (!print_all && !reg_scratched(env, i)) continue; verbose(env, " R%d", i); print_liveness(env, reg->live); verbose(env, "="); print_reg_state(env, state, reg); } for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { char types_buf[BPF_REG_SIZE + 1]; const char *sep = ""; bool valid = false; u8 slot_type; int j; if (!print_all && !stack_slot_scratched(env, i)) continue; for (j = 0; j < BPF_REG_SIZE; j++) { slot_type = state->stack[i].slot_type[j]; if (slot_type != STACK_INVALID) valid = true; types_buf[j] = slot_type_char[slot_type]; } types_buf[BPF_REG_SIZE] = 0; if (!valid) continue; reg = &state->stack[i].spilled_ptr; switch (state->stack[i].slot_type[BPF_REG_SIZE - 1]) { case STACK_SPILL: /* print MISC/ZERO/INVALID slots above subreg spill */ for (j = 0; j < BPF_REG_SIZE; j++) if (state->stack[i].slot_type[j] == STACK_SPILL) break; types_buf[j] = '\0'; verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); print_liveness(env, reg->live); verbose(env, "=%s", types_buf); print_reg_state(env, state, reg); break; case STACK_DYNPTR: /* skip to main dynptr slot */ i += BPF_DYNPTR_NR_SLOTS - 1; reg = &state->stack[i].spilled_ptr; verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); print_liveness(env, reg->live); verbose(env, "=dynptr_%s(", dynptr_type_str(reg->dynptr.type)); if (reg->id) verbose_a("id=%d", reg->id); if (reg->ref_obj_id) verbose_a("ref_id=%d", reg->ref_obj_id); if (reg->dynptr_id) verbose_a("dynptr_id=%d", reg->dynptr_id); verbose(env, ")"); break; case STACK_ITER: /* only main slot has ref_obj_id set; skip others */ if (!reg->ref_obj_id) continue; verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); print_liveness(env, reg->live); verbose(env, "=iter_%s(ref_id=%d,state=%s,depth=%u)", iter_type_str(reg->iter.btf, reg->iter.btf_id), reg->ref_obj_id, iter_state_str(reg->iter.state), reg->iter.depth); break; case STACK_MISC: case STACK_ZERO: default: verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); print_liveness(env, reg->live); verbose(env, "=%s", types_buf); break; } } if (vstate->acquired_refs && vstate->refs[0].id) { verbose(env, " refs=%d", vstate->refs[0].id); for (i = 1; i < vstate->acquired_refs; i++) if (vstate->refs[i].id) verbose(env, ",%d", vstate->refs[i].id); } if (state->in_callback_fn) verbose(env, " cb"); if (state->in_async_callback_fn) verbose(env, " async_cb"); verbose(env, "\n"); if (!print_all) mark_verifier_state_clean(env); } static inline u32 vlog_alignment(u32 pos) { return round_up(max(pos + BPF_LOG_MIN_ALIGNMENT / 2, BPF_LOG_ALIGNMENT), BPF_LOG_MIN_ALIGNMENT) - pos - 1; } void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate, u32 frameno) { if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) { /* remove new line character */ bpf_vlog_reset(&env->log, env->prev_log_pos - 1); verbose(env, "%*c;", vlog_alignment(env->prev_insn_print_pos), ' '); } else { verbose(env, "%d:", env->insn_idx); } print_verifier_state(env, vstate, frameno, false); }
524 357 340 92 10 10 719 51 50 7 7 9 9 8 7 8 8 8 9 9 9 8 8 519 520 15 4 8 528 519 522 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2007 Jens Axboe <jens.axboe@oracle.com> * * Scatterlist handling helpers. */ #include <linux/export.h> #include <linux/slab.h> #include <linux/scatterlist.h> #include <linux/highmem.h> #include <linux/kmemleak.h> #include <linux/bvec.h> #include <linux/uio.h> #include <linux/folio_queue.h> /** * sg_next - return the next scatterlist entry in a list * @sg: The current sg entry * * Description: * Usually the next entry will be @sg@ + 1, but if this sg element is part * of a chained scatterlist, it could jump to the start of a new * scatterlist array. * **/ struct scatterlist *sg_next(struct scatterlist *sg) { if (sg_is_last(sg)) return NULL; sg++; if (unlikely(sg_is_chain(sg))) sg = sg_chain_ptr(sg); return sg; } EXPORT_SYMBOL(sg_next); /** * sg_nents - return total count of entries in scatterlist * @sg: The scatterlist * * Description: * Allows to know how many entries are in sg, taking into account * chaining as well * **/ int sg_nents(struct scatterlist *sg) { int nents; for (nents = 0; sg; sg = sg_next(sg)) nents++; return nents; } EXPORT_SYMBOL(sg_nents); /** * sg_nents_for_len - return total count of entries in scatterlist * needed to satisfy the supplied length * @sg: The scatterlist * @len: The total required length * * Description: * Determines the number of entries in sg that are required to meet * the supplied length, taking into account chaining as well * * Returns: * the number of sg entries needed, negative error on failure * **/ int sg_nents_for_len(struct scatterlist *sg, u64 len) { int nents; u64 total; if (!len) return 0; for (nents = 0, total = 0; sg; sg = sg_next(sg)) { nents++; total += sg->length; if (total >= len) return nents; } return -EINVAL; } EXPORT_SYMBOL(sg_nents_for_len); /** * sg_last - return the last scatterlist entry in a list * @sgl: First entry in the scatterlist * @nents: Number of entries in the scatterlist * * Description: * Should only be used casually, it (currently) scans the entire list * to get the last entry. * * Note that the @sgl@ pointer passed in need not be the first one, * the important bit is that @nents@ denotes the number of entries that * exist from @sgl@. * **/ struct scatterlist *sg_last(struct scatterlist *sgl, unsigned int nents) { struct scatterlist *sg, *ret = NULL; unsigned int i; for_each_sg(sgl, sg, nents, i) ret = sg; BUG_ON(!sg_is_last(ret)); return ret; } EXPORT_SYMBOL(sg_last); /** * sg_init_table - Initialize SG table * @sgl: The SG table * @nents: Number of entries in table * * Notes: * If this is part of a chained sg table, sg_mark_end() should be * used only on the last table part. * **/ void sg_init_table(struct scatterlist *sgl, unsigned int nents) { memset(sgl, 0, sizeof(*sgl) * nents); sg_init_marker(sgl, nents); } EXPORT_SYMBOL(sg_init_table); /** * sg_init_one - Initialize a single entry sg list * @sg: SG entry * @buf: Virtual address for IO * @buflen: IO length * **/ void sg_init_one(struct scatterlist *sg, const void *buf, unsigned int buflen) { sg_init_table(sg, 1); sg_set_buf(sg, buf, buflen); } EXPORT_SYMBOL(sg_init_one); /* * The default behaviour of sg_alloc_table() is to use these kmalloc/kfree * helpers. */ static struct scatterlist *sg_kmalloc(unsigned int nents, gfp_t gfp_mask) { if (nents == SG_MAX_SINGLE_ALLOC) { /* * Kmemleak doesn't track page allocations as they are not * commonly used (in a raw form) for kernel data structures. * As we chain together a list of pages and then a normal * kmalloc (tracked by kmemleak), in order to for that last * allocation not to become decoupled (and thus a * false-positive) we need to inform kmemleak of all the * intermediate allocations. */ void *ptr = (void *) __get_free_page(gfp_mask); kmemleak_alloc(ptr, PAGE_SIZE, 1, gfp_mask); return ptr; } else return kmalloc_array(nents, sizeof(struct scatterlist), gfp_mask); } static void sg_kfree(struct scatterlist *sg, unsigned int nents) { if (nents == SG_MAX_SINGLE_ALLOC) { kmemleak_free(sg); free_page((unsigned long) sg); } else kfree(sg); } /** * __sg_free_table - Free a previously mapped sg table * @table: The sg table header to use * @max_ents: The maximum number of entries per single scatterlist * @nents_first_chunk: Number of entries int the (preallocated) first * scatterlist chunk, 0 means no such preallocated first chunk * @free_fn: Free function * @num_ents: Number of entries in the table * * Description: * Free an sg table previously allocated and setup with * __sg_alloc_table(). The @max_ents value must be identical to * that previously used with __sg_alloc_table(). * **/ void __sg_free_table(struct sg_table *table, unsigned int max_ents, unsigned int nents_first_chunk, sg_free_fn *free_fn, unsigned int num_ents) { struct scatterlist *sgl, *next; unsigned curr_max_ents = nents_first_chunk ?: max_ents; if (unlikely(!table->sgl)) return; sgl = table->sgl; while (num_ents) { unsigned int alloc_size = num_ents; unsigned int sg_size; /* * If we have more than max_ents segments left, * then assign 'next' to the sg table after the current one. * sg_size is then one less than alloc size, since the last * element is the chain pointer. */ if (alloc_size > curr_max_ents) { next = sg_chain_ptr(&sgl[curr_max_ents - 1]); alloc_size = curr_max_ents; sg_size = alloc_size - 1; } else { sg_size = alloc_size; next = NULL; } num_ents -= sg_size; if (nents_first_chunk) nents_first_chunk = 0; else free_fn(sgl, alloc_size); sgl = next; curr_max_ents = max_ents; } table->sgl = NULL; } EXPORT_SYMBOL(__sg_free_table); /** * sg_free_append_table - Free a previously allocated append sg table. * @table: The mapped sg append table header * **/ void sg_free_append_table(struct sg_append_table *table) { __sg_free_table(&table->sgt, SG_MAX_SINGLE_ALLOC, 0, sg_kfree, table->total_nents); } EXPORT_SYMBOL(sg_free_append_table); /** * sg_free_table - Free a previously allocated sg table * @table: The mapped sg table header * **/ void sg_free_table(struct sg_table *table) { __sg_free_table(table, SG_MAX_SINGLE_ALLOC, 0, sg_kfree, table->orig_nents); } EXPORT_SYMBOL(sg_free_table); /** * __sg_alloc_table - Allocate and initialize an sg table with given allocator * @table: The sg table header to use * @nents: Number of entries in sg list * @max_ents: The maximum number of entries the allocator returns per call * @first_chunk: first SGL if preallocated (may be %NULL) * @nents_first_chunk: Number of entries in the (preallocated) first * scatterlist chunk, 0 means no such preallocated chunk provided by user * @gfp_mask: GFP allocation mask * @alloc_fn: Allocator to use * * Description: * This function returns a @table @nents long. The allocator is * defined to return scatterlist chunks of maximum size @max_ents. * Thus if @nents is bigger than @max_ents, the scatterlists will be * chained in units of @max_ents. * * Notes: * If this function returns non-0 (eg failure), the caller must call * __sg_free_table() to cleanup any leftover allocations. * **/ int __sg_alloc_table(struct sg_table *table, unsigned int nents, unsigned int max_ents, struct scatterlist *first_chunk, unsigned int nents_first_chunk, gfp_t gfp_mask, sg_alloc_fn *alloc_fn) { struct scatterlist *sg, *prv; unsigned int left; unsigned curr_max_ents = nents_first_chunk ?: max_ents; unsigned prv_max_ents; memset(table, 0, sizeof(*table)); if (nents == 0) return -EINVAL; #ifdef CONFIG_ARCH_NO_SG_CHAIN if (WARN_ON_ONCE(nents > max_ents)) return -EINVAL; #endif left = nents; prv = NULL; do { unsigned int sg_size, alloc_size = left; if (alloc_size > curr_max_ents) { alloc_size = curr_max_ents; sg_size = alloc_size - 1; } else sg_size = alloc_size; left -= sg_size; if (first_chunk) { sg = first_chunk; first_chunk = NULL; } else { sg = alloc_fn(alloc_size, gfp_mask); } if (unlikely(!sg)) { /* * Adjust entry count to reflect that the last * entry of the previous table won't be used for * linkage. Without this, sg_kfree() may get * confused. */ if (prv) table->nents = ++table->orig_nents; return -ENOMEM; } sg_init_table(sg, alloc_size); table->nents = table->orig_nents += sg_size; /* * If this is the first mapping, assign the sg table header. * If this is not the first mapping, chain previous part. */ if (prv) sg_chain(prv, prv_max_ents, sg); else table->sgl = sg; /* * If no more entries after this one, mark the end */ if (!left) sg_mark_end(&sg[sg_size - 1]); prv = sg; prv_max_ents = curr_max_ents; curr_max_ents = max_ents; } while (left); return 0; } EXPORT_SYMBOL(__sg_alloc_table); /** * sg_alloc_table - Allocate and initialize an sg table * @table: The sg table header to use * @nents: Number of entries in sg list * @gfp_mask: GFP allocation mask * * Description: * Allocate and initialize an sg table. If @nents@ is larger than * SG_MAX_SINGLE_ALLOC a chained sg table will be setup. * **/ int sg_alloc_table(struct sg_table *table, unsigned int nents, gfp_t gfp_mask) { int ret; ret = __sg_alloc_table(table, nents, SG_MAX_SINGLE_ALLOC, NULL, 0, gfp_mask, sg_kmalloc); if (unlikely(ret)) sg_free_table(table); return ret; } EXPORT_SYMBOL(sg_alloc_table); static struct scatterlist *get_next_sg(struct sg_append_table *table, struct scatterlist *cur, unsigned long needed_sges, gfp_t gfp_mask) { struct scatterlist *new_sg, *next_sg; unsigned int alloc_size; if (cur) { next_sg = sg_next(cur); /* Check if last entry should be keeped for chainning */ if (!sg_is_last(next_sg) || needed_sges == 1) return next_sg; } alloc_size = min_t(unsigned long, needed_sges, SG_MAX_SINGLE_ALLOC); new_sg = sg_kmalloc(alloc_size, gfp_mask); if (!new_sg) return ERR_PTR(-ENOMEM); sg_init_table(new_sg, alloc_size); if (cur) { table->total_nents += alloc_size - 1; __sg_chain(next_sg, new_sg); } else { table->sgt.sgl = new_sg; table->total_nents = alloc_size; } return new_sg; } static bool pages_are_mergeable(struct page *a, struct page *b) { if (page_to_pfn(a) != page_to_pfn(b) + 1) return false; if (!zone_device_pages_have_same_pgmap(a, b)) return false; return true; } /** * sg_alloc_append_table_from_pages - Allocate and initialize an append sg * table from an array of pages * @sgt_append: The sg append table to use * @pages: Pointer to an array of page pointers * @n_pages: Number of pages in the pages array * @offset: Offset from start of the first page to the start of a buffer * @size: Number of valid bytes in the buffer (after offset) * @max_segment: Maximum size of a scatterlist element in bytes * @left_pages: Left pages caller have to set after this call * @gfp_mask: GFP allocation mask * * Description: * In the first call it allocate and initialize an sg table from a list of * pages, else reuse the scatterlist from sgt_append. Contiguous ranges of * the pages are squashed into a single scatterlist entry up to the maximum * size specified in @max_segment. A user may provide an offset at a start * and a size of valid data in a buffer specified by the page array. The * returned sg table is released by sg_free_append_table * * Returns: * 0 on success, negative error on failure * * Notes: * If this function returns non-0 (eg failure), the caller must call * sg_free_append_table() to cleanup any leftover allocations. * * In the fist call, sgt_append must by initialized. */ int sg_alloc_append_table_from_pages(struct sg_append_table *sgt_append, struct page **pages, unsigned int n_pages, unsigned int offset, unsigned long size, unsigned int max_segment, unsigned int left_pages, gfp_t gfp_mask) { unsigned int chunks, cur_page, seg_len, i, prv_len = 0; unsigned int added_nents = 0; struct scatterlist *s = sgt_append->prv; struct page *last_pg; /* * The algorithm below requires max_segment to be aligned to PAGE_SIZE * otherwise it can overshoot. */ max_segment = ALIGN_DOWN(max_segment, PAGE_SIZE); if (WARN_ON(max_segment < PAGE_SIZE)) return -EINVAL; if (IS_ENABLED(CONFIG_ARCH_NO_SG_CHAIN) && sgt_append->prv) return -EOPNOTSUPP; if (sgt_append->prv) { unsigned long next_pfn; if (WARN_ON(offset)) return -EINVAL; /* Merge contiguous pages into the last SG */ prv_len = sgt_append->prv->length; next_pfn = (sg_phys(sgt_append->prv) + prv_len) / PAGE_SIZE; if (page_to_pfn(pages[0]) == next_pfn) { last_pg = pfn_to_page(next_pfn - 1); while (n_pages && pages_are_mergeable(pages[0], last_pg)) { if (sgt_append->prv->length + PAGE_SIZE > max_segment) break; sgt_append->prv->length += PAGE_SIZE; last_pg = pages[0]; pages++; n_pages--; } if (!n_pages) goto out; } } /* compute number of contiguous chunks */ chunks = 1; seg_len = 0; for (i = 1; i < n_pages; i++) { seg_len += PAGE_SIZE; if (seg_len >= max_segment || !pages_are_mergeable(pages[i], pages[i - 1])) { chunks++; seg_len = 0; } } /* merging chunks and putting them into the scatterlist */ cur_page = 0; for (i = 0; i < chunks; i++) { unsigned int j, chunk_size; /* look for the end of the current chunk */ seg_len = 0; for (j = cur_page + 1; j < n_pages; j++) { seg_len += PAGE_SIZE; if (seg_len >= max_segment || !pages_are_mergeable(pages[j], pages[j - 1])) break; } /* Pass how many chunks might be left */ s = get_next_sg(sgt_append, s, chunks - i + left_pages, gfp_mask); if (IS_ERR(s)) { /* * Adjust entry length to be as before function was * called. */ if (sgt_append->prv) sgt_append->prv->length = prv_len; return PTR_ERR(s); } chunk_size = ((j - cur_page) << PAGE_SHIFT) - offset; sg_set_page(s, pages[cur_page], min_t(unsigned long, size, chunk_size), offset); added_nents++; size -= chunk_size; offset = 0; cur_page = j; } sgt_append->sgt.nents += added_nents; sgt_append->sgt.orig_nents = sgt_append->sgt.nents; sgt_append->prv = s; out: if (!left_pages) sg_mark_end(s); return 0; } EXPORT_SYMBOL(sg_alloc_append_table_from_pages); /** * sg_alloc_table_from_pages_segment - Allocate and initialize an sg table from * an array of pages and given maximum * segment. * @sgt: The sg table header to use * @pages: Pointer to an array of page pointers * @n_pages: Number of pages in the pages array * @offset: Offset from start of the first page to the start of a buffer * @size: Number of valid bytes in the buffer (after offset) * @max_segment: Maximum size of a scatterlist element in bytes * @gfp_mask: GFP allocation mask * * Description: * Allocate and initialize an sg table from a list of pages. Contiguous * ranges of the pages are squashed into a single scatterlist node up to the * maximum size specified in @max_segment. A user may provide an offset at a * start and a size of valid data in a buffer specified by the page array. * * The returned sg table is released by sg_free_table. * * Returns: * 0 on success, negative error on failure */ int sg_alloc_table_from_pages_segment(struct sg_table *sgt, struct page **pages, unsigned int n_pages, unsigned int offset, unsigned long size, unsigned int max_segment, gfp_t gfp_mask) { struct sg_append_table append = {}; int err; err = sg_alloc_append_table_from_pages(&append, pages, n_pages, offset, size, max_segment, 0, gfp_mask); if (err) { sg_free_append_table(&append); return err; } memcpy(sgt, &append.sgt, sizeof(*sgt)); WARN_ON(append.total_nents != sgt->orig_nents); return 0; } EXPORT_SYMBOL(sg_alloc_table_from_pages_segment); #ifdef CONFIG_SGL_ALLOC /** * sgl_alloc_order - allocate a scatterlist and its pages * @length: Length in bytes of the scatterlist. Must be at least one * @order: Second argument for alloc_pages() * @chainable: Whether or not to allocate an extra element in the scatterlist * for scatterlist chaining purposes * @gfp: Memory allocation flags * @nent_p: [out] Number of entries in the scatterlist that have pages * * Returns: A pointer to an initialized scatterlist or %NULL upon failure. */ struct scatterlist *sgl_alloc_order(unsigned long long length, unsigned int order, bool chainable, gfp_t gfp, unsigned int *nent_p) { struct scatterlist *sgl, *sg; struct page *page; unsigned int nent, nalloc; u32 elem_len; nent = round_up(length, PAGE_SIZE << order) >> (PAGE_SHIFT + order); /* Check for integer overflow */ if (length > (nent << (PAGE_SHIFT + order))) return NULL; nalloc = nent; if (chainable) { /* Check for integer overflow */ if (nalloc + 1 < nalloc) return NULL; nalloc++; } sgl = kmalloc_array(nalloc, sizeof(struct scatterlist), gfp & ~GFP_DMA); if (!sgl) return NULL; sg_init_table(sgl, nalloc); sg = sgl; while (length) { elem_len = min_t(u64, length, PAGE_SIZE << order); page = alloc_pages(gfp, order); if (!page) { sgl_free_order(sgl, order); return NULL; } sg_set_page(sg, page, elem_len, 0); length -= elem_len; sg = sg_next(sg); } WARN_ONCE(length, "length = %lld\n", length); if (nent_p) *nent_p = nent; return sgl; } EXPORT_SYMBOL(sgl_alloc_order); /** * sgl_alloc - allocate a scatterlist and its pages * @length: Length in bytes of the scatterlist * @gfp: Memory allocation flags * @nent_p: [out] Number of entries in the scatterlist * * Returns: A pointer to an initialized scatterlist or %NULL upon failure. */ struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp, unsigned int *nent_p) { return sgl_alloc_order(length, 0, false, gfp, nent_p); } EXPORT_SYMBOL(sgl_alloc); /** * sgl_free_n_order - free a scatterlist and its pages * @sgl: Scatterlist with one or more elements * @nents: Maximum number of elements to free * @order: Second argument for __free_pages() * * Notes: * - If several scatterlists have been chained and each chain element is * freed separately then it's essential to set nents correctly to avoid that a * page would get freed twice. * - All pages in a chained scatterlist can be freed at once by setting @nents * to a high number. */ void sgl_free_n_order(struct scatterlist *sgl, int nents, int order) { struct scatterlist *sg; struct page *page; int i; for_each_sg(sgl, sg, nents, i) { if (!sg) break; page = sg_page(sg); if (page) __free_pages(page, order); } kfree(sgl); } EXPORT_SYMBOL(sgl_free_n_order); /** * sgl_free_order - free a scatterlist and its pages * @sgl: Scatterlist with one or more elements * @order: Second argument for __free_pages() */ void sgl_free_order(struct scatterlist *sgl, int order) { sgl_free_n_order(sgl, INT_MAX, order); } EXPORT_SYMBOL(sgl_free_order); /** * sgl_free - free a scatterlist and its pages * @sgl: Scatterlist with one or more elements */ void sgl_free(struct scatterlist *sgl) { sgl_free_order(sgl, 0); } EXPORT_SYMBOL(sgl_free); #endif /* CONFIG_SGL_ALLOC */ void __sg_page_iter_start(struct sg_page_iter *piter, struct scatterlist *sglist, unsigned int nents, unsigned long pgoffset) { piter->__pg_advance = 0; piter->__nents = nents; piter->sg = sglist; piter->sg_pgoffset = pgoffset; } EXPORT_SYMBOL(__sg_page_iter_start); static int sg_page_count(struct scatterlist *sg) { return PAGE_ALIGN(sg->offset + sg->length) >> PAGE_SHIFT; } bool __sg_page_iter_next(struct sg_page_iter *piter) { if (!piter->__nents || !piter->sg) return false; piter->sg_pgoffset += piter->__pg_advance; piter->__pg_advance = 1; while (piter->sg_pgoffset >= sg_page_count(piter->sg)) { piter->sg_pgoffset -= sg_page_count(piter->sg); piter->sg = sg_next(piter->sg); if (!--piter->__nents || !piter->sg) return false; } return true; } EXPORT_SYMBOL(__sg_page_iter_next); static int sg_dma_page_count(struct scatterlist *sg) { return PAGE_ALIGN(sg->offset + sg_dma_len(sg)) >> PAGE_SHIFT; } bool __sg_page_iter_dma_next(struct sg_dma_page_iter *dma_iter) { struct sg_page_iter *piter = &dma_iter->base; if (!piter->__nents || !piter->sg) return false; piter->sg_pgoffset += piter->__pg_advance; piter->__pg_advance = 1; while (piter->sg_pgoffset >= sg_dma_page_count(piter->sg)) { piter->sg_pgoffset -= sg_dma_page_count(piter->sg); piter->sg = sg_next(piter->sg); if (!--piter->__nents || !piter->sg) return false; } return true; } EXPORT_SYMBOL(__sg_page_iter_dma_next); /** * sg_miter_start - start mapping iteration over a sg list * @miter: sg mapping iter to be started * @sgl: sg list to iterate over * @nents: number of sg entries * @flags: sg iterator flags * * Description: * Starts mapping iterator @miter. * * Context: * Don't care. */ void sg_miter_start(struct sg_mapping_iter *miter, struct scatterlist *sgl, unsigned int nents, unsigned int flags) { memset(miter, 0, sizeof(struct sg_mapping_iter)); __sg_page_iter_start(&miter->piter, sgl, nents, 0); WARN_ON(!(flags & (SG_MITER_TO_SG | SG_MITER_FROM_SG))); miter->__flags = flags; } EXPORT_SYMBOL(sg_miter_start); static bool sg_miter_get_next_page(struct sg_mapping_iter *miter) { if (!miter->__remaining) { struct scatterlist *sg; if (!__sg_page_iter_next(&miter->piter)) return false; sg = miter->piter.sg; miter->__offset = miter->piter.sg_pgoffset ? 0 : sg->offset; miter->piter.sg_pgoffset += miter->__offset >> PAGE_SHIFT; miter->__offset &= PAGE_SIZE - 1; miter->__remaining = sg->offset + sg->length - (miter->piter.sg_pgoffset << PAGE_SHIFT) - miter->__offset; miter->__remaining = min_t(unsigned long, miter->__remaining, PAGE_SIZE - miter->__offset); } return true; } /** * sg_miter_skip - reposition mapping iterator * @miter: sg mapping iter to be skipped * @offset: number of bytes to plus the current location * * Description: * Sets the offset of @miter to its current location plus @offset bytes. * If mapping iterator @miter has been proceeded by sg_miter_next(), this * stops @miter. * * Context: * Don't care. * * Returns: * true if @miter contains the valid mapping. false if end of sg * list is reached. */ bool sg_miter_skip(struct sg_mapping_iter *miter, off_t offset) { sg_miter_stop(miter); while (offset) { off_t consumed; if (!sg_miter_get_next_page(miter)) return false; consumed = min_t(off_t, offset, miter->__remaining); miter->__offset += consumed; miter->__remaining -= consumed; offset -= consumed; } return true; } EXPORT_SYMBOL(sg_miter_skip); /** * sg_miter_next - proceed mapping iterator to the next mapping * @miter: sg mapping iter to proceed * * Description: * Proceeds @miter to the next mapping. @miter should have been started * using sg_miter_start(). On successful return, @miter->page, * @miter->addr and @miter->length point to the current mapping. * * Context: * May sleep if !SG_MITER_ATOMIC. * * Returns: * true if @miter contains the next mapping. false if end of sg * list is reached. */ bool sg_miter_next(struct sg_mapping_iter *miter) { sg_miter_stop(miter); /* * Get to the next page if necessary. * __remaining, __offset is adjusted by sg_miter_stop */ if (!sg_miter_get_next_page(miter)) return false; miter->page = sg_page_iter_page(&miter->piter); miter->consumed = miter->length = miter->__remaining; if (miter->__flags & SG_MITER_ATOMIC) miter->addr = kmap_atomic(miter->page) + miter->__offset; else miter->addr = kmap(miter->page) + miter->__offset; return true; } EXPORT_SYMBOL(sg_miter_next); /** * sg_miter_stop - stop mapping iteration * @miter: sg mapping iter to be stopped * * Description: * Stops mapping iterator @miter. @miter should have been started * using sg_miter_start(). A stopped iteration can be resumed by * calling sg_miter_next() on it. This is useful when resources (kmap) * need to be released during iteration. * * Context: * Don't care otherwise. */ void sg_miter_stop(struct sg_mapping_iter *miter) { WARN_ON(miter->consumed > miter->length); /* drop resources from the last iteration */ if (miter->addr) { miter->__offset += miter->consumed; miter->__remaining -= miter->consumed; if (miter->__flags & SG_MITER_TO_SG) flush_dcache_page(miter->page); if (miter->__flags & SG_MITER_ATOMIC) { WARN_ON_ONCE(!pagefault_disabled()); kunmap_atomic(miter->addr); } else kunmap(miter->page); miter->page = NULL; miter->addr = NULL; miter->length = 0; miter->consumed = 0; } } EXPORT_SYMBOL(sg_miter_stop); /** * sg_copy_buffer - Copy data between a linear buffer and an SG list * @sgl: The SG list * @nents: Number of SG entries * @buf: Where to copy from * @buflen: The number of bytes to copy * @skip: Number of bytes to skip before copying * @to_buffer: transfer direction (true == from an sg list to a * buffer, false == from a buffer to an sg list) * * Returns the number of copied bytes. * **/ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen, off_t skip, bool to_buffer) { unsigned int offset = 0; struct sg_mapping_iter miter; unsigned int sg_flags = SG_MITER_ATOMIC; if (to_buffer) sg_flags |= SG_MITER_FROM_SG; else sg_flags |= SG_MITER_TO_SG; sg_miter_start(&miter, sgl, nents, sg_flags); if (!sg_miter_skip(&miter, skip)) return 0; while ((offset < buflen) && sg_miter_next(&miter)) { unsigned int len; len = min(miter.length, buflen - offset); if (to_buffer) memcpy(buf + offset, miter.addr, len); else memcpy(miter.addr, buf + offset, len); offset += len; } sg_miter_stop(&miter); return offset; } EXPORT_SYMBOL(sg_copy_buffer); /** * sg_copy_from_buffer - Copy from a linear buffer to an SG list * @sgl: The SG list * @nents: Number of SG entries * @buf: Where to copy from * @buflen: The number of bytes to copy * * Returns the number of copied bytes. * **/ size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents, const void *buf, size_t buflen) { return sg_copy_buffer(sgl, nents, (void *)buf, buflen, 0, false); } EXPORT_SYMBOL(sg_copy_from_buffer); /** * sg_copy_to_buffer - Copy from an SG list to a linear buffer * @sgl: The SG list * @nents: Number of SG entries * @buf: Where to copy to * @buflen: The number of bytes to copy * * Returns the number of copied bytes. * **/ size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen) { return sg_copy_buffer(sgl, nents, buf, buflen, 0, true); } EXPORT_SYMBOL(sg_copy_to_buffer); /** * sg_pcopy_from_buffer - Copy from a linear buffer to an SG list * @sgl: The SG list * @nents: Number of SG entries * @buf: Where to copy from * @buflen: The number of bytes to copy * @skip: Number of bytes to skip before copying * * Returns the number of copied bytes. * **/ size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents, const void *buf, size_t buflen, off_t skip) { return sg_copy_buffer(sgl, nents, (void *)buf, buflen, skip, false); } EXPORT_SYMBOL(sg_pcopy_from_buffer); /** * sg_pcopy_to_buffer - Copy from an SG list to a linear buffer * @sgl: The SG list * @nents: Number of SG entries * @buf: Where to copy to * @buflen: The number of bytes to copy * @skip: Number of bytes to skip before copying * * Returns the number of copied bytes. * **/ size_t sg_pcopy_to_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen, off_t skip) { return sg_copy_buffer(sgl, nents, buf, buflen, skip, true); } EXPORT_SYMBOL(sg_pcopy_to_buffer); /** * sg_zero_buffer - Zero-out a part of a SG list * @sgl: The SG list * @nents: Number of SG entries * @buflen: The number of bytes to zero out * @skip: Number of bytes to skip before zeroing * * Returns the number of bytes zeroed. **/ size_t sg_zero_buffer(struct scatterlist *sgl, unsigned int nents, size_t buflen, off_t skip) { unsigned int offset = 0; struct sg_mapping_iter miter; unsigned int sg_flags = SG_MITER_ATOMIC | SG_MITER_TO_SG; sg_miter_start(&miter, sgl, nents, sg_flags); if (!sg_miter_skip(&miter, skip)) return false; while (offset < buflen && sg_miter_next(&miter)) { unsigned int len; len = min(miter.length, buflen - offset); memset(miter.addr, 0, len); offset += len; } sg_miter_stop(&miter); return offset; } EXPORT_SYMBOL(sg_zero_buffer); /* * Extract and pin a list of up to sg_max pages from UBUF- or IOVEC-class * iterators, and add them to the scatterlist. */ static ssize_t extract_user_to_sg(struct iov_iter *iter, ssize_t maxsize, struct sg_table *sgtable, unsigned int sg_max, iov_iter_extraction_t extraction_flags) { struct scatterlist *sg = sgtable->sgl + sgtable->nents; struct page **pages; unsigned int npages; ssize_t ret = 0, res; size_t len, off; /* We decant the page list into the tail of the scatterlist */ pages = (void *)sgtable->sgl + array_size(sg_max, sizeof(struct scatterlist)); pages -= sg_max; do { res = iov_iter_extract_pages(iter, &pages, maxsize, sg_max, extraction_flags, &off); if (res <= 0) goto failed; len = res; maxsize -= len; ret += len; npages = DIV_ROUND_UP(off + len, PAGE_SIZE); sg_max -= npages; for (; npages > 0; npages--) { struct page *page = *pages; size_t seg = min_t(size_t, PAGE_SIZE - off, len); *pages++ = NULL; sg_set_page(sg, page, seg, off); sgtable->nents++; sg++; len -= seg; off = 0; } } while (maxsize > 0 && sg_max > 0); return ret; failed: while (sgtable->nents > sgtable->orig_nents) unpin_user_page(sg_page(&sgtable->sgl[--sgtable->nents])); return res; } /* * Extract up to sg_max pages from a BVEC-type iterator and add them to the * scatterlist. The pages are not pinned. */ static ssize_t extract_bvec_to_sg(struct iov_iter *iter, ssize_t maxsize, struct sg_table *sgtable, unsigned int sg_max, iov_iter_extraction_t extraction_flags) { const struct bio_vec *bv = iter->bvec; struct scatterlist *sg = sgtable->sgl + sgtable->nents; unsigned long start = iter->iov_offset; unsigned int i; ssize_t ret = 0; for (i = 0; i < iter->nr_segs; i++) { size_t off, len; len = bv[i].bv_len; if (start >= len) { start -= len; continue; } len = min_t(size_t, maxsize, len - start); off = bv[i].bv_offset + start; sg_set_page(sg, bv[i].bv_page, len, off); sgtable->nents++; sg++; sg_max--; ret += len; maxsize -= len; if (maxsize <= 0 || sg_max == 0) break; start = 0; } if (ret > 0) iov_iter_advance(iter, ret); return ret; } /* * Extract up to sg_max pages from a KVEC-type iterator and add them to the * scatterlist. This can deal with vmalloc'd buffers as well as kmalloc'd or * static buffers. The pages are not pinned. */ static ssize_t extract_kvec_to_sg(struct iov_iter *iter, ssize_t maxsize, struct sg_table *sgtable, unsigned int sg_max, iov_iter_extraction_t extraction_flags) { const struct kvec *kv = iter->kvec; struct scatterlist *sg = sgtable->sgl + sgtable->nents; unsigned long start = iter->iov_offset; unsigned int i; ssize_t ret = 0; for (i = 0; i < iter->nr_segs; i++) { struct page *page; unsigned long kaddr; size_t off, len, seg; len = kv[i].iov_len; if (start >= len) { start -= len; continue; } kaddr = (unsigned long)kv[i].iov_base + start; off = kaddr & ~PAGE_MASK; len = min_t(size_t, maxsize, len - start); kaddr &= PAGE_MASK; maxsize -= len; ret += len; do { seg = min_t(size_t, len, PAGE_SIZE - off); if (is_vmalloc_or_module_addr((void *)kaddr)) page = vmalloc_to_page((void *)kaddr); else page = virt_to_page((void *)kaddr); sg_set_page(sg, page, len, off); sgtable->nents++; sg++; sg_max--; len -= seg; kaddr += PAGE_SIZE; off = 0; } while (len > 0 && sg_max > 0); if (maxsize <= 0 || sg_max == 0) break; start = 0; } if (ret > 0) iov_iter_advance(iter, ret); return ret; } /* * Extract up to sg_max folios from an FOLIOQ-type iterator and add them to * the scatterlist. The pages are not pinned. */ static ssize_t extract_folioq_to_sg(struct iov_iter *iter, ssize_t maxsize, struct sg_table *sgtable, unsigned int sg_max, iov_iter_extraction_t extraction_flags) { const struct folio_queue *folioq = iter->folioq; struct scatterlist *sg = sgtable->sgl + sgtable->nents; unsigned int slot = iter->folioq_slot; ssize_t ret = 0; size_t offset = iter->iov_offset; BUG_ON(!folioq); if (slot >= folioq_nr_slots(folioq)) { folioq = folioq->next; if (WARN_ON_ONCE(!folioq)) return 0; slot = 0; } do { struct folio *folio = folioq_folio(folioq, slot); size_t fsize = folioq_folio_size(folioq, slot); if (offset < fsize) { size_t part = umin(maxsize - ret, fsize - offset); sg_set_page(sg, folio_page(folio, 0), part, offset); sgtable->nents++; sg++; sg_max--; offset += part; ret += part; } if (offset >= fsize) { offset = 0; slot++; if (slot >= folioq_nr_slots(folioq)) { if (!folioq->next) { WARN_ON_ONCE(ret < iter->count); break; } folioq = folioq->next; slot = 0; } } } while (sg_max > 0 && ret < maxsize); iter->folioq = folioq; iter->folioq_slot = slot; iter->iov_offset = offset; iter->count -= ret; return ret; } /* * Extract up to sg_max folios from an XARRAY-type iterator and add them to * the scatterlist. The pages are not pinned. */ static ssize_t extract_xarray_to_sg(struct iov_iter *iter, ssize_t maxsize, struct sg_table *sgtable, unsigned int sg_max, iov_iter_extraction_t extraction_flags) { struct scatterlist *sg = sgtable->sgl + sgtable->nents; struct xarray *xa = iter->xarray; struct folio *folio; loff_t start = iter->xarray_start + iter->iov_offset; pgoff_t index = start / PAGE_SIZE; ssize_t ret = 0; size_t offset, len; XA_STATE(xas, xa, index); rcu_read_lock(); xas_for_each(&xas, folio, ULONG_MAX) { if (xas_retry(&xas, folio)) continue; if (WARN_ON(xa_is_value(folio))) break; if (WARN_ON(folio_test_hugetlb(folio))) break; offset = offset_in_folio(folio, start); len = min_t(size_t, maxsize, folio_size(folio) - offset); sg_set_page(sg, folio_page(folio, 0), len, offset); sgtable->nents++; sg++; sg_max--; maxsize -= len; ret += len; if (maxsize <= 0 || sg_max == 0) break; } rcu_read_unlock(); if (ret > 0) iov_iter_advance(iter, ret); return ret; } /** * extract_iter_to_sg - Extract pages from an iterator and add to an sglist * @iter: The iterator to extract from * @maxsize: The amount of iterator to copy * @sgtable: The scatterlist table to fill in * @sg_max: Maximum number of elements in @sgtable that may be filled * @extraction_flags: Flags to qualify the request * * Extract the page fragments from the given amount of the source iterator and * add them to a scatterlist that refers to all of those bits, to a maximum * addition of @sg_max elements. * * The pages referred to by UBUF- and IOVEC-type iterators are extracted and * pinned; BVEC-, KVEC-, FOLIOQ- and XARRAY-type are extracted but aren't * pinned; DISCARD-type is not supported. * * No end mark is placed on the scatterlist; that's left to the caller. * * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA * be allowed on the pages extracted. * * If successful, @sgtable->nents is updated to include the number of elements * added and the number of bytes added is returned. @sgtable->orig_nents is * left unaltered. * * The iov_iter_extract_mode() function should be used to query how cleanup * should be performed. */ ssize_t extract_iter_to_sg(struct iov_iter *iter, size_t maxsize, struct sg_table *sgtable, unsigned int sg_max, iov_iter_extraction_t extraction_flags) { if (maxsize == 0) return 0; switch (iov_iter_type(iter)) { case ITER_UBUF: case ITER_IOVEC: return extract_user_to_sg(iter, maxsize, sgtable, sg_max, extraction_flags); case ITER_BVEC: return extract_bvec_to_sg(iter, maxsize, sgtable, sg_max, extraction_flags); case ITER_KVEC: return extract_kvec_to_sg(iter, maxsize, sgtable, sg_max, extraction_flags); case ITER_FOLIOQ: return extract_folioq_to_sg(iter, maxsize, sgtable, sg_max, extraction_flags); case ITER_XARRAY: return extract_xarray_to_sg(iter, maxsize, sgtable, sg_max, extraction_flags); default: pr_err("%s(%u) unsupported\n", __func__, iov_iter_type(iter)); WARN_ON_ONCE(1); return -EIO; } } EXPORT_SYMBOL_GPL(extract_iter_to_sg);
75 75 1 1 1 1 151 63 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 // SPDX-License-Identifier: GPL-2.0-or-later /* * Bridge per vlan tunnel port dst_metadata handling code * * Authors: * Roopa Prabhu <roopa@cumulusnetworks.com> */ #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <net/switchdev.h> #include <net/dst_metadata.h> #include "br_private.h" #include "br_private_tunnel.h" static inline int br_vlan_tunid_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct net_bridge_vlan *vle = ptr; __be64 tunid = *(__be64 *)arg->key; return vle->tinfo.tunnel_id != tunid; } static const struct rhashtable_params br_vlan_tunnel_rht_params = { .head_offset = offsetof(struct net_bridge_vlan, tnode), .key_offset = offsetof(struct net_bridge_vlan, tinfo.tunnel_id), .key_len = sizeof(__be64), .nelem_hint = 3, .obj_cmpfn = br_vlan_tunid_cmp, .automatic_shrinking = true, }; static struct net_bridge_vlan *br_vlan_tunnel_lookup(struct rhashtable *tbl, __be64 tunnel_id) { return rhashtable_lookup_fast(tbl, &tunnel_id, br_vlan_tunnel_rht_params); } static void vlan_tunnel_info_release(struct net_bridge_vlan *vlan) { struct metadata_dst *tdst = rtnl_dereference(vlan->tinfo.tunnel_dst); WRITE_ONCE(vlan->tinfo.tunnel_id, 0); RCU_INIT_POINTER(vlan->tinfo.tunnel_dst, NULL); dst_release(&tdst->dst); } void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg, struct net_bridge_vlan *vlan) { if (!rcu_access_pointer(vlan->tinfo.tunnel_dst)) return; rhashtable_remove_fast(&vg->tunnel_hash, &vlan->tnode, br_vlan_tunnel_rht_params); vlan_tunnel_info_release(vlan); } static int __vlan_tunnel_info_add(struct net_bridge_vlan_group *vg, struct net_bridge_vlan *vlan, u32 tun_id) { struct metadata_dst *metadata = rtnl_dereference(vlan->tinfo.tunnel_dst); __be64 key = key32_to_tunnel_id(cpu_to_be32(tun_id)); IP_TUNNEL_DECLARE_FLAGS(flags) = { }; int err; if (metadata) return -EEXIST; __set_bit(IP_TUNNEL_KEY_BIT, flags); metadata = __ip_tun_set_dst(0, 0, 0, 0, 0, flags, key, 0); if (!metadata) return -EINVAL; metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_BRIDGE; rcu_assign_pointer(vlan->tinfo.tunnel_dst, metadata); WRITE_ONCE(vlan->tinfo.tunnel_id, key); err = rhashtable_lookup_insert_fast(&vg->tunnel_hash, &vlan->tnode, br_vlan_tunnel_rht_params); if (err) goto out; return 0; out: vlan_tunnel_info_release(vlan); return err; } /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. */ int nbp_vlan_tunnel_info_add(const struct net_bridge_port *port, u16 vid, u32 tun_id) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *vlan; ASSERT_RTNL(); vg = nbp_vlan_group(port); vlan = br_vlan_find(vg, vid); if (!vlan) return -EINVAL; return __vlan_tunnel_info_add(vg, vlan, tun_id); } /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. */ int nbp_vlan_tunnel_info_delete(const struct net_bridge_port *port, u16 vid) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; ASSERT_RTNL(); vg = nbp_vlan_group(port); v = br_vlan_find(vg, vid); if (!v) return -ENOENT; vlan_tunnel_info_del(vg, v); return 0; } static void __vlan_tunnel_info_flush(struct net_bridge_vlan_group *vg) { struct net_bridge_vlan *vlan, *tmp; list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist) vlan_tunnel_info_del(vg, vlan); } void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port) { struct net_bridge_vlan_group *vg; ASSERT_RTNL(); vg = nbp_vlan_group(port); __vlan_tunnel_info_flush(vg); } int vlan_tunnel_init(struct net_bridge_vlan_group *vg) { return rhashtable_init(&vg->tunnel_hash, &br_vlan_tunnel_rht_params); } void vlan_tunnel_deinit(struct net_bridge_vlan_group *vg) { rhashtable_destroy(&vg->tunnel_hash); } void br_handle_ingress_vlan_tunnel(struct sk_buff *skb, struct net_bridge_port *p, struct net_bridge_vlan_group *vg) { struct ip_tunnel_info *tinfo = skb_tunnel_info(skb); struct net_bridge_vlan *vlan; if (!vg || !tinfo) return; /* if already tagged, ignore */ if (skb_vlan_tagged(skb)) return; /* lookup vid, given tunnel id */ vlan = br_vlan_tunnel_lookup(&vg->tunnel_hash, tinfo->key.tun_id); if (!vlan) return; skb_dst_drop(skb); __vlan_hwaccel_put_tag(skb, p->br->vlan_proto, vlan->vid); } int br_handle_egress_vlan_tunnel(struct sk_buff *skb, struct net_bridge_vlan *vlan) { IP_TUNNEL_DECLARE_FLAGS(flags) = { }; struct metadata_dst *tunnel_dst; __be64 tunnel_id; int err; if (!vlan) return 0; tunnel_id = READ_ONCE(vlan->tinfo.tunnel_id); if (!tunnel_id || unlikely(!skb_vlan_tag_present(skb))) return 0; skb_dst_drop(skb); err = skb_vlan_pop(skb); if (err) return err; if (BR_INPUT_SKB_CB(skb)->backup_nhid) { __set_bit(IP_TUNNEL_KEY_BIT, flags); tunnel_dst = __ip_tun_set_dst(0, 0, 0, 0, 0, flags, tunnel_id, 0); if (!tunnel_dst) return -ENOMEM; tunnel_dst->u.tun_info.mode |= IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_BRIDGE; tunnel_dst->u.tun_info.key.nhid = BR_INPUT_SKB_CB(skb)->backup_nhid; skb_dst_set(skb, &tunnel_dst->dst); return 0; } tunnel_dst = rcu_dereference(vlan->tinfo.tunnel_dst); if (tunnel_dst && dst_hold_safe(&tunnel_dst->dst)) skb_dst_set(skb, &tunnel_dst->dst); return 0; }
43 43 27 18 21 19 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 /* * net/tipc/discover.c * * Copyright (c) 2003-2006, 2014-2018, Ericsson AB * Copyright (c) 2005-2006, 2010-2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "node.h" #include "discover.h" /* min delay during bearer start up */ #define TIPC_DISC_INIT msecs_to_jiffies(125) /* max delay if bearer has no links */ #define TIPC_DISC_FAST msecs_to_jiffies(1000) /* max delay if bearer has links */ #define TIPC_DISC_SLOW msecs_to_jiffies(60000) /* indicates no timer in use */ #define TIPC_DISC_INACTIVE 0xffffffff /** * struct tipc_discoverer - information about an ongoing link setup request * @bearer_id: identity of bearer issuing requests * @net: network namespace instance * @dest: destination address for request messages * @domain: network domain to which links can be established * @num_nodes: number of nodes currently discovered (i.e. with an active link) * @lock: spinlock for controlling access to requests * @skb: request message to be (repeatedly) sent * @timer: timer governing period between requests * @timer_intv: current interval between requests (in ms) */ struct tipc_discoverer { u32 bearer_id; struct tipc_media_addr dest; struct net *net; u32 domain; int num_nodes; spinlock_t lock; struct sk_buff *skb; struct timer_list timer; unsigned long timer_intv; }; /** * tipc_disc_init_msg - initialize a link setup message * @net: the applicable net namespace * @skb: buffer containing message * @mtyp: message type (request or response) * @b: ptr to bearer issuing message */ static void tipc_disc_init_msg(struct net *net, struct sk_buff *skb, u32 mtyp, struct tipc_bearer *b) { struct tipc_net *tn = tipc_net(net); u32 dest_domain = b->domain; struct tipc_msg *hdr; hdr = buf_msg(skb); tipc_msg_init(tn->trial_addr, hdr, LINK_CONFIG, mtyp, MAX_H_SIZE, dest_domain); msg_set_size(hdr, MAX_H_SIZE + NODE_ID_LEN); msg_set_non_seq(hdr, 1); msg_set_node_sig(hdr, tn->random); msg_set_node_capabilities(hdr, TIPC_NODE_CAPABILITIES); msg_set_dest_domain(hdr, dest_domain); msg_set_bc_netid(hdr, tn->net_id); b->media->addr2msg(msg_media_addr(hdr), &b->addr); msg_set_peer_net_hash(hdr, tipc_net_hash_mixes(net, tn->random)); msg_set_node_id(hdr, tipc_own_id(net)); } static void tipc_disc_msg_xmit(struct net *net, u32 mtyp, u32 dst, u32 src, u32 sugg_addr, struct tipc_media_addr *maddr, struct tipc_bearer *b) { struct tipc_msg *hdr; struct sk_buff *skb; skb = tipc_buf_acquire(MAX_H_SIZE + NODE_ID_LEN, GFP_ATOMIC); if (!skb) return; hdr = buf_msg(skb); tipc_disc_init_msg(net, skb, mtyp, b); msg_set_sugg_node_addr(hdr, sugg_addr); msg_set_dest_domain(hdr, dst); tipc_bearer_xmit_skb(net, b->identity, skb, maddr); } /** * disc_dupl_alert - issue node address duplication alert * @b: pointer to bearer detecting duplication * @node_addr: duplicated node address * @media_addr: media address advertised by duplicated node */ static void disc_dupl_alert(struct tipc_bearer *b, u32 node_addr, struct tipc_media_addr *media_addr) { char media_addr_str[64]; tipc_media_addr_printf(media_addr_str, sizeof(media_addr_str), media_addr); pr_warn("Duplicate %x using %s seen on <%s>\n", node_addr, media_addr_str, b->name); } /* tipc_disc_addr_trial(): - handle an address uniqueness trial from peer * Returns true if message should be dropped by caller, i.e., if it is a * trial message or we are inside trial period. Otherwise false. */ static bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d, struct tipc_media_addr *maddr, struct tipc_bearer *b, u32 dst, u32 src, u32 sugg_addr, u8 *peer_id, int mtyp) { struct net *net = d->net; struct tipc_net *tn = tipc_net(net); u32 self = tipc_own_addr(net); bool trial = time_before(jiffies, tn->addr_trial_end) && !self; if (mtyp == DSC_TRIAL_FAIL_MSG) { if (!trial) return true; /* Ignore if somebody else already gave new suggestion */ if (dst != tn->trial_addr) return true; /* Otherwise update trial address and restart trial period */ tn->trial_addr = sugg_addr; msg_set_prevnode(buf_msg(d->skb), sugg_addr); tn->addr_trial_end = jiffies + msecs_to_jiffies(1000); return true; } /* Apply trial address if we just left trial period */ if (!trial && !self) { schedule_work(&tn->work); msg_set_prevnode(buf_msg(d->skb), tn->trial_addr); msg_set_type(buf_msg(d->skb), DSC_REQ_MSG); } /* Accept regular link requests/responses only after trial period */ if (mtyp != DSC_TRIAL_MSG) return trial; sugg_addr = tipc_node_try_addr(net, peer_id, src); if (sugg_addr) tipc_disc_msg_xmit(net, DSC_TRIAL_FAIL_MSG, src, self, sugg_addr, maddr, b); return true; } /** * tipc_disc_rcv - handle incoming discovery message (request or response) * @net: applicable net namespace * @skb: buffer containing message * @b: bearer that message arrived on */ void tipc_disc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) { struct tipc_net *tn = tipc_net(net); struct tipc_msg *hdr = buf_msg(skb); u32 pnet_hash = msg_peer_net_hash(hdr); u16 caps = msg_node_capabilities(hdr); bool legacy = tn->legacy_addr_format; u32 sugg = msg_sugg_node_addr(hdr); u32 signature = msg_node_sig(hdr); u8 peer_id[NODE_ID_LEN] = {0,}; u32 dst = msg_dest_domain(hdr); u32 net_id = msg_bc_netid(hdr); struct tipc_media_addr maddr; u32 src = msg_prevnode(hdr); u32 mtyp = msg_type(hdr); bool dupl_addr = false; bool respond = false; u32 self; int err; if (skb_linearize(skb)) { kfree_skb(skb); return; } hdr = buf_msg(skb); if (caps & TIPC_NODE_ID128) memcpy(peer_id, msg_node_id(hdr), NODE_ID_LEN); else sprintf(peer_id, "%x", src); err = b->media->msg2addr(b, &maddr, msg_media_addr(hdr)); kfree_skb(skb); if (err || maddr.broadcast) { pr_warn_ratelimited("Rcv corrupt discovery message\n"); return; } /* Ignore discovery messages from own node */ if (!memcmp(&maddr, &b->addr, sizeof(maddr))) return; if (net_id != tn->net_id) return; if (tipc_disc_addr_trial_msg(b->disc, &maddr, b, dst, src, sugg, peer_id, mtyp)) return; self = tipc_own_addr(net); /* Message from somebody using this node's address */ if (in_own_node(net, src)) { disc_dupl_alert(b, self, &maddr); return; } if (!tipc_in_scope(legacy, dst, self)) return; if (!tipc_in_scope(legacy, b->domain, src)) return; tipc_node_check_dest(net, src, peer_id, b, caps, signature, pnet_hash, &maddr, &respond, &dupl_addr); if (dupl_addr) disc_dupl_alert(b, src, &maddr); if (!respond) return; if (mtyp != DSC_REQ_MSG) return; tipc_disc_msg_xmit(net, DSC_RESP_MSG, src, self, 0, &maddr, b); } /* tipc_disc_add_dest - increment set of discovered nodes */ void tipc_disc_add_dest(struct tipc_discoverer *d) { spin_lock_bh(&d->lock); d->num_nodes++; spin_unlock_bh(&d->lock); } /* tipc_disc_remove_dest - decrement set of discovered nodes */ void tipc_disc_remove_dest(struct tipc_discoverer *d) { int intv, num; spin_lock_bh(&d->lock); d->num_nodes--; num = d->num_nodes; intv = d->timer_intv; if (!num && (intv == TIPC_DISC_INACTIVE || intv > TIPC_DISC_FAST)) { d->timer_intv = TIPC_DISC_INIT; mod_timer(&d->timer, jiffies + d->timer_intv); } spin_unlock_bh(&d->lock); } /* tipc_disc_timeout - send a periodic link setup request * Called whenever a link setup request timer associated with a bearer expires. * - Keep doubling time between sent request until limit is reached; * - Hold at fast polling rate if we don't have any associated nodes * - Otherwise hold at slow polling rate */ static void tipc_disc_timeout(struct timer_list *t) { struct tipc_discoverer *d = from_timer(d, t, timer); struct tipc_net *tn = tipc_net(d->net); struct tipc_media_addr maddr; struct sk_buff *skb = NULL; struct net *net = d->net; u32 bearer_id; spin_lock_bh(&d->lock); /* Stop searching if only desired node has been found */ if (tipc_node(d->domain) && d->num_nodes) { d->timer_intv = TIPC_DISC_INACTIVE; goto exit; } /* Did we just leave trial period ? */ if (!time_before(jiffies, tn->addr_trial_end) && !tipc_own_addr(net)) { mod_timer(&d->timer, jiffies + TIPC_DISC_INIT); spin_unlock_bh(&d->lock); schedule_work(&tn->work); return; } /* Adjust timeout interval according to discovery phase */ if (time_before(jiffies, tn->addr_trial_end)) { d->timer_intv = TIPC_DISC_INIT; } else { d->timer_intv *= 2; if (d->num_nodes && d->timer_intv > TIPC_DISC_SLOW) d->timer_intv = TIPC_DISC_SLOW; else if (!d->num_nodes && d->timer_intv > TIPC_DISC_FAST) d->timer_intv = TIPC_DISC_FAST; msg_set_type(buf_msg(d->skb), DSC_REQ_MSG); msg_set_prevnode(buf_msg(d->skb), tn->trial_addr); } mod_timer(&d->timer, jiffies + d->timer_intv); memcpy(&maddr, &d->dest, sizeof(maddr)); skb = skb_clone(d->skb, GFP_ATOMIC); bearer_id = d->bearer_id; exit: spin_unlock_bh(&d->lock); if (skb) tipc_bearer_xmit_skb(net, bearer_id, skb, &maddr); } /** * tipc_disc_create - create object to send periodic link setup requests * @net: the applicable net namespace * @b: ptr to bearer issuing requests * @dest: destination address for request messages * @skb: pointer to created frame * * Return: 0 if successful, otherwise -errno. */ int tipc_disc_create(struct net *net, struct tipc_bearer *b, struct tipc_media_addr *dest, struct sk_buff **skb) { struct tipc_net *tn = tipc_net(net); struct tipc_discoverer *d; d = kmalloc(sizeof(*d), GFP_ATOMIC); if (!d) return -ENOMEM; d->skb = tipc_buf_acquire(MAX_H_SIZE + NODE_ID_LEN, GFP_ATOMIC); if (!d->skb) { kfree(d); return -ENOMEM; } tipc_disc_init_msg(net, d->skb, DSC_REQ_MSG, b); /* Do we need an address trial period first ? */ if (!tipc_own_addr(net)) { tn->addr_trial_end = jiffies + msecs_to_jiffies(1000); msg_set_type(buf_msg(d->skb), DSC_TRIAL_MSG); } memcpy(&d->dest, dest, sizeof(*dest)); d->net = net; d->bearer_id = b->identity; d->domain = b->domain; d->num_nodes = 0; d->timer_intv = TIPC_DISC_INIT; spin_lock_init(&d->lock); timer_setup(&d->timer, tipc_disc_timeout, 0); mod_timer(&d->timer, jiffies + d->timer_intv); b->disc = d; *skb = skb_clone(d->skb, GFP_ATOMIC); return 0; } /** * tipc_disc_delete - destroy object sending periodic link setup requests * @d: ptr to link dest structure */ void tipc_disc_delete(struct tipc_discoverer *d) { timer_shutdown_sync(&d->timer); kfree_skb(d->skb); kfree(d); } /** * tipc_disc_reset - reset object to send periodic link setup requests * @net: the applicable net namespace * @b: ptr to bearer issuing requests */ void tipc_disc_reset(struct net *net, struct tipc_bearer *b) { struct tipc_discoverer *d = b->disc; struct tipc_media_addr maddr; struct sk_buff *skb; spin_lock_bh(&d->lock); tipc_disc_init_msg(net, d->skb, DSC_REQ_MSG, b); d->net = net; d->bearer_id = b->identity; d->domain = b->domain; d->num_nodes = 0; d->timer_intv = TIPC_DISC_INIT; memcpy(&maddr, &d->dest, sizeof(maddr)); mod_timer(&d->timer, jiffies + d->timer_intv); skb = skb_clone(d->skb, GFP_ATOMIC); spin_unlock_bh(&d->lock); if (skb) tipc_bearer_xmit_skb(net, b->identity, skb, &maddr); }
190 1390 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PARAVIRT_H #define _ASM_X86_PARAVIRT_H /* Various instructions on x86 need to be replaced for * para-virtualization: those hooks are defined here. */ #include <asm/paravirt_types.h> #ifndef __ASSEMBLY__ struct mm_struct; #endif #ifdef CONFIG_PARAVIRT #include <asm/pgtable_types.h> #include <asm/asm.h> #include <asm/nospec-branch.h> #ifndef __ASSEMBLY__ #include <linux/bug.h> #include <linux/types.h> #include <linux/cpumask.h> #include <linux/static_call_types.h> #include <asm/frame.h> u64 dummy_steal_clock(int cpu); u64 dummy_sched_clock(void); DECLARE_STATIC_CALL(pv_steal_clock, dummy_steal_clock); DECLARE_STATIC_CALL(pv_sched_clock, dummy_sched_clock); void paravirt_set_sched_clock(u64 (*func)(void)); static __always_inline u64 paravirt_sched_clock(void) { return static_call(pv_sched_clock)(); } struct static_key; extern struct static_key paravirt_steal_enabled; extern struct static_key paravirt_steal_rq_enabled; __visible void __native_queued_spin_unlock(struct qspinlock *lock); bool pv_is_native_spin_unlock(void); __visible bool __native_vcpu_is_preempted(long cpu); bool pv_is_native_vcpu_is_preempted(void); static inline u64 paravirt_steal_clock(int cpu) { return static_call(pv_steal_clock)(cpu); } #ifdef CONFIG_PARAVIRT_SPINLOCKS void __init paravirt_set_cap(void); #endif /* The paravirtualized I/O functions */ static inline void slow_down_io(void) { PVOP_VCALL0(cpu.io_delay); #ifdef REALLY_SLOW_IO PVOP_VCALL0(cpu.io_delay); PVOP_VCALL0(cpu.io_delay); PVOP_VCALL0(cpu.io_delay); #endif } void native_flush_tlb_local(void); void native_flush_tlb_global(void); void native_flush_tlb_one_user(unsigned long addr); void native_flush_tlb_multi(const struct cpumask *cpumask, const struct flush_tlb_info *info); static inline void __flush_tlb_local(void) { PVOP_VCALL0(mmu.flush_tlb_user); } static inline void __flush_tlb_global(void) { PVOP_VCALL0(mmu.flush_tlb_kernel); } static inline void __flush_tlb_one_user(unsigned long addr) { PVOP_VCALL1(mmu.flush_tlb_one_user, addr); } static inline void __flush_tlb_multi(const struct cpumask *cpumask, const struct flush_tlb_info *info) { PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info); } static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) { PVOP_VCALL2(mmu.tlb_remove_table, tlb, table); } static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) { PVOP_VCALL1(mmu.exit_mmap, mm); } static inline void notify_page_enc_status_changed(unsigned long pfn, int npages, bool enc) { PVOP_VCALL3(mmu.notify_page_enc_status_changed, pfn, npages, enc); } #ifdef CONFIG_PARAVIRT_XXL static inline void load_sp0(unsigned long sp0) { PVOP_VCALL1(cpu.load_sp0, sp0); } /* The paravirtualized CPUID instruction. */ static inline void __cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { PVOP_VCALL4(cpu.cpuid, eax, ebx, ecx, edx); } /* * These special macros can be used to get or set a debugging register */ static __always_inline unsigned long paravirt_get_debugreg(int reg) { return PVOP_CALL1(unsigned long, cpu.get_debugreg, reg); } #define get_debugreg(var, reg) var = paravirt_get_debugreg(reg) static __always_inline void set_debugreg(unsigned long val, int reg) { PVOP_VCALL2(cpu.set_debugreg, reg, val); } static inline unsigned long read_cr0(void) { return PVOP_CALL0(unsigned long, cpu.read_cr0); } static inline void write_cr0(unsigned long x) { PVOP_VCALL1(cpu.write_cr0, x); } static __always_inline unsigned long read_cr2(void) { return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2, "mov %%cr2, %%rax;", ALT_NOT_XEN); } static __always_inline void write_cr2(unsigned long x) { PVOP_VCALL1(mmu.write_cr2, x); } static inline unsigned long __read_cr3(void) { return PVOP_ALT_CALL0(unsigned long, mmu.read_cr3, "mov %%cr3, %%rax;", ALT_NOT_XEN); } static inline void write_cr3(unsigned long x) { PVOP_ALT_VCALL1(mmu.write_cr3, x, "mov %%rdi, %%cr3", ALT_NOT_XEN); } static inline void __write_cr4(unsigned long x) { PVOP_VCALL1(cpu.write_cr4, x); } static __always_inline void arch_safe_halt(void) { PVOP_VCALL0(irq.safe_halt); } static inline void halt(void) { PVOP_VCALL0(irq.halt); } static inline u64 paravirt_read_msr(unsigned msr) { return PVOP_CALL1(u64, cpu.read_msr, msr); } static inline void paravirt_write_msr(unsigned msr, unsigned low, unsigned high) { PVOP_VCALL3(cpu.write_msr, msr, low, high); } static inline u64 paravirt_read_msr_safe(unsigned msr, int *err) { return PVOP_CALL2(u64, cpu.read_msr_safe, msr, err); } static inline int paravirt_write_msr_safe(unsigned msr, unsigned low, unsigned high) { return PVOP_CALL3(int, cpu.write_msr_safe, msr, low, high); } #define rdmsr(msr, val1, val2) \ do { \ u64 _l = paravirt_read_msr(msr); \ val1 = (u32)_l; \ val2 = _l >> 32; \ } while (0) #define wrmsr(msr, val1, val2) \ do { \ paravirt_write_msr(msr, val1, val2); \ } while (0) #define rdmsrl(msr, val) \ do { \ val = paravirt_read_msr(msr); \ } while (0) static inline void wrmsrl(unsigned msr, u64 val) { wrmsr(msr, (u32)val, (u32)(val>>32)); } #define wrmsr_safe(msr, a, b) paravirt_write_msr_safe(msr, a, b) /* rdmsr with exception handling */ #define rdmsr_safe(msr, a, b) \ ({ \ int _err; \ u64 _l = paravirt_read_msr_safe(msr, &_err); \ (*a) = (u32)_l; \ (*b) = _l >> 32; \ _err; \ }) static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) { int err; *p = paravirt_read_msr_safe(msr, &err); return err; } static inline unsigned long long paravirt_read_pmc(int counter) { return PVOP_CALL1(u64, cpu.read_pmc, counter); } #define rdpmc(counter, low, high) \ do { \ u64 _l = paravirt_read_pmc(counter); \ low = (u32)_l; \ high = _l >> 32; \ } while (0) #define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter)) static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) { PVOP_VCALL2(cpu.alloc_ldt, ldt, entries); } static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries) { PVOP_VCALL2(cpu.free_ldt, ldt, entries); } static inline void load_TR_desc(void) { PVOP_VCALL0(cpu.load_tr_desc); } static inline void load_gdt(const struct desc_ptr *dtr) { PVOP_VCALL1(cpu.load_gdt, dtr); } static inline void load_idt(const struct desc_ptr *dtr) { PVOP_VCALL1(cpu.load_idt, dtr); } static inline void set_ldt(const void *addr, unsigned entries) { PVOP_VCALL2(cpu.set_ldt, addr, entries); } static inline unsigned long paravirt_store_tr(void) { return PVOP_CALL0(unsigned long, cpu.store_tr); } #define store_tr(tr) ((tr) = paravirt_store_tr()) static inline void load_TLS(struct thread_struct *t, unsigned cpu) { PVOP_VCALL2(cpu.load_tls, t, cpu); } static inline void load_gs_index(unsigned int gs) { PVOP_VCALL1(cpu.load_gs_index, gs); } static inline void write_ldt_entry(struct desc_struct *dt, int entry, const void *desc) { PVOP_VCALL3(cpu.write_ldt_entry, dt, entry, desc); } static inline void write_gdt_entry(struct desc_struct *dt, int entry, void *desc, int type) { PVOP_VCALL4(cpu.write_gdt_entry, dt, entry, desc, type); } static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g) { PVOP_VCALL3(cpu.write_idt_entry, dt, entry, g); } #ifdef CONFIG_X86_IOPL_IOPERM static inline void tss_invalidate_io_bitmap(void) { PVOP_VCALL0(cpu.invalidate_io_bitmap); } static inline void tss_update_io_bitmap(void) { PVOP_VCALL0(cpu.update_io_bitmap); } #endif static inline void paravirt_enter_mmap(struct mm_struct *next) { PVOP_VCALL1(mmu.enter_mmap, next); } static inline int paravirt_pgd_alloc(struct mm_struct *mm) { return PVOP_CALL1(int, mmu.pgd_alloc, mm); } static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) { PVOP_VCALL2(mmu.pgd_free, mm, pgd); } static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) { PVOP_VCALL2(mmu.alloc_pte, mm, pfn); } static inline void paravirt_release_pte(unsigned long pfn) { PVOP_VCALL1(mmu.release_pte, pfn); } static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) { PVOP_VCALL2(mmu.alloc_pmd, mm, pfn); } static inline void paravirt_release_pmd(unsigned long pfn) { PVOP_VCALL1(mmu.release_pmd, pfn); } static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) { PVOP_VCALL2(mmu.alloc_pud, mm, pfn); } static inline void paravirt_release_pud(unsigned long pfn) { PVOP_VCALL1(mmu.release_pud, pfn); } static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn) { PVOP_VCALL2(mmu.alloc_p4d, mm, pfn); } static inline void paravirt_release_p4d(unsigned long pfn) { PVOP_VCALL1(mmu.release_p4d, pfn); } static inline pte_t __pte(pteval_t val) { return (pte_t) { PVOP_ALT_CALLEE1(pteval_t, mmu.make_pte, val, "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pteval_t pte_val(pte_t pte) { return PVOP_ALT_CALLEE1(pteval_t, mmu.pte_val, pte.pte, "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline pgd_t __pgd(pgdval_t val) { return (pgd_t) { PVOP_ALT_CALLEE1(pgdval_t, mmu.make_pgd, val, "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pgdval_t pgd_val(pgd_t pgd) { return PVOP_ALT_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd, "mov %%rdi, %%rax", ALT_NOT_XEN); } #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pteval_t ret; ret = PVOP_CALL3(pteval_t, mmu.ptep_modify_prot_start, vma, addr, ptep); return (pte_t) { .pte = ret }; } static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { PVOP_VCALL4(mmu.ptep_modify_prot_commit, vma, addr, ptep, pte.pte); } static inline void set_pte(pte_t *ptep, pte_t pte) { PVOP_VCALL2(mmu.set_pte, ptep, pte.pte); } static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) { PVOP_VCALL2(mmu.set_pmd, pmdp, native_pmd_val(pmd)); } static inline pmd_t __pmd(pmdval_t val) { return (pmd_t) { PVOP_ALT_CALLEE1(pmdval_t, mmu.make_pmd, val, "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pmdval_t pmd_val(pmd_t pmd) { return PVOP_ALT_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd, "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void set_pud(pud_t *pudp, pud_t pud) { PVOP_VCALL2(mmu.set_pud, pudp, native_pud_val(pud)); } static inline pud_t __pud(pudval_t val) { pudval_t ret; ret = PVOP_ALT_CALLEE1(pudval_t, mmu.make_pud, val, "mov %%rdi, %%rax", ALT_NOT_XEN); return (pud_t) { ret }; } static inline pudval_t pud_val(pud_t pud) { return PVOP_ALT_CALLEE1(pudval_t, mmu.pud_val, pud.pud, "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void pud_clear(pud_t *pudp) { set_pud(pudp, native_make_pud(0)); } static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) { p4dval_t val = native_p4d_val(p4d); PVOP_VCALL2(mmu.set_p4d, p4dp, val); } #if CONFIG_PGTABLE_LEVELS >= 5 static inline p4d_t __p4d(p4dval_t val) { p4dval_t ret = PVOP_ALT_CALLEE1(p4dval_t, mmu.make_p4d, val, "mov %%rdi, %%rax", ALT_NOT_XEN); return (p4d_t) { ret }; } static inline p4dval_t p4d_val(p4d_t p4d) { return PVOP_ALT_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d, "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd) { PVOP_VCALL2(mmu.set_pgd, pgdp, native_pgd_val(pgd)); } #define set_pgd(pgdp, pgdval) do { \ if (pgtable_l5_enabled()) \ __set_pgd(pgdp, pgdval); \ else \ set_p4d((p4d_t *)(pgdp), (p4d_t) { (pgdval).pgd }); \ } while (0) #define pgd_clear(pgdp) do { \ if (pgtable_l5_enabled()) \ set_pgd(pgdp, native_make_pgd(0)); \ } while (0) #endif /* CONFIG_PGTABLE_LEVELS == 5 */ static inline void p4d_clear(p4d_t *p4dp) { set_p4d(p4dp, native_make_p4d(0)); } static inline void set_pte_atomic(pte_t *ptep, pte_t pte) { set_pte(ptep, pte); } static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { set_pte(ptep, native_make_pte(0)); } static inline void pmd_clear(pmd_t *pmdp) { set_pmd(pmdp, native_make_pmd(0)); } #define __HAVE_ARCH_START_CONTEXT_SWITCH static inline void arch_start_context_switch(struct task_struct *prev) { PVOP_VCALL1(cpu.start_context_switch, prev); } static inline void arch_end_context_switch(struct task_struct *next) { PVOP_VCALL1(cpu.end_context_switch, next); } #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) { PVOP_VCALL0(mmu.lazy_mode.enter); } static inline void arch_leave_lazy_mmu_mode(void) { PVOP_VCALL0(mmu.lazy_mode.leave); } static inline void arch_flush_lazy_mmu_mode(void) { PVOP_VCALL0(mmu.lazy_mode.flush); } static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, phys_addr_t phys, pgprot_t flags) { pv_ops.mmu.set_fixmap(idx, phys, flags); } #endif #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) { PVOP_VCALL2(lock.queued_spin_lock_slowpath, lock, val); } static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock) { PVOP_ALT_VCALLEE1(lock.queued_spin_unlock, lock, "movb $0, (%%" _ASM_ARG1 ");", ALT_NOT(X86_FEATURE_PVUNLOCK)); } static __always_inline void pv_wait(u8 *ptr, u8 val) { PVOP_VCALL2(lock.wait, ptr, val); } static __always_inline void pv_kick(int cpu) { PVOP_VCALL1(lock.kick, cpu); } static __always_inline bool pv_vcpu_is_preempted(long cpu) { return PVOP_ALT_CALLEE1(bool, lock.vcpu_is_preempted, cpu, "xor %%" _ASM_AX ", %%" _ASM_AX ";", ALT_NOT(X86_FEATURE_VCPUPREEMPT)); } void __raw_callee_save___native_queued_spin_unlock(struct qspinlock *lock); bool __raw_callee_save___native_vcpu_is_preempted(long cpu); #endif /* SMP && PARAVIRT_SPINLOCKS */ #ifdef CONFIG_X86_32 /* save and restore all caller-save registers, except return value */ #define PV_SAVE_ALL_CALLER_REGS "pushl %ecx;" #define PV_RESTORE_ALL_CALLER_REGS "popl %ecx;" #else /* save and restore all caller-save registers, except return value */ #define PV_SAVE_ALL_CALLER_REGS \ "push %rcx;" \ "push %rdx;" \ "push %rsi;" \ "push %rdi;" \ "push %r8;" \ "push %r9;" \ "push %r10;" \ "push %r11;" #define PV_RESTORE_ALL_CALLER_REGS \ "pop %r11;" \ "pop %r10;" \ "pop %r9;" \ "pop %r8;" \ "pop %rdi;" \ "pop %rsi;" \ "pop %rdx;" \ "pop %rcx;" #endif /* * Generate a thunk around a function which saves all caller-save * registers except for the return value. This allows C functions to * be called from assembler code where fewer than normal registers are * available. It may also help code generation around calls from C * code if the common case doesn't use many registers. * * When a callee is wrapped in a thunk, the caller can assume that all * arg regs and all scratch registers are preserved across the * call. The return value in rax/eax will not be saved, even for void * functions. */ #define PV_THUNK_NAME(func) "__raw_callee_save_" #func #define __PV_CALLEE_SAVE_REGS_THUNK(func, section) \ extern typeof(func) __raw_callee_save_##func; \ \ asm(".pushsection " section ", \"ax\";" \ ".globl " PV_THUNK_NAME(func) ";" \ ".type " PV_THUNK_NAME(func) ", @function;" \ ASM_FUNC_ALIGN \ PV_THUNK_NAME(func) ":" \ ASM_ENDBR \ FRAME_BEGIN \ PV_SAVE_ALL_CALLER_REGS \ "call " #func ";" \ PV_RESTORE_ALL_CALLER_REGS \ FRAME_END \ ASM_RET \ ".size " PV_THUNK_NAME(func) ", .-" PV_THUNK_NAME(func) ";" \ ".popsection") #define PV_CALLEE_SAVE_REGS_THUNK(func) \ __PV_CALLEE_SAVE_REGS_THUNK(func, ".text") /* Get a reference to a callee-save function */ #define PV_CALLEE_SAVE(func) \ ((struct paravirt_callee_save) { __raw_callee_save_##func }) /* Promise that "func" already uses the right calling convention */ #define __PV_IS_CALLEE_SAVE(func) \ ((struct paravirt_callee_save) { func }) #ifdef CONFIG_PARAVIRT_XXL static __always_inline unsigned long arch_local_save_flags(void) { return PVOP_ALT_CALLEE0(unsigned long, irq.save_fl, "pushf; pop %%rax;", ALT_NOT_XEN); } static __always_inline void arch_local_irq_disable(void) { PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT_XEN); } static __always_inline void arch_local_irq_enable(void) { PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT_XEN); } static __always_inline unsigned long arch_local_irq_save(void) { unsigned long f; f = arch_local_save_flags(); arch_local_irq_disable(); return f; } #endif /* Make sure as little as possible of this mess escapes. */ #undef PARAVIRT_CALL #undef __PVOP_CALL #undef __PVOP_VCALL #undef PVOP_VCALL0 #undef PVOP_CALL0 #undef PVOP_VCALL1 #undef PVOP_CALL1 #undef PVOP_VCALL2 #undef PVOP_CALL2 #undef PVOP_VCALL3 #undef PVOP_CALL3 #undef PVOP_VCALL4 #undef PVOP_CALL4 extern void default_banner(void); void native_pv_lock_init(void) __init; #else /* __ASSEMBLY__ */ #ifdef CONFIG_X86_64 #ifdef CONFIG_PARAVIRT_XXL #ifdef CONFIG_DEBUG_ENTRY #define PARA_INDIRECT(addr) *addr(%rip) .macro PARA_IRQ_save_fl ANNOTATE_RETPOLINE_SAFE; call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl); .endm #define SAVE_FLAGS ALTERNATIVE_2 "PARA_IRQ_save_fl;", \ "ALT_CALL_INSTR;", ALT_CALL_ALWAYS, \ "pushf; pop %rax;", ALT_NOT_XEN #endif #endif /* CONFIG_PARAVIRT_XXL */ #endif /* CONFIG_X86_64 */ #endif /* __ASSEMBLY__ */ #else /* CONFIG_PARAVIRT */ # define default_banner x86_init_noop #ifndef __ASSEMBLY__ static inline void native_pv_lock_init(void) { } #endif #endif /* !CONFIG_PARAVIRT */ #ifndef __ASSEMBLY__ #ifndef CONFIG_PARAVIRT_XXL static inline void paravirt_enter_mmap(struct mm_struct *mm) { } #endif #ifndef CONFIG_PARAVIRT static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) { } #endif #ifndef CONFIG_PARAVIRT_SPINLOCKS static inline void paravirt_set_cap(void) { } #endif #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PARAVIRT_H */
9 1 8 2 5 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 // SPDX-License-Identifier: GPL-2.0-only /* * xt_HMARK - Netfilter module to set mark by means of hashing * * (C) 2012 by Hans Schillstrom <hans.schillstrom@ericsson.com> * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/icmp.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_HMARK.h> #include <net/ip.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> #endif #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) #include <net/ipv6.h> #include <linux/netfilter_ipv6/ip6_tables.h> #endif MODULE_LICENSE("GPL"); MODULE_AUTHOR("Hans Schillstrom <hans.schillstrom@ericsson.com>"); MODULE_DESCRIPTION("Xtables: packet marking using hash calculation"); MODULE_ALIAS("ipt_HMARK"); MODULE_ALIAS("ip6t_HMARK"); struct hmark_tuple { __be32 src; __be32 dst; union hmark_ports uports; u8 proto; }; static inline __be32 hmark_addr6_mask(const __be32 *addr32, const __be32 *mask) { return (addr32[0] & mask[0]) ^ (addr32[1] & mask[1]) ^ (addr32[2] & mask[2]) ^ (addr32[3] & mask[3]); } static inline __be32 hmark_addr_mask(int l3num, const __be32 *addr32, const __be32 *mask) { switch (l3num) { case AF_INET: return *addr32 & *mask; case AF_INET6: return hmark_addr6_mask(addr32, mask); } return 0; } static inline void hmark_swap_ports(union hmark_ports *uports, const struct xt_hmark_info *info) { union hmark_ports hp; u16 src, dst; hp.b32 = (uports->b32 & info->port_mask.b32) | info->port_set.b32; src = ntohs(hp.b16.src); dst = ntohs(hp.b16.dst); if (dst > src) uports->v32 = (dst << 16) | src; else uports->v32 = (src << 16) | dst; } static int hmark_ct_set_htuple(const struct sk_buff *skb, struct hmark_tuple *t, const struct xt_hmark_info *info) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); struct nf_conntrack_tuple *otuple; struct nf_conntrack_tuple *rtuple; if (ct == NULL) return -1; otuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; rtuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; t->src = hmark_addr_mask(otuple->src.l3num, otuple->src.u3.ip6, info->src_mask.ip6); t->dst = hmark_addr_mask(otuple->src.l3num, rtuple->src.u3.ip6, info->dst_mask.ip6); if (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3)) return 0; t->proto = nf_ct_protonum(ct); if (t->proto != IPPROTO_ICMP) { t->uports.b16.src = otuple->src.u.all; t->uports.b16.dst = rtuple->src.u.all; hmark_swap_ports(&t->uports, info); } return 0; #else return -1; #endif } /* This hash function is endian independent, to ensure consistent hashing if * the cluster is composed of big and little endian systems. */ static inline u32 hmark_hash(struct hmark_tuple *t, const struct xt_hmark_info *info) { u32 hash; u32 src = ntohl(t->src); u32 dst = ntohl(t->dst); if (dst < src) swap(src, dst); hash = jhash_3words(src, dst, t->uports.v32, info->hashrnd); hash = hash ^ (t->proto & info->proto_mask); return reciprocal_scale(hash, info->hmodulus) + info->hoffset; } static void hmark_set_tuple_ports(const struct sk_buff *skb, unsigned int nhoff, struct hmark_tuple *t, const struct xt_hmark_info *info) { int protoff; protoff = proto_ports_offset(t->proto); if (protoff < 0) return; nhoff += protoff; if (skb_copy_bits(skb, nhoff, &t->uports, sizeof(t->uports)) < 0) return; hmark_swap_ports(&t->uports, info); } #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static int get_inner6_hdr(const struct sk_buff *skb, int *offset) { struct icmp6hdr *icmp6h, _ih6; icmp6h = skb_header_pointer(skb, *offset, sizeof(_ih6), &_ih6); if (icmp6h == NULL) return 0; if (icmp6h->icmp6_type && icmp6h->icmp6_type < 128) { *offset += sizeof(struct icmp6hdr); return 1; } return 0; } static int hmark_pkt_set_htuple_ipv6(const struct sk_buff *skb, struct hmark_tuple *t, const struct xt_hmark_info *info) { struct ipv6hdr *ip6, _ip6; int flag = IP6_FH_F_AUTH; unsigned int nhoff = 0; u16 fragoff = 0; int nexthdr; ip6 = (struct ipv6hdr *) (skb->data + skb_network_offset(skb)); nexthdr = ipv6_find_hdr(skb, &nhoff, -1, &fragoff, &flag); if (nexthdr < 0) return 0; /* No need to check for icmp errors on fragments */ if ((flag & IP6_FH_F_FRAG) || (nexthdr != IPPROTO_ICMPV6)) goto noicmp; /* Use inner header in case of ICMP errors */ if (get_inner6_hdr(skb, &nhoff)) { ip6 = skb_header_pointer(skb, nhoff, sizeof(_ip6), &_ip6); if (ip6 == NULL) return -1; /* If AH present, use SPI like in ESP. */ flag = IP6_FH_F_AUTH; nexthdr = ipv6_find_hdr(skb, &nhoff, -1, &fragoff, &flag); if (nexthdr < 0) return -1; } noicmp: t->src = hmark_addr6_mask(ip6->saddr.s6_addr32, info->src_mask.ip6); t->dst = hmark_addr6_mask(ip6->daddr.s6_addr32, info->dst_mask.ip6); if (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3)) return 0; t->proto = nexthdr; if (t->proto == IPPROTO_ICMPV6) return 0; if (flag & IP6_FH_F_FRAG) return 0; hmark_set_tuple_ports(skb, nhoff, t, info); return 0; } static unsigned int hmark_tg_v6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_hmark_info *info = par->targinfo; struct hmark_tuple t; memset(&t, 0, sizeof(struct hmark_tuple)); if (info->flags & XT_HMARK_FLAG(XT_HMARK_CT)) { if (hmark_ct_set_htuple(skb, &t, info) < 0) return XT_CONTINUE; } else { if (hmark_pkt_set_htuple_ipv6(skb, &t, info) < 0) return XT_CONTINUE; } skb->mark = hmark_hash(&t, info); return XT_CONTINUE; } #endif static int get_inner_hdr(const struct sk_buff *skb, int iphsz, int *nhoff) { const struct icmphdr *icmph; struct icmphdr _ih; /* Not enough header? */ icmph = skb_header_pointer(skb, *nhoff + iphsz, sizeof(_ih), &_ih); if (icmph == NULL || icmph->type > NR_ICMP_TYPES) return 0; /* Error message? */ if (!icmp_is_err(icmph->type)) return 0; *nhoff += iphsz + sizeof(_ih); return 1; } static int hmark_pkt_set_htuple_ipv4(const struct sk_buff *skb, struct hmark_tuple *t, const struct xt_hmark_info *info) { struct iphdr *ip, _ip; int nhoff = skb_network_offset(skb); ip = (struct iphdr *) (skb->data + nhoff); if (ip->protocol == IPPROTO_ICMP) { /* Use inner header in case of ICMP errors */ if (get_inner_hdr(skb, ip->ihl * 4, &nhoff)) { ip = skb_header_pointer(skb, nhoff, sizeof(_ip), &_ip); if (ip == NULL) return -1; } } t->src = ip->saddr & info->src_mask.ip; t->dst = ip->daddr & info->dst_mask.ip; if (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3)) return 0; t->proto = ip->protocol; /* ICMP has no ports, skip */ if (t->proto == IPPROTO_ICMP) return 0; /* follow-up fragments don't contain ports, skip all fragments */ if (ip_is_fragment(ip)) return 0; hmark_set_tuple_ports(skb, (ip->ihl * 4) + nhoff, t, info); return 0; } static unsigned int hmark_tg_v4(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_hmark_info *info = par->targinfo; struct hmark_tuple t; memset(&t, 0, sizeof(struct hmark_tuple)); if (info->flags & XT_HMARK_FLAG(XT_HMARK_CT)) { if (hmark_ct_set_htuple(skb, &t, info) < 0) return XT_CONTINUE; } else { if (hmark_pkt_set_htuple_ipv4(skb, &t, info) < 0) return XT_CONTINUE; } skb->mark = hmark_hash(&t, info); return XT_CONTINUE; } static int hmark_tg_check(const struct xt_tgchk_param *par) { const struct xt_hmark_info *info = par->targinfo; const char *errmsg = "proto mask must be zero with L3 mode"; if (!info->hmodulus) return -EINVAL; if (info->proto_mask && (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3))) goto err; if (info->flags & XT_HMARK_FLAG(XT_HMARK_SPI_MASK) && (info->flags & (XT_HMARK_FLAG(XT_HMARK_SPORT_MASK) | XT_HMARK_FLAG(XT_HMARK_DPORT_MASK)))) return -EINVAL; if (info->flags & XT_HMARK_FLAG(XT_HMARK_SPI) && (info->flags & (XT_HMARK_FLAG(XT_HMARK_SPORT) | XT_HMARK_FLAG(XT_HMARK_DPORT)))) { errmsg = "spi-set and port-set can't be combined"; goto err; } return 0; err: pr_info_ratelimited("%s\n", errmsg); return -EINVAL; } static struct xt_target hmark_tg_reg[] __read_mostly = { { .name = "HMARK", .family = NFPROTO_IPV4, .target = hmark_tg_v4, .targetsize = sizeof(struct xt_hmark_info), .checkentry = hmark_tg_check, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "HMARK", .family = NFPROTO_IPV6, .target = hmark_tg_v6, .targetsize = sizeof(struct xt_hmark_info), .checkentry = hmark_tg_check, .me = THIS_MODULE, }, #endif }; static int __init hmark_tg_init(void) { return xt_register_targets(hmark_tg_reg, ARRAY_SIZE(hmark_tg_reg)); } static void __exit hmark_tg_exit(void) { xt_unregister_targets(hmark_tg_reg, ARRAY_SIZE(hmark_tg_reg)); } module_init(hmark_tg_init); module_exit(hmark_tg_exit);
4 11 7 7 7 2 5 7 8 8 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2008-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the list:set type */ #include <linux/module.h> #include <linux/ip.h> #include <linux/rculist.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_list.h> #define IPSET_TYPE_REV_MIN 0 /* 1 Counters support added */ /* 2 Comments support added */ #define IPSET_TYPE_REV_MAX 3 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("list:set", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_list:set"); /* Member elements */ struct set_elem { struct rcu_head rcu; struct list_head list; struct ip_set *set; /* Sigh, in order to cleanup reference */ ip_set_id_t id; } __aligned(__alignof__(u64)); struct set_adt_elem { ip_set_id_t id; ip_set_id_t refid; int before; }; /* Type structure */ struct list_set { u32 size; /* size of set list array */ struct timer_list gc; /* garbage collection */ struct ip_set *set; /* attached to this ip_set */ struct net *net; /* namespace */ struct list_head members; /* the set members */ }; static int list_set_ktest(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt, const struct ip_set_ext *ext) { struct list_set *map = set->data; struct ip_set_ext *mext = &opt->ext; struct set_elem *e; u32 flags = opt->cmdflags; int ret; /* Don't lookup sub-counters at all */ opt->cmdflags &= ~IPSET_FLAG_MATCH_COUNTERS; if (opt->cmdflags & IPSET_FLAG_SKIP_SUBCOUNTER_UPDATE) opt->cmdflags |= IPSET_FLAG_SKIP_COUNTER_UPDATE; list_for_each_entry_rcu(e, &map->members, list) { ret = ip_set_test(e->id, skb, par, opt); if (ret <= 0) continue; if (ip_set_match_extensions(set, ext, mext, flags, e)) return 1; } return 0; } static int list_set_kadd(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt, const struct ip_set_ext *ext) { struct list_set *map = set->data; struct set_elem *e; int ret; list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; ret = ip_set_add(e->id, skb, par, opt); if (ret == 0) return ret; } return 0; } static int list_set_kdel(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, struct ip_set_adt_opt *opt, const struct ip_set_ext *ext) { struct list_set *map = set->data; struct set_elem *e; int ret; list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; ret = ip_set_del(e->id, skb, par, opt); if (ret == 0) return ret; } return 0; } static int list_set_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); int ret = -EINVAL; rcu_read_lock(); switch (adt) { case IPSET_TEST: ret = list_set_ktest(set, skb, par, opt, &ext); break; case IPSET_ADD: ret = list_set_kadd(set, skb, par, opt, &ext); break; case IPSET_DEL: ret = list_set_kdel(set, skb, par, opt, &ext); break; default: break; } rcu_read_unlock(); return ret; } /* Userspace interfaces: we are protected by the nfnl mutex */ static void __list_set_del_rcu(struct rcu_head * rcu) { struct set_elem *e = container_of(rcu, struct set_elem, rcu); struct ip_set *set = e->set; ip_set_ext_destroy(set, e); kfree(e); } static void list_set_del(struct ip_set *set, struct set_elem *e) { struct list_set *map = set->data; set->elements--; list_del_rcu(&e->list); ip_set_put_byindex(map->net, e->id); call_rcu(&e->rcu, __list_set_del_rcu); } static void list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old) { struct list_set *map = set->data; list_replace_rcu(&old->list, &e->list); ip_set_put_byindex(map->net, old->id); call_rcu(&old->rcu, __list_set_del_rcu); } static void set_cleanup_entries(struct ip_set *set) { struct list_set *map = set->data; struct set_elem *e, *n; list_for_each_entry_safe(e, n, &map->members, list) if (ip_set_timeout_expired(ext_timeout(e, set))) list_set_del(set, e); } static int list_set_utest(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags) { struct list_set *map = set->data; struct set_adt_elem *d = value; struct set_elem *e, *next, *prev = NULL; int ret = 0; rcu_read_lock(); list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; else if (e->id != d->id) { prev = e; continue; } if (d->before == 0) { ret = 1; goto out; } else if (d->before > 0) { next = list_next_entry(e, list); ret = !list_is_last(&e->list, &map->members) && next->id == d->refid; } else { ret = prev && prev->id == d->refid; } goto out; } out: rcu_read_unlock(); return ret; } static void list_set_init_extensions(struct ip_set *set, const struct ip_set_ext *ext, struct set_elem *e) { if (SET_WITH_COUNTER(set)) ip_set_init_counter(ext_counter(e, set), ext); if (SET_WITH_COMMENT(set)) ip_set_init_comment(set, ext_comment(e, set), ext); if (SET_WITH_SKBINFO(set)) ip_set_init_skbinfo(ext_skbinfo(e, set), ext); /* Update timeout last */ if (SET_WITH_TIMEOUT(set)) ip_set_timeout_set(ext_timeout(e, set), ext->timeout); } static int list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags) { struct list_set *map = set->data; struct set_adt_elem *d = value; struct set_elem *e, *n, *prev, *next; bool flag_exist = flags & IPSET_FLAG_EXIST; /* Find where to add the new entry */ n = prev = next = NULL; list_for_each_entry_rcu(e, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; else if (d->id == e->id) n = e; else if (d->before == 0 || e->id != d->refid) continue; else if (d->before > 0) next = e; else prev = e; } /* If before/after is used on an empty set */ if ((d->before > 0 && !next) || (d->before < 0 && !prev)) return -IPSET_ERR_REF_EXIST; /* Re-add already existing element */ if (n) { if (!flag_exist) return -IPSET_ERR_EXIST; /* Update extensions */ ip_set_ext_destroy(set, n); list_set_init_extensions(set, ext, n); /* Set is already added to the list */ ip_set_put_byindex(map->net, d->id); return 0; } /* Add new entry */ if (d->before == 0) { /* Append */ n = list_empty(&map->members) ? NULL : list_last_entry(&map->members, struct set_elem, list); } else if (d->before > 0) { /* Insert after next element */ if (!list_is_last(&next->list, &map->members)) n = list_next_entry(next, list); } else { /* Insert before prev element */ if (prev->list.prev != &map->members) n = list_prev_entry(prev, list); } /* Can we replace a timed out entry? */ if (n && !(SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(n, set)))) n = NULL; e = kzalloc(set->dsize, GFP_ATOMIC); if (!e) return -ENOMEM; e->id = d->id; e->set = set; INIT_LIST_HEAD(&e->list); list_set_init_extensions(set, ext, e); if (n) list_set_replace(set, e, n); else if (next) list_add_tail_rcu(&e->list, &next->list); else if (prev) list_add_rcu(&e->list, &prev->list); else list_add_tail_rcu(&e->list, &map->members); set->elements++; return 0; } static int list_set_udel(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags) { struct list_set *map = set->data; struct set_adt_elem *d = value; struct set_elem *e, *n, *next, *prev = NULL; list_for_each_entry_safe(e, n, &map->members, list) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set))) continue; else if (e->id != d->id) { prev = e; continue; } if (d->before > 0) { next = list_next_entry(e, list); if (list_is_last(&e->list, &map->members) || next->id != d->refid) return -IPSET_ERR_REF_EXIST; } else if (d->before < 0) { if (!prev || prev->id != d->refid) return -IPSET_ERR_REF_EXIST; } list_set_del(set, e); return 0; } return d->before != 0 ? -IPSET_ERR_REF_EXIST : -IPSET_ERR_EXIST; } static int list_set_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct list_set *map = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct set_adt_elem e = { .refid = IPSET_INVALID_ID }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); struct ip_set *s; int ret = 0; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_NAME] || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; e.id = ip_set_get_byname(map->net, nla_data(tb[IPSET_ATTR_NAME]), &s); if (e.id == IPSET_INVALID_ID) return -IPSET_ERR_NAME; /* "Loop detection" */ if (s->type->features & IPSET_TYPE_NAME) { ret = -IPSET_ERR_LOOP; goto finish; } if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 f = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); e.before = f & IPSET_FLAG_BEFORE; } if (e.before && !tb[IPSET_ATTR_NAMEREF]) { ret = -IPSET_ERR_BEFORE; goto finish; } if (tb[IPSET_ATTR_NAMEREF]) { e.refid = ip_set_get_byname(map->net, nla_data(tb[IPSET_ATTR_NAMEREF]), &s); if (e.refid == IPSET_INVALID_ID) { ret = -IPSET_ERR_NAMEREF; goto finish; } if (!e.before) e.before = -1; } if (adt != IPSET_TEST && SET_WITH_TIMEOUT(set)) set_cleanup_entries(set); ret = adtfn(set, &e, &ext, &ext, flags); finish: if (e.refid != IPSET_INVALID_ID) ip_set_put_byindex(map->net, e.refid); if (adt != IPSET_ADD || ret) ip_set_put_byindex(map->net, e.id); return ip_set_eexist(ret, flags) ? 0 : ret; } static void list_set_flush(struct ip_set *set) { struct list_set *map = set->data; struct set_elem *e, *n; list_for_each_entry_safe(e, n, &map->members, list) list_set_del(set, e); set->elements = 0; set->ext_size = 0; } static void list_set_destroy(struct ip_set *set) { struct list_set *map = set->data; WARN_ON_ONCE(!list_empty(&map->members)); kfree(map); set->data = NULL; } /* Calculate the actual memory size of the set data */ static size_t list_set_memsize(const struct list_set *map, size_t dsize) { struct set_elem *e; u32 n = 0; rcu_read_lock(); list_for_each_entry_rcu(e, &map->members, list) n++; rcu_read_unlock(); return (sizeof(*map) + n * dsize); } static int list_set_head(struct ip_set *set, struct sk_buff *skb) { const struct list_set *map = set->data; struct nlattr *nested; size_t memsize = list_set_memsize(map, set->dsize) + set->ext_size; nested = nla_nest_start(skb, IPSET_ATTR_DATA); if (!nested) goto nla_put_failure; if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) || nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) || nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) || nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(set->elements))) goto nla_put_failure; if (unlikely(ip_set_put_flags(skb, set))) goto nla_put_failure; nla_nest_end(skb, nested); return 0; nla_put_failure: return -EMSGSIZE; } static int list_set_list(const struct ip_set *set, struct sk_buff *skb, struct netlink_callback *cb) { const struct list_set *map = set->data; struct nlattr *atd, *nested; u32 i = 0, first = cb->args[IPSET_CB_ARG0]; char name[IPSET_MAXNAMELEN]; struct set_elem *e; int ret = 0; atd = nla_nest_start(skb, IPSET_ATTR_ADT); if (!atd) return -EMSGSIZE; rcu_read_lock(); list_for_each_entry_rcu(e, &map->members, list) { if (i < first || (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(e, set)))) { i++; continue; } nested = nla_nest_start(skb, IPSET_ATTR_DATA); if (!nested) goto nla_put_failure; ip_set_name_byindex(map->net, e->id, name); if (nla_put_string(skb, IPSET_ATTR_NAME, name)) goto nla_put_failure; if (ip_set_put_extensions(skb, set, e, true)) goto nla_put_failure; nla_nest_end(skb, nested); i++; } nla_nest_end(skb, atd); /* Set listing finished */ cb->args[IPSET_CB_ARG0] = 0; goto out; nla_put_failure: nla_nest_cancel(skb, nested); if (unlikely(i == first)) { nla_nest_cancel(skb, atd); cb->args[IPSET_CB_ARG0] = 0; ret = -EMSGSIZE; } else { cb->args[IPSET_CB_ARG0] = i; nla_nest_end(skb, atd); } out: rcu_read_unlock(); return ret; } static bool list_set_same_set(const struct ip_set *a, const struct ip_set *b) { const struct list_set *x = a->data; const struct list_set *y = b->data; return x->size == y->size && a->timeout == b->timeout && a->extensions == b->extensions; } static void list_set_cancel_gc(struct ip_set *set) { struct list_set *map = set->data; if (SET_WITH_TIMEOUT(set)) timer_shutdown_sync(&map->gc); /* Flush list to drop references to other ipsets */ list_set_flush(set); } static const struct ip_set_type_variant set_variant = { .kadt = list_set_kadt, .uadt = list_set_uadt, .adt = { [IPSET_ADD] = list_set_uadd, [IPSET_DEL] = list_set_udel, [IPSET_TEST] = list_set_utest, }, .destroy = list_set_destroy, .flush = list_set_flush, .head = list_set_head, .list = list_set_list, .same_set = list_set_same_set, .cancel_gc = list_set_cancel_gc, }; static void list_set_gc(struct timer_list *t) { struct list_set *map = from_timer(map, t, gc); struct ip_set *set = map->set; spin_lock_bh(&set->lock); set_cleanup_entries(set); spin_unlock_bh(&set->lock); map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&map->gc); } static void list_set_gc_init(struct ip_set *set, void (*gc)(struct timer_list *t)) { struct list_set *map = set->data; timer_setup(&map->gc, gc, 0); mod_timer(&map->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ); } /* Create list:set type of sets */ static bool init_list_set(struct net *net, struct ip_set *set, u32 size) { struct list_set *map; map = kzalloc(sizeof(*map), GFP_KERNEL); if (!map) return false; map->size = size; map->net = net; map->set = set; INIT_LIST_HEAD(&map->members); set->data = map; return true; } static struct lock_class_key list_set_lockdep_key; static int list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[], u32 flags) { u32 size = IP_SET_LIST_DEFAULT_SIZE; if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_SIZE) || !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (tb[IPSET_ATTR_SIZE]) size = ip_set_get_h32(tb[IPSET_ATTR_SIZE]); if (size < IP_SET_LIST_MIN_SIZE) size = IP_SET_LIST_MIN_SIZE; lockdep_set_class(&set->lock, &list_set_lockdep_key); set->variant = &set_variant; set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem), __alignof__(struct set_elem)); if (!init_list_set(net, set, size)) return -ENOMEM; if (tb[IPSET_ATTR_TIMEOUT]) { set->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]); list_set_gc_init(set, list_set_gc); } return 0; } static struct ip_set_type list_set_type __read_mostly = { .name = "list:set", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_NAME | IPSET_DUMP_LAST, .dimension = IPSET_DIM_ONE, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create = list_set_create, .create_policy = { [IPSET_ATTR_SIZE] = { .type = NLA_U32 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_NAME] = { .type = NLA_STRING, .len = IPSET_MAXNAMELEN }, [IPSET_ATTR_NAMEREF] = { .type = NLA_STRING, .len = IPSET_MAXNAMELEN }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init list_set_init(void) { return ip_set_type_register(&list_set_type); } static void __exit list_set_fini(void) { rcu_barrier(); ip_set_type_unregister(&list_set_type); } module_init(list_set_init); module_exit(list_set_fini);
182 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 // SPDX-License-Identifier: GPL-2.0-or-later /* * NET Generic infrastructure for Network protocols. * * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * From code originally in include/net/tcp.h */ #include <linux/module.h> #include <linux/random.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/tcp.h> #include <linux/vmalloc.h> #include <net/request_sock.h> /* * Maximum number of SYN_RECV sockets in queue per LISTEN socket. * One SYN_RECV socket costs about 80bytes on a 32bit machine. * It would be better to replace it with a global counter for all sockets * but then some measure against one socket starving all other sockets * would be needed. * * The minimum value of it is 128. Experiments with real servers show that * it is absolutely not enough even at 100conn/sec. 256 cures most * of problems. * This value is adjusted to 128 for low memory machines, * and it will increase in proportion to the memory of machine. * Note : Dont forget somaxconn that may limit backlog too. */ void reqsk_queue_alloc(struct request_sock_queue *queue) { queue->fastopenq.rskq_rst_head = NULL; queue->fastopenq.rskq_rst_tail = NULL; queue->fastopenq.qlen = 0; queue->rskq_accept_head = NULL; } /* * This function is called to set a Fast Open socket's "fastopen_rsk" field * to NULL when a TFO socket no longer needs to access the request_sock. * This happens only after 3WHS has been either completed or aborted (e.g., * RST is received). * * Before TFO, a child socket is created only after 3WHS is completed, * hence it never needs to access the request_sock. things get a lot more * complex with TFO. A child socket, accepted or not, has to access its * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts, * until 3WHS is either completed or aborted. Afterwards the req will stay * until either the child socket is accepted, or in the rare case when the * listener is closed before the child is accepted. * * In short, a request socket is only freed after BOTH 3WHS has completed * (or aborted) and the child socket has been accepted (or listener closed). * When a child socket is accepted, its corresponding req->sk is set to * NULL since it's no longer needed. More importantly, "req->sk == NULL" * will be used by the code below to determine if a child socket has been * accepted or not, and the check is protected by the fastopenq->lock * described below. * * Note that fastopen_rsk is only accessed from the child socket's context * with its socket lock held. But a request_sock (req) can be accessed by * both its child socket through fastopen_rsk, and a listener socket through * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created. * only in the rare case when both the listener and the child locks are held, * e.g., in inet_csk_listen_stop() do we not need to acquire the lock. * The lock also protects other fields such as fastopenq->qlen, which is * decremented by this function when fastopen_rsk is no longer needed. * * Note that another solution was to simply use the existing socket lock * from the listener. But first socket lock is difficult to use. It is not * a simple spin lock - one must consider sock_owned_by_user() and arrange * to use sk_add_backlog() stuff. But what really makes it infeasible is the * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to * acquire a child's lock while holding listener's socket lock. A corner * case might also exist in tcp_v4_hnd_req() that will trigger this locking * order. * * This function also sets "treq->tfo_listener" to false. * treq->tfo_listener is used by the listener so it is protected by the * fastopenq->lock in this function. */ void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, bool reset) { struct sock *lsk = req->rsk_listener; struct fastopen_queue *fastopenq; fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq; RCU_INIT_POINTER(tcp_sk(sk)->fastopen_rsk, NULL); spin_lock_bh(&fastopenq->lock); fastopenq->qlen--; tcp_rsk(req)->tfo_listener = false; if (req->sk) /* the child socket hasn't been accepted yet */ goto out; if (!reset || lsk->sk_state != TCP_LISTEN) { /* If the listener has been closed don't bother with the * special RST handling below. */ spin_unlock_bh(&fastopenq->lock); reqsk_put(req); return; } /* Wait for 60secs before removing a req that has triggered RST. * This is a simple defense against TFO spoofing attack - by * counting the req against fastopen.max_qlen, and disabling * TFO when the qlen exceeds max_qlen. * * For more details see CoNext'11 "TCP Fast Open" paper. */ req->rsk_timer.expires = jiffies + 60*HZ; if (fastopenq->rskq_rst_head == NULL) fastopenq->rskq_rst_head = req; else fastopenq->rskq_rst_tail->dl_next = req; req->dl_next = NULL; fastopenq->rskq_rst_tail = req; fastopenq->qlen++; out: spin_unlock_bh(&fastopenq->lock); }
100 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_FS_STRUCT_H #define _LINUX_FS_STRUCT_H #include <linux/path.h> #include <linux/spinlock.h> #include <linux/seqlock.h> struct fs_struct { int users; spinlock_t lock; seqcount_spinlock_t seq; int umask; int in_exec; struct path root, pwd; } __randomize_layout; extern struct kmem_cache *fs_cachep; extern void exit_fs(struct task_struct *); extern void set_fs_root(struct fs_struct *, const struct path *); extern void set_fs_pwd(struct fs_struct *, const struct path *); extern struct fs_struct *copy_fs_struct(struct fs_struct *); extern void free_fs_struct(struct fs_struct *); extern int unshare_fs_struct(void); static inline void get_fs_root(struct fs_struct *fs, struct path *root) { spin_lock(&fs->lock); *root = fs->root; path_get(root); spin_unlock(&fs->lock); } static inline void get_fs_pwd(struct fs_struct *fs, struct path *pwd) { spin_lock(&fs->lock); *pwd = fs->pwd; path_get(pwd); spin_unlock(&fs->lock); } extern bool current_chrooted(void); #endif /* _LINUX_FS_STRUCT_H */
87 87 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 // SPDX-License-Identifier: GPL-2.0 #include <linux/err.h> #include <linux/mm.h> #include <asm/current.h> #include <asm/traps.h> #include <asm/vdso.h> struct vdso_exception_table_entry { int insn, fixup; }; bool fixup_vdso_exception(struct pt_regs *regs, int trapnr, unsigned long error_code, unsigned long fault_addr) { const struct vdso_image *image = current->mm->context.vdso_image; const struct vdso_exception_table_entry *extable; unsigned int nr_entries, i; unsigned long base; /* * Do not attempt to fixup #DB or #BP. It's impossible to identify * whether or not a #DB/#BP originated from within an SGX enclave and * SGX enclaves are currently the only use case for vDSO fixup. */ if (trapnr == X86_TRAP_DB || trapnr == X86_TRAP_BP) return false; if (!current->mm->context.vdso) return false; base = (unsigned long)current->mm->context.vdso + image->extable_base; nr_entries = image->extable_len / (sizeof(*extable)); extable = image->extable; for (i = 0; i < nr_entries; i++) { if (regs->ip == base + extable[i].insn) { regs->ip = base + extable[i].fixup; regs->di = trapnr; regs->si = error_code; regs->dx = fault_addr; return true; } } return false; }
8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 // SPDX-License-Identifier: GPL-2.0-only /* ipv6header match - matches IPv6 packets based on whether they contain certain headers */ /* Original idea: Brad Chapman * Rewritten by: Andras Kis-Szabo <kisza@sch.bme.hu> */ /* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ipv6.h> #include <linux/types.h> #include <net/checksum.h> #include <net/ipv6.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_ipv6/ip6t_ipv6header.h> MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: IPv6 header types match"); MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); static bool ipv6header_mt6(const struct sk_buff *skb, struct xt_action_param *par) { const struct ip6t_ipv6header_info *info = par->matchinfo; unsigned int temp; int len; u8 nexthdr; unsigned int ptr; /* Make sure this isn't an evil packet */ /* type of the 1st exthdr */ nexthdr = ipv6_hdr(skb)->nexthdr; /* pointer to the 1st exthdr */ ptr = sizeof(struct ipv6hdr); /* available length */ len = skb->len - ptr; temp = 0; while (nf_ip6_ext_hdr(nexthdr)) { const struct ipv6_opt_hdr *hp; struct ipv6_opt_hdr _hdr; int hdrlen; /* No more exthdr -> evaluate */ if (nexthdr == NEXTHDR_NONE) { temp |= MASK_NONE; break; } /* Is there enough space for the next ext header? */ if (len < (int)sizeof(struct ipv6_opt_hdr)) return false; /* ESP -> evaluate */ if (nexthdr == NEXTHDR_ESP) { temp |= MASK_ESP; break; } hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); if (!hp) { par->hotdrop = true; return false; } /* Calculate the header length */ if (nexthdr == NEXTHDR_FRAGMENT) hdrlen = 8; else if (nexthdr == NEXTHDR_AUTH) hdrlen = ipv6_authlen(hp); else hdrlen = ipv6_optlen(hp); /* set the flag */ switch (nexthdr) { case NEXTHDR_HOP: temp |= MASK_HOPOPTS; break; case NEXTHDR_ROUTING: temp |= MASK_ROUTING; break; case NEXTHDR_FRAGMENT: temp |= MASK_FRAGMENT; break; case NEXTHDR_AUTH: temp |= MASK_AH; break; case NEXTHDR_DEST: temp |= MASK_DSTOPTS; break; default: return false; } nexthdr = hp->nexthdr; len -= hdrlen; ptr += hdrlen; if (ptr > skb->len) break; } if (nexthdr != NEXTHDR_NONE && nexthdr != NEXTHDR_ESP) temp |= MASK_PROTO; if (info->modeflag) return !((temp ^ info->matchflags ^ info->invflags) & info->matchflags); else { if (info->invflags) return temp != info->matchflags; else return temp == info->matchflags; } } static int ipv6header_mt6_check(const struct xt_mtchk_param *par) { const struct ip6t_ipv6header_info *info = par->matchinfo; /* invflags is 0 or 0xff in hard mode */ if ((!info->modeflag) && info->invflags != 0x00 && info->invflags != 0xFF) return -EINVAL; return 0; } static struct xt_match ipv6header_mt6_reg __read_mostly = { .name = "ipv6header", .family = NFPROTO_IPV6, .match = ipv6header_mt6, .matchsize = sizeof(struct ip6t_ipv6header_info), .checkentry = ipv6header_mt6_check, .destroy = NULL, .me = THIS_MODULE, }; static int __init ipv6header_mt6_init(void) { return xt_register_match(&ipv6header_mt6_reg); } static void __exit ipv6header_mt6_exit(void) { xt_unregister_match(&ipv6header_mt6_reg); } module_init(ipv6header_mt6_init); module_exit(ipv6header_mt6_exit);
7 1 1 1 2 1 1 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/kernel.h> #include <linux/capability.h> #include <linux/if.h> #include <linux/inetdevice.h> #include <linux/ip.h> #include <linux/list.h> #include <linux/rculist.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/tcp.h> #include <net/ip.h> #include <net/tcp.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/x_tables.h> #include <net/netfilter/nf_log.h> #include <linux/netfilter/nfnetlink_osf.h> /* * Indexed by dont-fragment bit. * It is the only constant value in the fingerprint. */ struct list_head nf_osf_fingers[2]; EXPORT_SYMBOL_GPL(nf_osf_fingers); static inline int nf_osf_ttl(const struct sk_buff *skb, int ttl_check, unsigned char f_ttl) { struct in_device *in_dev = __in_dev_get_rcu(skb->dev); const struct iphdr *ip = ip_hdr(skb); const struct in_ifaddr *ifa; int ret = 0; if (ttl_check == NF_OSF_TTL_TRUE) return ip->ttl == f_ttl; if (ttl_check == NF_OSF_TTL_NOCHECK) return 1; else if (ip->ttl <= f_ttl) return 1; in_dev_for_each_ifa_rcu(ifa, in_dev) { if (inet_ifa_match(ip->saddr, ifa)) { ret = (ip->ttl == f_ttl); break; } } return ret; } struct nf_osf_hdr_ctx { bool df; u16 window; u16 totlen; const unsigned char *optp; unsigned int optsize; }; static bool nf_osf_match_one(const struct sk_buff *skb, const struct nf_osf_user_finger *f, int ttl_check, struct nf_osf_hdr_ctx *ctx) { const __u8 *optpinit = ctx->optp; unsigned int check_WSS = 0; int fmatch = FMATCH_WRONG; int foptsize, optnum; u16 mss = 0; if (ctx->totlen != f->ss || !nf_osf_ttl(skb, ttl_check, f->ttl)) return false; /* * Should not happen if userspace parser was written correctly. */ if (f->wss.wc >= OSF_WSS_MAX) return false; /* Check options */ foptsize = 0; for (optnum = 0; optnum < f->opt_num; ++optnum) foptsize += f->opt[optnum].length; if (foptsize > MAX_IPOPTLEN || ctx->optsize > MAX_IPOPTLEN || ctx->optsize != foptsize) return false; check_WSS = f->wss.wc; for (optnum = 0; optnum < f->opt_num; ++optnum) { if (f->opt[optnum].kind == *ctx->optp) { __u32 len = f->opt[optnum].length; const __u8 *optend = ctx->optp + len; fmatch = FMATCH_OK; switch (*ctx->optp) { case OSFOPT_MSS: mss = ctx->optp[3]; mss <<= 8; mss |= ctx->optp[2]; mss = ntohs((__force __be16)mss); break; case OSFOPT_TS: break; } ctx->optp = optend; } else fmatch = FMATCH_OPT_WRONG; if (fmatch != FMATCH_OK) break; } if (fmatch != FMATCH_OPT_WRONG) { fmatch = FMATCH_WRONG; switch (check_WSS) { case OSF_WSS_PLAIN: if (f->wss.val == 0 || ctx->window == f->wss.val) fmatch = FMATCH_OK; break; case OSF_WSS_MSS: /* * Some smart modems decrease mangle MSS to * SMART_MSS_2, so we check standard, decreased * and the one provided in the fingerprint MSS * values. */ #define SMART_MSS_1 1460 #define SMART_MSS_2 1448 if (ctx->window == f->wss.val * mss || ctx->window == f->wss.val * SMART_MSS_1 || ctx->window == f->wss.val * SMART_MSS_2) fmatch = FMATCH_OK; break; case OSF_WSS_MTU: if (ctx->window == f->wss.val * (mss + 40) || ctx->window == f->wss.val * (SMART_MSS_1 + 40) || ctx->window == f->wss.val * (SMART_MSS_2 + 40)) fmatch = FMATCH_OK; break; case OSF_WSS_MODULO: if ((ctx->window % f->wss.val) == 0) fmatch = FMATCH_OK; break; } } if (fmatch != FMATCH_OK) ctx->optp = optpinit; return fmatch == FMATCH_OK; } static const struct tcphdr *nf_osf_hdr_ctx_init(struct nf_osf_hdr_ctx *ctx, const struct sk_buff *skb, const struct iphdr *ip, unsigned char *opts, struct tcphdr *_tcph) { const struct tcphdr *tcp; tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), _tcph); if (!tcp) return NULL; if (!tcp->syn) return NULL; ctx->totlen = ntohs(ip->tot_len); ctx->df = ntohs(ip->frag_off) & IP_DF; ctx->window = ntohs(tcp->window); if (tcp->doff * 4 > sizeof(struct tcphdr)) { ctx->optsize = tcp->doff * 4 - sizeof(struct tcphdr); ctx->optp = skb_header_pointer(skb, ip_hdrlen(skb) + sizeof(struct tcphdr), ctx->optsize, opts); if (!ctx->optp) return NULL; } return tcp; } bool nf_osf_match(const struct sk_buff *skb, u_int8_t family, int hooknum, struct net_device *in, struct net_device *out, const struct nf_osf_info *info, struct net *net, const struct list_head *nf_osf_fingers) { const struct iphdr *ip = ip_hdr(skb); const struct nf_osf_user_finger *f; unsigned char opts[MAX_IPOPTLEN]; const struct nf_osf_finger *kf; int fcount = 0, ttl_check; int fmatch = FMATCH_WRONG; struct nf_osf_hdr_ctx ctx; const struct tcphdr *tcp; struct tcphdr _tcph; memset(&ctx, 0, sizeof(ctx)); tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts, &_tcph); if (!tcp) return false; ttl_check = (info->flags & NF_OSF_TTL) ? info->ttl : 0; list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) { f = &kf->finger; if (!(info->flags & NF_OSF_LOG) && strcmp(info->genre, f->genre)) continue; if (!nf_osf_match_one(skb, f, ttl_check, &ctx)) continue; fmatch = FMATCH_OK; fcount++; if (info->flags & NF_OSF_LOG) nf_log_packet(net, family, hooknum, skb, in, out, NULL, "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n", f->genre, f->version, f->subtype, &ip->saddr, ntohs(tcp->source), &ip->daddr, ntohs(tcp->dest), f->ttl - ip->ttl); if ((info->flags & NF_OSF_LOG) && info->loglevel == NF_OSF_LOGLEVEL_FIRST) break; } if (!fcount && (info->flags & NF_OSF_LOG)) nf_log_packet(net, family, hooknum, skb, in, out, NULL, "Remote OS is not known: %pI4:%u -> %pI4:%u\n", &ip->saddr, ntohs(tcp->source), &ip->daddr, ntohs(tcp->dest)); if (fcount) fmatch = FMATCH_OK; return fmatch == FMATCH_OK; } EXPORT_SYMBOL_GPL(nf_osf_match); bool nf_osf_find(const struct sk_buff *skb, const struct list_head *nf_osf_fingers, const int ttl_check, struct nf_osf_data *data) { const struct iphdr *ip = ip_hdr(skb); const struct nf_osf_user_finger *f; unsigned char opts[MAX_IPOPTLEN]; const struct nf_osf_finger *kf; struct nf_osf_hdr_ctx ctx; const struct tcphdr *tcp; struct tcphdr _tcph; bool found = false; memset(&ctx, 0, sizeof(ctx)); tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts, &_tcph); if (!tcp) return false; list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) { f = &kf->finger; if (!nf_osf_match_one(skb, f, ttl_check, &ctx)) continue; data->genre = f->genre; data->version = f->version; found = true; break; } return found; } EXPORT_SYMBOL_GPL(nf_osf_find); static const struct nla_policy nfnl_osf_policy[OSF_ATTR_MAX + 1] = { [OSF_ATTR_FINGER] = { .len = sizeof(struct nf_osf_user_finger) }, }; static int nfnl_osf_add_callback(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const osf_attrs[]) { struct nf_osf_user_finger *f; struct nf_osf_finger *kf = NULL, *sf; int err = 0; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) return -EINVAL; f = nla_data(osf_attrs[OSF_ATTR_FINGER]); if (f->opt_num > ARRAY_SIZE(f->opt)) return -EINVAL; if (!memchr(f->genre, 0, MAXGENRELEN) || !memchr(f->subtype, 0, MAXGENRELEN) || !memchr(f->version, 0, MAXGENRELEN)) return -EINVAL; kf = kmalloc(sizeof(struct nf_osf_finger), GFP_KERNEL); if (!kf) return -ENOMEM; memcpy(&kf->finger, f, sizeof(struct nf_osf_user_finger)); list_for_each_entry(sf, &nf_osf_fingers[!!f->df], finger_entry) { if (memcmp(&sf->finger, f, sizeof(struct nf_osf_user_finger))) continue; kfree(kf); kf = NULL; if (info->nlh->nlmsg_flags & NLM_F_EXCL) err = -EEXIST; break; } /* * We are protected by nfnl mutex. */ if (kf) list_add_tail_rcu(&kf->finger_entry, &nf_osf_fingers[!!f->df]); return err; } static int nfnl_osf_remove_callback(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const osf_attrs[]) { struct nf_osf_user_finger *f; struct nf_osf_finger *sf; int err = -ENOENT; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; f = nla_data(osf_attrs[OSF_ATTR_FINGER]); list_for_each_entry(sf, &nf_osf_fingers[!!f->df], finger_entry) { if (memcmp(&sf->finger, f, sizeof(struct nf_osf_user_finger))) continue; /* * We are protected by nfnl mutex. */ list_del_rcu(&sf->finger_entry); kfree_rcu(sf, rcu_head); err = 0; break; } return err; } static const struct nfnl_callback nfnl_osf_callbacks[OSF_MSG_MAX] = { [OSF_MSG_ADD] = { .call = nfnl_osf_add_callback, .type = NFNL_CB_MUTEX, .attr_count = OSF_ATTR_MAX, .policy = nfnl_osf_policy, }, [OSF_MSG_REMOVE] = { .call = nfnl_osf_remove_callback, .type = NFNL_CB_MUTEX, .attr_count = OSF_ATTR_MAX, .policy = nfnl_osf_policy, }, }; static const struct nfnetlink_subsystem nfnl_osf_subsys = { .name = "osf", .subsys_id = NFNL_SUBSYS_OSF, .cb_count = OSF_MSG_MAX, .cb = nfnl_osf_callbacks, }; static int __init nfnl_osf_init(void) { int err = -EINVAL; int i; for (i = 0; i < ARRAY_SIZE(nf_osf_fingers); ++i) INIT_LIST_HEAD(&nf_osf_fingers[i]); err = nfnetlink_subsys_register(&nfnl_osf_subsys); if (err < 0) { pr_err("Failed to register OSF nsfnetlink helper (%d)\n", err); goto err_out_exit; } return 0; err_out_exit: return err; } static void __exit nfnl_osf_fini(void) { struct nf_osf_finger *f; int i; nfnetlink_subsys_unregister(&nfnl_osf_subsys); rcu_read_lock(); for (i = 0; i < ARRAY_SIZE(nf_osf_fingers); ++i) { list_for_each_entry_rcu(f, &nf_osf_fingers[i], finger_entry) { list_del_rcu(&f->finger_entry); kfree_rcu(f, rcu_head); } } rcu_read_unlock(); rcu_barrier(); } module_init(nfnl_osf_init); module_exit(nfnl_osf_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Passive OS fingerprint matching"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF);
86 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 /* SPDX-License-Identifier: GPL-2.0 */ /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner */ #ifndef _NET_BATMAN_ADV_SOFT_INTERFACE_H_ #define _NET_BATMAN_ADV_SOFT_INTERFACE_H_ #include "main.h" #include <linux/kref.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/types.h> #include <net/rtnetlink.h> int batadv_skb_head_push(struct sk_buff *skb, unsigned int len); void batadv_interface_rx(struct net_device *soft_iface, struct sk_buff *skb, int hdr_size, struct batadv_orig_node *orig_node); bool batadv_softif_is_valid(const struct net_device *net_dev); extern struct rtnl_link_ops batadv_link_ops; int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid); void batadv_softif_vlan_release(struct kref *ref); struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv, unsigned short vid); /** * batadv_softif_vlan_put() - decrease the vlan object refcounter and * possibly release it * @vlan: the vlan object to release */ static inline void batadv_softif_vlan_put(struct batadv_softif_vlan *vlan) { if (!vlan) return; kref_put(&vlan->refcount, batadv_softif_vlan_release); } #endif /* _NET_BATMAN_ADV_SOFT_INTERFACE_H_ */
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2020 Facebook */ #include <linux/fs.h> #include <linux/anon_inodes.h> #include <linux/filter.h> #include <linux/bpf.h> #include <linux/rcupdate_trace.h> struct bpf_iter_target_info { struct list_head list; const struct bpf_iter_reg *reg_info; u32 btf_id; /* cached value */ }; struct bpf_iter_link { struct bpf_link link; struct bpf_iter_aux_info aux; struct bpf_iter_target_info *tinfo; }; struct bpf_iter_priv_data { struct bpf_iter_target_info *tinfo; const struct bpf_iter_seq_info *seq_info; struct bpf_prog *prog; u64 session_id; u64 seq_num; bool done_stop; u8 target_private[] __aligned(8); }; static struct list_head targets = LIST_HEAD_INIT(targets); static DEFINE_MUTEX(targets_mutex); /* protect bpf_iter_link changes */ static DEFINE_MUTEX(link_mutex); /* incremented on every opened seq_file */ static atomic64_t session_id; static int prepare_seq_file(struct file *file, struct bpf_iter_link *link, const struct bpf_iter_seq_info *seq_info); static void bpf_iter_inc_seq_num(struct seq_file *seq) { struct bpf_iter_priv_data *iter_priv; iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); iter_priv->seq_num++; } static void bpf_iter_dec_seq_num(struct seq_file *seq) { struct bpf_iter_priv_data *iter_priv; iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); iter_priv->seq_num--; } static void bpf_iter_done_stop(struct seq_file *seq) { struct bpf_iter_priv_data *iter_priv; iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); iter_priv->done_stop = true; } static inline bool bpf_iter_target_support_resched(const struct bpf_iter_target_info *tinfo) { return tinfo->reg_info->feature & BPF_ITER_RESCHED; } static bool bpf_iter_support_resched(struct seq_file *seq) { struct bpf_iter_priv_data *iter_priv; iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); return bpf_iter_target_support_resched(iter_priv->tinfo); } /* maximum visited objects before bailing out */ #define MAX_ITER_OBJECTS 1000000 /* bpf_seq_read, a customized and simpler version for bpf iterator. * The following are differences from seq_read(): * . fixed buffer size (PAGE_SIZE) * . assuming NULL ->llseek() * . stop() may call bpf program, handling potential overflow there */ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; size_t n, offs, copied = 0; int err = 0, num_objs = 0; bool can_resched; void *p; mutex_lock(&seq->lock); if (!seq->buf) { seq->size = PAGE_SIZE << 3; seq->buf = kvmalloc(seq->size, GFP_KERNEL); if (!seq->buf) { err = -ENOMEM; goto done; } } if (seq->count) { n = min(seq->count, size); err = copy_to_user(buf, seq->buf + seq->from, n); if (err) { err = -EFAULT; goto done; } seq->count -= n; seq->from += n; copied = n; goto done; } seq->from = 0; p = seq->op->start(seq, &seq->index); if (!p) goto stop; if (IS_ERR(p)) { err = PTR_ERR(p); seq->op->stop(seq, p); seq->count = 0; goto done; } err = seq->op->show(seq, p); if (err > 0) { /* object is skipped, decrease seq_num, so next * valid object can reuse the same seq_num. */ bpf_iter_dec_seq_num(seq); seq->count = 0; } else if (err < 0 || seq_has_overflowed(seq)) { if (!err) err = -E2BIG; seq->op->stop(seq, p); seq->count = 0; goto done; } can_resched = bpf_iter_support_resched(seq); while (1) { loff_t pos = seq->index; num_objs++; offs = seq->count; p = seq->op->next(seq, p, &seq->index); if (pos == seq->index) { pr_info_ratelimited("buggy seq_file .next function %ps " "did not updated position index\n", seq->op->next); seq->index++; } if (IS_ERR_OR_NULL(p)) break; /* got a valid next object, increase seq_num */ bpf_iter_inc_seq_num(seq); if (seq->count >= size) break; if (num_objs >= MAX_ITER_OBJECTS) { if (offs == 0) { err = -EAGAIN; seq->op->stop(seq, p); goto done; } break; } err = seq->op->show(seq, p); if (err > 0) { bpf_iter_dec_seq_num(seq); seq->count = offs; } else if (err < 0 || seq_has_overflowed(seq)) { seq->count = offs; if (offs == 0) { if (!err) err = -E2BIG; seq->op->stop(seq, p); goto done; } break; } if (can_resched) cond_resched(); } stop: offs = seq->count; if (IS_ERR(p)) { seq->op->stop(seq, NULL); err = PTR_ERR(p); goto done; } /* bpf program called if !p */ seq->op->stop(seq, p); if (!p) { if (!seq_has_overflowed(seq)) { bpf_iter_done_stop(seq); } else { seq->count = offs; if (offs == 0) { err = -E2BIG; goto done; } } } n = min(seq->count, size); err = copy_to_user(buf, seq->buf, n); if (err) { err = -EFAULT; goto done; } copied = n; seq->count -= n; seq->from = n; done: if (!copied) copied = err; else *ppos += copied; mutex_unlock(&seq->lock); return copied; } static const struct bpf_iter_seq_info * __get_seq_info(struct bpf_iter_link *link) { const struct bpf_iter_seq_info *seq_info; if (link->aux.map) { seq_info = link->aux.map->ops->iter_seq_info; if (seq_info) return seq_info; } return link->tinfo->reg_info->seq_info; } static int iter_open(struct inode *inode, struct file *file) { struct bpf_iter_link *link = inode->i_private; return prepare_seq_file(file, link, __get_seq_info(link)); } static int iter_release(struct inode *inode, struct file *file) { struct bpf_iter_priv_data *iter_priv; struct seq_file *seq; seq = file->private_data; if (!seq) return 0; iter_priv = container_of(seq->private, struct bpf_iter_priv_data, target_private); if (iter_priv->seq_info->fini_seq_private) iter_priv->seq_info->fini_seq_private(seq->private); bpf_prog_put(iter_priv->prog); seq->private = iter_priv; return seq_release_private(inode, file); } const struct file_operations bpf_iter_fops = { .open = iter_open, .read = bpf_seq_read, .release = iter_release, }; /* The argument reg_info will be cached in bpf_iter_target_info. * The common practice is to declare target reg_info as * a const static variable and passed as an argument to * bpf_iter_reg_target(). */ int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; tinfo = kzalloc(sizeof(*tinfo), GFP_KERNEL); if (!tinfo) return -ENOMEM; tinfo->reg_info = reg_info; INIT_LIST_HEAD(&tinfo->list); mutex_lock(&targets_mutex); list_add(&tinfo->list, &targets); mutex_unlock(&targets_mutex); return 0; } void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; bool found = false; mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { if (reg_info == tinfo->reg_info) { list_del(&tinfo->list); kfree(tinfo); found = true; break; } } mutex_unlock(&targets_mutex); WARN_ON(found == false); } static void cache_btf_id(struct bpf_iter_target_info *tinfo, struct bpf_prog *prog) { tinfo->btf_id = prog->aux->attach_btf_id; } bool bpf_iter_prog_supported(struct bpf_prog *prog) { const char *attach_fname = prog->aux->attach_func_name; struct bpf_iter_target_info *tinfo = NULL, *iter; u32 prog_btf_id = prog->aux->attach_btf_id; const char *prefix = BPF_ITER_FUNC_PREFIX; int prefix_len = strlen(prefix); if (strncmp(attach_fname, prefix, prefix_len)) return false; mutex_lock(&targets_mutex); list_for_each_entry(iter, &targets, list) { if (iter->btf_id && iter->btf_id == prog_btf_id) { tinfo = iter; break; } if (!strcmp(attach_fname + prefix_len, iter->reg_info->target)) { cache_btf_id(iter, prog); tinfo = iter; break; } } mutex_unlock(&targets_mutex); if (tinfo) { prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size; prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info; } return tinfo != NULL; } const struct bpf_func_proto * bpf_iter_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_iter_target_info *tinfo; const struct bpf_func_proto *fn = NULL; mutex_lock(&targets_mutex); list_for_each_entry(tinfo, &targets, list) { if (tinfo->btf_id == prog->aux->attach_btf_id) { const struct bpf_iter_reg *reg_info; reg_info = tinfo->reg_info; if (reg_info->get_func_proto) fn = reg_info->get_func_proto(func_id, prog); break; } } mutex_unlock(&targets_mutex); return fn; } static void bpf_iter_link_release(struct bpf_link *link) { struct bpf_iter_link *iter_link = container_of(link, struct bpf_iter_link, link); if (iter_link->tinfo->reg_info->detach_target) iter_link->tinfo->reg_info->detach_target(&iter_link->aux); } static void bpf_iter_link_dealloc(struct bpf_link *link) { struct bpf_iter_link *iter_link = container_of(link, struct bpf_iter_link, link); kfree(iter_link); } static int bpf_iter_link_replace(struct bpf_link *link, struct bpf_prog *new_prog, struct bpf_prog *old_prog) { int ret = 0; mutex_lock(&link_mutex); if (old_prog && link->prog != old_prog) { ret = -EPERM; goto out_unlock; } if (link->prog->type != new_prog->type || link->prog->expected_attach_type != new_prog->expected_attach_type || link->prog->aux->attach_btf_id != new_prog->aux->attach_btf_id) { ret = -EINVAL; goto out_unlock; } old_prog = xchg(&link->prog, new_prog); bpf_prog_put(old_prog); out_unlock: mutex_unlock(&link_mutex); return ret; } static void bpf_iter_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { struct bpf_iter_link *iter_link = container_of(link, struct bpf_iter_link, link); bpf_iter_show_fdinfo_t show_fdinfo; seq_printf(seq, "target_name:\t%s\n", iter_link->tinfo->reg_info->target); show_fdinfo = iter_link->tinfo->reg_info->show_fdinfo; if (show_fdinfo) show_fdinfo(&iter_link->aux, seq); } static int bpf_iter_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { struct bpf_iter_link *iter_link = container_of(link, struct bpf_iter_link, link); char __user *ubuf = u64_to_user_ptr(info->iter.target_name); bpf_iter_fill_link_info_t fill_link_info; u32 ulen = info->iter.target_name_len; const char *target_name; u32 target_len; if (!ulen ^ !ubuf) return -EINVAL; target_name = iter_link->tinfo->reg_info->target; target_len = strlen(target_name); info->iter.target_name_len = target_len + 1; if (ubuf) { if (ulen >= target_len + 1) { if (copy_to_user(ubuf, target_name, target_len + 1)) return -EFAULT; } else { char zero = '\0'; if (copy_to_user(ubuf, target_name, ulen - 1)) return -EFAULT; if (put_user(zero, ubuf + ulen - 1)) return -EFAULT; return -ENOSPC; } } fill_link_info = iter_link->tinfo->reg_info->fill_link_info; if (fill_link_info) return fill_link_info(&iter_link->aux, info); return 0; } static const struct bpf_link_ops bpf_iter_link_lops = { .release = bpf_iter_link_release, .dealloc = bpf_iter_link_dealloc, .update_prog = bpf_iter_link_replace, .show_fdinfo = bpf_iter_link_show_fdinfo, .fill_link_info = bpf_iter_link_fill_link_info, }; bool bpf_link_is_iter(struct bpf_link *link) { return link->ops == &bpf_iter_link_lops; } int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr, struct bpf_prog *prog) { struct bpf_iter_target_info *tinfo = NULL, *iter; struct bpf_link_primer link_primer; union bpf_iter_link_info linfo; struct bpf_iter_link *link; u32 prog_btf_id, linfo_len; bpfptr_t ulinfo; int err; if (attr->link_create.target_fd || attr->link_create.flags) return -EINVAL; memset(&linfo, 0, sizeof(union bpf_iter_link_info)); ulinfo = make_bpfptr(attr->link_create.iter_info, uattr.is_kernel); linfo_len = attr->link_create.iter_info_len; if (bpfptr_is_null(ulinfo) ^ !linfo_len) return -EINVAL; if (!bpfptr_is_null(ulinfo)) { err = bpf_check_uarg_tail_zero(ulinfo, sizeof(linfo), linfo_len); if (err) return err; linfo_len = min_t(u32, linfo_len, sizeof(linfo)); if (copy_from_bpfptr(&linfo, ulinfo, linfo_len)) return -EFAULT; } prog_btf_id = prog->aux->attach_btf_id; mutex_lock(&targets_mutex); list_for_each_entry(iter, &targets, list) { if (iter->btf_id == prog_btf_id) { tinfo = iter; break; } } mutex_unlock(&targets_mutex); if (!tinfo) return -ENOENT; /* Only allow sleepable program for resched-able iterator */ if (prog->sleepable && !bpf_iter_target_support_resched(tinfo)) return -EINVAL; link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN); if (!link) return -ENOMEM; bpf_link_init(&link->link, BPF_LINK_TYPE_ITER, &bpf_iter_link_lops, prog); link->tinfo = tinfo; err = bpf_link_prime(&link->link, &link_primer); if (err) { kfree(link); return err; } if (tinfo->reg_info->attach_target) { err = tinfo->reg_info->attach_target(prog, &linfo, &link->aux); if (err) { bpf_link_cleanup(&link_primer); return err; } } return bpf_link_settle(&link_primer); } static void init_seq_meta(struct bpf_iter_priv_data *priv_data, struct bpf_iter_target_info *tinfo, const struct bpf_iter_seq_info *seq_info, struct bpf_prog *prog) { priv_data->tinfo = tinfo; priv_data->seq_info = seq_info; priv_data->prog = prog; priv_data->session_id = atomic64_inc_return(&session_id); priv_data->seq_num = 0; priv_data->done_stop = false; } static int prepare_seq_file(struct file *file, struct bpf_iter_link *link, const struct bpf_iter_seq_info *seq_info) { struct bpf_iter_priv_data *priv_data; struct bpf_iter_target_info *tinfo; struct bpf_prog *prog; u32 total_priv_dsize; struct seq_file *seq; int err = 0; mutex_lock(&link_mutex); prog = link->link.prog; bpf_prog_inc(prog); mutex_unlock(&link_mutex); tinfo = link->tinfo; total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) + seq_info->seq_priv_size; priv_data = __seq_open_private(file, seq_info->seq_ops, total_priv_dsize); if (!priv_data) { err = -ENOMEM; goto release_prog; } if (seq_info->init_seq_private) { err = seq_info->init_seq_private(priv_data->target_private, &link->aux); if (err) goto release_seq_file; } init_seq_meta(priv_data, tinfo, seq_info, prog); seq = file->private_data; seq->private = priv_data->target_private; return 0; release_seq_file: seq_release_private(file->f_inode, file); file->private_data = NULL; release_prog: bpf_prog_put(prog); return err; } int bpf_iter_new_fd(struct bpf_link *link) { struct bpf_iter_link *iter_link; struct file *file; unsigned int flags; int err, fd; if (link->ops != &bpf_iter_link_lops) return -EINVAL; flags = O_RDONLY | O_CLOEXEC; fd = get_unused_fd_flags(flags); if (fd < 0) return fd; file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags); if (IS_ERR(file)) { err = PTR_ERR(file); goto free_fd; } iter_link = container_of(link, struct bpf_iter_link, link); err = prepare_seq_file(file, iter_link, __get_seq_info(iter_link)); if (err) goto free_file; fd_install(fd, file); return fd; free_file: fput(file); free_fd: put_unused_fd(fd); return err; } struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop) { struct bpf_iter_priv_data *iter_priv; struct seq_file *seq; void *seq_priv; seq = meta->seq; if (seq->file->f_op != &bpf_iter_fops) return NULL; seq_priv = seq->private; iter_priv = container_of(seq_priv, struct bpf_iter_priv_data, target_private); if (in_stop && iter_priv->done_stop) return NULL; meta->session_id = iter_priv->session_id; meta->seq_num = iter_priv->seq_num; return iter_priv->prog; } int bpf_iter_run_prog(struct bpf_prog *prog, void *ctx) { struct bpf_run_ctx run_ctx, *old_run_ctx; int ret; if (prog->sleepable) { rcu_read_lock_trace(); migrate_disable(); might_fault(); old_run_ctx = bpf_set_run_ctx(&run_ctx); ret = bpf_prog_run(prog, ctx); bpf_reset_run_ctx(old_run_ctx); migrate_enable(); rcu_read_unlock_trace(); } else { rcu_read_lock(); migrate_disable(); old_run_ctx = bpf_set_run_ctx(&run_ctx); ret = bpf_prog_run(prog, ctx); bpf_reset_run_ctx(old_run_ctx); migrate_enable(); rcu_read_unlock(); } /* bpf program can only return 0 or 1: * 0 : okay * 1 : retry the same object * The bpf_iter_run_prog() return value * will be seq_ops->show() return value. */ return ret == 0 ? 0 : -EAGAIN; } BPF_CALL_4(bpf_for_each_map_elem, struct bpf_map *, map, void *, callback_fn, void *, callback_ctx, u64, flags) { return map->ops->map_for_each_callback(map, callback_fn, callback_ctx, flags); } const struct bpf_func_proto bpf_for_each_map_elem_proto = { .func = bpf_for_each_map_elem, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_FUNC, .arg3_type = ARG_PTR_TO_STACK_OR_NULL, .arg4_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx, u64, flags) { bpf_callback_t callback = (bpf_callback_t)callback_fn; u64 ret; u32 i; /* Note: these safety checks are also verified when bpf_loop * is inlined, be careful to modify this code in sync. See * function verifier.c:inline_bpf_loop. */ if (flags) return -EINVAL; if (nr_loops > BPF_MAX_LOOPS) return -E2BIG; for (i = 0; i < nr_loops; i++) { ret = callback((u64)i, (u64)(long)callback_ctx, 0, 0, 0); /* return value: 0 - continue, 1 - stop and return */ if (ret) return i + 1; } return i; } const struct bpf_func_proto bpf_loop_proto = { .func = bpf_loop, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_ANYTHING, .arg2_type = ARG_PTR_TO_FUNC, .arg3_type = ARG_PTR_TO_STACK_OR_NULL, .arg4_type = ARG_ANYTHING, }; struct bpf_iter_num_kern { int cur; /* current value, inclusive */ int end; /* final value, exclusive */ } __aligned(8); __bpf_kfunc_start_defs(); __bpf_kfunc int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end) { struct bpf_iter_num_kern *s = (void *)it; BUILD_BUG_ON(sizeof(struct bpf_iter_num_kern) != sizeof(struct bpf_iter_num)); BUILD_BUG_ON(__alignof__(struct bpf_iter_num_kern) != __alignof__(struct bpf_iter_num)); /* start == end is legit, it's an empty range and we'll just get NULL * on first (and any subsequent) bpf_iter_num_next() call */ if (start > end) { s->cur = s->end = 0; return -EINVAL; } /* avoid overflows, e.g., if start == INT_MIN and end == INT_MAX */ if ((s64)end - (s64)start > BPF_MAX_LOOPS) { s->cur = s->end = 0; return -E2BIG; } /* user will call bpf_iter_num_next() first, * which will set s->cur to exactly start value; * underflow shouldn't matter */ s->cur = start - 1; s->end = end; return 0; } __bpf_kfunc int *bpf_iter_num_next(struct bpf_iter_num* it) { struct bpf_iter_num_kern *s = (void *)it; /* check failed initialization or if we are done (same behavior); * need to be careful about overflow, so convert to s64 for checks, * e.g., if s->cur == s->end == INT_MAX, we can't just do * s->cur + 1 >= s->end */ if ((s64)(s->cur + 1) >= s->end) { s->cur = s->end = 0; return NULL; } s->cur++; return &s->cur; } __bpf_kfunc void bpf_iter_num_destroy(struct bpf_iter_num *it) { struct bpf_iter_num_kern *s = (void *)it; s->cur = s->end = 0; } __bpf_kfunc_end_defs();
55 55 3459 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM sched #if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SCHED_H #include <linux/kthread.h> #include <linux/sched/numa_balancing.h> #include <linux/tracepoint.h> #include <linux/binfmts.h> /* * Tracepoint for calling kthread_stop, performed to end a kthread: */ TRACE_EVENT(sched_kthread_stop, TP_PROTO(struct task_struct *t), TP_ARGS(t), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) ), TP_fast_assign( memcpy(__entry->comm, t->comm, TASK_COMM_LEN); __entry->pid = t->pid; ), TP_printk("comm=%s pid=%d", __entry->comm, __entry->pid) ); /* * Tracepoint for the return value of the kthread stopping: */ TRACE_EVENT(sched_kthread_stop_ret, TP_PROTO(int ret), TP_ARGS(ret), TP_STRUCT__entry( __field( int, ret ) ), TP_fast_assign( __entry->ret = ret; ), TP_printk("ret=%d", __entry->ret) ); /** * sched_kthread_work_queue_work - called when a work gets queued * @worker: pointer to the kthread_worker * @work: pointer to struct kthread_work * * This event occurs when a work is queued immediately or once a * delayed work is actually queued (ie: once the delay has been * reached). */ TRACE_EVENT(sched_kthread_work_queue_work, TP_PROTO(struct kthread_worker *worker, struct kthread_work *work), TP_ARGS(worker, work), TP_STRUCT__entry( __field( void *, work ) __field( void *, function) __field( void *, worker) ), TP_fast_assign( __entry->work = work; __entry->function = work->func; __entry->worker = worker; ), TP_printk("work struct=%p function=%ps worker=%p", __entry->work, __entry->function, __entry->worker) ); /** * sched_kthread_work_execute_start - called immediately before the work callback * @work: pointer to struct kthread_work * * Allows to track kthread work execution. */ TRACE_EVENT(sched_kthread_work_execute_start, TP_PROTO(struct kthread_work *work), TP_ARGS(work), TP_STRUCT__entry( __field( void *, work ) __field( void *, function) ), TP_fast_assign( __entry->work = work; __entry->function = work->func; ), TP_printk("work struct %p: function %ps", __entry->work, __entry->function) ); /** * sched_kthread_work_execute_end - called immediately after the work callback * @work: pointer to struct work_struct * @function: pointer to worker function * * Allows to track workqueue execution. */ TRACE_EVENT(sched_kthread_work_execute_end, TP_PROTO(struct kthread_work *work, kthread_work_func_t function), TP_ARGS(work, function), TP_STRUCT__entry( __field( void *, work ) __field( void *, function) ), TP_fast_assign( __entry->work = work; __entry->function = function; ), TP_printk("work struct %p: function %ps", __entry->work, __entry->function) ); /* * Tracepoint for waking up a task: */ DECLARE_EVENT_CLASS(sched_wakeup_template, TP_PROTO(struct task_struct *p), TP_ARGS(__perf_task(p)), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( int, prio ) __field( int, target_cpu ) ), TP_fast_assign( memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; __entry->prio = p->prio; /* XXX SCHED_DEADLINE */ __entry->target_cpu = task_cpu(p); ), TP_printk("comm=%s pid=%d prio=%d target_cpu=%03d", __entry->comm, __entry->pid, __entry->prio, __entry->target_cpu) ); /* * Tracepoint called when waking a task; this tracepoint is guaranteed to be * called from the waking context. */ DEFINE_EVENT(sched_wakeup_template, sched_waking, TP_PROTO(struct task_struct *p), TP_ARGS(p)); /* * Tracepoint called when the task is actually woken; p->state == TASK_RUNNING. * It is not always called from the waking context. */ DEFINE_EVENT(sched_wakeup_template, sched_wakeup, TP_PROTO(struct task_struct *p), TP_ARGS(p)); /* * Tracepoint for waking up a new task: */ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new, TP_PROTO(struct task_struct *p), TP_ARGS(p)); #ifdef CREATE_TRACE_POINTS static inline long __trace_sched_switch_state(bool preempt, unsigned int prev_state, struct task_struct *p) { unsigned int state; #ifdef CONFIG_SCHED_DEBUG BUG_ON(p != current); #endif /* CONFIG_SCHED_DEBUG */ /* * Preemption ignores task state, therefore preempted tasks are always * RUNNING (we will not have dequeued if state != RUNNING). */ if (preempt) return TASK_REPORT_MAX; /* * task_state_index() uses fls() and returns a value from 0-8 range. * Decrement it by 1 (except TASK_RUNNING state i.e 0) before using * it for left shift operation to get the correct task->state * mapping. */ state = __task_state_index(prev_state, p->exit_state); return state ? (1 << (state - 1)) : state; } #endif /* CREATE_TRACE_POINTS */ /* * Tracepoint for task switches, performed by the scheduler: */ TRACE_EVENT(sched_switch, TP_PROTO(bool preempt, struct task_struct *prev, struct task_struct *next, unsigned int prev_state), TP_ARGS(preempt, prev, next, prev_state), TP_STRUCT__entry( __array( char, prev_comm, TASK_COMM_LEN ) __field( pid_t, prev_pid ) __field( int, prev_prio ) __field( long, prev_state ) __array( char, next_comm, TASK_COMM_LEN ) __field( pid_t, next_pid ) __field( int, next_prio ) ), TP_fast_assign( memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); __entry->prev_pid = prev->pid; __entry->prev_prio = prev->prio; __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev); memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); __entry->next_pid = next->pid; __entry->next_prio = next->prio; /* XXX SCHED_DEADLINE */ ), TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, (__entry->prev_state & (TASK_REPORT_MAX - 1)) ? __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|", { TASK_INTERRUPTIBLE, "S" }, { TASK_UNINTERRUPTIBLE, "D" }, { __TASK_STOPPED, "T" }, { __TASK_TRACED, "t" }, { EXIT_DEAD, "X" }, { EXIT_ZOMBIE, "Z" }, { TASK_PARKED, "P" }, { TASK_DEAD, "I" }) : "R", __entry->prev_state & TASK_REPORT_MAX ? "+" : "", __entry->next_comm, __entry->next_pid, __entry->next_prio) ); /* * Tracepoint for a task being migrated: */ TRACE_EVENT(sched_migrate_task, TP_PROTO(struct task_struct *p, int dest_cpu), TP_ARGS(p, dest_cpu), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( int, prio ) __field( int, orig_cpu ) __field( int, dest_cpu ) ), TP_fast_assign( memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; __entry->prio = p->prio; /* XXX SCHED_DEADLINE */ __entry->orig_cpu = task_cpu(p); __entry->dest_cpu = dest_cpu; ), TP_printk("comm=%s pid=%d prio=%d orig_cpu=%d dest_cpu=%d", __entry->comm, __entry->pid, __entry->prio, __entry->orig_cpu, __entry->dest_cpu) ); DECLARE_EVENT_CLASS(sched_process_template, TP_PROTO(struct task_struct *p), TP_ARGS(p), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( int, prio ) ), TP_fast_assign( memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; __entry->prio = p->prio; /* XXX SCHED_DEADLINE */ ), TP_printk("comm=%s pid=%d prio=%d", __entry->comm, __entry->pid, __entry->prio) ); /* * Tracepoint for freeing a task: */ DEFINE_EVENT(sched_process_template, sched_process_free, TP_PROTO(struct task_struct *p), TP_ARGS(p)); /* * Tracepoint for a task exiting: */ DEFINE_EVENT(sched_process_template, sched_process_exit, TP_PROTO(struct task_struct *p), TP_ARGS(p)); /* * Tracepoint for waiting on task to unschedule: */ DEFINE_EVENT(sched_process_template, sched_wait_task, TP_PROTO(struct task_struct *p), TP_ARGS(p)); /* * Tracepoint for a waiting task: */ TRACE_EVENT(sched_process_wait, TP_PROTO(struct pid *pid), TP_ARGS(pid), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( int, prio ) ), TP_fast_assign( memcpy(__entry->comm, current->comm, TASK_COMM_LEN); __entry->pid = pid_nr(pid); __entry->prio = current->prio; /* XXX SCHED_DEADLINE */ ), TP_printk("comm=%s pid=%d prio=%d", __entry->comm, __entry->pid, __entry->prio) ); /* * Tracepoint for kernel_clone: */ TRACE_EVENT(sched_process_fork, TP_PROTO(struct task_struct *parent, struct task_struct *child), TP_ARGS(parent, child), TP_STRUCT__entry( __array( char, parent_comm, TASK_COMM_LEN ) __field( pid_t, parent_pid ) __array( char, child_comm, TASK_COMM_LEN ) __field( pid_t, child_pid ) ), TP_fast_assign( memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN); __entry->parent_pid = parent->pid; memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN); __entry->child_pid = child->pid; ), TP_printk("comm=%s pid=%d child_comm=%s child_pid=%d", __entry->parent_comm, __entry->parent_pid, __entry->child_comm, __entry->child_pid) ); /* * Tracepoint for exec: */ TRACE_EVENT(sched_process_exec, TP_PROTO(struct task_struct *p, pid_t old_pid, struct linux_binprm *bprm), TP_ARGS(p, old_pid, bprm), TP_STRUCT__entry( __string( filename, bprm->filename ) __field( pid_t, pid ) __field( pid_t, old_pid ) ), TP_fast_assign( __assign_str(filename); __entry->pid = p->pid; __entry->old_pid = old_pid; ), TP_printk("filename=%s pid=%d old_pid=%d", __get_str(filename), __entry->pid, __entry->old_pid) ); /** * sched_prepare_exec - called before setting up new exec * @task: pointer to the current task * @bprm: pointer to linux_binprm used for new exec * * Called before flushing the old exec, where @task is still unchanged, but at * the point of no return during switching to the new exec. At the point it is * called the exec will either succeed, or on failure terminate the task. Also * see the "sched_process_exec" tracepoint, which is called right after @task * has successfully switched to the new exec. */ TRACE_EVENT(sched_prepare_exec, TP_PROTO(struct task_struct *task, struct linux_binprm *bprm), TP_ARGS(task, bprm), TP_STRUCT__entry( __string( interp, bprm->interp ) __string( filename, bprm->filename ) __field( pid_t, pid ) __string( comm, task->comm ) ), TP_fast_assign( __assign_str(interp); __assign_str(filename); __entry->pid = task->pid; __assign_str(comm); ), TP_printk("interp=%s filename=%s pid=%d comm=%s", __get_str(interp), __get_str(filename), __entry->pid, __get_str(comm)) ); #ifdef CONFIG_SCHEDSTATS #define DEFINE_EVENT_SCHEDSTAT DEFINE_EVENT #define DECLARE_EVENT_CLASS_SCHEDSTAT DECLARE_EVENT_CLASS #else #define DEFINE_EVENT_SCHEDSTAT DEFINE_EVENT_NOP #define DECLARE_EVENT_CLASS_SCHEDSTAT DECLARE_EVENT_CLASS_NOP #endif /* * XXX the below sched_stat tracepoints only apply to SCHED_OTHER/BATCH/IDLE * adding sched_stat support to SCHED_FIFO/RR would be welcome. */ DECLARE_EVENT_CLASS_SCHEDSTAT(sched_stat_template, TP_PROTO(struct task_struct *tsk, u64 delay), TP_ARGS(__perf_task(tsk), __perf_count(delay)), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( u64, delay ) ), TP_fast_assign( memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); __entry->pid = tsk->pid; __entry->delay = delay; ), TP_printk("comm=%s pid=%d delay=%Lu [ns]", __entry->comm, __entry->pid, (unsigned long long)__entry->delay) ); /* * Tracepoint for accounting wait time (time the task is runnable * but not actually running due to scheduler contention). */ DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_wait, TP_PROTO(struct task_struct *tsk, u64 delay), TP_ARGS(tsk, delay)); /* * Tracepoint for accounting sleep time (time the task is not runnable, * including iowait, see below). */ DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_sleep, TP_PROTO(struct task_struct *tsk, u64 delay), TP_ARGS(tsk, delay)); /* * Tracepoint for accounting iowait time (time the task is not runnable * due to waiting on IO to complete). */ DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_iowait, TP_PROTO(struct task_struct *tsk, u64 delay), TP_ARGS(tsk, delay)); /* * Tracepoint for accounting blocked time (time the task is in uninterruptible). */ DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_blocked, TP_PROTO(struct task_struct *tsk, u64 delay), TP_ARGS(tsk, delay)); /* * Tracepoint for accounting runtime (time the task is executing * on a CPU). */ DECLARE_EVENT_CLASS(sched_stat_runtime, TP_PROTO(struct task_struct *tsk, u64 runtime), TP_ARGS(tsk, __perf_count(runtime)), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( u64, runtime ) ), TP_fast_assign( memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); __entry->pid = tsk->pid; __entry->runtime = runtime; ), TP_printk("comm=%s pid=%d runtime=%Lu [ns]", __entry->comm, __entry->pid, (unsigned long long)__entry->runtime) ); DEFINE_EVENT(sched_stat_runtime, sched_stat_runtime, TP_PROTO(struct task_struct *tsk, u64 runtime), TP_ARGS(tsk, runtime)); /* * Tracepoint for showing priority inheritance modifying a tasks * priority. */ TRACE_EVENT(sched_pi_setprio, TP_PROTO(struct task_struct *tsk, struct task_struct *pi_task), TP_ARGS(tsk, pi_task), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( int, oldprio ) __field( int, newprio ) ), TP_fast_assign( memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); __entry->pid = tsk->pid; __entry->oldprio = tsk->prio; __entry->newprio = pi_task ? min(tsk->normal_prio, pi_task->prio) : tsk->normal_prio; /* XXX SCHED_DEADLINE bits missing */ ), TP_printk("comm=%s pid=%d oldprio=%d newprio=%d", __entry->comm, __entry->pid, __entry->oldprio, __entry->newprio) ); #ifdef CONFIG_DETECT_HUNG_TASK TRACE_EVENT(sched_process_hang, TP_PROTO(struct task_struct *tsk), TP_ARGS(tsk), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) ), TP_fast_assign( memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); __entry->pid = tsk->pid; ), TP_printk("comm=%s pid=%d", __entry->comm, __entry->pid) ); #endif /* CONFIG_DETECT_HUNG_TASK */ /* * Tracks migration of tasks from one runqueue to another. Can be used to * detect if automatic NUMA balancing is bouncing between nodes. */ TRACE_EVENT(sched_move_numa, TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu), TP_ARGS(tsk, src_cpu, dst_cpu), TP_STRUCT__entry( __field( pid_t, pid ) __field( pid_t, tgid ) __field( pid_t, ngid ) __field( int, src_cpu ) __field( int, src_nid ) __field( int, dst_cpu ) __field( int, dst_nid ) ), TP_fast_assign( __entry->pid = task_pid_nr(tsk); __entry->tgid = task_tgid_nr(tsk); __entry->ngid = task_numa_group_id(tsk); __entry->src_cpu = src_cpu; __entry->src_nid = cpu_to_node(src_cpu); __entry->dst_cpu = dst_cpu; __entry->dst_nid = cpu_to_node(dst_cpu); ), TP_printk("pid=%d tgid=%d ngid=%d src_cpu=%d src_nid=%d dst_cpu=%d dst_nid=%d", __entry->pid, __entry->tgid, __entry->ngid, __entry->src_cpu, __entry->src_nid, __entry->dst_cpu, __entry->dst_nid) ); DECLARE_EVENT_CLASS(sched_numa_pair_template, TP_PROTO(struct task_struct *src_tsk, int src_cpu, struct task_struct *dst_tsk, int dst_cpu), TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu), TP_STRUCT__entry( __field( pid_t, src_pid ) __field( pid_t, src_tgid ) __field( pid_t, src_ngid ) __field( int, src_cpu ) __field( int, src_nid ) __field( pid_t, dst_pid ) __field( pid_t, dst_tgid ) __field( pid_t, dst_ngid ) __field( int, dst_cpu ) __field( int, dst_nid ) ), TP_fast_assign( __entry->src_pid = task_pid_nr(src_tsk); __entry->src_tgid = task_tgid_nr(src_tsk); __entry->src_ngid = task_numa_group_id(src_tsk); __entry->src_cpu = src_cpu; __entry->src_nid = cpu_to_node(src_cpu); __entry->dst_pid = dst_tsk ? task_pid_nr(dst_tsk) : 0; __entry->dst_tgid = dst_tsk ? task_tgid_nr(dst_tsk) : 0; __entry->dst_ngid = dst_tsk ? task_numa_group_id(dst_tsk) : 0; __entry->dst_cpu = dst_cpu; __entry->dst_nid = dst_cpu >= 0 ? cpu_to_node(dst_cpu) : -1; ), TP_printk("src_pid=%d src_tgid=%d src_ngid=%d src_cpu=%d src_nid=%d dst_pid=%d dst_tgid=%d dst_ngid=%d dst_cpu=%d dst_nid=%d", __entry->src_pid, __entry->src_tgid, __entry->src_ngid, __entry->src_cpu, __entry->src_nid, __entry->dst_pid, __entry->dst_tgid, __entry->dst_ngid, __entry->dst_cpu, __entry->dst_nid) ); DEFINE_EVENT(sched_numa_pair_template, sched_stick_numa, TP_PROTO(struct task_struct *src_tsk, int src_cpu, struct task_struct *dst_tsk, int dst_cpu), TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu) ); DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa, TP_PROTO(struct task_struct *src_tsk, int src_cpu, struct task_struct *dst_tsk, int dst_cpu), TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu) ); #ifdef CONFIG_NUMA_BALANCING #define NUMAB_SKIP_REASON \ EM( NUMAB_SKIP_UNSUITABLE, "unsuitable" ) \ EM( NUMAB_SKIP_SHARED_RO, "shared_ro" ) \ EM( NUMAB_SKIP_INACCESSIBLE, "inaccessible" ) \ EM( NUMAB_SKIP_SCAN_DELAY, "scan_delay" ) \ EM( NUMAB_SKIP_PID_INACTIVE, "pid_inactive" ) \ EM( NUMAB_SKIP_IGNORE_PID, "ignore_pid_inactive" ) \ EMe(NUMAB_SKIP_SEQ_COMPLETED, "seq_completed" ) /* Redefine for export. */ #undef EM #undef EMe #define EM(a, b) TRACE_DEFINE_ENUM(a); #define EMe(a, b) TRACE_DEFINE_ENUM(a); NUMAB_SKIP_REASON /* Redefine for symbolic printing. */ #undef EM #undef EMe #define EM(a, b) { a, b }, #define EMe(a, b) { a, b } TRACE_EVENT(sched_skip_vma_numa, TP_PROTO(struct mm_struct *mm, struct vm_area_struct *vma, enum numa_vmaskip_reason reason), TP_ARGS(mm, vma, reason), TP_STRUCT__entry( __field(unsigned long, numa_scan_offset) __field(unsigned long, vm_start) __field(unsigned long, vm_end) __field(enum numa_vmaskip_reason, reason) ), TP_fast_assign( __entry->numa_scan_offset = mm->numa_scan_offset; __entry->vm_start = vma->vm_start; __entry->vm_end = vma->vm_end; __entry->reason = reason; ), TP_printk("numa_scan_offset=%lX vm_start=%lX vm_end=%lX reason=%s", __entry->numa_scan_offset, __entry->vm_start, __entry->vm_end, __print_symbolic(__entry->reason, NUMAB_SKIP_REASON)) ); #endif /* CONFIG_NUMA_BALANCING */ /* * Tracepoint for waking a polling cpu without an IPI. */ TRACE_EVENT(sched_wake_idle_without_ipi, TP_PROTO(int cpu), TP_ARGS(cpu), TP_STRUCT__entry( __field( int, cpu ) ), TP_fast_assign( __entry->cpu = cpu; ), TP_printk("cpu=%d", __entry->cpu) ); /* * Following tracepoints are not exported in tracefs and provide hooking * mechanisms only for testing and debugging purposes. * * Postfixed with _tp to make them easily identifiable in the code. */ DECLARE_TRACE(pelt_cfs_tp, TP_PROTO(struct cfs_rq *cfs_rq), TP_ARGS(cfs_rq)); DECLARE_TRACE(pelt_rt_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); DECLARE_TRACE(pelt_dl_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); DECLARE_TRACE(pelt_hw_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); DECLARE_TRACE(pelt_irq_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); DECLARE_TRACE(pelt_se_tp, TP_PROTO(struct sched_entity *se), TP_ARGS(se)); DECLARE_TRACE(sched_cpu_capacity_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); DECLARE_TRACE(sched_overutilized_tp, TP_PROTO(struct root_domain *rd, bool overutilized), TP_ARGS(rd, overutilized)); DECLARE_TRACE(sched_util_est_cfs_tp, TP_PROTO(struct cfs_rq *cfs_rq), TP_ARGS(cfs_rq)); DECLARE_TRACE(sched_util_est_se_tp, TP_PROTO(struct sched_entity *se), TP_ARGS(se)); DECLARE_TRACE(sched_update_nr_running_tp, TP_PROTO(struct rq *rq, int change), TP_ARGS(rq, change)); DECLARE_TRACE(sched_compute_energy_tp, TP_PROTO(struct task_struct *p, int dst_cpu, unsigned long energy, unsigned long max_util, unsigned long busy_time), TP_ARGS(p, dst_cpu, energy, max_util, busy_time)); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
6 5 1 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 #include <linux/errno.h> #include <linux/ip.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/socket.h> #include <linux/types.h> #include <net/checksum.h> #include <net/ip.h> #include <net/ip6_fib.h> #include <net/lwtunnel.h> #include <net/protocol.h> #include <uapi/linux/ila.h> #include "ila.h" void ila_init_saved_csum(struct ila_params *p) { if (!p->locator_match.v64) return; p->csum_diff = compute_csum_diff8( (__be32 *)&p->locator, (__be32 *)&p->locator_match); } static __wsum get_csum_diff_iaddr(struct ila_addr *iaddr, struct ila_params *p) { if (p->locator_match.v64) return p->csum_diff; else return compute_csum_diff8((__be32 *)&p->locator, (__be32 *)&iaddr->loc); } static __wsum get_csum_diff(struct ipv6hdr *ip6h, struct ila_params *p) { return get_csum_diff_iaddr(ila_a2i(&ip6h->daddr), p); } static void ila_csum_do_neutral_fmt(struct ila_addr *iaddr, struct ila_params *p) { __sum16 *adjust = (__force __sum16 *)&iaddr->ident.v16[3]; __wsum diff, fval; diff = get_csum_diff_iaddr(iaddr, p); fval = (__force __wsum)(ila_csum_neutral_set(iaddr->ident) ? CSUM_NEUTRAL_FLAG : ~CSUM_NEUTRAL_FLAG); diff = csum_add(diff, fval); *adjust = ~csum_fold(csum_add(diff, csum_unfold(*adjust))); /* Flip the csum-neutral bit. Either we are doing a SIR->ILA * translation with ILA_CSUM_NEUTRAL_MAP as the csum_method * and the C-bit is not set, or we are doing an ILA-SIR * tranlsation and the C-bit is set. */ iaddr->ident.csum_neutral ^= 1; } static void ila_csum_do_neutral_nofmt(struct ila_addr *iaddr, struct ila_params *p) { __sum16 *adjust = (__force __sum16 *)&iaddr->ident.v16[3]; __wsum diff; diff = get_csum_diff_iaddr(iaddr, p); *adjust = ~csum_fold(csum_add(diff, csum_unfold(*adjust))); } static void ila_csum_adjust_transport(struct sk_buff *skb, struct ila_params *p) { size_t nhoff = sizeof(struct ipv6hdr); struct ipv6hdr *ip6h = ipv6_hdr(skb); __wsum diff; switch (ip6h->nexthdr) { case NEXTHDR_TCP: if (likely(pskb_may_pull(skb, nhoff + sizeof(struct tcphdr)))) { struct tcphdr *th = (struct tcphdr *) (skb_network_header(skb) + nhoff); diff = get_csum_diff(ip6h, p); inet_proto_csum_replace_by_diff(&th->check, skb, diff, true); } break; case NEXTHDR_UDP: if (likely(pskb_may_pull(skb, nhoff + sizeof(struct udphdr)))) { struct udphdr *uh = (struct udphdr *) (skb_network_header(skb) + nhoff); if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { diff = get_csum_diff(ip6h, p); inet_proto_csum_replace_by_diff(&uh->check, skb, diff, true); if (!uh->check) uh->check = CSUM_MANGLED_0; } } break; case NEXTHDR_ICMP: if (likely(pskb_may_pull(skb, nhoff + sizeof(struct icmp6hdr)))) { struct icmp6hdr *ih = (struct icmp6hdr *) (skb_network_header(skb) + nhoff); diff = get_csum_diff(ip6h, p); inet_proto_csum_replace_by_diff(&ih->icmp6_cksum, skb, diff, true); } break; } } void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p, bool sir2ila) { struct ipv6hdr *ip6h = ipv6_hdr(skb); struct ila_addr *iaddr = ila_a2i(&ip6h->daddr); switch (p->csum_mode) { case ILA_CSUM_ADJUST_TRANSPORT: ila_csum_adjust_transport(skb, p); break; case ILA_CSUM_NEUTRAL_MAP: if (sir2ila) { if (WARN_ON(ila_csum_neutral_set(iaddr->ident))) { /* Checksum flag should never be * set in a formatted SIR address. */ break; } } else if (!ila_csum_neutral_set(iaddr->ident)) { /* ILA to SIR translation and C-bit isn't * set so we're good. */ break; } ila_csum_do_neutral_fmt(iaddr, p); break; case ILA_CSUM_NEUTRAL_MAP_AUTO: ila_csum_do_neutral_nofmt(iaddr, p); break; case ILA_CSUM_NO_ACTION: break; } /* Now change destination address */ iaddr->loc = p->locator; }
4926 4921 4925 16282 16268 4924 4922 4823 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 // SPDX-License-Identifier: GPL-2.0-or-later /* * printk_safe.c - Safe printk for printk-deadlock-prone contexts */ #include <linux/preempt.h> #include <linux/kdb.h> #include <linux/smp.h> #include <linux/cpumask.h> #include <linux/printk.h> #include <linux/kprobes.h> #include "internal.h" /* Context where printk messages are never suppressed */ static atomic_t force_con; void printk_force_console_enter(void) { atomic_inc(&force_con); } void printk_force_console_exit(void) { atomic_dec(&force_con); } bool is_printk_force_console(void) { return atomic_read(&force_con); } static DEFINE_PER_CPU(int, printk_context); /* Can be preempted by NMI. */ void __printk_safe_enter(void) { this_cpu_inc(printk_context); } /* Can be preempted by NMI. */ void __printk_safe_exit(void) { this_cpu_dec(printk_context); } void __printk_deferred_enter(void) { cant_migrate(); __printk_safe_enter(); } void __printk_deferred_exit(void) { cant_migrate(); __printk_safe_exit(); } bool is_printk_legacy_deferred(void) { /* * The per-CPU variable @printk_context can be read safely in any * context. CPU migration is always disabled when set. * * A context holding the printk_cpu_sync must not spin waiting for * another CPU. For legacy printing, it could be the console_lock * or the port lock. */ return (force_legacy_kthread() || this_cpu_read(printk_context) || in_nmi() || is_printk_cpu_sync_owner()); } asmlinkage int vprintk(const char *fmt, va_list args) { #ifdef CONFIG_KGDB_KDB /* Allow to pass printk() to kdb but avoid a recursion. */ if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); #endif return vprintk_default(fmt, args); } EXPORT_SYMBOL(vprintk);
38 39 39 38 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2002 Richard Henderson * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. * Copyright (C) 2023 Luis Chamberlain <mcgrof@kernel.org> * Copyright (C) 2024 Mike Rapoport IBM. */ #define pr_fmt(fmt) "execmem: " fmt #include <linux/mm.h> #include <linux/mutex.h> #include <linux/vmalloc.h> #include <linux/execmem.h> #include <linux/maple_tree.h> #include <linux/set_memory.h> #include <linux/moduleloader.h> #include <linux/text-patching.h> #include <asm/tlbflush.h> #include "internal.h" static struct execmem_info *execmem_info __ro_after_init; static struct execmem_info default_execmem_info __ro_after_init; #ifdef CONFIG_MMU static void *execmem_vmalloc(struct execmem_range *range, size_t size, pgprot_t pgprot, unsigned long vm_flags) { bool kasan = range->flags & EXECMEM_KASAN_SHADOW; gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN; unsigned int align = range->alignment; unsigned long start = range->start; unsigned long end = range->end; void *p; if (kasan) vm_flags |= VM_DEFER_KMEMLEAK; if (vm_flags & VM_ALLOW_HUGE_VMAP) align = PMD_SIZE; p = __vmalloc_node_range(size, align, start, end, gfp_flags, pgprot, vm_flags, NUMA_NO_NODE, __builtin_return_address(0)); if (!p && range->fallback_start) { start = range->fallback_start; end = range->fallback_end; p = __vmalloc_node_range(size, align, start, end, gfp_flags, pgprot, vm_flags, NUMA_NO_NODE, __builtin_return_address(0)); } if (!p) { pr_warn_ratelimited("unable to allocate memory\n"); return NULL; } if (kasan && (kasan_alloc_module_shadow(p, size, GFP_KERNEL) < 0)) { vfree(p); return NULL; } return p; } struct vm_struct *execmem_vmap(size_t size) { struct execmem_range *range = &execmem_info->ranges[EXECMEM_MODULE_DATA]; struct vm_struct *area; area = __get_vm_area_node(size, range->alignment, PAGE_SHIFT, VM_ALLOC, range->start, range->end, NUMA_NO_NODE, GFP_KERNEL, __builtin_return_address(0)); if (!area && range->fallback_start) area = __get_vm_area_node(size, range->alignment, PAGE_SHIFT, VM_ALLOC, range->fallback_start, range->fallback_end, NUMA_NO_NODE, GFP_KERNEL, __builtin_return_address(0)); return area; } #else static void *execmem_vmalloc(struct execmem_range *range, size_t size, pgprot_t pgprot, unsigned long vm_flags) { return vmalloc(size); } #endif /* CONFIG_MMU */ #ifdef CONFIG_ARCH_HAS_EXECMEM_ROX struct execmem_cache { struct mutex mutex; struct maple_tree busy_areas; struct maple_tree free_areas; }; static struct execmem_cache execmem_cache = { .mutex = __MUTEX_INITIALIZER(execmem_cache.mutex), .busy_areas = MTREE_INIT_EXT(busy_areas, MT_FLAGS_LOCK_EXTERN, execmem_cache.mutex), .free_areas = MTREE_INIT_EXT(free_areas, MT_FLAGS_LOCK_EXTERN, execmem_cache.mutex), }; static inline unsigned long mas_range_len(struct ma_state *mas) { return mas->last - mas->index + 1; } static int execmem_set_direct_map_valid(struct vm_struct *vm, bool valid) { unsigned int nr = (1 << get_vm_area_page_order(vm)); unsigned int updated = 0; int err = 0; for (int i = 0; i < vm->nr_pages; i += nr) { err = set_direct_map_valid_noflush(vm->pages[i], nr, valid); if (err) goto err_restore; updated += nr; } return 0; err_restore: for (int i = 0; i < updated; i += nr) set_direct_map_valid_noflush(vm->pages[i], nr, !valid); return err; } static void execmem_cache_clean(struct work_struct *work) { struct maple_tree *free_areas = &execmem_cache.free_areas; struct mutex *mutex = &execmem_cache.mutex; MA_STATE(mas, free_areas, 0, ULONG_MAX); void *area; mutex_lock(mutex); mas_for_each(&mas, area, ULONG_MAX) { size_t size = mas_range_len(&mas); if (IS_ALIGNED(size, PMD_SIZE) && IS_ALIGNED(mas.index, PMD_SIZE)) { struct vm_struct *vm = find_vm_area(area); execmem_set_direct_map_valid(vm, true); mas_store_gfp(&mas, NULL, GFP_KERNEL); vfree(area); } } mutex_unlock(mutex); } static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean); static int execmem_cache_add(void *ptr, size_t size) { struct maple_tree *free_areas = &execmem_cache.free_areas; struct mutex *mutex = &execmem_cache.mutex; unsigned long addr = (unsigned long)ptr; MA_STATE(mas, free_areas, addr - 1, addr + 1); unsigned long lower, upper; void *area = NULL; int err; lower = addr; upper = addr + size - 1; mutex_lock(mutex); area = mas_walk(&mas); if (area && mas.last == addr - 1) lower = mas.index; area = mas_next(&mas, ULONG_MAX); if (area && mas.index == addr + size) upper = mas.last; mas_set_range(&mas, lower, upper); err = mas_store_gfp(&mas, (void *)lower, GFP_KERNEL); mutex_unlock(mutex); if (err) return err; return 0; } static bool within_range(struct execmem_range *range, struct ma_state *mas, size_t size) { unsigned long addr = mas->index; if (addr >= range->start && addr + size < range->end) return true; if (range->fallback_start && addr >= range->fallback_start && addr + size < range->fallback_end) return true; return false; } static void *__execmem_cache_alloc(struct execmem_range *range, size_t size) { struct maple_tree *free_areas = &execmem_cache.free_areas; struct maple_tree *busy_areas = &execmem_cache.busy_areas; MA_STATE(mas_free, free_areas, 0, ULONG_MAX); MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX); struct mutex *mutex = &execmem_cache.mutex; unsigned long addr, last, area_size = 0; void *area, *ptr = NULL; int err; mutex_lock(mutex); mas_for_each(&mas_free, area, ULONG_MAX) { area_size = mas_range_len(&mas_free); if (area_size >= size && within_range(range, &mas_free, size)) break; } if (area_size < size) goto out_unlock; addr = mas_free.index; last = mas_free.last; /* insert allocated size to busy_areas at range [addr, addr + size) */ mas_set_range(&mas_busy, addr, addr + size - 1); err = mas_store_gfp(&mas_busy, (void *)addr, GFP_KERNEL); if (err) goto out_unlock; mas_store_gfp(&mas_free, NULL, GFP_KERNEL); if (area_size > size) { void *ptr = (void *)(addr + size); /* * re-insert remaining free size to free_areas at range * [addr + size, last] */ mas_set_range(&mas_free, addr + size, last); err = mas_store_gfp(&mas_free, ptr, GFP_KERNEL); if (err) { mas_store_gfp(&mas_busy, NULL, GFP_KERNEL); goto out_unlock; } } ptr = (void *)addr; out_unlock: mutex_unlock(mutex); return ptr; } static int execmem_cache_populate(struct execmem_range *range, size_t size) { unsigned long vm_flags = VM_ALLOW_HUGE_VMAP; unsigned long start, end; struct vm_struct *vm; size_t alloc_size; int err = -ENOMEM; void *p; alloc_size = round_up(size, PMD_SIZE); p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags); if (!p) return err; vm = find_vm_area(p); if (!vm) goto err_free_mem; /* fill memory with instructions that will trap */ execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true); start = (unsigned long)p; end = start + alloc_size; vunmap_range(start, end); err = execmem_set_direct_map_valid(vm, false); if (err) goto err_free_mem; err = vmap_pages_range_noflush(start, end, range->pgprot, vm->pages, PMD_SHIFT); if (err) goto err_free_mem; err = execmem_cache_add(p, alloc_size); if (err) goto err_free_mem; return 0; err_free_mem: vfree(p); return err; } static void *execmem_cache_alloc(struct execmem_range *range, size_t size) { void *p; int err; p = __execmem_cache_alloc(range, size); if (p) return p; err = execmem_cache_populate(range, size); if (err) return NULL; return __execmem_cache_alloc(range, size); } static bool execmem_cache_free(void *ptr) { struct maple_tree *busy_areas = &execmem_cache.busy_areas; struct mutex *mutex = &execmem_cache.mutex; unsigned long addr = (unsigned long)ptr; MA_STATE(mas, busy_areas, addr, addr); size_t size; void *area; mutex_lock(mutex); area = mas_walk(&mas); if (!area) { mutex_unlock(mutex); return false; } size = mas_range_len(&mas); mas_store_gfp(&mas, NULL, GFP_KERNEL); mutex_unlock(mutex); execmem_fill_trapping_insns(ptr, size, /* writable = */ false); execmem_cache_add(ptr, size); schedule_work(&execmem_cache_clean_work); return true; } #else /* CONFIG_ARCH_HAS_EXECMEM_ROX */ static void *execmem_cache_alloc(struct execmem_range *range, size_t size) { return NULL; } static bool execmem_cache_free(void *ptr) { return false; } #endif /* CONFIG_ARCH_HAS_EXECMEM_ROX */ void *execmem_alloc(enum execmem_type type, size_t size) { struct execmem_range *range = &execmem_info->ranges[type]; bool use_cache = range->flags & EXECMEM_ROX_CACHE; unsigned long vm_flags = VM_FLUSH_RESET_PERMS; pgprot_t pgprot = range->pgprot; void *p; if (use_cache) p = execmem_cache_alloc(range, size); else p = execmem_vmalloc(range, size, pgprot, vm_flags); return kasan_reset_tag(p); } void execmem_free(void *ptr) { /* * This memory may be RO, and freeing RO memory in an interrupt is not * supported by vmalloc. */ WARN_ON(in_interrupt()); if (!execmem_cache_free(ptr)) vfree(ptr); } void *execmem_update_copy(void *dst, const void *src, size_t size) { return text_poke_copy(dst, src, size); } bool execmem_is_rox(enum execmem_type type) { return !!(execmem_info->ranges[type].flags & EXECMEM_ROX_CACHE); } static bool execmem_validate(struct execmem_info *info) { struct execmem_range *r = &info->ranges[EXECMEM_DEFAULT]; if (!r->alignment || !r->start || !r->end || !pgprot_val(r->pgprot)) { pr_crit("Invalid parameters for execmem allocator, module loading will fail"); return false; } if (!IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX)) { for (int i = EXECMEM_DEFAULT; i < EXECMEM_TYPE_MAX; i++) { r = &info->ranges[i]; if (r->flags & EXECMEM_ROX_CACHE) { pr_warn_once("ROX cache is not supported\n"); r->flags &= ~EXECMEM_ROX_CACHE; } } } return true; } static void execmem_init_missing(struct execmem_info *info) { struct execmem_range *default_range = &info->ranges[EXECMEM_DEFAULT]; for (int i = EXECMEM_DEFAULT + 1; i < EXECMEM_TYPE_MAX; i++) { struct execmem_range *r = &info->ranges[i]; if (!r->start) { if (i == EXECMEM_MODULE_DATA) r->pgprot = PAGE_KERNEL; else r->pgprot = default_range->pgprot; r->alignment = default_range->alignment; r->start = default_range->start; r->end = default_range->end; r->flags = default_range->flags; r->fallback_start = default_range->fallback_start; r->fallback_end = default_range->fallback_end; } } } struct execmem_info * __weak execmem_arch_setup(void) { return NULL; } static void __init __execmem_init(void) { struct execmem_info *info = execmem_arch_setup(); if (!info) { info = execmem_info = &default_execmem_info; info->ranges[EXECMEM_DEFAULT].start = VMALLOC_START; info->ranges[EXECMEM_DEFAULT].end = VMALLOC_END; info->ranges[EXECMEM_DEFAULT].pgprot = PAGE_KERNEL_EXEC; info->ranges[EXECMEM_DEFAULT].alignment = 1; } if (!execmem_validate(info)) return; execmem_init_missing(info); execmem_info = info; } #ifdef CONFIG_ARCH_WANTS_EXECMEM_LATE static int __init execmem_late_init(void) { __execmem_init(); return 0; } core_initcall(execmem_late_init); #else void __init execmem_init(void) { __execmem_init(); } #endif
6 6 6 1 5 2 3 6 3 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 // SPDX-License-Identifier: GPL-2.0 /* * xfrm4_input.c * * Changes: * YOSHIFUJI Hideaki @USAGI * Split up af-specific portion * Derek Atkins <derek@ihtfp.com> * Add Encapsulation support * */ #include <linux/slab.h> #include <linux/module.h> #include <linux/string.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/protocol.h> #include <net/gro.h> static int xfrm4_rcv_encap_finish2(struct net *net, struct sock *sk, struct sk_buff *skb) { return dst_input(skb); } static inline int xfrm4_rcv_encap_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { if (!skb_dst(skb)) { const struct iphdr *iph = ip_hdr(skb); if (ip_route_input_noref(skb, iph->daddr, iph->saddr, ip4h_dscp(iph), skb->dev)) goto drop; } if (xfrm_trans_queue(skb, xfrm4_rcv_encap_finish2)) goto drop; return 0; drop: kfree_skb(skb); return NET_RX_DROP; } int xfrm4_transport_finish(struct sk_buff *skb, int async) { struct xfrm_offload *xo = xfrm_offload(skb); struct iphdr *iph = ip_hdr(skb); iph->protocol = XFRM_MODE_SKB_CB(skb)->protocol; #ifndef CONFIG_NETFILTER if (!async) return -iph->protocol; #endif __skb_push(skb, -skb_network_offset(skb)); iph->tot_len = htons(skb->len); ip_send_check(iph); if (xo && (xo->flags & XFRM_GRO)) { /* The full l2 header needs to be preserved so that re-injecting the packet at l2 * works correctly in the presence of vlan tags. */ skb_mac_header_rebuild_full(skb, xo->orig_mac_len); skb_reset_network_header(skb); skb_reset_transport_header(skb); return 0; } NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, dev_net(skb->dev), NULL, skb, skb->dev, NULL, xfrm4_rcv_encap_finish); return 0; } static int __xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb, bool pull) { struct udp_sock *up = udp_sk(sk); struct udphdr *uh; struct iphdr *iph; int iphlen, len; __u8 *udpdata; __be32 *udpdata32; u16 encap_type; encap_type = READ_ONCE(up->encap_type); /* if this is not encapsulated socket, then just return now */ if (!encap_type) return 1; /* If this is a paged skb, make sure we pull up * whatever data we need to look at. */ len = skb->len - sizeof(struct udphdr); if (!pskb_may_pull(skb, sizeof(struct udphdr) + min(len, 8))) return 1; /* Now we can get the pointers */ uh = udp_hdr(skb); udpdata = (__u8 *)uh + sizeof(struct udphdr); udpdata32 = (__be32 *)udpdata; switch (encap_type) { default: case UDP_ENCAP_ESPINUDP: /* Check if this is a keepalive packet. If so, eat it. */ if (len == 1 && udpdata[0] == 0xff) { return -EINVAL; } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0) { /* ESP Packet without Non-ESP header */ len = sizeof(struct udphdr); } else /* Must be an IKE packet.. pass it through */ return 1; break; } /* At this point we are sure that this is an ESPinUDP packet, * so we need to remove 'len' bytes from the packet (the UDP * header and optional ESP marker bytes) and then modify the * protocol to ESP, and then call into the transform receiver. */ if (skb_unclone(skb, GFP_ATOMIC)) return -EINVAL; /* Now we can update and verify the packet length... */ iph = ip_hdr(skb); iphlen = iph->ihl << 2; iph->tot_len = htons(ntohs(iph->tot_len) - len); if (skb->len < iphlen + len) { /* packet is too small!?! */ return -EINVAL; } /* pull the data buffer up to the ESP header and set the * transport header to point to ESP. Keep UDP on the stack * for later. */ if (pull) { __skb_pull(skb, len); skb_reset_transport_header(skb); } else { skb_set_transport_header(skb, len); } /* process ESP */ return 0; } /* If it's a keepalive packet, then just eat it. * If it's an encapsulated packet, then pass it to the * IPsec xfrm input. * Returns 0 if skb passed to xfrm or was dropped. * Returns >0 if skb should be passed to UDP. * Returns <0 if skb should be resubmitted (-ret is protocol) */ int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) { int ret; ret = __xfrm4_udp_encap_rcv(sk, skb, true); if (!ret) return xfrm4_rcv_encap(skb, IPPROTO_ESP, 0, udp_sk(sk)->encap_type); if (ret < 0) { kfree_skb(skb); return 0; } return ret; } EXPORT_SYMBOL(xfrm4_udp_encap_rcv); struct sk_buff *xfrm4_gro_udp_encap_rcv(struct sock *sk, struct list_head *head, struct sk_buff *skb) { int offset = skb_gro_offset(skb); const struct net_offload *ops; struct sk_buff *pp = NULL; int ret; offset = offset - sizeof(struct udphdr); if (!pskb_pull(skb, offset)) return NULL; rcu_read_lock(); ops = rcu_dereference(inet_offloads[IPPROTO_ESP]); if (!ops || !ops->callbacks.gro_receive) goto out; ret = __xfrm4_udp_encap_rcv(sk, skb, false); if (ret) goto out; skb_push(skb, offset); NAPI_GRO_CB(skb)->proto = IPPROTO_UDP; pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); rcu_read_unlock(); return pp; out: rcu_read_unlock(); skb_push(skb, offset); NAPI_GRO_CB(skb)->same_flow = 0; NAPI_GRO_CB(skb)->flush = 1; return NULL; } EXPORT_SYMBOL(xfrm4_gro_udp_encap_rcv); int xfrm4_rcv(struct sk_buff *skb) { return xfrm4_rcv_spi(skb, ip_hdr(skb)->protocol, 0); } EXPORT_SYMBOL(xfrm4_rcv);
117 117 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 /* SPDX-License-Identifier: GPL-2.0-only */ /* * kernfs.h - pseudo filesystem decoupled from vfs locking */ #ifndef __LINUX_KERNFS_H #define __LINUX_KERNFS_H #include <linux/err.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/idr.h> #include <linux/lockdep.h> #include <linux/rbtree.h> #include <linux/atomic.h> #include <linux/bug.h> #include <linux/types.h> #include <linux/uidgid.h> #include <linux/wait.h> #include <linux/rwsem.h> #include <linux/cache.h> struct file; struct dentry; struct iattr; struct seq_file; struct vm_area_struct; struct vm_operations_struct; struct super_block; struct file_system_type; struct poll_table_struct; struct fs_context; struct kernfs_fs_context; struct kernfs_open_node; struct kernfs_iattrs; /* * NR_KERNFS_LOCK_BITS determines size (NR_KERNFS_LOCKS) of hash * table of locks. * Having a small hash table would impact scalability, since * more and more kernfs_node objects will end up using same lock * and having a very large hash table would waste memory. * * At the moment size of hash table of locks is being set based on * the number of CPUs as follows: * * NR_CPU NR_KERNFS_LOCK_BITS NR_KERNFS_LOCKS * 1 1 2 * 2-3 2 4 * 4-7 4 16 * 8-15 6 64 * 16-31 8 256 * 32 and more 10 1024 * * The above relation between NR_CPU and number of locks is based * on some internal experimentation which involved booting qemu * with different values of smp, performing some sysfs operations * on all CPUs and observing how increase in number of locks impacts * completion time of these sysfs operations on each CPU. */ #ifdef CONFIG_SMP #define NR_KERNFS_LOCK_BITS (2 * (ilog2(NR_CPUS < 32 ? NR_CPUS : 32))) #else #define NR_KERNFS_LOCK_BITS 1 #endif #define NR_KERNFS_LOCKS (1 << NR_KERNFS_LOCK_BITS) /* * There's one kernfs_open_file for each open file and one kernfs_open_node * for each kernfs_node with one or more open files. * * filp->private_data points to seq_file whose ->private points to * kernfs_open_file. * * kernfs_open_files are chained at kernfs_open_node->files, which is * protected by kernfs_global_locks.open_file_mutex[i]. * * To reduce possible contention in sysfs access, arising due to single * locks, use an array of locks (e.g. open_file_mutex) and use kernfs_node * object address as hash keys to get the index of these locks. * * Hashed mutexes are safe to use here because operations using these don't * rely on global exclusion. * * In future we intend to replace other global locks with hashed ones as well. * kernfs_global_locks acts as a holder for all such hash tables. */ struct kernfs_global_locks { struct mutex open_file_mutex[NR_KERNFS_LOCKS]; }; enum kernfs_node_type { KERNFS_DIR = 0x0001, KERNFS_FILE = 0x0002, KERNFS_LINK = 0x0004, }; #define KERNFS_TYPE_MASK 0x000f #define KERNFS_FLAG_MASK ~KERNFS_TYPE_MASK #define KERNFS_MAX_USER_XATTRS 128 #define KERNFS_USER_XATTR_SIZE_LIMIT (128 << 10) enum kernfs_node_flag { KERNFS_ACTIVATED = 0x0010, KERNFS_NS = 0x0020, KERNFS_HAS_SEQ_SHOW = 0x0040, KERNFS_HAS_MMAP = 0x0080, KERNFS_LOCKDEP = 0x0100, KERNFS_HIDDEN = 0x0200, KERNFS_SUICIDAL = 0x0400, KERNFS_SUICIDED = 0x0800, KERNFS_EMPTY_DIR = 0x1000, KERNFS_HAS_RELEASE = 0x2000, KERNFS_REMOVING = 0x4000, }; /* @flags for kernfs_create_root() */ enum kernfs_root_flag { /* * kernfs_nodes are created in the deactivated state and invisible. * They require explicit kernfs_activate() to become visible. This * can be used to make related nodes become visible atomically * after all nodes are created successfully. */ KERNFS_ROOT_CREATE_DEACTIVATED = 0x0001, /* * For regular files, if the opener has CAP_DAC_OVERRIDE, open(2) * succeeds regardless of the RW permissions. sysfs had an extra * layer of enforcement where open(2) fails with -EACCES regardless * of CAP_DAC_OVERRIDE if the permission doesn't have the * respective read or write access at all (none of S_IRUGO or * S_IWUGO) or the respective operation isn't implemented. The * following flag enables that behavior. */ KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK = 0x0002, /* * The filesystem supports exportfs operation, so userspace can use * fhandle to access nodes of the fs. */ KERNFS_ROOT_SUPPORT_EXPORTOP = 0x0004, /* * Support user xattrs to be written to nodes rooted at this root. */ KERNFS_ROOT_SUPPORT_USER_XATTR = 0x0008, }; /* type-specific structures for kernfs_node union members */ struct kernfs_elem_dir { unsigned long subdirs; /* children rbtree starts here and goes through kn->rb */ struct rb_root children; /* * The kernfs hierarchy this directory belongs to. This fits * better directly in kernfs_node but is here to save space. */ struct kernfs_root *root; /* * Monotonic revision counter, used to identify if a directory * node has changed during negative dentry revalidation. */ unsigned long rev; }; struct kernfs_elem_symlink { struct kernfs_node *target_kn; }; struct kernfs_elem_attr { const struct kernfs_ops *ops; struct kernfs_open_node __rcu *open; loff_t size; struct kernfs_node *notify_next; /* for kernfs_notify() */ }; /* * kernfs_node - the building block of kernfs hierarchy. Each and every * kernfs node is represented by single kernfs_node. Most fields are * private to kernfs and shouldn't be accessed directly by kernfs users. * * As long as count reference is held, the kernfs_node itself is * accessible. Dereferencing elem or any other outer entity requires * active reference. */ struct kernfs_node { atomic_t count; atomic_t active; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif /* * Use kernfs_get_parent() and kernfs_name/path() instead of * accessing the following two fields directly. If the node is * never moved to a different parent, it is safe to access the * parent directly. */ struct kernfs_node *parent; const char *name; struct rb_node rb; const void *ns; /* namespace tag */ unsigned int hash; /* ns + name hash */ unsigned short flags; umode_t mode; union { struct kernfs_elem_dir dir; struct kernfs_elem_symlink symlink; struct kernfs_elem_attr attr; }; /* * 64bit unique ID. On 64bit ino setups, id is the ino. On 32bit, * the low 32bits are ino and upper generation. */ u64 id; void *priv; struct kernfs_iattrs *iattr; struct rcu_head rcu; }; /* * kernfs_syscall_ops may be specified on kernfs_create_root() to support * syscalls. These optional callbacks are invoked on the matching syscalls * and can perform any kernfs operations which don't necessarily have to be * the exact operation requested. An active reference is held for each * kernfs_node parameter. */ struct kernfs_syscall_ops { int (*show_options)(struct seq_file *sf, struct kernfs_root *root); int (*mkdir)(struct kernfs_node *parent, const char *name, umode_t mode); int (*rmdir)(struct kernfs_node *kn); int (*rename)(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name); int (*show_path)(struct seq_file *sf, struct kernfs_node *kn, struct kernfs_root *root); }; struct kernfs_node *kernfs_root_to_node(struct kernfs_root *root); struct kernfs_open_file { /* published fields */ struct kernfs_node *kn; struct file *file; struct seq_file *seq_file; void *priv; /* private fields, do not use outside kernfs proper */ struct mutex mutex; struct mutex prealloc_mutex; int event; struct list_head list; char *prealloc_buf; size_t atomic_write_len; bool mmapped:1; bool released:1; const struct vm_operations_struct *vm_ops; }; struct kernfs_ops { /* * Optional open/release methods. Both are called with * @of->seq_file populated. */ int (*open)(struct kernfs_open_file *of); void (*release)(struct kernfs_open_file *of); /* * Read is handled by either seq_file or raw_read(). * * If seq_show() is present, seq_file path is active. Other seq * operations are optional and if not implemented, the behavior is * equivalent to single_open(). @sf->private points to the * associated kernfs_open_file. * * read() is bounced through kernel buffer and a read larger than * PAGE_SIZE results in partial operation of PAGE_SIZE. */ int (*seq_show)(struct seq_file *sf, void *v); void *(*seq_start)(struct seq_file *sf, loff_t *ppos); void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos); void (*seq_stop)(struct seq_file *sf, void *v); ssize_t (*read)(struct kernfs_open_file *of, char *buf, size_t bytes, loff_t off); /* * write() is bounced through kernel buffer. If atomic_write_len * is not set, a write larger than PAGE_SIZE results in partial * operations of PAGE_SIZE chunks. If atomic_write_len is set, * writes upto the specified size are executed atomically but * larger ones are rejected with -E2BIG. */ size_t atomic_write_len; /* * "prealloc" causes a buffer to be allocated at open for * all read/write requests. As ->seq_show uses seq_read() * which does its own allocation, it is incompatible with * ->prealloc. Provide ->read and ->write with ->prealloc. */ bool prealloc; ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t bytes, loff_t off); __poll_t (*poll)(struct kernfs_open_file *of, struct poll_table_struct *pt); int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma); loff_t (*llseek)(struct kernfs_open_file *of, loff_t offset, int whence); }; /* * The kernfs superblock creation/mount parameter context. */ struct kernfs_fs_context { struct kernfs_root *root; /* Root of the hierarchy being mounted */ void *ns_tag; /* Namespace tag of the mount (or NULL) */ unsigned long magic; /* File system specific magic number */ /* The following are set/used by kernfs_mount() */ bool new_sb_created; /* Set to T if we allocated a new sb */ }; #ifdef CONFIG_KERNFS static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) { return kn->flags & KERNFS_TYPE_MASK; } static inline ino_t kernfs_id_ino(u64 id) { /* id is ino if ino_t is 64bit; otherwise, low 32bits */ if (sizeof(ino_t) >= sizeof(u64)) return id; else return (u32)id; } static inline u32 kernfs_id_gen(u64 id) { /* gen is fixed at 1 if ino_t is 64bit; otherwise, high 32bits */ if (sizeof(ino_t) >= sizeof(u64)) return 1; else return id >> 32; } static inline ino_t kernfs_ino(struct kernfs_node *kn) { return kernfs_id_ino(kn->id); } static inline ino_t kernfs_gen(struct kernfs_node *kn) { return kernfs_id_gen(kn->id); } /** * kernfs_enable_ns - enable namespace under a directory * @kn: directory of interest, should be empty * * This is to be called right after @kn is created to enable namespace * under it. All children of @kn must have non-NULL namespace tags and * only the ones which match the super_block's tag will be visible. */ static inline void kernfs_enable_ns(struct kernfs_node *kn) { WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR); WARN_ON_ONCE(!RB_EMPTY_ROOT(&kn->dir.children)); kn->flags |= KERNFS_NS; } /** * kernfs_ns_enabled - test whether namespace is enabled * @kn: the node to test * * Test whether namespace filtering is enabled for the children of @ns. */ static inline bool kernfs_ns_enabled(struct kernfs_node *kn) { return kn->flags & KERNFS_NS; } int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen); int kernfs_path_from_node(struct kernfs_node *root_kn, struct kernfs_node *kn, char *buf, size_t buflen); void pr_cont_kernfs_name(struct kernfs_node *kn); void pr_cont_kernfs_path(struct kernfs_node *kn); struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn); struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns); struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path, const void *ns); void kernfs_get(struct kernfs_node *kn); void kernfs_put(struct kernfs_node *kn); struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry); struct kernfs_root *kernfs_root_from_sb(struct super_block *sb); struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn); struct dentry *kernfs_node_dentry(struct kernfs_node *kn, struct super_block *sb); struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, void *priv); void kernfs_destroy_root(struct kernfs_root *root); struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, void *priv, const void *ns); struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, const char *name); struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns, struct lock_class_key *key); struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, const char *name, struct kernfs_node *target); void kernfs_activate(struct kernfs_node *kn); void kernfs_show(struct kernfs_node *kn, bool show); void kernfs_remove(struct kernfs_node *kn); void kernfs_break_active_protection(struct kernfs_node *kn); void kernfs_unbreak_active_protection(struct kernfs_node *kn); bool kernfs_remove_self(struct kernfs_node *kn); int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, const void *ns); int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name, const void *new_ns); int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); __poll_t kernfs_generic_poll(struct kernfs_open_file *of, struct poll_table_struct *pt); void kernfs_notify(struct kernfs_node *kn); int kernfs_xattr_get(struct kernfs_node *kn, const char *name, void *value, size_t size); int kernfs_xattr_set(struct kernfs_node *kn, const char *name, const void *value, size_t size, int flags); const void *kernfs_super_ns(struct super_block *sb); int kernfs_get_tree(struct fs_context *fc); void kernfs_free_fs_context(struct fs_context *fc); void kernfs_kill_sb(struct super_block *sb); void kernfs_init(void); struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, u64 id); #else /* CONFIG_KERNFS */ static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) { return 0; } /* whatever */ static inline void kernfs_enable_ns(struct kernfs_node *kn) { } static inline bool kernfs_ns_enabled(struct kernfs_node *kn) { return false; } static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) { return -ENOSYS; } static inline int kernfs_path_from_node(struct kernfs_node *root_kn, struct kernfs_node *kn, char *buf, size_t buflen) { return -ENOSYS; } static inline void pr_cont_kernfs_name(struct kernfs_node *kn) { } static inline void pr_cont_kernfs_path(struct kernfs_node *kn) { } static inline struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn) { return NULL; } static inline struct kernfs_node * kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns) { return NULL; } static inline struct kernfs_node * kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path, const void *ns) { return NULL; } static inline void kernfs_get(struct kernfs_node *kn) { } static inline void kernfs_put(struct kernfs_node *kn) { } static inline struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) { return NULL; } static inline struct kernfs_root *kernfs_root_from_sb(struct super_block *sb) { return NULL; } static inline struct inode * kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) { return NULL; } static inline struct kernfs_root * kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, void *priv) { return ERR_PTR(-ENOSYS); } static inline void kernfs_destroy_root(struct kernfs_root *root) { } static inline struct kernfs_node * kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, void *priv, const void *ns) { return ERR_PTR(-ENOSYS); } static inline struct kernfs_node * __kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns, struct lock_class_key *key) { return ERR_PTR(-ENOSYS); } static inline struct kernfs_node * kernfs_create_link(struct kernfs_node *parent, const char *name, struct kernfs_node *target) { return ERR_PTR(-ENOSYS); } static inline void kernfs_activate(struct kernfs_node *kn) { } static inline void kernfs_remove(struct kernfs_node *kn) { } static inline bool kernfs_remove_self(struct kernfs_node *kn) { return false; } static inline int kernfs_remove_by_name_ns(struct kernfs_node *kn, const char *name, const void *ns) { return -ENOSYS; } static inline int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name, const void *new_ns) { return -ENOSYS; } static inline int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) { return -ENOSYS; } static inline __poll_t kernfs_generic_poll(struct kernfs_open_file *of, struct poll_table_struct *pt) { return -ENOSYS; } static inline void kernfs_notify(struct kernfs_node *kn) { } static inline int kernfs_xattr_get(struct kernfs_node *kn, const char *name, void *value, size_t size) { return -ENOSYS; } static inline int kernfs_xattr_set(struct kernfs_node *kn, const char *name, const void *value, size_t size, int flags) { return -ENOSYS; } static inline const void *kernfs_super_ns(struct super_block *sb) { return NULL; } static inline int kernfs_get_tree(struct fs_context *fc) { return -ENOSYS; } static inline void kernfs_free_fs_context(struct fs_context *fc) { } static inline void kernfs_kill_sb(struct super_block *sb) { } static inline void kernfs_init(void) { } #endif /* CONFIG_KERNFS */ /** * kernfs_path - build full path of a given node * @kn: kernfs_node of interest * @buf: buffer to copy @kn's name into * @buflen: size of @buf * * If @kn is NULL result will be "(null)". * * Returns the length of the full path. If the full length is equal to or * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ static inline int kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen) { return kernfs_path_from_node(kn, NULL, buf, buflen); } static inline struct kernfs_node * kernfs_find_and_get(struct kernfs_node *kn, const char *name) { return kernfs_find_and_get_ns(kn, name, NULL); } static inline struct kernfs_node * kernfs_walk_and_get(struct kernfs_node *kn, const char *path) { return kernfs_walk_and_get_ns(kn, path, NULL); } static inline struct kernfs_node * kernfs_create_dir(struct kernfs_node *parent, const char *name, umode_t mode, void *priv) { return kernfs_create_dir_ns(parent, name, mode, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, priv, NULL); } static inline int kernfs_remove_by_name(struct kernfs_node *parent, const char *name) { return kernfs_remove_by_name_ns(parent, name, NULL); } static inline int kernfs_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name) { return kernfs_rename_ns(kn, new_parent, new_name, NULL); } #endif /* __LINUX_KERNFS_H */
1 1 1 1 1 1 1 3 3 2 2 2 17 16 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 /* gf128mul.c - GF(2^128) multiplication functions * * Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. * Copyright (c) 2006, Rik Snel <rsnel@cube.dyndns.org> * * Based on Dr Brian Gladman's (GPL'd) work published at * http://gladman.plushost.co.uk/oldsite/cryptography_technology/index.php * See the original copyright notice below. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. */ /* --------------------------------------------------------------------------- Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved. LICENSE TERMS The free distribution and use of this software in both source and binary form is allowed (with or without changes) provided that: 1. distributions of this source code include the above copyright notice, this list of conditions and the following disclaimer; 2. distributions in binary form include the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other associated materials; 3. the copyright holder's name is not used to endorse products built using this software without specific written permission. ALTERNATIVELY, provided that this notice is retained in full, this product may be distributed under the terms of the GNU General Public License (GPL), in which case the provisions of the GPL apply INSTEAD OF those given above. DISCLAIMER This software is provided 'as is' with no explicit or implied warranties in respect of its properties, including, but not limited to, correctness and/or fitness for purpose. --------------------------------------------------------------------------- Issue 31/01/2006 This file provides fast multiplication in GF(2^128) as required by several cryptographic authentication modes */ #include <crypto/gf128mul.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> #define gf128mul_dat(q) { \ q(0x00), q(0x01), q(0x02), q(0x03), q(0x04), q(0x05), q(0x06), q(0x07),\ q(0x08), q(0x09), q(0x0a), q(0x0b), q(0x0c), q(0x0d), q(0x0e), q(0x0f),\ q(0x10), q(0x11), q(0x12), q(0x13), q(0x14), q(0x15), q(0x16), q(0x17),\ q(0x18), q(0x19), q(0x1a), q(0x1b), q(0x1c), q(0x1d), q(0x1e), q(0x1f),\ q(0x20), q(0x21), q(0x22), q(0x23), q(0x24), q(0x25), q(0x26), q(0x27),\ q(0x28), q(0x29), q(0x2a), q(0x2b), q(0x2c), q(0x2d), q(0x2e), q(0x2f),\ q(0x30), q(0x31), q(0x32), q(0x33), q(0x34), q(0x35), q(0x36), q(0x37),\ q(0x38), q(0x39), q(0x3a), q(0x3b), q(0x3c), q(0x3d), q(0x3e), q(0x3f),\ q(0x40), q(0x41), q(0x42), q(0x43), q(0x44), q(0x45), q(0x46), q(0x47),\ q(0x48), q(0x49), q(0x4a), q(0x4b), q(0x4c), q(0x4d), q(0x4e), q(0x4f),\ q(0x50), q(0x51), q(0x52), q(0x53), q(0x54), q(0x55), q(0x56), q(0x57),\ q(0x58), q(0x59), q(0x5a), q(0x5b), q(0x5c), q(0x5d), q(0x5e), q(0x5f),\ q(0x60), q(0x61), q(0x62), q(0x63), q(0x64), q(0x65), q(0x66), q(0x67),\ q(0x68), q(0x69), q(0x6a), q(0x6b), q(0x6c), q(0x6d), q(0x6e), q(0x6f),\ q(0x70), q(0x71), q(0x72), q(0x73), q(0x74), q(0x75), q(0x76), q(0x77),\ q(0x78), q(0x79), q(0x7a), q(0x7b), q(0x7c), q(0x7d), q(0x7e), q(0x7f),\ q(0x80), q(0x81), q(0x82), q(0x83), q(0x84), q(0x85), q(0x86), q(0x87),\ q(0x88), q(0x89), q(0x8a), q(0x8b), q(0x8c), q(0x8d), q(0x8e), q(0x8f),\ q(0x90), q(0x91), q(0x92), q(0x93), q(0x94), q(0x95), q(0x96), q(0x97),\ q(0x98), q(0x99), q(0x9a), q(0x9b), q(0x9c), q(0x9d), q(0x9e), q(0x9f),\ q(0xa0), q(0xa1), q(0xa2), q(0xa3), q(0xa4), q(0xa5), q(0xa6), q(0xa7),\ q(0xa8), q(0xa9), q(0xaa), q(0xab), q(0xac), q(0xad), q(0xae), q(0xaf),\ q(0xb0), q(0xb1), q(0xb2), q(0xb3), q(0xb4), q(0xb5), q(0xb6), q(0xb7),\ q(0xb8), q(0xb9), q(0xba), q(0xbb), q(0xbc), q(0xbd), q(0xbe), q(0xbf),\ q(0xc0), q(0xc1), q(0xc2), q(0xc3), q(0xc4), q(0xc5), q(0xc6), q(0xc7),\ q(0xc8), q(0xc9), q(0xca), q(0xcb), q(0xcc), q(0xcd), q(0xce), q(0xcf),\ q(0xd0), q(0xd1), q(0xd2), q(0xd3), q(0xd4), q(0xd5), q(0xd6), q(0xd7),\ q(0xd8), q(0xd9), q(0xda), q(0xdb), q(0xdc), q(0xdd), q(0xde), q(0xdf),\ q(0xe0), q(0xe1), q(0xe2), q(0xe3), q(0xe4), q(0xe5), q(0xe6), q(0xe7),\ q(0xe8), q(0xe9), q(0xea), q(0xeb), q(0xec), q(0xed), q(0xee), q(0xef),\ q(0xf0), q(0xf1), q(0xf2), q(0xf3), q(0xf4), q(0xf5), q(0xf6), q(0xf7),\ q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) \ } /* * Given a value i in 0..255 as the byte overflow when a field element * in GF(2^128) is multiplied by x^8, the following macro returns the * 16-bit value that must be XOR-ed into the low-degree end of the * product to reduce it modulo the polynomial x^128 + x^7 + x^2 + x + 1. * * There are two versions of the macro, and hence two tables: one for * the "be" convention where the highest-order bit is the coefficient of * the highest-degree polynomial term, and one for the "le" convention * where the highest-order bit is the coefficient of the lowest-degree * polynomial term. In both cases the values are stored in CPU byte * endianness such that the coefficients are ordered consistently across * bytes, i.e. in the "be" table bits 15..0 of the stored value * correspond to the coefficients of x^15..x^0, and in the "le" table * bits 15..0 correspond to the coefficients of x^0..x^15. * * Therefore, provided that the appropriate byte endianness conversions * are done by the multiplication functions (and these must be in place * anyway to support both little endian and big endian CPUs), the "be" * table can be used for multiplications of both "bbe" and "ble" * elements, and the "le" table can be used for multiplications of both * "lle" and "lbe" elements. */ #define xda_be(i) ( \ (i & 0x80 ? 0x4380 : 0) ^ (i & 0x40 ? 0x21c0 : 0) ^ \ (i & 0x20 ? 0x10e0 : 0) ^ (i & 0x10 ? 0x0870 : 0) ^ \ (i & 0x08 ? 0x0438 : 0) ^ (i & 0x04 ? 0x021c : 0) ^ \ (i & 0x02 ? 0x010e : 0) ^ (i & 0x01 ? 0x0087 : 0) \ ) #define xda_le(i) ( \ (i & 0x80 ? 0xe100 : 0) ^ (i & 0x40 ? 0x7080 : 0) ^ \ (i & 0x20 ? 0x3840 : 0) ^ (i & 0x10 ? 0x1c20 : 0) ^ \ (i & 0x08 ? 0x0e10 : 0) ^ (i & 0x04 ? 0x0708 : 0) ^ \ (i & 0x02 ? 0x0384 : 0) ^ (i & 0x01 ? 0x01c2 : 0) \ ) static const u16 gf128mul_table_le[256] = gf128mul_dat(xda_le); static const u16 gf128mul_table_be[256] = gf128mul_dat(xda_be); /* * The following functions multiply a field element by x^8 in * the polynomial field representation. They use 64-bit word operations * to gain speed but compensate for machine endianness and hence work * correctly on both styles of machine. */ static void gf128mul_x8_lle(be128 *x) { u64 a = be64_to_cpu(x->a); u64 b = be64_to_cpu(x->b); u64 _tt = gf128mul_table_le[b & 0xff]; x->b = cpu_to_be64((b >> 8) | (a << 56)); x->a = cpu_to_be64((a >> 8) ^ (_tt << 48)); } /* time invariant version of gf128mul_x8_lle */ static void gf128mul_x8_lle_ti(be128 *x) { u64 a = be64_to_cpu(x->a); u64 b = be64_to_cpu(x->b); u64 _tt = xda_le(b & 0xff); /* avoid table lookup */ x->b = cpu_to_be64((b >> 8) | (a << 56)); x->a = cpu_to_be64((a >> 8) ^ (_tt << 48)); } static void gf128mul_x8_bbe(be128 *x) { u64 a = be64_to_cpu(x->a); u64 b = be64_to_cpu(x->b); u64 _tt = gf128mul_table_be[a >> 56]; x->a = cpu_to_be64((a << 8) | (b >> 56)); x->b = cpu_to_be64((b << 8) ^ _tt); } void gf128mul_x8_ble(le128 *r, const le128 *x) { u64 a = le64_to_cpu(x->a); u64 b = le64_to_cpu(x->b); u64 _tt = gf128mul_table_be[a >> 56]; r->a = cpu_to_le64((a << 8) | (b >> 56)); r->b = cpu_to_le64((b << 8) ^ _tt); } EXPORT_SYMBOL(gf128mul_x8_ble); void gf128mul_lle(be128 *r, const be128 *b) { /* * The p array should be aligned to twice the size of its element type, * so that every even/odd pair is guaranteed to share a cacheline * (assuming a cacheline size of 32 bytes or more, which is by far the * most common). This ensures that each be128_xor() call in the loop * takes the same amount of time regardless of the value of 'ch', which * is derived from function parameter 'b', which is commonly used as a * key, e.g., for GHASH. The odd array elements are all set to zero, * making each be128_xor() a NOP if its associated bit in 'ch' is not * set, and this is equivalent to calling be128_xor() conditionally. * This approach aims to avoid leaking information about such keys * through execution time variances. * * Unfortunately, __aligned(16) or higher does not work on x86 for * variables on the stack so we need to perform the alignment by hand. */ be128 array[16 + 3] = {}; be128 *p = PTR_ALIGN(&array[0], 2 * sizeof(be128)); int i; p[0] = *r; for (i = 0; i < 7; ++i) gf128mul_x_lle(&p[2 * i + 2], &p[2 * i]); memset(r, 0, sizeof(*r)); for (i = 0;;) { u8 ch = ((u8 *)b)[15 - i]; be128_xor(r, r, &p[ 0 + !(ch & 0x80)]); be128_xor(r, r, &p[ 2 + !(ch & 0x40)]); be128_xor(r, r, &p[ 4 + !(ch & 0x20)]); be128_xor(r, r, &p[ 6 + !(ch & 0x10)]); be128_xor(r, r, &p[ 8 + !(ch & 0x08)]); be128_xor(r, r, &p[10 + !(ch & 0x04)]); be128_xor(r, r, &p[12 + !(ch & 0x02)]); be128_xor(r, r, &p[14 + !(ch & 0x01)]); if (++i >= 16) break; gf128mul_x8_lle_ti(r); /* use the time invariant version */ } } EXPORT_SYMBOL(gf128mul_lle); /* This version uses 64k bytes of table space. A 16 byte buffer has to be multiplied by a 16 byte key value in GF(2^128). If we consider a GF(2^128) value in the buffer's lowest byte, we can construct a table of the 256 16 byte values that result from the 256 values of this byte. This requires 4096 bytes. But we also need tables for each of the 16 higher bytes in the buffer as well, which makes 64 kbytes in total. */ /* additional explanation * t[0][BYTE] contains g*BYTE * t[1][BYTE] contains g*x^8*BYTE * .. * t[15][BYTE] contains g*x^120*BYTE */ struct gf128mul_64k *gf128mul_init_64k_bbe(const be128 *g) { struct gf128mul_64k *t; int i, j, k; t = kzalloc(sizeof(*t), GFP_KERNEL); if (!t) goto out; for (i = 0; i < 16; i++) { t->t[i] = kzalloc(sizeof(*t->t[i]), GFP_KERNEL); if (!t->t[i]) { gf128mul_free_64k(t); t = NULL; goto out; } } t->t[0]->t[1] = *g; for (j = 1; j <= 64; j <<= 1) gf128mul_x_bbe(&t->t[0]->t[j + j], &t->t[0]->t[j]); for (i = 0;;) { for (j = 2; j < 256; j += j) for (k = 1; k < j; ++k) be128_xor(&t->t[i]->t[j + k], &t->t[i]->t[j], &t->t[i]->t[k]); if (++i >= 16) break; for (j = 128; j > 0; j >>= 1) { t->t[i]->t[j] = t->t[i - 1]->t[j]; gf128mul_x8_bbe(&t->t[i]->t[j]); } } out: return t; } EXPORT_SYMBOL(gf128mul_init_64k_bbe); void gf128mul_free_64k(struct gf128mul_64k *t) { int i; for (i = 0; i < 16; i++) kfree_sensitive(t->t[i]); kfree_sensitive(t); } EXPORT_SYMBOL(gf128mul_free_64k); void gf128mul_64k_bbe(be128 *a, const struct gf128mul_64k *t) { u8 *ap = (u8 *)a; be128 r[1]; int i; *r = t->t[0]->t[ap[15]]; for (i = 1; i < 16; ++i) be128_xor(r, r, &t->t[i]->t[ap[15 - i]]); *a = *r; } EXPORT_SYMBOL(gf128mul_64k_bbe); /* This version uses 4k bytes of table space. A 16 byte buffer has to be multiplied by a 16 byte key value in GF(2^128). If we consider a GF(2^128) value in a single byte, we can construct a table of the 256 16 byte values that result from the 256 values of this byte. This requires 4096 bytes. If we take the highest byte in the buffer and use this table to get the result, we then have to multiply by x^120 to get the final value. For the next highest byte the result has to be multiplied by x^112 and so on. But we can do this by accumulating the result in an accumulator starting with the result for the top byte. We repeatedly multiply the accumulator value by x^8 and then add in (i.e. xor) the 16 bytes of the next lower byte in the buffer, stopping when we reach the lowest byte. This requires a 4096 byte table. */ struct gf128mul_4k *gf128mul_init_4k_lle(const be128 *g) { struct gf128mul_4k *t; int j, k; t = kzalloc(sizeof(*t), GFP_KERNEL); if (!t) goto out; t->t[128] = *g; for (j = 64; j > 0; j >>= 1) gf128mul_x_lle(&t->t[j], &t->t[j+j]); for (j = 2; j < 256; j += j) for (k = 1; k < j; ++k) be128_xor(&t->t[j + k], &t->t[j], &t->t[k]); out: return t; } EXPORT_SYMBOL(gf128mul_init_4k_lle); void gf128mul_4k_lle(be128 *a, const struct gf128mul_4k *t) { u8 *ap = (u8 *)a; be128 r[1]; int i = 15; *r = t->t[ap[15]]; while (i--) { gf128mul_x8_lle(r); be128_xor(r, r, &t->t[ap[i]]); } *a = *r; } EXPORT_SYMBOL(gf128mul_4k_lle); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Functions for multiplying elements of GF(2^128)");
33 33 33 33 33 33 42 24 24 24 11 11 11 20 6 20 4 16 1 17 6 38 39 39 2 10 11 21 4 4 4 17 17 17 1 15 2 3 8 5 3 2 24 24 14 9 1 1 24 3 24 24 27 27 15 11 24 24 16 7 3 3 3 9 5 109 109 103 107 6 4 7 2 12 14 13 13 41 30 1 2 4 1 28 23 2 5 22 11 4 1 1 1 110 105 105 105 105 9 8 1 5 2 5 2 5 2 5 2 5 2 6 1 8 5 3 1 1 1 1 189 221 220 2 7 1 3 85 85 85 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 virtual tunneling interface * * Copyright (C) 2013 secunet Security Networks AG * * Author: * Steffen Klassert <steffen.klassert@secunet.com> * * Based on: * net/ipv6/ip6_tunnel.c */ #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/sockios.h> #include <linux/icmp.h> #include <linux/if.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/icmpv6.h> #include <linux/init.h> #include <linux/route.h> #include <linux/rtnetlink.h> #include <linux/netfilter_ipv6.h> #include <linux/slab.h> #include <linux/hash.h> #include <linux/uaccess.h> #include <linux/atomic.h> #include <net/icmp.h> #include <net/ip.h> #include <net/ip_tunnels.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/ip6_tunnel.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <linux/etherdevice.h> #define IP6_VTI_HASH_SIZE_SHIFT 5 #define IP6_VTI_HASH_SIZE (1 << IP6_VTI_HASH_SIZE_SHIFT) static u32 HASH(const struct in6_addr *addr1, const struct in6_addr *addr2) { u32 hash = ipv6_addr_hash(addr1) ^ ipv6_addr_hash(addr2); return hash_32(hash, IP6_VTI_HASH_SIZE_SHIFT); } static int vti6_dev_init(struct net_device *dev); static void vti6_dev_setup(struct net_device *dev); static struct rtnl_link_ops vti6_link_ops __read_mostly; static unsigned int vti6_net_id __read_mostly; struct vti6_net { /* the vti6 tunnel fallback device */ struct net_device *fb_tnl_dev; /* lists for storing tunnels in use */ struct ip6_tnl __rcu *tnls_r_l[IP6_VTI_HASH_SIZE]; struct ip6_tnl __rcu *tnls_wc[1]; struct ip6_tnl __rcu **tnls[2]; }; #define for_each_vti6_tunnel_rcu(start) \ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) /** * vti6_tnl_lookup - fetch tunnel matching the end-point addresses * @net: network namespace * @remote: the address of the tunnel exit-point * @local: the address of the tunnel entry-point * * Return: * tunnel matching given end-points if found, * else fallback tunnel if its device is up, * else %NULL **/ static struct ip6_tnl * vti6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_addr *local) { unsigned int hash = HASH(remote, local); struct ip6_tnl *t; struct vti6_net *ip6n = net_generic(net, vti6_net_id); struct in6_addr any; for_each_vti6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (ipv6_addr_equal(local, &t->parms.laddr) && ipv6_addr_equal(remote, &t->parms.raddr) && (t->dev->flags & IFF_UP)) return t; } memset(&any, 0, sizeof(any)); hash = HASH(&any, local); for_each_vti6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (ipv6_addr_equal(local, &t->parms.laddr) && (t->dev->flags & IFF_UP)) return t; } hash = HASH(remote, &any); for_each_vti6_tunnel_rcu(ip6n->tnls_r_l[hash]) { if (ipv6_addr_equal(remote, &t->parms.raddr) && (t->dev->flags & IFF_UP)) return t; } t = rcu_dereference(ip6n->tnls_wc[0]); if (t && (t->dev->flags & IFF_UP)) return t; return NULL; } /** * vti6_tnl_bucket - get head of list matching given tunnel parameters * @ip6n: the private data for ip6_vti in the netns * @p: parameters containing tunnel end-points * * Description: * vti6_tnl_bucket() returns the head of the list matching the * &struct in6_addr entries laddr and raddr in @p. * * Return: head of IPv6 tunnel list **/ static struct ip6_tnl __rcu ** vti6_tnl_bucket(struct vti6_net *ip6n, const struct __ip6_tnl_parm *p) { const struct in6_addr *remote = &p->raddr; const struct in6_addr *local = &p->laddr; unsigned int h = 0; int prio = 0; if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) { prio = 1; h = HASH(remote, local); } return &ip6n->tnls[prio][h]; } static void vti6_tnl_link(struct vti6_net *ip6n, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp = vti6_tnl_bucket(ip6n, &t->parms); rcu_assign_pointer(t->next, rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); } static void vti6_tnl_unlink(struct vti6_net *ip6n, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp; struct ip6_tnl *iter; for (tp = vti6_tnl_bucket(ip6n, &t->parms); (iter = rtnl_dereference(*tp)) != NULL; tp = &iter->next) { if (t == iter) { rcu_assign_pointer(*tp, t->next); break; } } } static int vti6_tnl_create2(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = dev_net(dev); struct vti6_net *ip6n = net_generic(net, vti6_net_id); int err; dev->rtnl_link_ops = &vti6_link_ops; err = register_netdevice(dev); if (err < 0) goto out; strcpy(t->parms.name, dev->name); vti6_tnl_link(ip6n, t); return 0; out: return err; } static struct ip6_tnl *vti6_tnl_create(struct net *net, struct __ip6_tnl_parm *p) { struct net_device *dev; struct ip6_tnl *t; char name[IFNAMSIZ]; int err; if (p->name[0]) { if (!dev_valid_name(p->name)) goto failed; strscpy(name, p->name, IFNAMSIZ); } else { sprintf(name, "ip6_vti%%d"); } dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, vti6_dev_setup); if (!dev) goto failed; dev_net_set(dev, net); t = netdev_priv(dev); t->parms = *p; t->net = dev_net(dev); err = vti6_tnl_create2(dev); if (err < 0) goto failed_free; return t; failed_free: free_netdev(dev); failed: return NULL; } /** * vti6_locate - find or create tunnel matching given parameters * @net: network namespace * @p: tunnel parameters * @create: != 0 if allowed to create new tunnel if no match found * * Description: * vti6_locate() first tries to locate an existing tunnel * based on @parms. If this is unsuccessful, but @create is set a new * tunnel device is created and registered for use. * * Return: * matching tunnel or NULL **/ static struct ip6_tnl *vti6_locate(struct net *net, struct __ip6_tnl_parm *p, int create) { const struct in6_addr *remote = &p->raddr; const struct in6_addr *local = &p->laddr; struct ip6_tnl __rcu **tp; struct ip6_tnl *t; struct vti6_net *ip6n = net_generic(net, vti6_net_id); for (tp = vti6_tnl_bucket(ip6n, p); (t = rtnl_dereference(*tp)) != NULL; tp = &t->next) { if (ipv6_addr_equal(local, &t->parms.laddr) && ipv6_addr_equal(remote, &t->parms.raddr)) { if (create) return NULL; return t; } } if (!create) return NULL; return vti6_tnl_create(net, p); } /** * vti6_dev_uninit - tunnel device uninitializer * @dev: the device to be destroyed * * Description: * vti6_dev_uninit() removes tunnel from its list **/ static void vti6_dev_uninit(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct vti6_net *ip6n = net_generic(t->net, vti6_net_id); if (dev == ip6n->fb_tnl_dev) RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL); else vti6_tnl_unlink(ip6n, t); netdev_put(dev, &t->dev_tracker); } static int vti6_input_proto(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { struct ip6_tnl *t; const struct ipv6hdr *ipv6h = ipv6_hdr(skb); rcu_read_lock(); t = vti6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr); if (t) { if (t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) { rcu_read_unlock(); goto discard; } if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { rcu_read_unlock(); goto discard; } ipv6h = ipv6_hdr(skb); if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) { DEV_STATS_INC(t->dev, rx_dropped); rcu_read_unlock(); goto discard; } rcu_read_unlock(); XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t; XFRM_SPI_SKB_CB(skb)->family = AF_INET6; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); return xfrm_input(skb, nexthdr, spi, encap_type); } rcu_read_unlock(); return -EINVAL; discard: kfree_skb(skb); return 0; } static int vti6_rcv(struct sk_buff *skb) { int nexthdr = skb_network_header(skb)[IP6CB(skb)->nhoff]; return vti6_input_proto(skb, nexthdr, 0, 0); } static int vti6_rcv_cb(struct sk_buff *skb, int err) { unsigned short family; struct net_device *dev; struct xfrm_state *x; const struct xfrm_mode *inner_mode; struct ip6_tnl *t = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6; u32 orig_mark = skb->mark; int ret; if (!t) return 1; dev = t->dev; if (err) { DEV_STATS_INC(dev, rx_errors); DEV_STATS_INC(dev, rx_dropped); return 0; } x = xfrm_input_state(skb); inner_mode = &x->inner_mode; if (x->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); if (inner_mode == NULL) { XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINSTATEMODEERROR); return -EINVAL; } } family = inner_mode->family; skb->mark = be32_to_cpu(t->parms.i_key); ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family); skb->mark = orig_mark; if (!ret) return -EPERM; skb_scrub_packet(skb, !net_eq(t->net, dev_net(skb->dev))); skb->dev = dev; dev_sw_netstats_rx_add(dev, skb->len); return 0; } /** * vti6_addr_conflict - compare packet addresses to tunnel's own * @t: the outgoing tunnel device * @hdr: IPv6 header from the incoming packet * * Description: * Avoid trivial tunneling loop by checking that tunnel exit-point * doesn't match source of incoming packet. * * Return: * 1 if conflict, * 0 else **/ static inline bool vti6_addr_conflict(const struct ip6_tnl *t, const struct ipv6hdr *hdr) { return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr); } static bool vti6_state_check(const struct xfrm_state *x, const struct in6_addr *dst, const struct in6_addr *src) { xfrm_address_t *daddr = (xfrm_address_t *)dst; xfrm_address_t *saddr = (xfrm_address_t *)src; /* if there is no transform then this tunnel is not functional. * Or if the xfrm is not mode tunnel. */ if (!x || x->props.mode != XFRM_MODE_TUNNEL || x->props.family != AF_INET6) return false; if (ipv6_addr_any(dst)) return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET6); if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET6)) return false; return true; } /** * vti6_xmit - send a packet * @skb: the outgoing socket buffer * @dev: the outgoing tunnel device * @fl: the flow informations for the xfrm_lookup **/ static int vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) { struct ip6_tnl *t = netdev_priv(dev); struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; struct xfrm_state *x; int pkt_len = skb->len; int err = -1; int mtu; if (!dst) { switch (skb->protocol) { case htons(ETH_P_IP): { struct rtable *rt; fl->u.ip4.flowi4_oif = dev->ifindex; fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4); if (IS_ERR(rt)) goto tx_err_link_failure; dst = &rt->dst; skb_dst_set(skb, dst); break; } case htons(ETH_P_IPV6): fl->u.ip6.flowi6_oif = dev->ifindex; fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6); if (dst->error) { dst_release(dst); dst = NULL; goto tx_err_link_failure; } skb_dst_set(skb, dst); break; default: goto tx_err_link_failure; } } dst_hold(dst); dst = xfrm_lookup_route(t->net, dst, fl, NULL, 0); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; goto tx_err_link_failure; } if (dst->flags & DST_XFRM_QUEUE) goto xmit; x = dst->xfrm; if (!vti6_state_check(x, &t->parms.raddr, &t->parms.laddr)) goto tx_err_link_failure; if (!ip6_tnl_xmit_ctl(t, (const struct in6_addr *)&x->props.saddr, (const struct in6_addr *)&x->id.daddr)) goto tx_err_link_failure; tdev = dst->dev; if (tdev == dev) { DEV_STATS_INC(dev, collisions); net_warn_ratelimited("%s: Local routing loop detected!\n", t->parms.name); goto tx_err_dst_release; } mtu = dst_mtu(dst); if (skb->len > mtu) { skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IPV6)) { if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); } else { if (!(ip_hdr(skb)->frag_off & htons(IP_DF))) goto xmit; icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); } err = -EMSGSIZE; goto tx_err_dst_release; } xmit: skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev))); skb_dst_set(skb, dst); skb->dev = skb_dst(skb)->dev; err = dst_output(t->net, skb->sk, skb); if (net_xmit_eval(err) == 0) err = pkt_len; iptunnel_xmit_stats(dev, err); return 0; tx_err_link_failure: DEV_STATS_INC(dev, tx_carrier_errors); dst_link_failure(skb); tx_err_dst_release: dst_release(dst); return err; } static netdev_tx_t vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct flowi fl; int ret; if (!pskb_inet_may_pull(skb)) goto tx_err; memset(&fl, 0, sizeof(fl)); switch (skb->protocol) { case htons(ETH_P_IPV6): if ((t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) || vti6_addr_conflict(t, ipv6_hdr(skb))) goto tx_err; memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET6); break; case htons(ETH_P_IP): memset(IPCB(skb), 0, sizeof(*IPCB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET); break; default: goto tx_err; } /* override mark with tunnel output key */ fl.flowi_mark = be32_to_cpu(t->parms.o_key); ret = vti6_xmit(skb, dev, &fl); if (ret < 0) goto tx_err; return NETDEV_TX_OK; tx_err: DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } static int vti6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { __be32 spi; __u32 mark; struct xfrm_state *x; struct ip6_tnl *t; struct ip_esp_hdr *esph; struct ip_auth_hdr *ah; struct ip_comp_hdr *ipch; struct net *net = dev_net(skb->dev); const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; int protocol = iph->nexthdr; t = vti6_tnl_lookup(dev_net(skb->dev), &iph->daddr, &iph->saddr); if (!t) return -1; mark = be32_to_cpu(t->parms.o_key); switch (protocol) { case IPPROTO_ESP: esph = (struct ip_esp_hdr *)(skb->data + offset); spi = esph->spi; break; case IPPROTO_AH: ah = (struct ip_auth_hdr *)(skb->data + offset); spi = ah->spi; break; case IPPROTO_COMP: ipch = (struct ip_comp_hdr *)(skb->data + offset); spi = htonl(ntohs(ipch->cpi)); break; default: return 0; } if (type != ICMPV6_PKT_TOOBIG && type != NDISC_REDIRECT) return 0; x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr, spi, protocol, AF_INET6); if (!x) return 0; if (type == NDISC_REDIRECT) ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); else ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); xfrm_state_put(x); return 0; } static void vti6_link_config(struct ip6_tnl *t, bool keep_mtu) { struct net_device *dev = t->dev; struct __ip6_tnl_parm *p = &t->parms; struct net_device *tdev = NULL; int mtu; __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); p->flags &= ~(IP6_TNL_F_CAP_XMIT | IP6_TNL_F_CAP_RCV | IP6_TNL_F_CAP_PER_PACKET); p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr); if (p->flags & IP6_TNL_F_CAP_XMIT && p->flags & IP6_TNL_F_CAP_RCV) dev->flags |= IFF_POINTOPOINT; else dev->flags &= ~IFF_POINTOPOINT; if (keep_mtu && dev->mtu) { WRITE_ONCE(dev->mtu, clamp(dev->mtu, dev->min_mtu, dev->max_mtu)); return; } if (p->flags & IP6_TNL_F_CAP_XMIT) { int strict = (ipv6_addr_type(&p->raddr) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)); struct rt6_info *rt = rt6_lookup(t->net, &p->raddr, &p->laddr, p->link, NULL, strict); if (rt) tdev = rt->dst.dev; ip6_rt_put(rt); } if (!tdev && p->link) tdev = __dev_get_by_index(t->net, p->link); if (tdev) mtu = tdev->mtu - sizeof(struct ipv6hdr); else mtu = ETH_DATA_LEN - LL_MAX_HEADER - sizeof(struct ipv6hdr); dev->mtu = max_t(int, mtu, IPV4_MIN_MTU); } /** * vti6_tnl_change - update the tunnel parameters * @t: tunnel to be changed * @p: tunnel configuration parameters * @keep_mtu: MTU was set from userspace, don't re-compute it * * Description: * vti6_tnl_change() updates the tunnel parameters **/ static int vti6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p, bool keep_mtu) { t->parms.laddr = p->laddr; t->parms.raddr = p->raddr; t->parms.link = p->link; t->parms.i_key = p->i_key; t->parms.o_key = p->o_key; t->parms.proto = p->proto; t->parms.fwmark = p->fwmark; dst_cache_reset(&t->dst_cache); vti6_link_config(t, keep_mtu); return 0; } static int vti6_update(struct ip6_tnl *t, struct __ip6_tnl_parm *p, bool keep_mtu) { struct net *net = dev_net(t->dev); struct vti6_net *ip6n = net_generic(net, vti6_net_id); int err; vti6_tnl_unlink(ip6n, t); synchronize_net(); err = vti6_tnl_change(t, p, keep_mtu); vti6_tnl_link(ip6n, t); netdev_state_change(t->dev); return err; } static void vti6_parm_from_user(struct __ip6_tnl_parm *p, const struct ip6_tnl_parm2 *u) { p->laddr = u->laddr; p->raddr = u->raddr; p->link = u->link; p->i_key = u->i_key; p->o_key = u->o_key; p->proto = u->proto; memcpy(p->name, u->name, sizeof(u->name)); } static void vti6_parm_to_user(struct ip6_tnl_parm2 *u, const struct __ip6_tnl_parm *p) { u->laddr = p->laddr; u->raddr = p->raddr; u->link = p->link; u->i_key = p->i_key; u->o_key = p->o_key; if (u->i_key) u->i_flags |= GRE_KEY; if (u->o_key) u->o_flags |= GRE_KEY; u->proto = p->proto; memcpy(u->name, p->name, sizeof(u->name)); } /** * vti6_siocdevprivate - configure vti6 tunnels from userspace * @dev: virtual device associated with tunnel * @ifr: unused * @data: parameters passed from userspace * @cmd: command to be performed * * Description: * vti6_siocdevprivate() is used for managing vti6 tunnels * from userspace. * * The possible commands are the following: * %SIOCGETTUNNEL: get tunnel parameters for device * %SIOCADDTUNNEL: add tunnel matching given tunnel parameters * %SIOCCHGTUNNEL: change tunnel parameters to those given * %SIOCDELTUNNEL: delete tunnel * * The fallback device "ip6_vti0", created during module * initialization, can be used for creating other tunnel devices. * * Return: * 0 on success, * %-EFAULT if unable to copy data to or from userspace, * %-EPERM if current process hasn't %CAP_NET_ADMIN set * %-EINVAL if passed tunnel parameters are invalid, * %-EEXIST if changing a tunnel's parameters would cause a conflict * %-ENODEV if attempting to change or delete a nonexisting device **/ static int vti6_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd) { int err = 0; struct ip6_tnl_parm2 p; struct __ip6_tnl_parm p1; struct ip6_tnl *t = NULL; struct net *net = dev_net(dev); struct vti6_net *ip6n = net_generic(net, vti6_net_id); memset(&p1, 0, sizeof(p1)); switch (cmd) { case SIOCGETTUNNEL: if (dev == ip6n->fb_tnl_dev) { if (copy_from_user(&p, data, sizeof(p))) { err = -EFAULT; break; } vti6_parm_from_user(&p1, &p); t = vti6_locate(net, &p1, 0); } else { memset(&p, 0, sizeof(p)); } if (!t) t = netdev_priv(dev); vti6_parm_to_user(&p, &t->parms); if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; break; case SIOCADDTUNNEL: case SIOCCHGTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; err = -EFAULT; if (copy_from_user(&p, data, sizeof(p))) break; err = -EINVAL; if (p.proto != IPPROTO_IPV6 && p.proto != 0) break; vti6_parm_from_user(&p1, &p); t = vti6_locate(net, &p1, cmd == SIOCADDTUNNEL); if (dev != ip6n->fb_tnl_dev && cmd == SIOCCHGTUNNEL) { if (t) { if (t->dev != dev) { err = -EEXIST; break; } } else t = netdev_priv(dev); err = vti6_update(t, &p1, false); } if (t) { err = 0; vti6_parm_to_user(&p, &t->parms); if (copy_to_user(data, &p, sizeof(p))) err = -EFAULT; } else err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); break; case SIOCDELTUNNEL: err = -EPERM; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) break; if (dev == ip6n->fb_tnl_dev) { err = -EFAULT; if (copy_from_user(&p, data, sizeof(p))) break; err = -ENOENT; vti6_parm_from_user(&p1, &p); t = vti6_locate(net, &p1, 0); if (!t) break; err = -EPERM; if (t->dev == ip6n->fb_tnl_dev) break; dev = t->dev; } err = 0; unregister_netdevice(dev); break; default: err = -EINVAL; } return err; } static const struct net_device_ops vti6_netdev_ops = { .ndo_init = vti6_dev_init, .ndo_uninit = vti6_dev_uninit, .ndo_start_xmit = vti6_tnl_xmit, .ndo_siocdevprivate = vti6_siocdevprivate, .ndo_get_iflink = ip6_tnl_get_iflink, }; /** * vti6_dev_setup - setup virtual tunnel device * @dev: virtual device associated with tunnel * * Description: * Initialize function pointers and device parameters **/ static void vti6_dev_setup(struct net_device *dev) { dev->netdev_ops = &vti6_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->needs_free_netdev = true; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->type = ARPHRD_TUNNEL6; dev->min_mtu = IPV4_MIN_MTU; dev->max_mtu = IP_MAX_MTU - sizeof(struct ipv6hdr); dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); netif_keep_dst(dev); /* This perm addr will be used as interface identifier by IPv6 */ dev->addr_assign_type = NET_ADDR_RANDOM; eth_random_addr(dev->perm_addr); } /** * vti6_dev_init_gen - general initializer for all tunnel devices * @dev: virtual device associated with tunnel **/ static inline int vti6_dev_init_gen(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); t->dev = dev; t->net = dev_net(dev); netdev_hold(dev, &t->dev_tracker, GFP_KERNEL); netdev_lockdep_set_classes(dev); return 0; } /** * vti6_dev_init - initializer for all non fallback tunnel devices * @dev: virtual device associated with tunnel **/ static int vti6_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); int err = vti6_dev_init_gen(dev); if (err) return err; vti6_link_config(t, true); return 0; } /** * vti6_fb_tnl_dev_init - initializer for fallback tunnel device * @dev: fallback device * * Return: 0 **/ static int __net_init vti6_fb_tnl_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = dev_net(dev); struct vti6_net *ip6n = net_generic(net, vti6_net_id); t->parms.proto = IPPROTO_IPV6; rcu_assign_pointer(ip6n->tnls_wc[0], t); return 0; } static int vti6_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { return 0; } static void vti6_netlink_parms(struct nlattr *data[], struct __ip6_tnl_parm *parms) { memset(parms, 0, sizeof(*parms)); if (!data) return; if (data[IFLA_VTI_LINK]) parms->link = nla_get_u32(data[IFLA_VTI_LINK]); if (data[IFLA_VTI_LOCAL]) parms->laddr = nla_get_in6_addr(data[IFLA_VTI_LOCAL]); if (data[IFLA_VTI_REMOTE]) parms->raddr = nla_get_in6_addr(data[IFLA_VTI_REMOTE]); if (data[IFLA_VTI_IKEY]) parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]); if (data[IFLA_VTI_OKEY]) parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]); if (data[IFLA_VTI_FWMARK]) parms->fwmark = nla_get_u32(data[IFLA_VTI_FWMARK]); } static int vti6_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct net *net = dev_net(dev); struct ip6_tnl *nt; nt = netdev_priv(dev); vti6_netlink_parms(data, &nt->parms); nt->parms.proto = IPPROTO_IPV6; if (vti6_locate(net, &nt->parms, 0)) return -EEXIST; return vti6_tnl_create2(dev); } static void vti6_dellink(struct net_device *dev, struct list_head *head) { struct net *net = dev_net(dev); struct vti6_net *ip6n = net_generic(net, vti6_net_id); if (dev != ip6n->fb_tnl_dev) unregister_netdevice_queue(dev, head); } static int vti6_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip6_tnl *t; struct __ip6_tnl_parm p; struct net *net = dev_net(dev); struct vti6_net *ip6n = net_generic(net, vti6_net_id); if (dev == ip6n->fb_tnl_dev) return -EINVAL; vti6_netlink_parms(data, &p); t = vti6_locate(net, &p, 0); if (t) { if (t->dev != dev) return -EEXIST; } else t = netdev_priv(dev); return vti6_update(t, &p, tb && tb[IFLA_MTU]); } static size_t vti6_get_size(const struct net_device *dev) { return /* IFLA_VTI_LINK */ nla_total_size(4) + /* IFLA_VTI_LOCAL */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VTI_REMOTE */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VTI_IKEY */ nla_total_size(4) + /* IFLA_VTI_OKEY */ nla_total_size(4) + /* IFLA_VTI_FWMARK */ nla_total_size(4) + 0; } static int vti6_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip6_tnl *tunnel = netdev_priv(dev); struct __ip6_tnl_parm *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_VTI_LINK, parm->link) || nla_put_in6_addr(skb, IFLA_VTI_LOCAL, &parm->laddr) || nla_put_in6_addr(skb, IFLA_VTI_REMOTE, &parm->raddr) || nla_put_be32(skb, IFLA_VTI_IKEY, parm->i_key) || nla_put_be32(skb, IFLA_VTI_OKEY, parm->o_key) || nla_put_u32(skb, IFLA_VTI_FWMARK, parm->fwmark)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static const struct nla_policy vti6_policy[IFLA_VTI_MAX + 1] = { [IFLA_VTI_LINK] = { .type = NLA_U32 }, [IFLA_VTI_LOCAL] = { .len = sizeof(struct in6_addr) }, [IFLA_VTI_REMOTE] = { .len = sizeof(struct in6_addr) }, [IFLA_VTI_IKEY] = { .type = NLA_U32 }, [IFLA_VTI_OKEY] = { .type = NLA_U32 }, [IFLA_VTI_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops vti6_link_ops __read_mostly = { .kind = "vti6", .maxtype = IFLA_VTI_MAX, .policy = vti6_policy, .priv_size = sizeof(struct ip6_tnl), .setup = vti6_dev_setup, .validate = vti6_validate, .newlink = vti6_newlink, .dellink = vti6_dellink, .changelink = vti6_changelink, .get_size = vti6_get_size, .fill_info = vti6_fill_info, .get_link_net = ip6_tnl_get_link_net, }; static void __net_exit vti6_destroy_tunnels(struct vti6_net *ip6n, struct list_head *list) { int h; struct ip6_tnl *t; for (h = 0; h < IP6_VTI_HASH_SIZE; h++) { t = rtnl_dereference(ip6n->tnls_r_l[h]); while (t) { unregister_netdevice_queue(t->dev, list); t = rtnl_dereference(t->next); } } t = rtnl_dereference(ip6n->tnls_wc[0]); if (t) unregister_netdevice_queue(t->dev, list); } static int __net_init vti6_init_net(struct net *net) { struct vti6_net *ip6n = net_generic(net, vti6_net_id); struct ip6_tnl *t = NULL; int err; ip6n->tnls[0] = ip6n->tnls_wc; ip6n->tnls[1] = ip6n->tnls_r_l; if (!net_has_fallback_tunnels(net)) return 0; err = -ENOMEM; ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6_vti0", NET_NAME_UNKNOWN, vti6_dev_setup); if (!ip6n->fb_tnl_dev) goto err_alloc_dev; dev_net_set(ip6n->fb_tnl_dev, net); ip6n->fb_tnl_dev->rtnl_link_ops = &vti6_link_ops; err = vti6_fb_tnl_dev_init(ip6n->fb_tnl_dev); if (err < 0) goto err_register; err = register_netdev(ip6n->fb_tnl_dev); if (err < 0) goto err_register; t = netdev_priv(ip6n->fb_tnl_dev); strcpy(t->parms.name, ip6n->fb_tnl_dev->name); return 0; err_register: free_netdev(ip6n->fb_tnl_dev); err_alloc_dev: return err; } static void __net_exit vti6_exit_batch_rtnl(struct list_head *net_list, struct list_head *dev_to_kill) { struct vti6_net *ip6n; struct net *net; ASSERT_RTNL(); list_for_each_entry(net, net_list, exit_list) { ip6n = net_generic(net, vti6_net_id); vti6_destroy_tunnels(ip6n, dev_to_kill); } } static struct pernet_operations vti6_net_ops = { .init = vti6_init_net, .exit_batch_rtnl = vti6_exit_batch_rtnl, .id = &vti6_net_id, .size = sizeof(struct vti6_net), }; static struct xfrm6_protocol vti_esp6_protocol __read_mostly = { .handler = vti6_rcv, .input_handler = vti6_input_proto, .cb_handler = vti6_rcv_cb, .err_handler = vti6_err, .priority = 100, }; static struct xfrm6_protocol vti_ah6_protocol __read_mostly = { .handler = vti6_rcv, .input_handler = vti6_input_proto, .cb_handler = vti6_rcv_cb, .err_handler = vti6_err, .priority = 100, }; static struct xfrm6_protocol vti_ipcomp6_protocol __read_mostly = { .handler = vti6_rcv, .input_handler = vti6_input_proto, .cb_handler = vti6_rcv_cb, .err_handler = vti6_err, .priority = 100, }; #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) static int vti6_rcv_tunnel(struct sk_buff *skb) { const xfrm_address_t *saddr; __be32 spi; saddr = (const xfrm_address_t *)&ipv6_hdr(skb)->saddr; spi = xfrm6_tunnel_spi_lookup(dev_net(skb->dev), saddr); return vti6_input_proto(skb, IPPROTO_IPV6, spi, 0); } static struct xfrm6_tunnel vti_ipv6_handler __read_mostly = { .handler = vti6_rcv_tunnel, .cb_handler = vti6_rcv_cb, .err_handler = vti6_err, .priority = 0, }; static struct xfrm6_tunnel vti_ip6ip_handler __read_mostly = { .handler = vti6_rcv_tunnel, .cb_handler = vti6_rcv_cb, .err_handler = vti6_err, .priority = 0, }; #endif /** * vti6_tunnel_init - register protocol and reserve needed resources * * Return: 0 on success **/ static int __init vti6_tunnel_init(void) { const char *msg; int err; msg = "tunnel device"; err = register_pernet_device(&vti6_net_ops); if (err < 0) goto pernet_dev_failed; msg = "tunnel protocols"; err = xfrm6_protocol_register(&vti_esp6_protocol, IPPROTO_ESP); if (err < 0) goto xfrm_proto_esp_failed; err = xfrm6_protocol_register(&vti_ah6_protocol, IPPROTO_AH); if (err < 0) goto xfrm_proto_ah_failed; err = xfrm6_protocol_register(&vti_ipcomp6_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) msg = "ipv6 tunnel"; err = xfrm6_tunnel_register(&vti_ipv6_handler, AF_INET6); if (err < 0) goto vti_tunnel_ipv6_failed; err = xfrm6_tunnel_register(&vti_ip6ip_handler, AF_INET); if (err < 0) goto vti_tunnel_ip6ip_failed; #endif msg = "netlink interface"; err = rtnl_link_register(&vti6_link_ops); if (err < 0) goto rtnl_link_failed; return 0; rtnl_link_failed: #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) err = xfrm6_tunnel_deregister(&vti_ip6ip_handler, AF_INET); vti_tunnel_ip6ip_failed: err = xfrm6_tunnel_deregister(&vti_ipv6_handler, AF_INET6); vti_tunnel_ipv6_failed: #endif xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP); xfrm_proto_comp_failed: xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH); xfrm_proto_ah_failed: xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP); xfrm_proto_esp_failed: unregister_pernet_device(&vti6_net_ops); pernet_dev_failed: pr_err("vti6 init: failed to register %s\n", msg); return err; } /** * vti6_tunnel_cleanup - free resources and unregister protocol **/ static void __exit vti6_tunnel_cleanup(void) { rtnl_link_unregister(&vti6_link_ops); #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) xfrm6_tunnel_deregister(&vti_ip6ip_handler, AF_INET); xfrm6_tunnel_deregister(&vti_ipv6_handler, AF_INET6); #endif xfrm6_protocol_deregister(&vti_ipcomp6_protocol, IPPROTO_COMP); xfrm6_protocol_deregister(&vti_ah6_protocol, IPPROTO_AH); xfrm6_protocol_deregister(&vti_esp6_protocol, IPPROTO_ESP); unregister_pernet_device(&vti6_net_ops); } module_init(vti6_tunnel_init); module_exit(vti6_tunnel_cleanup); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("vti6"); MODULE_ALIAS_NETDEV("ip6_vti0"); MODULE_AUTHOR("Steffen Klassert"); MODULE_DESCRIPTION("IPv6 virtual tunnel interface");
24 606 1207 24 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* fs/ internal definitions * * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ struct super_block; struct file_system_type; struct iomap; struct iomap_ops; struct linux_binprm; struct path; struct mount; struct shrink_control; struct fs_context; struct pipe_inode_info; struct iov_iter; struct mnt_idmap; struct ns_common; /* * block/bdev.c */ #ifdef CONFIG_BLOCK extern void __init bdev_cache_init(void); #else static inline void bdev_cache_init(void) { } #endif /* CONFIG_BLOCK */ /* * buffer.c */ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block, const struct iomap *iomap); /* * char_dev.c */ extern void __init chrdev_init(void); /* * fs_context.c */ extern const struct fs_context_operations legacy_fs_context_ops; extern int parse_monolithic_mount_data(struct fs_context *, void *); extern void vfs_clean_context(struct fs_context *fc); extern int finish_clean_context(struct fs_context *fc); /* * namei.c */ extern int filename_lookup(int dfd, struct filename *name, unsigned flags, struct path *path, struct path *root); int do_rmdir(int dfd, struct filename *name); int do_unlinkat(int dfd, struct filename *name); int may_linkat(struct mnt_idmap *idmap, const struct path *link); int do_renameat2(int olddfd, struct filename *oldname, int newdfd, struct filename *newname, unsigned int flags); int do_mkdirat(int dfd, struct filename *name, umode_t mode); int do_symlinkat(struct filename *from, int newdfd, struct filename *to); int do_linkat(int olddfd, struct filename *old, int newdfd, struct filename *new, int flags); int vfs_tmpfile(struct mnt_idmap *idmap, const struct path *parentpath, struct file *file, umode_t mode); /* * namespace.c */ extern struct vfsmount *lookup_mnt(const struct path *); extern int finish_automount(struct vfsmount *, const struct path *); extern int sb_prepare_remount_readonly(struct super_block *); extern void __init mnt_init(void); int mnt_get_write_access_file(struct file *file); void mnt_put_write_access_file(struct file *file); extern void dissolve_on_fput(struct vfsmount *); extern bool may_mount(void); int path_mount(const char *dev_name, struct path *path, const char *type_page, unsigned long flags, void *data_page); int path_umount(struct path *path, int flags); int show_path(struct seq_file *m, struct dentry *root); /* * fs_struct.c */ extern void chroot_fs_refs(const struct path *, const struct path *); /* * file_table.c */ struct file *alloc_empty_file(int flags, const struct cred *cred); struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred); struct file *alloc_empty_backing_file(int flags, const struct cred *cred); static inline void file_put_write_access(struct file *file) { put_write_access(file->f_inode); mnt_put_write_access(file->f_path.mnt); if (unlikely(file->f_mode & FMODE_BACKING)) mnt_put_write_access(backing_file_user_path(file)->mnt); } static inline void put_file_access(struct file *file) { if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { i_readcount_dec(file->f_inode); } else if (file->f_mode & FMODE_WRITER) { file_put_write_access(file); } } /* * super.c */ extern int reconfigure_super(struct fs_context *); extern bool super_trylock_shared(struct super_block *sb); struct super_block *user_get_super(dev_t, bool excl); void put_super(struct super_block *sb); extern bool mount_capable(struct fs_context *); int sb_init_dio_done_wq(struct super_block *sb); /* * Prepare superblock for changing its read-only state (i.e., either remount * read-write superblock read-only or vice versa). After this function returns * mnt_is_readonly() will return true for any mount of the superblock if its * caller is able to observe any changes done by the remount. This holds until * sb_end_ro_state_change() is called. */ static inline void sb_start_ro_state_change(struct super_block *sb) { WRITE_ONCE(sb->s_readonly_remount, 1); /* * For RO->RW transition, the barrier pairs with the barrier in * mnt_is_readonly() making sure if mnt_is_readonly() sees SB_RDONLY * cleared, it will see s_readonly_remount set. * For RW->RO transition, the barrier pairs with the barrier in * mnt_get_write_access() before the mnt_is_readonly() check. * The barrier makes sure if mnt_get_write_access() sees MNT_WRITE_HOLD * already cleared, it will see s_readonly_remount set. */ smp_wmb(); } /* * Ends section changing read-only state of the superblock. After this function * returns if mnt_is_readonly() returns false, the caller will be able to * observe all the changes remount did to the superblock. */ static inline void sb_end_ro_state_change(struct super_block *sb) { /* * This barrier provides release semantics that pairs with * the smp_rmb() acquire semantics in mnt_is_readonly(). * This barrier pair ensure that when mnt_is_readonly() sees * 0 for sb->s_readonly_remount, it will also see all the * preceding flag changes that were made during the RO state * change. */ smp_wmb(); WRITE_ONCE(sb->s_readonly_remount, 0); } /* * open.c */ struct open_flags { int open_flag; umode_t mode; int acc_mode; int intent; int lookup_flags; }; extern struct file *do_filp_open(int dfd, struct filename *pathname, const struct open_flags *op); extern struct file *do_file_open_root(const struct path *, const char *, const struct open_flags *); extern struct open_how build_open_how(int flags, umode_t mode); extern int build_open_flags(const struct open_how *how, struct open_flags *op); struct file *file_close_fd_locked(struct files_struct *files, unsigned fd); long do_ftruncate(struct file *file, loff_t length, int small); long do_sys_ftruncate(unsigned int fd, loff_t length, int small); int chmod_common(const struct path *path, umode_t mode); int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag); int chown_common(const struct path *path, uid_t user, gid_t group); extern int vfs_open(const struct path *, struct file *); /* * inode.c */ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); int dentry_needs_remove_privs(struct mnt_idmap *, struct dentry *dentry); bool in_group_or_capable(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t vfsgid); /* * fs-writeback.c */ extern long get_nr_dirty_inodes(void); void invalidate_inodes(struct super_block *sb); /* * dcache.c */ extern int d_set_mounted(struct dentry *dentry); extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc); extern struct dentry *d_alloc_cursor(struct dentry *); extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *); extern char *simple_dname(struct dentry *, char *, int); extern void dput_to_list(struct dentry *, struct list_head *); extern void shrink_dentry_list(struct list_head *); extern void shrink_dcache_for_umount(struct super_block *); extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *); extern struct dentry *__d_lookup_rcu(const struct dentry *parent, const struct qstr *name, unsigned *seq); extern void d_genocide(struct dentry *); /* * pipe.c */ extern const struct file_operations pipefifo_fops; /* * fs_pin.c */ extern void group_pin_kill(struct hlist_head *p); extern void mnt_pin_kill(struct mount *m); /* * fs/nsfs.c */ extern const struct dentry_operations ns_dentry_operations; int open_namespace(struct ns_common *ns); /* * fs/stat.c: */ int do_statx(int dfd, struct filename *filename, unsigned int flags, unsigned int mask, struct statx __user *buffer); int do_statx_fd(int fd, unsigned int flags, unsigned int mask, struct statx __user *buffer); /* * fs/splice.c: */ ssize_t splice_file_to_pipe(struct file *in, struct pipe_inode_info *opipe, loff_t *offset, size_t len, unsigned int flags); /* * fs/xattr.c: */ struct xattr_name { char name[XATTR_NAME_MAX + 1]; }; struct kernel_xattr_ctx { /* Value of attribute */ union { const void __user *cvalue; void __user *value; }; void *kvalue; size_t size; /* Attribute name */ struct xattr_name *kname; unsigned int flags; }; ssize_t file_getxattr(struct file *file, struct kernel_xattr_ctx *ctx); ssize_t filename_getxattr(int dfd, struct filename *filename, unsigned int lookup_flags, struct kernel_xattr_ctx *ctx); int file_setxattr(struct file *file, struct kernel_xattr_ctx *ctx); int filename_setxattr(int dfd, struct filename *filename, unsigned int lookup_flags, struct kernel_xattr_ctx *ctx); int setxattr_copy(const char __user *name, struct kernel_xattr_ctx *ctx); int import_xattr_name(struct xattr_name *kname, const char __user *name); int may_write_xattr(struct mnt_idmap *idmap, struct inode *inode); #ifdef CONFIG_FS_POSIX_ACL int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, const void *kvalue, size_t size); ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, void *kvalue, size_t size); #else static inline int do_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, const void *kvalue, size_t size) { return -EOPNOTSUPP; } static inline ssize_t do_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, const char *acl_name, void *kvalue, size_t size) { return -EOPNOTSUPP; } #endif ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *pos); /* * fs/attr.c */ struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns); struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap); void mnt_idmap_put(struct mnt_idmap *idmap); struct stashed_operations { void (*put_data)(void *data); int (*init_inode)(struct inode *inode, void *data); }; int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, struct path *path); void stashed_dentry_prune(struct dentry *dentry); /** * path_mounted - check whether path is mounted * @path: path to check * * Determine whether @path refers to the root of a mount. * * Return: true if @path is the root of a mount, false if not. */ static inline bool path_mounted(const struct path *path) { return path->mnt->mnt_root == path->dentry; } void file_f_owner_release(struct file *file);
153 147 27 123 117 6 116 7 329 371 335 153 329 330 327 5 330 329 327 5 277 1 330 69 329 1 329 329 328 6 4 52 2 52 6 1 1 3 43 121 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2003, 2004 * * This file is part of the SCTP kernel implementation * * This file contains the code relating the chunk abstraction. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Jon Grimm <jgrimm@us.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/kernel.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> /* This file is mostly in anticipation of future work, but initially * populate with fragment tracking for an outbound message. */ /* Initialize datamsg from memory. */ static void sctp_datamsg_init(struct sctp_datamsg *msg) { refcount_set(&msg->refcnt, 1); msg->send_failed = 0; msg->send_error = 0; msg->can_delay = 1; msg->abandoned = 0; msg->expires_at = 0; INIT_LIST_HEAD(&msg->chunks); } /* Allocate and initialize datamsg. */ static struct sctp_datamsg *sctp_datamsg_new(gfp_t gfp) { struct sctp_datamsg *msg; msg = kmalloc(sizeof(struct sctp_datamsg), gfp); if (msg) { sctp_datamsg_init(msg); SCTP_DBG_OBJCNT_INC(datamsg); } return msg; } void sctp_datamsg_free(struct sctp_datamsg *msg) { struct sctp_chunk *chunk; /* This doesn't have to be a _safe vairant because * sctp_chunk_free() only drops the refs. */ list_for_each_entry(chunk, &msg->chunks, frag_list) sctp_chunk_free(chunk); sctp_datamsg_put(msg); } /* Final destructruction of datamsg memory. */ static void sctp_datamsg_destroy(struct sctp_datamsg *msg) { struct sctp_association *asoc = NULL; struct list_head *pos, *temp; struct sctp_chunk *chunk; struct sctp_ulpevent *ev; int error, sent; /* Release all references. */ list_for_each_safe(pos, temp, &msg->chunks) { list_del_init(pos); chunk = list_entry(pos, struct sctp_chunk, frag_list); if (!msg->send_failed) { sctp_chunk_put(chunk); continue; } asoc = chunk->asoc; error = msg->send_error ?: asoc->outqueue.error; sent = chunk->has_tsn ? SCTP_DATA_SENT : SCTP_DATA_UNSENT; if (sctp_ulpevent_type_enabled(asoc->subscribe, SCTP_SEND_FAILED)) { ev = sctp_ulpevent_make_send_failed(asoc, chunk, sent, error, GFP_ATOMIC); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } if (sctp_ulpevent_type_enabled(asoc->subscribe, SCTP_SEND_FAILED_EVENT)) { ev = sctp_ulpevent_make_send_failed_event(asoc, chunk, sent, error, GFP_ATOMIC); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } sctp_chunk_put(chunk); } SCTP_DBG_OBJCNT_DEC(datamsg); kfree(msg); } /* Hold a reference. */ static void sctp_datamsg_hold(struct sctp_datamsg *msg) { refcount_inc(&msg->refcnt); } /* Release a reference. */ void sctp_datamsg_put(struct sctp_datamsg *msg) { if (refcount_dec_and_test(&msg->refcnt)) sctp_datamsg_destroy(msg); } /* Assign a chunk to this datamsg. */ static void sctp_datamsg_assign(struct sctp_datamsg *msg, struct sctp_chunk *chunk) { sctp_datamsg_hold(msg); chunk->msg = msg; } /* A data chunk can have a maximum payload of (2^16 - 20). Break * down any such message into smaller chunks. Opportunistically, fragment * the chunks down to the current MTU constraints. We may get refragmented * later if the PMTU changes, but it is _much better_ to fragment immediately * with a reasonable guess than always doing our fragmentation on the * soft-interrupt. */ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, struct sctp_sndrcvinfo *sinfo, struct iov_iter *from) { size_t len, first_len, max_data, remaining; size_t msg_len = iov_iter_count(from); struct sctp_shared_key *shkey = NULL; struct list_head *pos, *temp; struct sctp_chunk *chunk; struct sctp_datamsg *msg; int err; msg = sctp_datamsg_new(GFP_KERNEL); if (!msg) return ERR_PTR(-ENOMEM); /* Note: Calculate this outside of the loop, so that all fragments * have the same expiration. */ if (asoc->peer.prsctp_capable && sinfo->sinfo_timetolive && (SCTP_PR_TTL_ENABLED(sinfo->sinfo_flags) || !SCTP_PR_POLICY(sinfo->sinfo_flags))) msg->expires_at = jiffies + msecs_to_jiffies(sinfo->sinfo_timetolive); /* This is the biggest possible DATA chunk that can fit into * the packet */ max_data = asoc->frag_point; if (unlikely(!max_data)) { max_data = sctp_min_frag_point(sctp_sk(asoc->base.sk), sctp_datachk_len(&asoc->stream)); pr_warn_ratelimited("%s: asoc:%p frag_point is zero, forcing max_data to default minimum (%zu)", __func__, asoc, max_data); } /* If the peer requested that we authenticate DATA chunks * we need to account for bundling of the AUTH chunks along with * DATA. */ if (sctp_auth_send_cid(SCTP_CID_DATA, asoc)) { struct sctp_hmac *hmac_desc = sctp_auth_asoc_get_hmac(asoc); if (hmac_desc) max_data -= SCTP_PAD4(sizeof(struct sctp_auth_chunk) + hmac_desc->hmac_len); if (sinfo->sinfo_tsn && sinfo->sinfo_ssn != asoc->active_key_id) { shkey = sctp_auth_get_shkey(asoc, sinfo->sinfo_ssn); if (!shkey) { err = -EINVAL; goto errout; } } else { shkey = asoc->shkey; } } /* Set first_len and then account for possible bundles on first frag */ first_len = max_data; /* Check to see if we have a pending SACK and try to let it be bundled * with this message. Do this if we don't have any data queued already. * To check that, look at out_qlen and retransmit list. * NOTE: we will not reduce to account for SACK, if the message would * not have been fragmented. */ if (timer_pending(&asoc->timers[SCTP_EVENT_TIMEOUT_SACK]) && asoc->outqueue.out_qlen == 0 && list_empty(&asoc->outqueue.retransmit) && msg_len > max_data) first_len -= SCTP_PAD4(sizeof(struct sctp_sack_chunk)); /* Encourage Cookie-ECHO bundling. */ if (asoc->state < SCTP_STATE_COOKIE_ECHOED) first_len -= SCTP_ARBITRARY_COOKIE_ECHO_LEN; /* Account for a different sized first fragment */ if (msg_len >= first_len) { msg->can_delay = 0; if (msg_len > first_len) SCTP_INC_STATS(asoc->base.net, SCTP_MIB_FRAGUSRMSGS); } else { /* Which may be the only one... */ first_len = msg_len; } /* Create chunks for all DATA chunks. */ for (remaining = msg_len; remaining; remaining -= len) { u8 frag = SCTP_DATA_MIDDLE_FRAG; if (remaining == msg_len) { /* First frag, which may also be the last */ frag |= SCTP_DATA_FIRST_FRAG; len = first_len; } else { /* Middle frags */ len = max_data; } if (len >= remaining) { /* Last frag, which may also be the first */ len = remaining; frag |= SCTP_DATA_LAST_FRAG; /* The application requests to set the I-bit of the * last DATA chunk of a user message when providing * the user message to the SCTP implementation. */ if ((sinfo->sinfo_flags & SCTP_EOF) || (sinfo->sinfo_flags & SCTP_SACK_IMMEDIATELY)) frag |= SCTP_DATA_SACK_IMM; } chunk = asoc->stream.si->make_datafrag(asoc, sinfo, len, frag, GFP_KERNEL); if (!chunk) { err = -ENOMEM; goto errout; } err = sctp_user_addto_chunk(chunk, len, from); if (err < 0) goto errout_chunk_free; chunk->shkey = shkey; /* Put the chunk->skb back into the form expected by send. */ __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr - chunk->skb->data); sctp_datamsg_assign(msg, chunk); list_add_tail(&chunk->frag_list, &msg->chunks); } return msg; errout_chunk_free: sctp_chunk_free(chunk); errout: list_for_each_safe(pos, temp, &msg->chunks) { list_del_init(pos); chunk = list_entry(pos, struct sctp_chunk, frag_list); sctp_chunk_free(chunk); } sctp_datamsg_put(msg); return ERR_PTR(err); } /* Check whether this message has expired. */ int sctp_chunk_abandoned(struct sctp_chunk *chunk) { if (!chunk->asoc->peer.prsctp_capable) return 0; if (chunk->msg->abandoned) return 1; if (!chunk->has_tsn && !(chunk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG)) return 0; if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) && time_after(jiffies, chunk->msg->expires_at)) { struct sctp_stream_out *streamout = SCTP_SO(&chunk->asoc->stream, chunk->sinfo.sinfo_stream); if (chunk->sent_count) { chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++; streamout->ext->abandoned_sent[SCTP_PR_INDEX(TTL)]++; } else { chunk->asoc->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; streamout->ext->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; } chunk->msg->abandoned = 1; return 1; } else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) && chunk->sent_count > chunk->sinfo.sinfo_timetolive) { struct sctp_stream_out *streamout = SCTP_SO(&chunk->asoc->stream, chunk->sinfo.sinfo_stream); chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++; streamout->ext->abandoned_sent[SCTP_PR_INDEX(RTX)]++; chunk->msg->abandoned = 1; return 1; } else if (!SCTP_PR_POLICY(chunk->sinfo.sinfo_flags) && chunk->msg->expires_at && time_after(jiffies, chunk->msg->expires_at)) { chunk->msg->abandoned = 1; return 1; } /* PRIO policy is processed by sendmsg, not here */ return 0; } /* This chunk (and consequently entire message) has failed in its sending. */ void sctp_chunk_fail(struct sctp_chunk *chunk, int error) { chunk->msg->send_failed = 1; chunk->msg->send_error = error; }
843 850 3 8 15 8 11 5 11 6 9 15 15 1 8 8 8 8 12 728 728 8 5 5 5 727 2181 2183 729 7 3 4 11 11 11 10 3 6 6 6 4 4 1 3 1 1 1 5 5 8 1 5 5 7 7 19 1 1 17 17 15 5 2 4 6 4 6 11 11 4 6 4 20 2 2 2 2 7 1 1 7 1 1 18 14 1 1 18 18 2 17 2 1 2 88 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 // SPDX-License-Identifier: GPL-2.0-or-later /** -*- linux-c -*- *********************************************************** * Linux PPP over Ethernet (PPPoX/PPPoE) Sockets * * PPPoX --- Generic PPP encapsulation socket family * PPPoE --- PPP over Ethernet (RFC 2516) * * Version: 0.7.0 * * 070228 : Fix to allow multiple sessions with same remote MAC and same * session id by including the local device ifindex in the * tuple identifying a session. This also ensures packets can't * be injected into a session from interfaces other than the one * specified by userspace. Florian Zumbiehl <florz@florz.de> * (Oh, BTW, this one is YYMMDD, in case you were wondering ...) * 220102 : Fix module use count on failure in pppoe_create, pppox_sk -acme * 030700 : Fixed connect logic to allow for disconnect. * 270700 : Fixed potential SMP problems; we must protect against * simultaneous invocation of ppp_input * and ppp_unregister_channel. * 040800 : Respect reference count mechanisms on net-devices. * 200800 : fix kfree(skb) in pppoe_rcv (acme) * Module reference count is decremented in the right spot now, * guards against sock_put not actually freeing the sk * in pppoe_release. * 051000 : Initialization cleanup. * 111100 : Fix recvmsg. * 050101 : Fix PADT processing. * 140501 : Use pppoe_rcv_core to handle all backlog. (Alexey) * 170701 : Do not lock_sock with rwlock held. (DaveM) * Ignore discovery frames if user has socket * locked. (DaveM) * Ignore return value of dev_queue_xmit in __pppoe_xmit * or else we may kfree an SKB twice. (DaveM) * 190701 : When doing copies of skb's in __pppoe_xmit, always delete * the original skb that was passed in on success, never on * failure. Delete the copy of the skb on failure to avoid * a memory leak. * 081001 : Misc. cleanup (licence string, non-blocking, prevent * reference of device on close). * 121301 : New ppp channels interface; cannot unregister a channel * from interrupts. Thus, we mark the socket as a ZOMBIE * and do the unregistration later. * 081002 : seq_file support for proc stuff -acme * 111602 : Merge all 2.4 fixes into 2.5/2.6 tree. Label 2.5/2.6 * as version 0.7. Spacing cleanup. * Author: Michal Ostrowski <mostrows@speakeasy.net> * Contributors: * Arnaldo Carvalho de Melo <acme@conectiva.com.br> * David S. Miller (davem@redhat.com) * * License: */ #include <linux/string.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/net.h> #include <linux/inetdevice.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/if_ether.h> #include <linux/if_pppox.h> #include <linux/ppp_channel.h> #include <linux/ppp_defs.h> #include <linux/ppp-ioctl.h> #include <linux/notifier.h> #include <linux/file.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/nsproxy.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/sock.h> #include <linux/uaccess.h> #define PPPOE_HASH_BITS CONFIG_PPPOE_HASH_BITS #define PPPOE_HASH_SIZE (1 << PPPOE_HASH_BITS) #define PPPOE_HASH_MASK (PPPOE_HASH_SIZE - 1) static int __pppoe_xmit(struct sock *sk, struct sk_buff *skb); static const struct proto_ops pppoe_ops; static const struct ppp_channel_ops pppoe_chan_ops; /* per-net private data for this module */ static unsigned int pppoe_net_id __read_mostly; struct pppoe_net { /* * we could use _single_ hash table for all * nets by injecting net id into the hash but * it would increase hash chains and add * a few additional math comparisons messy * as well, moreover in case of SMP less locking * controversy here */ struct pppox_sock *hash_table[PPPOE_HASH_SIZE]; rwlock_t hash_lock; }; /* * PPPoE could be in the following stages: * 1) Discovery stage (to obtain remote MAC and Session ID) * 2) Session stage (MAC and SID are known) * * Ethernet frames have a special tag for this but * we use simpler approach based on session id */ static inline bool stage_session(__be16 sid) { return sid != 0; } static inline struct pppoe_net *pppoe_pernet(struct net *net) { return net_generic(net, pppoe_net_id); } static inline int cmp_2_addr(struct pppoe_addr *a, struct pppoe_addr *b) { return a->sid == b->sid && ether_addr_equal(a->remote, b->remote); } static inline int cmp_addr(struct pppoe_addr *a, __be16 sid, char *addr) { return a->sid == sid && ether_addr_equal(a->remote, addr); } #if 8 % PPPOE_HASH_BITS #error 8 must be a multiple of PPPOE_HASH_BITS #endif static int hash_item(__be16 sid, unsigned char *addr) { unsigned char hash = 0; unsigned int i; for (i = 0; i < ETH_ALEN; i++) hash ^= addr[i]; for (i = 0; i < sizeof(sid_t) * 8; i += 8) hash ^= (__force __u32)sid >> i; for (i = 8; (i >>= 1) >= PPPOE_HASH_BITS;) hash ^= hash >> i; return hash & PPPOE_HASH_MASK; } /********************************************************************** * * Set/get/delete/rehash items (internal versions) * **********************************************************************/ static struct pppox_sock *__get_item(struct pppoe_net *pn, __be16 sid, unsigned char *addr, int ifindex) { int hash = hash_item(sid, addr); struct pppox_sock *ret; ret = pn->hash_table[hash]; while (ret) { if (cmp_addr(&ret->pppoe_pa, sid, addr) && ret->pppoe_ifindex == ifindex) return ret; ret = ret->next; } return NULL; } static int __set_item(struct pppoe_net *pn, struct pppox_sock *po) { int hash = hash_item(po->pppoe_pa.sid, po->pppoe_pa.remote); struct pppox_sock *ret; ret = pn->hash_table[hash]; while (ret) { if (cmp_2_addr(&ret->pppoe_pa, &po->pppoe_pa) && ret->pppoe_ifindex == po->pppoe_ifindex) return -EALREADY; ret = ret->next; } po->next = pn->hash_table[hash]; pn->hash_table[hash] = po; return 0; } static void __delete_item(struct pppoe_net *pn, __be16 sid, char *addr, int ifindex) { int hash = hash_item(sid, addr); struct pppox_sock *ret, **src; ret = pn->hash_table[hash]; src = &pn->hash_table[hash]; while (ret) { if (cmp_addr(&ret->pppoe_pa, sid, addr) && ret->pppoe_ifindex == ifindex) { *src = ret->next; break; } src = &ret->next; ret = ret->next; } } /********************************************************************** * * Set/get/delete/rehash items * **********************************************************************/ static inline struct pppox_sock *get_item(struct pppoe_net *pn, __be16 sid, unsigned char *addr, int ifindex) { struct pppox_sock *po; read_lock_bh(&pn->hash_lock); po = __get_item(pn, sid, addr, ifindex); if (po) sock_hold(sk_pppox(po)); read_unlock_bh(&pn->hash_lock); return po; } static inline struct pppox_sock *get_item_by_addr(struct net *net, struct sockaddr_pppox *sp) { struct net_device *dev; struct pppoe_net *pn; struct pppox_sock *pppox_sock = NULL; int ifindex; rcu_read_lock(); dev = dev_get_by_name_rcu(net, sp->sa_addr.pppoe.dev); if (dev) { ifindex = dev->ifindex; pn = pppoe_pernet(net); pppox_sock = get_item(pn, sp->sa_addr.pppoe.sid, sp->sa_addr.pppoe.remote, ifindex); } rcu_read_unlock(); return pppox_sock; } static inline void delete_item(struct pppoe_net *pn, __be16 sid, char *addr, int ifindex) { write_lock_bh(&pn->hash_lock); __delete_item(pn, sid, addr, ifindex); write_unlock_bh(&pn->hash_lock); } /*************************************************************************** * * Handler for device events. * Certain device events require that sockets be unconnected. * **************************************************************************/ static void pppoe_flush_dev(struct net_device *dev) { struct pppoe_net *pn; int i; pn = pppoe_pernet(dev_net(dev)); write_lock_bh(&pn->hash_lock); for (i = 0; i < PPPOE_HASH_SIZE; i++) { struct pppox_sock *po = pn->hash_table[i]; struct sock *sk; while (po) { while (po && po->pppoe_dev != dev) { po = po->next; } if (!po) break; sk = sk_pppox(po); /* We always grab the socket lock, followed by the * hash_lock, in that order. Since we should hold the * sock lock while doing any unbinding, we need to * release the lock we're holding. Hold a reference to * the sock so it doesn't disappear as we're jumping * between locks. */ sock_hold(sk); write_unlock_bh(&pn->hash_lock); lock_sock(sk); if (po->pppoe_dev == dev && sk->sk_state & (PPPOX_CONNECTED | PPPOX_BOUND)) { pppox_unbind_sock(sk); sk->sk_state_change(sk); po->pppoe_dev = NULL; dev_put(dev); } release_sock(sk); sock_put(sk); /* Restart the process from the start of the current * hash chain. We dropped locks so the world may have * change from underneath us. */ BUG_ON(pppoe_pernet(dev_net(dev)) == NULL); write_lock_bh(&pn->hash_lock); po = pn->hash_table[i]; } } write_unlock_bh(&pn->hash_lock); } static int pppoe_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); /* Only look at sockets that are using this specific device. */ switch (event) { case NETDEV_CHANGEADDR: case NETDEV_CHANGEMTU: /* A change in mtu or address is a bad thing, requiring * LCP re-negotiation. */ case NETDEV_GOING_DOWN: case NETDEV_DOWN: /* Find every socket on this device and kill it. */ pppoe_flush_dev(dev); break; default: break; } return NOTIFY_DONE; } static struct notifier_block pppoe_notifier = { .notifier_call = pppoe_device_event, }; /************************************************************************ * * Do the real work of receiving a PPPoE Session frame. * ***********************************************************************/ static int pppoe_rcv_core(struct sock *sk, struct sk_buff *skb) { struct pppox_sock *po = pppox_sk(sk); struct pppox_sock *relay_po; /* Backlog receive. Semantics of backlog rcv preclude any code from * executing in lock_sock()/release_sock() bounds; meaning sk->sk_state * can't change. */ if (skb->pkt_type == PACKET_OTHERHOST) goto abort_kfree; if (sk->sk_state & PPPOX_BOUND) { ppp_input(&po->chan, skb); } else if (sk->sk_state & PPPOX_RELAY) { relay_po = get_item_by_addr(sock_net(sk), &po->pppoe_relay); if (relay_po == NULL) goto abort_kfree; if ((sk_pppox(relay_po)->sk_state & PPPOX_CONNECTED) == 0) goto abort_put; if (!__pppoe_xmit(sk_pppox(relay_po), skb)) goto abort_put; sock_put(sk_pppox(relay_po)); } else { if (sock_queue_rcv_skb(sk, skb)) goto abort_kfree; } return NET_RX_SUCCESS; abort_put: sock_put(sk_pppox(relay_po)); abort_kfree: kfree_skb(skb); return NET_RX_DROP; } /************************************************************************ * * Receive wrapper called in BH context. * ***********************************************************************/ static int pppoe_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct pppoe_hdr *ph; struct pppox_sock *po; struct pppoe_net *pn; int len; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) goto out; if (skb_mac_header_len(skb) < ETH_HLEN) goto drop; if (!pskb_may_pull(skb, sizeof(struct pppoe_hdr))) goto drop; ph = pppoe_hdr(skb); len = ntohs(ph->length); skb_pull_rcsum(skb, sizeof(*ph)); if (skb->len < len) goto drop; if (pskb_trim_rcsum(skb, len)) goto drop; ph = pppoe_hdr(skb); pn = pppoe_pernet(dev_net(dev)); /* Note that get_item does a sock_hold(), so sk_pppox(po) * is known to be safe. */ po = get_item(pn, ph->sid, eth_hdr(skb)->h_source, dev->ifindex); if (!po) goto drop; return sk_receive_skb(sk_pppox(po), skb, 0); drop: kfree_skb(skb); out: return NET_RX_DROP; } static void pppoe_unbind_sock_work(struct work_struct *work) { struct pppox_sock *po = container_of(work, struct pppox_sock, proto.pppoe.padt_work); struct sock *sk = sk_pppox(po); lock_sock(sk); if (po->pppoe_dev) { dev_put(po->pppoe_dev); po->pppoe_dev = NULL; } pppox_unbind_sock(sk); release_sock(sk); sock_put(sk); } /************************************************************************ * * Receive a PPPoE Discovery frame. * This is solely for detection of PADT frames * ***********************************************************************/ static int pppoe_disc_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct pppoe_hdr *ph; struct pppox_sock *po; struct pppoe_net *pn; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) goto out; if (skb->pkt_type != PACKET_HOST) goto abort; if (!pskb_may_pull(skb, sizeof(struct pppoe_hdr))) goto abort; ph = pppoe_hdr(skb); if (ph->code != PADT_CODE) goto abort; pn = pppoe_pernet(dev_net(dev)); po = get_item(pn, ph->sid, eth_hdr(skb)->h_source, dev->ifindex); if (po) if (!schedule_work(&po->proto.pppoe.padt_work)) sock_put(sk_pppox(po)); abort: kfree_skb(skb); out: return NET_RX_SUCCESS; /* Lies... :-) */ } static struct packet_type pppoes_ptype __read_mostly = { .type = cpu_to_be16(ETH_P_PPP_SES), .func = pppoe_rcv, }; static struct packet_type pppoed_ptype __read_mostly = { .type = cpu_to_be16(ETH_P_PPP_DISC), .func = pppoe_disc_rcv, }; static struct proto pppoe_sk_proto __read_mostly = { .name = "PPPOE", .owner = THIS_MODULE, .obj_size = sizeof(struct pppox_sock), }; /*********************************************************************** * * Initialize a new struct sock. * **********************************************************************/ static int pppoe_create(struct net *net, struct socket *sock, int kern) { struct sock *sk; sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppoe_sk_proto, kern); if (!sk) return -ENOMEM; sock_init_data(sock, sk); sock->state = SS_UNCONNECTED; sock->ops = &pppoe_ops; sk->sk_backlog_rcv = pppoe_rcv_core; sk->sk_state = PPPOX_NONE; sk->sk_type = SOCK_STREAM; sk->sk_family = PF_PPPOX; sk->sk_protocol = PX_PROTO_OE; INIT_WORK(&pppox_sk(sk)->proto.pppoe.padt_work, pppoe_unbind_sock_work); return 0; } static int pppoe_release(struct socket *sock) { struct sock *sk = sock->sk; struct pppox_sock *po; struct pppoe_net *pn; struct net *net = NULL; if (!sk) return 0; lock_sock(sk); if (sock_flag(sk, SOCK_DEAD)) { release_sock(sk); return -EBADF; } po = pppox_sk(sk); if (po->pppoe_dev) { dev_put(po->pppoe_dev); po->pppoe_dev = NULL; } pppox_unbind_sock(sk); /* Signal the death of the socket. */ sk->sk_state = PPPOX_DEAD; net = sock_net(sk); pn = pppoe_pernet(net); /* * protect "po" from concurrent updates * on pppoe_flush_dev */ delete_item(pn, po->pppoe_pa.sid, po->pppoe_pa.remote, po->pppoe_ifindex); sock_orphan(sk); sock->sk = NULL; skb_queue_purge(&sk->sk_receive_queue); release_sock(sk); sock_put(sk); return 0; } static int pppoe_connect(struct socket *sock, struct sockaddr *uservaddr, int sockaddr_len, int flags) { struct sock *sk = sock->sk; struct sockaddr_pppox *sp = (struct sockaddr_pppox *)uservaddr; struct pppox_sock *po = pppox_sk(sk); struct net_device *dev = NULL; struct pppoe_net *pn; struct net *net = NULL; int error; lock_sock(sk); error = -EINVAL; if (sockaddr_len != sizeof(struct sockaddr_pppox)) goto end; if (sp->sa_protocol != PX_PROTO_OE) goto end; /* Check for already bound sockets */ error = -EBUSY; if ((sk->sk_state & PPPOX_CONNECTED) && stage_session(sp->sa_addr.pppoe.sid)) goto end; /* Check for already disconnected sockets, on attempts to disconnect */ error = -EALREADY; if ((sk->sk_state & PPPOX_DEAD) && !stage_session(sp->sa_addr.pppoe.sid)) goto end; error = 0; /* Delete the old binding */ if (stage_session(po->pppoe_pa.sid)) { pppox_unbind_sock(sk); pn = pppoe_pernet(sock_net(sk)); delete_item(pn, po->pppoe_pa.sid, po->pppoe_pa.remote, po->pppoe_ifindex); if (po->pppoe_dev) { dev_put(po->pppoe_dev); po->pppoe_dev = NULL; } po->pppoe_ifindex = 0; memset(&po->pppoe_pa, 0, sizeof(po->pppoe_pa)); memset(&po->pppoe_relay, 0, sizeof(po->pppoe_relay)); memset(&po->chan, 0, sizeof(po->chan)); po->next = NULL; po->num = 0; sk->sk_state = PPPOX_NONE; } /* Re-bind in session stage only */ if (stage_session(sp->sa_addr.pppoe.sid)) { error = -ENODEV; net = sock_net(sk); dev = dev_get_by_name(net, sp->sa_addr.pppoe.dev); if (!dev) goto err_put; po->pppoe_dev = dev; po->pppoe_ifindex = dev->ifindex; pn = pppoe_pernet(net); if (!(dev->flags & IFF_UP)) { goto err_put; } memcpy(&po->pppoe_pa, &sp->sa_addr.pppoe, sizeof(struct pppoe_addr)); write_lock_bh(&pn->hash_lock); error = __set_item(pn, po); write_unlock_bh(&pn->hash_lock); if (error < 0) goto err_put; po->chan.hdrlen = (sizeof(struct pppoe_hdr) + dev->hard_header_len); po->chan.mtu = dev->mtu - sizeof(struct pppoe_hdr) - 2; po->chan.private = sk; po->chan.ops = &pppoe_chan_ops; error = ppp_register_net_channel(dev_net(dev), &po->chan); if (error) { delete_item(pn, po->pppoe_pa.sid, po->pppoe_pa.remote, po->pppoe_ifindex); goto err_put; } sk->sk_state = PPPOX_CONNECTED; } po->num = sp->sa_addr.pppoe.sid; end: release_sock(sk); return error; err_put: if (po->pppoe_dev) { dev_put(po->pppoe_dev); po->pppoe_dev = NULL; } goto end; } static int pppoe_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { int len = sizeof(struct sockaddr_pppox); struct sockaddr_pppox sp; sp.sa_family = AF_PPPOX; sp.sa_protocol = PX_PROTO_OE; memcpy(&sp.sa_addr.pppoe, &pppox_sk(sock->sk)->pppoe_pa, sizeof(struct pppoe_addr)); memcpy(uaddr, &sp, len); return len; } static int pppoe_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; struct pppox_sock *po = pppox_sk(sk); int val; int err; switch (cmd) { case PPPIOCGMRU: err = -ENXIO; if (!(sk->sk_state & PPPOX_CONNECTED)) break; err = -EFAULT; if (put_user(po->pppoe_dev->mtu - sizeof(struct pppoe_hdr) - PPP_HDRLEN, (int __user *)arg)) break; err = 0; break; case PPPIOCSMRU: err = -ENXIO; if (!(sk->sk_state & PPPOX_CONNECTED)) break; err = -EFAULT; if (get_user(val, (int __user *)arg)) break; if (val < (po->pppoe_dev->mtu - sizeof(struct pppoe_hdr) - PPP_HDRLEN)) err = 0; else err = -EINVAL; break; case PPPIOCSFLAGS: err = -EFAULT; if (get_user(val, (int __user *)arg)) break; err = 0; break; case PPPOEIOCSFWD: { struct pppox_sock *relay_po; err = -EBUSY; if (sk->sk_state & (PPPOX_BOUND | PPPOX_DEAD)) break; err = -ENOTCONN; if (!(sk->sk_state & PPPOX_CONNECTED)) break; /* PPPoE address from the user specifies an outbound PPPoE address which frames are forwarded to */ err = -EFAULT; if (copy_from_user(&po->pppoe_relay, (void __user *)arg, sizeof(struct sockaddr_pppox))) break; err = -EINVAL; if (po->pppoe_relay.sa_family != AF_PPPOX || po->pppoe_relay.sa_protocol != PX_PROTO_OE) break; /* Check that the socket referenced by the address actually exists. */ relay_po = get_item_by_addr(sock_net(sk), &po->pppoe_relay); if (!relay_po) break; sock_put(sk_pppox(relay_po)); sk->sk_state |= PPPOX_RELAY; err = 0; break; } case PPPOEIOCDFWD: err = -EALREADY; if (!(sk->sk_state & PPPOX_RELAY)) break; sk->sk_state &= ~PPPOX_RELAY; err = 0; break; default: err = -ENOTTY; } return err; } static int pppoe_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { struct sk_buff *skb; struct sock *sk = sock->sk; struct pppox_sock *po = pppox_sk(sk); int error; struct pppoe_hdr hdr; struct pppoe_hdr *ph; struct net_device *dev; char *start; int hlen; lock_sock(sk); if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED)) { error = -ENOTCONN; goto end; } hdr.ver = 1; hdr.type = 1; hdr.code = 0; hdr.sid = po->num; dev = po->pppoe_dev; error = -EMSGSIZE; if (total_len > (dev->mtu + dev->hard_header_len)) goto end; hlen = LL_RESERVED_SPACE(dev); skb = sock_wmalloc(sk, hlen + sizeof(*ph) + total_len + dev->needed_tailroom, 0, GFP_KERNEL); if (!skb) { error = -ENOMEM; goto end; } /* Reserve space for headers. */ skb_reserve(skb, hlen); skb_reset_network_header(skb); skb->dev = dev; skb->priority = READ_ONCE(sk->sk_priority); skb->protocol = cpu_to_be16(ETH_P_PPP_SES); ph = skb_put(skb, total_len + sizeof(struct pppoe_hdr)); start = (char *)&ph->tag[0]; error = memcpy_from_msg(start, m, total_len); if (error < 0) { kfree_skb(skb); goto end; } error = total_len; dev_hard_header(skb, dev, ETH_P_PPP_SES, po->pppoe_pa.remote, NULL, total_len); memcpy(ph, &hdr, sizeof(struct pppoe_hdr)); ph->length = htons(total_len); dev_queue_xmit(skb); end: release_sock(sk); return error; } /************************************************************************ * * xmit function for internal use. * ***********************************************************************/ static int __pppoe_xmit(struct sock *sk, struct sk_buff *skb) { struct pppox_sock *po = pppox_sk(sk); struct net_device *dev = po->pppoe_dev; struct pppoe_hdr *ph; int data_len = skb->len; /* The higher-level PPP code (ppp_unregister_channel()) ensures the PPP * xmit operations conclude prior to an unregistration call. Thus * sk->sk_state cannot change, so we don't need to do lock_sock(). * But, we also can't do a lock_sock since that introduces a potential * deadlock as we'd reverse the lock ordering used when calling * ppp_unregister_channel(). */ if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED)) goto abort; if (!dev) goto abort; /* Copy the data if there is no space for the header or if it's * read-only. */ if (skb_cow_head(skb, LL_RESERVED_SPACE(dev) + sizeof(*ph))) goto abort; __skb_push(skb, sizeof(*ph)); skb_reset_network_header(skb); ph = pppoe_hdr(skb); ph->ver = 1; ph->type = 1; ph->code = 0; ph->sid = po->num; ph->length = htons(data_len); skb->protocol = cpu_to_be16(ETH_P_PPP_SES); skb->dev = dev; dev_hard_header(skb, dev, ETH_P_PPP_SES, po->pppoe_pa.remote, NULL, data_len); dev_queue_xmit(skb); return 1; abort: kfree_skb(skb); return 1; } /************************************************************************ * * xmit function called by generic PPP driver * sends PPP frame over PPPoE socket * ***********************************************************************/ static int pppoe_xmit(struct ppp_channel *chan, struct sk_buff *skb) { struct sock *sk = chan->private; return __pppoe_xmit(sk, skb); } static int pppoe_fill_forward_path(struct net_device_path_ctx *ctx, struct net_device_path *path, const struct ppp_channel *chan) { struct sock *sk = chan->private; struct pppox_sock *po = pppox_sk(sk); struct net_device *dev = po->pppoe_dev; if (sock_flag(sk, SOCK_DEAD) || !(sk->sk_state & PPPOX_CONNECTED) || !dev) return -1; path->type = DEV_PATH_PPPOE; path->encap.proto = htons(ETH_P_PPP_SES); path->encap.id = be16_to_cpu(po->num); memcpy(path->encap.h_dest, po->pppoe_pa.remote, ETH_ALEN); memcpy(ctx->daddr, po->pppoe_pa.remote, ETH_ALEN); path->dev = ctx->dev; ctx->dev = dev; return 0; } static const struct ppp_channel_ops pppoe_chan_ops = { .start_xmit = pppoe_xmit, .fill_forward_path = pppoe_fill_forward_path, }; static int pppoe_recvmsg(struct socket *sock, struct msghdr *m, size_t total_len, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; int error = 0; if (sk->sk_state & PPPOX_BOUND) return -EIO; skb = skb_recv_datagram(sk, flags, &error); if (!skb) return error; total_len = min_t(size_t, total_len, skb->len); error = skb_copy_datagram_msg(skb, 0, m, total_len); if (error == 0) { consume_skb(skb); return total_len; } kfree_skb(skb); return error; } #ifdef CONFIG_PROC_FS static int pppoe_seq_show(struct seq_file *seq, void *v) { struct pppox_sock *po; char *dev_name; if (v == SEQ_START_TOKEN) { seq_puts(seq, "Id Address Device\n"); goto out; } po = v; dev_name = po->pppoe_pa.dev; seq_printf(seq, "%08X %pM %8s\n", po->pppoe_pa.sid, po->pppoe_pa.remote, dev_name); out: return 0; } static inline struct pppox_sock *pppoe_get_idx(struct pppoe_net *pn, loff_t pos) { struct pppox_sock *po; int i; for (i = 0; i < PPPOE_HASH_SIZE; i++) { po = pn->hash_table[i]; while (po) { if (!pos--) goto out; po = po->next; } } out: return po; } static void *pppoe_seq_start(struct seq_file *seq, loff_t *pos) __acquires(pn->hash_lock) { struct pppoe_net *pn = pppoe_pernet(seq_file_net(seq)); loff_t l = *pos; read_lock_bh(&pn->hash_lock); return l ? pppoe_get_idx(pn, --l) : SEQ_START_TOKEN; } static void *pppoe_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct pppoe_net *pn = pppoe_pernet(seq_file_net(seq)); struct pppox_sock *po; ++*pos; if (v == SEQ_START_TOKEN) { po = pppoe_get_idx(pn, 0); goto out; } po = v; if (po->next) po = po->next; else { int hash = hash_item(po->pppoe_pa.sid, po->pppoe_pa.remote); po = NULL; while (++hash < PPPOE_HASH_SIZE) { po = pn->hash_table[hash]; if (po) break; } } out: return po; } static void pppoe_seq_stop(struct seq_file *seq, void *v) __releases(pn->hash_lock) { struct pppoe_net *pn = pppoe_pernet(seq_file_net(seq)); read_unlock_bh(&pn->hash_lock); } static const struct seq_operations pppoe_seq_ops = { .start = pppoe_seq_start, .next = pppoe_seq_next, .stop = pppoe_seq_stop, .show = pppoe_seq_show, }; #endif /* CONFIG_PROC_FS */ static const struct proto_ops pppoe_ops = { .family = AF_PPPOX, .owner = THIS_MODULE, .release = pppoe_release, .bind = sock_no_bind, .connect = pppoe_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pppoe_getname, .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .sendmsg = pppoe_sendmsg, .recvmsg = pppoe_recvmsg, .mmap = sock_no_mmap, .ioctl = pppox_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = pppox_compat_ioctl, #endif }; static const struct pppox_proto pppoe_proto = { .create = pppoe_create, .ioctl = pppoe_ioctl, .owner = THIS_MODULE, }; static __net_init int pppoe_init_net(struct net *net) { struct pppoe_net *pn = pppoe_pernet(net); struct proc_dir_entry *pde; rwlock_init(&pn->hash_lock); pde = proc_create_net("pppoe", 0444, net->proc_net, &pppoe_seq_ops, sizeof(struct seq_net_private)); #ifdef CONFIG_PROC_FS if (!pde) return -ENOMEM; #endif return 0; } static __net_exit void pppoe_exit_net(struct net *net) { remove_proc_entry("pppoe", net->proc_net); } static struct pernet_operations pppoe_net_ops = { .init = pppoe_init_net, .exit = pppoe_exit_net, .id = &pppoe_net_id, .size = sizeof(struct pppoe_net), }; static int __init pppoe_init(void) { int err; err = register_pernet_device(&pppoe_net_ops); if (err) goto out; err = proto_register(&pppoe_sk_proto, 0); if (err) goto out_unregister_net_ops; err = register_pppox_proto(PX_PROTO_OE, &pppoe_proto); if (err) goto out_unregister_pppoe_proto; dev_add_pack(&pppoes_ptype); dev_add_pack(&pppoed_ptype); register_netdevice_notifier(&pppoe_notifier); return 0; out_unregister_pppoe_proto: proto_unregister(&pppoe_sk_proto); out_unregister_net_ops: unregister_pernet_device(&pppoe_net_ops); out: return err; } static void __exit pppoe_exit(void) { unregister_netdevice_notifier(&pppoe_notifier); dev_remove_pack(&pppoed_ptype); dev_remove_pack(&pppoes_ptype); unregister_pppox_proto(PX_PROTO_OE); proto_unregister(&pppoe_sk_proto); unregister_pernet_device(&pppoe_net_ops); } module_init(pppoe_init); module_exit(pppoe_exit); MODULE_AUTHOR("Michal Ostrowski <mostrows@speakeasy.net>"); MODULE_DESCRIPTION("PPP over Ethernet driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NET_PF_PROTO(PF_PPPOX, PX_PROTO_OE);
8 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_MROUTE6_H #define __LINUX_MROUTE6_H #include <linux/pim.h> #include <linux/skbuff.h> /* for struct sk_buff_head */ #include <net/net_namespace.h> #include <uapi/linux/mroute6.h> #include <linux/mroute_base.h> #include <linux/sockptr.h> #include <net/fib_rules.h> #ifdef CONFIG_IPV6_MROUTE static inline int ip6_mroute_opt(int opt) { return (opt >= MRT6_BASE) && (opt <= MRT6_MAX); } #else static inline int ip6_mroute_opt(int opt) { return 0; } #endif struct sock; #ifdef CONFIG_IPV6_MROUTE extern int ip6_mroute_setsockopt(struct sock *, int, sockptr_t, unsigned int); extern int ip6_mroute_getsockopt(struct sock *, int, sockptr_t, sockptr_t); extern int ip6_mr_input(struct sk_buff *skb); extern int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); extern int ip6_mr_init(void); extern void ip6_mr_cleanup(void); int ip6mr_ioctl(struct sock *sk, int cmd, void *arg); #else static inline int ip6_mroute_setsockopt(struct sock *sock, int optname, sockptr_t optval, unsigned int optlen) { return -ENOPROTOOPT; } static inline int ip6_mroute_getsockopt(struct sock *sock, int optname, sockptr_t optval, sockptr_t optlen) { return -ENOPROTOOPT; } static inline int ip6mr_ioctl(struct sock *sk, int cmd, void *arg) { return -ENOIOCTLCMD; } static inline int ip6_mr_init(void) { return 0; } static inline void ip6_mr_cleanup(void) { return; } #endif #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES bool ip6mr_rule_default(const struct fib_rule *rule); #else static inline bool ip6mr_rule_default(const struct fib_rule *rule) { return true; } #endif #define VIFF_STATIC 0x8000 struct mfc6_cache_cmp_arg { struct in6_addr mf6c_mcastgrp; struct in6_addr mf6c_origin; }; struct mfc6_cache { struct mr_mfc _c; union { struct { struct in6_addr mf6c_mcastgrp; struct in6_addr mf6c_origin; }; struct mfc6_cache_cmp_arg cmparg; }; }; #define MFC_ASSERT_THRESH (3*HZ) /* Maximal freq. of asserts */ struct rtmsg; extern int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, u32 portid); #ifdef CONFIG_IPV6_MROUTE bool mroute6_is_socket(struct net *net, struct sk_buff *skb); extern int ip6mr_sk_done(struct sock *sk); static inline int ip6mr_sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { switch (cmd) { /* These userspace buffers will be consumed by ip6mr_ioctl() */ case SIOCGETMIFCNT_IN6: { struct sioc_mif_req6 buffer; return sock_ioctl_inout(sk, cmd, arg, &buffer, sizeof(buffer)); } case SIOCGETSGCNT_IN6: { struct sioc_sg_req6 buffer; return sock_ioctl_inout(sk, cmd, arg, &buffer, sizeof(buffer)); } } return 1; } #else static inline bool mroute6_is_socket(struct net *net, struct sk_buff *skb) { return false; } static inline int ip6mr_sk_done(struct sock *sk) { return 0; } static inline int ip6mr_sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { return 1; } #endif #endif
2 2 5 5 2 2 5 2 6 6 2 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 // SPDX-License-Identifier: GPL-2.0-or-later /* * PCBC: Propagating Cipher Block Chaining mode * * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * Derived from cbc.c * - Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/algapi.h> #include <crypto/internal/cipher.h> #include <crypto/internal/skcipher.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> static int crypto_pcbc_encrypt_segment(struct skcipher_request *req, struct skcipher_walk *walk, struct crypto_cipher *tfm) { int bsize = crypto_cipher_blocksize(tfm); unsigned int nbytes = walk->nbytes; u8 *src = walk->src.virt.addr; u8 *dst = walk->dst.virt.addr; u8 * const iv = walk->iv; do { crypto_xor(iv, src, bsize); crypto_cipher_encrypt_one(tfm, dst, iv); crypto_xor_cpy(iv, dst, src, bsize); src += bsize; dst += bsize; } while ((nbytes -= bsize) >= bsize); return nbytes; } static int crypto_pcbc_encrypt_inplace(struct skcipher_request *req, struct skcipher_walk *walk, struct crypto_cipher *tfm) { int bsize = crypto_cipher_blocksize(tfm); unsigned int nbytes = walk->nbytes; u8 *src = walk->src.virt.addr; u8 * const iv = walk->iv; u8 tmpbuf[MAX_CIPHER_BLOCKSIZE]; do { memcpy(tmpbuf, src, bsize); crypto_xor(iv, src, bsize); crypto_cipher_encrypt_one(tfm, src, iv); crypto_xor_cpy(iv, tmpbuf, src, bsize); src += bsize; } while ((nbytes -= bsize) >= bsize); return nbytes; } static int crypto_pcbc_encrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_cipher *cipher = skcipher_cipher_simple(tfm); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while (walk.nbytes) { if (walk.src.virt.addr == walk.dst.virt.addr) nbytes = crypto_pcbc_encrypt_inplace(req, &walk, cipher); else nbytes = crypto_pcbc_encrypt_segment(req, &walk, cipher); err = skcipher_walk_done(&walk, nbytes); } return err; } static int crypto_pcbc_decrypt_segment(struct skcipher_request *req, struct skcipher_walk *walk, struct crypto_cipher *tfm) { int bsize = crypto_cipher_blocksize(tfm); unsigned int nbytes = walk->nbytes; u8 *src = walk->src.virt.addr; u8 *dst = walk->dst.virt.addr; u8 * const iv = walk->iv; do { crypto_cipher_decrypt_one(tfm, dst, src); crypto_xor(dst, iv, bsize); crypto_xor_cpy(iv, dst, src, bsize); src += bsize; dst += bsize; } while ((nbytes -= bsize) >= bsize); return nbytes; } static int crypto_pcbc_decrypt_inplace(struct skcipher_request *req, struct skcipher_walk *walk, struct crypto_cipher *tfm) { int bsize = crypto_cipher_blocksize(tfm); unsigned int nbytes = walk->nbytes; u8 *src = walk->src.virt.addr; u8 * const iv = walk->iv; u8 tmpbuf[MAX_CIPHER_BLOCKSIZE] __aligned(__alignof__(u32)); do { memcpy(tmpbuf, src, bsize); crypto_cipher_decrypt_one(tfm, src, src); crypto_xor(src, iv, bsize); crypto_xor_cpy(iv, src, tmpbuf, bsize); src += bsize; } while ((nbytes -= bsize) >= bsize); return nbytes; } static int crypto_pcbc_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_cipher *cipher = skcipher_cipher_simple(tfm); struct skcipher_walk walk; unsigned int nbytes; int err; err = skcipher_walk_virt(&walk, req, false); while (walk.nbytes) { if (walk.src.virt.addr == walk.dst.virt.addr) nbytes = crypto_pcbc_decrypt_inplace(req, &walk, cipher); else nbytes = crypto_pcbc_decrypt_segment(req, &walk, cipher); err = skcipher_walk_done(&walk, nbytes); } return err; } static int crypto_pcbc_create(struct crypto_template *tmpl, struct rtattr **tb) { struct skcipher_instance *inst; int err; inst = skcipher_alloc_instance_simple(tmpl, tb); if (IS_ERR(inst)) return PTR_ERR(inst); inst->alg.encrypt = crypto_pcbc_encrypt; inst->alg.decrypt = crypto_pcbc_decrypt; err = skcipher_register_instance(tmpl, inst); if (err) inst->free(inst); return err; } static struct crypto_template crypto_pcbc_tmpl = { .name = "pcbc", .create = crypto_pcbc_create, .module = THIS_MODULE, }; static int __init crypto_pcbc_module_init(void) { return crypto_register_template(&crypto_pcbc_tmpl); } static void __exit crypto_pcbc_module_exit(void) { crypto_unregister_template(&crypto_pcbc_tmpl); } subsys_initcall(crypto_pcbc_module_init); module_exit(crypto_pcbc_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("PCBC block cipher mode of operation"); MODULE_ALIAS_CRYPTO("pcbc"); MODULE_IMPORT_NS("CRYPTO_INTERNAL");
172 11 1 18 18 1 9 8 25 37 8 2 36 534 2 8 1 1 1 1 2 27 14 12 7 12 12 12 15 15 12 144 123 47 82 428 428 428 423 16 428 93 93 94 94 2 16 16 93 66 55 167 15 14 14 15 15 2 5 14 15 15 11 6 15 15 15 3 11 14 14 7 9 9 14 14 9 9 7 14 14 57 57 56 48 30 14 14 202 33 36 7 7 7 9 173 9 29 4 456 453 2 12 10 296 425 355 393 1 1 391 8 443 12 453 164 35 129 1 6 161 6 3 17 18 1 17 17 18 27 18 18 4 91 5 5 149 21 149 146 3 166 168 168 168 149 21 167 149 21 168 161 8 167 148 21 167 168 1 84 83 167 167 82 86 86 86 8 76 23 23 2 12 4 5 23 6 9 9 9 6 46 46 41 43 21 1 22 43 43 13 5 5 5 7 6 17 17 21 8 40 402 401 52 23 29 52 37 46 16 5 27 10 36 4 10 6 36 36 4 6 10 24 21 1 3 10 31 31 18 31 10 1 9 7 4 1 3 201 30 198 216 5 4 2 6 201 6 161 4 36 198 32 171 8 24 28 194 2 2 168 2 2 2 1 23 194 31 211 219 219 67 6 2 5 2 52 45 20 20 50 3 43 50 49 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 // SPDX-License-Identifier: GPL-2.0-or-later /* * fs/eventpoll.c (Efficient event retrieval implementation) * Copyright (C) 2001,...,2009 Davide Libenzi * * Davide Libenzi <davidel@xmailserver.org> */ #include <linux/init.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/signal.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/string.h> #include <linux/list.h> #include <linux/hash.h> #include <linux/spinlock.h> #include <linux/syscalls.h> #include <linux/rbtree.h> #include <linux/wait.h> #include <linux/eventpoll.h> #include <linux/mount.h> #include <linux/bitops.h> #include <linux/mutex.h> #include <linux/anon_inodes.h> #include <linux/device.h> #include <linux/uaccess.h> #include <asm/io.h> #include <asm/mman.h> #include <linux/atomic.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/compat.h> #include <linux/rculist.h> #include <linux/capability.h> #include <net/busy_poll.h> /* * LOCKING: * There are three level of locking required by epoll : * * 1) epnested_mutex (mutex) * 2) ep->mtx (mutex) * 3) ep->lock (rwlock) * * The acquire order is the one listed above, from 1 to 3. * We need a rwlock (ep->lock) because we manipulate objects * from inside the poll callback, that might be triggered from * a wake_up() that in turn might be called from IRQ context. * So we can't sleep inside the poll callback and hence we need * a spinlock. During the event transfer loop (from kernel to * user space) we could end up sleeping due a copy_to_user(), so * we need a lock that will allow us to sleep. This lock is a * mutex (ep->mtx). It is acquired during the event transfer loop, * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file(). * The epnested_mutex is acquired when inserting an epoll fd onto another * epoll fd. We do this so that we walk the epoll tree and ensure that this * insertion does not create a cycle of epoll file descriptors, which * could lead to deadlock. We need a global mutex to prevent two * simultaneous inserts (A into B and B into A) from racing and * constructing a cycle without either insert observing that it is * going to. * It is necessary to acquire multiple "ep->mtx"es at once in the * case when one epoll fd is added to another. In this case, we * always acquire the locks in the order of nesting (i.e. after * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired * before e2->mtx). Since we disallow cycles of epoll file * descriptors, this ensures that the mutexes are well-ordered. In * order to communicate this nesting to lockdep, when walking a tree * of epoll file descriptors, we use the current recursion depth as * the lockdep subkey. * It is possible to drop the "ep->mtx" and to use the global * mutex "epnested_mutex" (together with "ep->lock") to have it working, * but having "ep->mtx" will make the interface more scalable. * Events that require holding "epnested_mutex" are very rare, while for * normal operations the epoll private "ep->mtx" will guarantee * a better scalability. */ /* Epoll private bits inside the event mask */ #define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE) #define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT) #define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \ EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE) /* Maximum number of nesting allowed inside epoll sets */ #define EP_MAX_NESTS 4 #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) #define EP_UNACTIVE_PTR ((void *) -1L) #define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry)) struct epoll_filefd { struct file *file; int fd; } __packed; /* Wait structure used by the poll hooks */ struct eppoll_entry { /* List header used to link this structure to the "struct epitem" */ struct eppoll_entry *next; /* The "base" pointer is set to the container "struct epitem" */ struct epitem *base; /* * Wait queue item that will be linked to the target file wait * queue head. */ wait_queue_entry_t wait; /* The wait queue head that linked the "wait" wait queue item */ wait_queue_head_t *whead; }; /* * Each file descriptor added to the eventpoll interface will * have an entry of this type linked to the "rbr" RB tree. * Avoid increasing the size of this struct, there can be many thousands * of these on a server and we do not want this to take another cache line. */ struct epitem { union { /* RB tree node links this structure to the eventpoll RB tree */ struct rb_node rbn; /* Used to free the struct epitem */ struct rcu_head rcu; }; /* List header used to link this structure to the eventpoll ready list */ struct list_head rdllink; /* * Works together "struct eventpoll"->ovflist in keeping the * single linked chain of items. */ struct epitem *next; /* The file descriptor information this item refers to */ struct epoll_filefd ffd; /* * Protected by file->f_lock, true for to-be-released epitem already * removed from the "struct file" items list; together with * eventpoll->refcount orchestrates "struct eventpoll" disposal */ bool dying; /* List containing poll wait queues */ struct eppoll_entry *pwqlist; /* The "container" of this item */ struct eventpoll *ep; /* List header used to link this item to the "struct file" items list */ struct hlist_node fllink; /* wakeup_source used when EPOLLWAKEUP is set */ struct wakeup_source __rcu *ws; /* The structure that describe the interested events and the source fd */ struct epoll_event event; }; /* * This structure is stored inside the "private_data" member of the file * structure and represents the main data structure for the eventpoll * interface. */ struct eventpoll { /* * This mutex is used to ensure that files are not removed * while epoll is using them. This is held during the event * collection loop, the file cleanup path, the epoll file exit * code and the ctl operations. */ struct mutex mtx; /* Wait queue used by sys_epoll_wait() */ wait_queue_head_t wq; /* Wait queue used by file->poll() */ wait_queue_head_t poll_wait; /* List of ready file descriptors */ struct list_head rdllist; /* Lock which protects rdllist and ovflist */ rwlock_t lock; /* RB tree root used to store monitored fd structs */ struct rb_root_cached rbr; /* * This is a single linked list that chains all the "struct epitem" that * happened while transferring ready events to userspace w/out * holding ->lock. */ struct epitem *ovflist; /* wakeup_source used when ep_send_events or __ep_eventpoll_poll is running */ struct wakeup_source *ws; /* The user that created the eventpoll descriptor */ struct user_struct *user; struct file *file; /* used to optimize loop detection check */ u64 gen; struct hlist_head refs; /* * usage count, used together with epitem->dying to * orchestrate the disposal of this struct */ refcount_t refcount; #ifdef CONFIG_NET_RX_BUSY_POLL /* used to track busy poll napi_id */ unsigned int napi_id; /* busy poll timeout */ u32 busy_poll_usecs; /* busy poll packet budget */ u16 busy_poll_budget; bool prefer_busy_poll; #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC /* tracks wakeup nests for lockdep validation */ u8 nests; #endif }; /* Wrapper struct used by poll queueing */ struct ep_pqueue { poll_table pt; struct epitem *epi; }; /* * Configuration options available inside /proc/sys/fs/epoll/ */ /* Maximum number of epoll watched descriptors, per user */ static long max_user_watches __read_mostly; /* Used for cycles detection */ static DEFINE_MUTEX(epnested_mutex); static u64 loop_check_gen = 0; /* Used to check for epoll file descriptor inclusion loops */ static struct eventpoll *inserting_into; /* Slab cache used to allocate "struct epitem" */ static struct kmem_cache *epi_cache __ro_after_init; /* Slab cache used to allocate "struct eppoll_entry" */ static struct kmem_cache *pwq_cache __ro_after_init; /* * List of files with newly added links, where we may need to limit the number * of emanating paths. Protected by the epnested_mutex. */ struct epitems_head { struct hlist_head epitems; struct epitems_head *next; }; static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR; static struct kmem_cache *ephead_cache __ro_after_init; static inline void free_ephead(struct epitems_head *head) { if (head) kmem_cache_free(ephead_cache, head); } static void list_file(struct file *file) { struct epitems_head *head; head = container_of(file->f_ep, struct epitems_head, epitems); if (!head->next) { head->next = tfile_check_list; tfile_check_list = head; } } static void unlist_file(struct epitems_head *head) { struct epitems_head *to_free = head; struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems)); if (p) { struct epitem *epi= container_of(p, struct epitem, fllink); spin_lock(&epi->ffd.file->f_lock); if (!hlist_empty(&head->epitems)) to_free = NULL; head->next = NULL; spin_unlock(&epi->ffd.file->f_lock); } free_ephead(to_free); } #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> static long long_zero; static long long_max = LONG_MAX; static const struct ctl_table epoll_table[] = { { .procname = "max_user_watches", .data = &max_user_watches, .maxlen = sizeof(max_user_watches), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra1 = &long_zero, .extra2 = &long_max, }, }; static void __init epoll_sysctls_init(void) { register_sysctl("fs/epoll", epoll_table); } #else #define epoll_sysctls_init() do { } while (0) #endif /* CONFIG_SYSCTL */ static const struct file_operations eventpoll_fops; static inline int is_file_epoll(struct file *f) { return f->f_op == &eventpoll_fops; } /* Setup the structure that is used as key for the RB tree */ static inline void ep_set_ffd(struct epoll_filefd *ffd, struct file *file, int fd) { ffd->file = file; ffd->fd = fd; } /* Compare RB tree keys */ static inline int ep_cmp_ffd(struct epoll_filefd *p1, struct epoll_filefd *p2) { return (p1->file > p2->file ? +1: (p1->file < p2->file ? -1 : p1->fd - p2->fd)); } /* Tells us if the item is currently linked */ static inline int ep_is_linked(struct epitem *epi) { return !list_empty(&epi->rdllink); } static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p) { return container_of(p, struct eppoll_entry, wait); } /* Get the "struct epitem" from a wait queue pointer */ static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p) { return container_of(p, struct eppoll_entry, wait)->base; } /** * ep_events_available - Checks if ready events might be available. * * @ep: Pointer to the eventpoll context. * * Return: a value different than %zero if ready events are available, * or %zero otherwise. */ static inline int ep_events_available(struct eventpoll *ep) { return !list_empty_careful(&ep->rdllist) || READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR; } #ifdef CONFIG_NET_RX_BUSY_POLL /** * busy_loop_ep_timeout - check if busy poll has timed out. The timeout value * from the epoll instance ep is preferred, but if it is not set fallback to * the system-wide global via busy_loop_timeout. * * @start_time: The start time used to compute the remaining time until timeout. * @ep: Pointer to the eventpoll context. * * Return: true if the timeout has expired, false otherwise. */ static bool busy_loop_ep_timeout(unsigned long start_time, struct eventpoll *ep) { unsigned long bp_usec = READ_ONCE(ep->busy_poll_usecs); if (bp_usec) { unsigned long end_time = start_time + bp_usec; unsigned long now = busy_loop_current_time(); return time_after(now, end_time); } else { return busy_loop_timeout(start_time); } } static bool ep_busy_loop_on(struct eventpoll *ep) { return !!READ_ONCE(ep->busy_poll_usecs) || READ_ONCE(ep->prefer_busy_poll) || net_busy_loop_on(); } static bool ep_busy_loop_end(void *p, unsigned long start_time) { struct eventpoll *ep = p; return ep_events_available(ep) || busy_loop_ep_timeout(start_time, ep); } /* * Busy poll if globally on and supporting sockets found && no events, * busy loop will return if need_resched or ep_events_available. * * we must do our busy polling with irqs enabled */ static bool ep_busy_loop(struct eventpoll *ep, int nonblock) { unsigned int napi_id = READ_ONCE(ep->napi_id); u16 budget = READ_ONCE(ep->busy_poll_budget); bool prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll); if (!budget) budget = BUSY_POLL_BUDGET; if (napi_id >= MIN_NAPI_ID && ep_busy_loop_on(ep)) { napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, prefer_busy_poll, budget); if (ep_events_available(ep)) return true; /* * Busy poll timed out. Drop NAPI ID for now, we can add * it back in when we have moved a socket with a valid NAPI * ID onto the ready list. */ if (prefer_busy_poll) napi_resume_irqs(napi_id); ep->napi_id = 0; return false; } return false; } /* * Set epoll busy poll NAPI ID from sk. */ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) { struct eventpoll *ep = epi->ep; unsigned int napi_id; struct socket *sock; struct sock *sk; if (!ep_busy_loop_on(ep)) return; sock = sock_from_file(epi->ffd.file); if (!sock) return; sk = sock->sk; if (!sk) return; napi_id = READ_ONCE(sk->sk_napi_id); /* Non-NAPI IDs can be rejected * or * Nothing to do if we already have this ID */ if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id) return; /* record NAPI ID for use in next busy poll */ ep->napi_id = napi_id; } static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct eventpoll *ep = file->private_data; void __user *uarg = (void __user *)arg; struct epoll_params epoll_params; switch (cmd) { case EPIOCSPARAMS: if (copy_from_user(&epoll_params, uarg, sizeof(epoll_params))) return -EFAULT; /* pad byte must be zero */ if (epoll_params.__pad) return -EINVAL; if (epoll_params.busy_poll_usecs > S32_MAX) return -EINVAL; if (epoll_params.prefer_busy_poll > 1) return -EINVAL; if (epoll_params.busy_poll_budget > NAPI_POLL_WEIGHT && !capable(CAP_NET_ADMIN)) return -EPERM; WRITE_ONCE(ep->busy_poll_usecs, epoll_params.busy_poll_usecs); WRITE_ONCE(ep->busy_poll_budget, epoll_params.busy_poll_budget); WRITE_ONCE(ep->prefer_busy_poll, epoll_params.prefer_busy_poll); return 0; case EPIOCGPARAMS: memset(&epoll_params, 0, sizeof(epoll_params)); epoll_params.busy_poll_usecs = READ_ONCE(ep->busy_poll_usecs); epoll_params.busy_poll_budget = READ_ONCE(ep->busy_poll_budget); epoll_params.prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll); if (copy_to_user(uarg, &epoll_params, sizeof(epoll_params))) return -EFAULT; return 0; default: return -ENOIOCTLCMD; } } static void ep_suspend_napi_irqs(struct eventpoll *ep) { unsigned int napi_id = READ_ONCE(ep->napi_id); if (napi_id >= MIN_NAPI_ID && READ_ONCE(ep->prefer_busy_poll)) napi_suspend_irqs(napi_id); } static void ep_resume_napi_irqs(struct eventpoll *ep) { unsigned int napi_id = READ_ONCE(ep->napi_id); if (napi_id >= MIN_NAPI_ID && READ_ONCE(ep->prefer_busy_poll)) napi_resume_irqs(napi_id); } #else static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock) { return false; } static inline void ep_set_busy_poll_napi_id(struct epitem *epi) { } static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return -EOPNOTSUPP; } static void ep_suspend_napi_irqs(struct eventpoll *ep) { } static void ep_resume_napi_irqs(struct eventpoll *ep) { } #endif /* CONFIG_NET_RX_BUSY_POLL */ /* * As described in commit 0ccf831cb lockdep: annotate epoll * the use of wait queues used by epoll is done in a very controlled * manner. Wake ups can nest inside each other, but are never done * with the same locking. For example: * * dfd = socket(...); * efd1 = epoll_create(); * efd2 = epoll_create(); * epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...); * epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...); * * When a packet arrives to the device underneath "dfd", the net code will * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a * callback wakeup entry on that queue, and the wake_up() performed by the * "dfd" net code will end up in ep_poll_callback(). At this point epoll * (efd1) notices that it may have some event ready, so it needs to wake up * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake() * that ends up in another wake_up(), after having checked about the * recursion constraints. That are, no more than EP_MAX_NESTS, to avoid * stack blasting. * * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle * this special case of epoll. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, unsigned pollflags) { struct eventpoll *ep_src; unsigned long flags; u8 nests = 0; /* * To set the subclass or nesting level for spin_lock_irqsave_nested() * it might be natural to create a per-cpu nest count. However, since * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can * schedule() in the -rt kernel, the per-cpu variable are no longer * protected. Thus, we are introducing a per eventpoll nest field. * If we are not being call from ep_poll_callback(), epi is NULL and * we are at the first level of nesting, 0. Otherwise, we are being * called from ep_poll_callback() and if a previous wakeup source is * not an epoll file itself, we are at depth 1 since the wakeup source * is depth 0. If the wakeup source is a previous epoll file in the * wakeup chain then we use its nests value and record ours as * nests + 1. The previous epoll file nests value is stable since its * already holding its own poll_wait.lock. */ if (epi) { if ((is_file_epoll(epi->ffd.file))) { ep_src = epi->ffd.file->private_data; nests = ep_src->nests; } else { nests = 1; } } spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests); ep->nests = nests + 1; wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags); ep->nests = 0; spin_unlock_irqrestore(&ep->poll_wait.lock, flags); } #else static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, __poll_t pollflags) { wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags); } #endif static void ep_remove_wait_queue(struct eppoll_entry *pwq) { wait_queue_head_t *whead; rcu_read_lock(); /* * If it is cleared by POLLFREE, it should be rcu-safe. * If we read NULL we need a barrier paired with * smp_store_release() in ep_poll_callback(), otherwise * we rely on whead->lock. */ whead = smp_load_acquire(&pwq->whead); if (whead) remove_wait_queue(whead, &pwq->wait); rcu_read_unlock(); } /* * This function unregisters poll callbacks from the associated file * descriptor. Must be called with "mtx" held. */ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) { struct eppoll_entry **p = &epi->pwqlist; struct eppoll_entry *pwq; while ((pwq = *p) != NULL) { *p = pwq->next; ep_remove_wait_queue(pwq); kmem_cache_free(pwq_cache, pwq); } } /* call only when ep->mtx is held */ static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi) { return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx)); } /* call only when ep->mtx is held */ static inline void ep_pm_stay_awake(struct epitem *epi) { struct wakeup_source *ws = ep_wakeup_source(epi); if (ws) __pm_stay_awake(ws); } static inline bool ep_has_wakeup_source(struct epitem *epi) { return rcu_access_pointer(epi->ws) ? true : false; } /* call when ep->mtx cannot be held (ep_poll_callback) */ static inline void ep_pm_stay_awake_rcu(struct epitem *epi) { struct wakeup_source *ws; rcu_read_lock(); ws = rcu_dereference(epi->ws); if (ws) __pm_stay_awake(ws); rcu_read_unlock(); } /* * ep->mutex needs to be held because we could be hit by * eventpoll_release_file() and epoll_ctl(). */ static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist) { /* * Steal the ready list, and re-init the original one to the * empty list. Also, set ep->ovflist to NULL so that events * happening while looping w/out locks, are not lost. We cannot * have the poll callback to queue directly on ep->rdllist, * because we want the "sproc" callback to be able to do it * in a lockless way. */ lockdep_assert_irqs_enabled(); write_lock_irq(&ep->lock); list_splice_init(&ep->rdllist, txlist); WRITE_ONCE(ep->ovflist, NULL); write_unlock_irq(&ep->lock); } static void ep_done_scan(struct eventpoll *ep, struct list_head *txlist) { struct epitem *epi, *nepi; write_lock_irq(&ep->lock); /* * During the time we spent inside the "sproc" callback, some * other events might have been queued by the poll callback. * We re-insert them inside the main ready-list here. */ for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL; nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { /* * We need to check if the item is already in the list. * During the "sproc" callback execution time, items are * queued into ->ovflist but the "txlist" might already * contain them, and the list_splice() below takes care of them. */ if (!ep_is_linked(epi)) { /* * ->ovflist is LIFO, so we have to reverse it in order * to keep in FIFO. */ list_add(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); } } /* * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after * releasing the lock, events will be queued in the normal way inside * ep->rdllist. */ WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR); /* * Quickly re-inject items left on "txlist". */ list_splice(txlist, &ep->rdllist); __pm_relax(ep->ws); if (!list_empty(&ep->rdllist)) { if (waitqueue_active(&ep->wq)) wake_up(&ep->wq); } write_unlock_irq(&ep->lock); } static void ep_get(struct eventpoll *ep) { refcount_inc(&ep->refcount); } /* * Returns true if the event poll can be disposed */ static bool ep_refcount_dec_and_test(struct eventpoll *ep) { if (!refcount_dec_and_test(&ep->refcount)) return false; WARN_ON_ONCE(!RB_EMPTY_ROOT(&ep->rbr.rb_root)); return true; } static void ep_free(struct eventpoll *ep) { ep_resume_napi_irqs(ep); mutex_destroy(&ep->mtx); free_uid(ep->user); wakeup_source_unregister(ep->ws); kfree(ep); } /* * Removes a "struct epitem" from the eventpoll RB tree and deallocates * all the associated resources. Must be called with "mtx" held. * If the dying flag is set, do the removal only if force is true. * This prevents ep_clear_and_put() from dropping all the ep references * while running concurrently with eventpoll_release_file(). * Returns true if the eventpoll can be disposed. */ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) { struct file *file = epi->ffd.file; struct epitems_head *to_free; struct hlist_head *head; lockdep_assert_irqs_enabled(); /* * Removes poll wait queue hooks. */ ep_unregister_pollwait(ep, epi); /* Remove the current item from the list of epoll hooks */ spin_lock(&file->f_lock); if (epi->dying && !force) { spin_unlock(&file->f_lock); return false; } to_free = NULL; head = file->f_ep; if (head->first == &epi->fllink && !epi->fllink.next) { /* See eventpoll_release() for details. */ WRITE_ONCE(file->f_ep, NULL); if (!is_file_epoll(file)) { struct epitems_head *v; v = container_of(head, struct epitems_head, epitems); if (!smp_load_acquire(&v->next)) to_free = v; } } hlist_del_rcu(&epi->fllink); spin_unlock(&file->f_lock); free_ephead(to_free); rb_erase_cached(&epi->rbn, &ep->rbr); write_lock_irq(&ep->lock); if (ep_is_linked(epi)) list_del_init(&epi->rdllink); write_unlock_irq(&ep->lock); wakeup_source_unregister(ep_wakeup_source(epi)); /* * At this point it is safe to free the eventpoll item. Use the union * field epi->rcu, since we are trying to minimize the size of * 'struct epitem'. The 'rbn' field is no longer in use. Protected by * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make * use of the rbn field. */ kfree_rcu(epi, rcu); percpu_counter_dec(&ep->user->epoll_watches); return ep_refcount_dec_and_test(ep); } /* * ep_remove variant for callers owing an additional reference to the ep */ static void ep_remove_safe(struct eventpoll *ep, struct epitem *epi) { WARN_ON_ONCE(__ep_remove(ep, epi, false)); } static void ep_clear_and_put(struct eventpoll *ep) { struct rb_node *rbp, *next; struct epitem *epi; bool dispose; /* We need to release all tasks waiting for these file */ if (waitqueue_active(&ep->poll_wait)) ep_poll_safewake(ep, NULL, 0); mutex_lock(&ep->mtx); /* * Walks through the whole tree by unregistering poll callbacks. */ for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); ep_unregister_pollwait(ep, epi); cond_resched(); } /* * Walks through the whole tree and try to free each "struct epitem". * Note that ep_remove_safe() will not remove the epitem in case of a * racing eventpoll_release_file(); the latter will do the removal. * At this point we are sure no poll callbacks will be lingering around. * Since we still own a reference to the eventpoll struct, the loop can't * dispose it. */ for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = next) { next = rb_next(rbp); epi = rb_entry(rbp, struct epitem, rbn); ep_remove_safe(ep, epi); cond_resched(); } dispose = ep_refcount_dec_and_test(ep); mutex_unlock(&ep->mtx); if (dispose) ep_free(ep); } static long ep_eventpoll_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { int ret; if (!is_file_epoll(file)) return -EINVAL; switch (cmd) { case EPIOCSPARAMS: case EPIOCGPARAMS: ret = ep_eventpoll_bp_ioctl(file, cmd, arg); break; default: ret = -EINVAL; break; } return ret; } static int ep_eventpoll_release(struct inode *inode, struct file *file) { struct eventpoll *ep = file->private_data; if (ep) ep_clear_and_put(ep); return 0; } static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth); static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth) { struct eventpoll *ep = file->private_data; LIST_HEAD(txlist); struct epitem *epi, *tmp; poll_table pt; __poll_t res = 0; init_poll_funcptr(&pt, NULL); /* Insert inside our poll wait queue */ poll_wait(file, &ep->poll_wait, wait); /* * Proceed to find out if wanted events are really available inside * the ready list. */ mutex_lock_nested(&ep->mtx, depth); ep_start_scan(ep, &txlist); list_for_each_entry_safe(epi, tmp, &txlist, rdllink) { if (ep_item_poll(epi, &pt, depth + 1)) { res = EPOLLIN | EPOLLRDNORM; break; } else { /* * Item has been dropped into the ready list by the poll * callback, but it's not actually ready, as far as * caller requested events goes. We can remove it here. */ __pm_relax(ep_wakeup_source(epi)); list_del_init(&epi->rdllink); } } ep_done_scan(ep, &txlist); mutex_unlock(&ep->mtx); return res; } /* * The ffd.file pointer may be in the process of being torn down due to * being closed, but we may not have finished eventpoll_release() yet. * * Normally, even with the atomic_long_inc_not_zero, the file may have * been free'd and then gotten re-allocated to something else (since * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU). * * But for epoll, users hold the ep->mtx mutex, and as such any file in * the process of being free'd will block in eventpoll_release_file() * and thus the underlying file allocation will not be free'd, and the * file re-use cannot happen. * * For the same reason we can avoid a rcu_read_lock() around the * operation - 'ffd.file' cannot go away even if the refcount has * reached zero (but we must still not call out to ->poll() functions * etc). */ static struct file *epi_fget(const struct epitem *epi) { struct file *file; file = epi->ffd.file; if (!file_ref_get(&file->f_ref)) file = NULL; return file; } /* * Differs from ep_eventpoll_poll() in that internal callers already have * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested() * is correctly annotated. */ static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth) { struct file *file = epi_fget(epi); __poll_t res; /* * We could return EPOLLERR | EPOLLHUP or something, but let's * treat this more as "file doesn't exist, poll didn't happen". */ if (!file) return 0; pt->_key = epi->event.events; if (!is_file_epoll(file)) res = vfs_poll(file, pt); else res = __ep_eventpoll_poll(file, pt, depth); fput(file); return res & epi->event.events; } static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait) { return __ep_eventpoll_poll(file, wait, 0); } #ifdef CONFIG_PROC_FS static void ep_show_fdinfo(struct seq_file *m, struct file *f) { struct eventpoll *ep = f->private_data; struct rb_node *rbp; mutex_lock(&ep->mtx); for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { struct epitem *epi = rb_entry(rbp, struct epitem, rbn); struct inode *inode = file_inode(epi->ffd.file); seq_printf(m, "tfd: %8d events: %8x data: %16llx " " pos:%lli ino:%lx sdev:%x\n", epi->ffd.fd, epi->event.events, (long long)epi->event.data, (long long)epi->ffd.file->f_pos, inode->i_ino, inode->i_sb->s_dev); if (seq_has_overflowed(m)) break; } mutex_unlock(&ep->mtx); } #endif /* File callbacks that implement the eventpoll file behaviour */ static const struct file_operations eventpoll_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = ep_show_fdinfo, #endif .release = ep_eventpoll_release, .poll = ep_eventpoll_poll, .llseek = noop_llseek, .unlocked_ioctl = ep_eventpoll_ioctl, .compat_ioctl = compat_ptr_ioctl, }; /* * This is called from eventpoll_release() to unlink files from the eventpoll * interface. We need to have this facility to cleanup correctly files that are * closed without being removed from the eventpoll interface. */ void eventpoll_release_file(struct file *file) { struct eventpoll *ep; struct epitem *epi; bool dispose; /* * Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from * touching the epitems list before eventpoll_release_file() can access * the ep->mtx. */ again: spin_lock(&file->f_lock); if (file->f_ep && file->f_ep->first) { epi = hlist_entry(file->f_ep->first, struct epitem, fllink); epi->dying = true; spin_unlock(&file->f_lock); /* * ep access is safe as we still own a reference to the ep * struct */ ep = epi->ep; mutex_lock(&ep->mtx); dispose = __ep_remove(ep, epi, true); mutex_unlock(&ep->mtx); if (dispose) ep_free(ep); goto again; } spin_unlock(&file->f_lock); } static int ep_alloc(struct eventpoll **pep) { struct eventpoll *ep; ep = kzalloc(sizeof(*ep), GFP_KERNEL); if (unlikely(!ep)) return -ENOMEM; mutex_init(&ep->mtx); rwlock_init(&ep->lock); init_waitqueue_head(&ep->wq); init_waitqueue_head(&ep->poll_wait); INIT_LIST_HEAD(&ep->rdllist); ep->rbr = RB_ROOT_CACHED; ep->ovflist = EP_UNACTIVE_PTR; ep->user = get_current_user(); refcount_set(&ep->refcount, 1); *pep = ep; return 0; } /* * Search the file inside the eventpoll tree. The RB tree operations * are protected by the "mtx" mutex, and ep_find() must be called with * "mtx" held. */ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) { int kcmp; struct rb_node *rbp; struct epitem *epi, *epir = NULL; struct epoll_filefd ffd; ep_set_ffd(&ffd, file, fd); for (rbp = ep->rbr.rb_root.rb_node; rbp; ) { epi = rb_entry(rbp, struct epitem, rbn); kcmp = ep_cmp_ffd(&ffd, &epi->ffd); if (kcmp > 0) rbp = rbp->rb_right; else if (kcmp < 0) rbp = rbp->rb_left; else { epir = epi; break; } } return epir; } #ifdef CONFIG_KCMP static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff) { struct rb_node *rbp; struct epitem *epi; for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (epi->ffd.fd == tfd) { if (toff == 0) return epi; else toff--; } cond_resched(); } return NULL; } struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff) { struct file *file_raw; struct eventpoll *ep; struct epitem *epi; if (!is_file_epoll(file)) return ERR_PTR(-EINVAL); ep = file->private_data; mutex_lock(&ep->mtx); epi = ep_find_tfd(ep, tfd, toff); if (epi) file_raw = epi->ffd.file; else file_raw = ERR_PTR(-ENOENT); mutex_unlock(&ep->mtx); return file_raw; } #endif /* CONFIG_KCMP */ /* * Adds a new entry to the tail of the list in a lockless way, i.e. * multiple CPUs are allowed to call this function concurrently. * * Beware: it is necessary to prevent any other modifications of the * existing list until all changes are completed, in other words * concurrent list_add_tail_lockless() calls should be protected * with a read lock, where write lock acts as a barrier which * makes sure all list_add_tail_lockless() calls are fully * completed. * * Also an element can be locklessly added to the list only in one * direction i.e. either to the tail or to the head, otherwise * concurrent access will corrupt the list. * * Return: %false if element has been already added to the list, %true * otherwise. */ static inline bool list_add_tail_lockless(struct list_head *new, struct list_head *head) { struct list_head *prev; /* * This is simple 'new->next = head' operation, but cmpxchg() * is used in order to detect that same element has been just * added to the list from another CPU: the winner observes * new->next == new. */ if (!try_cmpxchg(&new->next, &new, head)) return false; /* * Initially ->next of a new element must be updated with the head * (we are inserting to the tail) and only then pointers are atomically * exchanged. XCHG guarantees memory ordering, thus ->next should be * updated before pointers are actually swapped and pointers are * swapped before prev->next is updated. */ prev = xchg(&head->prev, new); /* * It is safe to modify prev->next and new->prev, because a new element * is added only to the tail and new->next is updated before XCHG. */ prev->next = new; new->prev = prev; return true; } /* * Chains a new epi entry to the tail of the ep->ovflist in a lockless way, * i.e. multiple CPUs are allowed to call this function concurrently. * * Return: %false if epi element has been already chained, %true otherwise. */ static inline bool chain_epi_lockless(struct epitem *epi) { struct eventpoll *ep = epi->ep; /* Fast preliminary check */ if (epi->next != EP_UNACTIVE_PTR) return false; /* Check that the same epi has not been just chained from another CPU */ if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR) return false; /* Atomically exchange tail */ epi->next = xchg(&ep->ovflist, epi); return true; } /* * This is the callback that is passed to the wait queue wakeup * mechanism. It is called by the stored file descriptors when they * have events to report. * * This callback takes a read lock in order not to contend with concurrent * events from another file descriptor, thus all modifications to ->rdllist * or ->ovflist are lockless. Read lock is paired with the write lock from * ep_start/done_scan(), which stops all list modifications and guarantees * that lists state is seen correctly. * * Another thing worth to mention is that ep_poll_callback() can be called * concurrently for the same @epi from different CPUs if poll table was inited * with several wait queues entries. Plural wakeup from different CPUs of a * single wait queue is serialized by wq.lock, but the case when multiple wait * queues are used should be detected accordingly. This is detected using * cmpxchg() operation. */ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { int pwake = 0; struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; __poll_t pollflags = key_to_poll(key); unsigned long flags; int ewake = 0; read_lock_irqsave(&ep->lock, flags); ep_set_busy_poll_napi_id(epi); /* * If the event mask does not contain any poll(2) event, we consider the * descriptor to be disabled. This condition is likely the effect of the * EPOLLONESHOT bit that disables the descriptor when an event is received, * until the next EPOLL_CTL_MOD will be issued. */ if (!(epi->event.events & ~EP_PRIVATE_BITS)) goto out_unlock; /* * Check the events coming with the callback. At this stage, not * every device reports the events in the "key" parameter of the * callback. We need to be able to handle both cases here, hence the * test for "key" != NULL before the event match test. */ if (pollflags && !(pollflags & epi->event.events)) goto out_unlock; /* * If we are transferring events to userspace, we can hold no locks * (because we're accessing user memory, and because of linux f_op->poll() * semantics). All the events that happen during that period of time are * chained in ep->ovflist and requeued later on. */ if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { if (chain_epi_lockless(epi)) ep_pm_stay_awake_rcu(epi); } else if (!ep_is_linked(epi)) { /* In the usual case, add event to ready list. */ if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) ep_pm_stay_awake_rcu(epi); } /* * Wake up ( if active ) both the eventpoll wait list and the ->poll() * wait list. */ if (waitqueue_active(&ep->wq)) { if ((epi->event.events & EPOLLEXCLUSIVE) && !(pollflags & POLLFREE)) { switch (pollflags & EPOLLINOUT_BITS) { case EPOLLIN: if (epi->event.events & EPOLLIN) ewake = 1; break; case EPOLLOUT: if (epi->event.events & EPOLLOUT) ewake = 1; break; case 0: ewake = 1; break; } } if (sync) wake_up_sync(&ep->wq); else wake_up(&ep->wq); } if (waitqueue_active(&ep->poll_wait)) pwake++; out_unlock: read_unlock_irqrestore(&ep->lock, flags); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE); if (!(epi->event.events & EPOLLEXCLUSIVE)) ewake = 1; if (pollflags & POLLFREE) { /* * If we race with ep_remove_wait_queue() it can miss * ->whead = NULL and do another remove_wait_queue() after * us, so we can't use __remove_wait_queue(). */ list_del_init(&wait->entry); /* * ->whead != NULL protects us from the race with * ep_clear_and_put() or ep_remove(), ep_remove_wait_queue() * takes whead->lock held by the caller. Once we nullify it, * nothing protects ep/epi or even wait. */ smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL); } return ewake; } /* * This is the callback that is used to add our wait queue to the * target file wakeup lists. */ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, poll_table *pt) { struct ep_pqueue *epq = container_of(pt, struct ep_pqueue, pt); struct epitem *epi = epq->epi; struct eppoll_entry *pwq; if (unlikely(!epi)) // an earlier allocation has failed return; pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL); if (unlikely(!pwq)) { epq->epi = NULL; return; } init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); pwq->whead = whead; pwq->base = epi; if (epi->event.events & EPOLLEXCLUSIVE) add_wait_queue_exclusive(whead, &pwq->wait); else add_wait_queue(whead, &pwq->wait); pwq->next = epi->pwqlist; epi->pwqlist = pwq; } static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) { int kcmp; struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL; struct epitem *epic; bool leftmost = true; while (*p) { parent = *p; epic = rb_entry(parent, struct epitem, rbn); kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd); if (kcmp > 0) { p = &parent->rb_right; leftmost = false; } else p = &parent->rb_left; } rb_link_node(&epi->rbn, parent, p); rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost); } #define PATH_ARR_SIZE 5 /* * These are the number paths of length 1 to 5, that we are allowing to emanate * from a single file of interest. For example, we allow 1000 paths of length * 1, to emanate from each file of interest. This essentially represents the * potential wakeup paths, which need to be limited in order to avoid massive * uncontrolled wakeup storms. The common use case should be a single ep which * is connected to n file sources. In this case each file source has 1 path * of length 1. Thus, the numbers below should be more than sufficient. These * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify * and delete can't add additional paths. Protected by the epnested_mutex. */ static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 }; static int path_count[PATH_ARR_SIZE]; static int path_count_inc(int nests) { /* Allow an arbitrary number of depth 1 paths */ if (nests == 0) return 0; if (++path_count[nests] > path_limits[nests]) return -1; return 0; } static void path_count_init(void) { int i; for (i = 0; i < PATH_ARR_SIZE; i++) path_count[i] = 0; } static int reverse_path_check_proc(struct hlist_head *refs, int depth) { int error = 0; struct epitem *epi; if (depth > EP_MAX_NESTS) /* too deep nesting */ return -1; /* CTL_DEL can remove links here, but that can't increase our count */ hlist_for_each_entry_rcu(epi, refs, fllink) { struct hlist_head *refs = &epi->ep->refs; if (hlist_empty(refs)) error = path_count_inc(depth); else error = reverse_path_check_proc(refs, depth + 1); if (error != 0) break; } return error; } /** * reverse_path_check - The tfile_check_list is list of epitem_head, which have * links that are proposed to be newly added. We need to * make sure that those added links don't add too many * paths such that we will spend all our time waking up * eventpoll objects. * * Return: %zero if the proposed links don't create too many paths, * %-1 otherwise. */ static int reverse_path_check(void) { struct epitems_head *p; for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) { int error; path_count_init(); rcu_read_lock(); error = reverse_path_check_proc(&p->epitems, 0); rcu_read_unlock(); if (error) return error; } return 0; } static int ep_create_wakeup_source(struct epitem *epi) { struct name_snapshot n; struct wakeup_source *ws; if (!epi->ep->ws) { epi->ep->ws = wakeup_source_register(NULL, "eventpoll"); if (!epi->ep->ws) return -ENOMEM; } take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry); ws = wakeup_source_register(NULL, n.name.name); release_dentry_name_snapshot(&n); if (!ws) return -ENOMEM; rcu_assign_pointer(epi->ws, ws); return 0; } /* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */ static noinline void ep_destroy_wakeup_source(struct epitem *epi) { struct wakeup_source *ws = ep_wakeup_source(epi); RCU_INIT_POINTER(epi->ws, NULL); /* * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is * used internally by wakeup_source_remove, too (called by * wakeup_source_unregister), so we cannot use call_rcu */ synchronize_rcu(); wakeup_source_unregister(ws); } static int attach_epitem(struct file *file, struct epitem *epi) { struct epitems_head *to_free = NULL; struct hlist_head *head = NULL; struct eventpoll *ep = NULL; if (is_file_epoll(file)) ep = file->private_data; if (ep) { head = &ep->refs; } else if (!READ_ONCE(file->f_ep)) { allocate: to_free = kmem_cache_zalloc(ephead_cache, GFP_KERNEL); if (!to_free) return -ENOMEM; head = &to_free->epitems; } spin_lock(&file->f_lock); if (!file->f_ep) { if (unlikely(!head)) { spin_unlock(&file->f_lock); goto allocate; } /* See eventpoll_release() for details. */ WRITE_ONCE(file->f_ep, head); to_free = NULL; } hlist_add_head_rcu(&epi->fllink, file->f_ep); spin_unlock(&file->f_lock); free_ephead(to_free); return 0; } /* * Must be called with "mtx" held. */ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, struct file *tfile, int fd, int full_check) { int error, pwake = 0; __poll_t revents; struct epitem *epi; struct ep_pqueue epq; struct eventpoll *tep = NULL; if (is_file_epoll(tfile)) tep = tfile->private_data; lockdep_assert_irqs_enabled(); if (unlikely(percpu_counter_compare(&ep->user->epoll_watches, max_user_watches) >= 0)) return -ENOSPC; percpu_counter_inc(&ep->user->epoll_watches); if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) { percpu_counter_dec(&ep->user->epoll_watches); return -ENOMEM; } /* Item initialization follow here ... */ INIT_LIST_HEAD(&epi->rdllink); epi->ep = ep; ep_set_ffd(&epi->ffd, tfile, fd); epi->event = *event; epi->next = EP_UNACTIVE_PTR; if (tep) mutex_lock_nested(&tep->mtx, 1); /* Add the current item to the list of active epoll hook for this file */ if (unlikely(attach_epitem(tfile, epi) < 0)) { if (tep) mutex_unlock(&tep->mtx); kmem_cache_free(epi_cache, epi); percpu_counter_dec(&ep->user->epoll_watches); return -ENOMEM; } if (full_check && !tep) list_file(tfile); /* * Add the current item to the RB tree. All RB tree operations are * protected by "mtx", and ep_insert() is called with "mtx" held. */ ep_rbtree_insert(ep, epi); if (tep) mutex_unlock(&tep->mtx); /* * ep_remove_safe() calls in the later error paths can't lead to * ep_free() as the ep file itself still holds an ep reference. */ ep_get(ep); /* now check if we've created too many backpaths */ if (unlikely(full_check && reverse_path_check())) { ep_remove_safe(ep, epi); return -EINVAL; } if (epi->event.events & EPOLLWAKEUP) { error = ep_create_wakeup_source(epi); if (error) { ep_remove_safe(ep, epi); return error; } } /* Initialize the poll table using the queue callback */ epq.epi = epi; init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); /* * Attach the item to the poll hooks and get current event bits. * We can safely use the file* here because its usage count has * been increased by the caller of this function. Note that after * this operation completes, the poll callback can start hitting * the new item. */ revents = ep_item_poll(epi, &epq.pt, 1); /* * We have to check if something went wrong during the poll wait queue * install process. Namely an allocation for a wait queue failed due * high memory pressure. */ if (unlikely(!epq.epi)) { ep_remove_safe(ep, epi); return -ENOMEM; } /* We have to drop the new item inside our item list to keep track of it */ write_lock_irq(&ep->lock); /* record NAPI ID of new item if present */ ep_set_busy_poll_napi_id(epi); /* If the file is already "ready" we drop it inside the ready list */ if (revents && !ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) wake_up(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } write_unlock_irq(&ep->lock); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(ep, NULL, 0); return 0; } /* * Modify the interest event mask by dropping an event if the new mask * has a match in the current file status. Must be called with "mtx" held. */ static int ep_modify(struct eventpoll *ep, struct epitem *epi, const struct epoll_event *event) { int pwake = 0; poll_table pt; lockdep_assert_irqs_enabled(); init_poll_funcptr(&pt, NULL); /* * Set the new event interest mask before calling f_op->poll(); * otherwise we might miss an event that happens between the * f_op->poll() call and the new event set registering. */ epi->event.events = event->events; /* need barrier below */ epi->event.data = event->data; /* protected by mtx */ if (epi->event.events & EPOLLWAKEUP) { if (!ep_has_wakeup_source(epi)) ep_create_wakeup_source(epi); } else if (ep_has_wakeup_source(epi)) { ep_destroy_wakeup_source(epi); } /* * The following barrier has two effects: * * 1) Flush epi changes above to other CPUs. This ensures * we do not miss events from ep_poll_callback if an * event occurs immediately after we call f_op->poll(). * We need this because we did not take ep->lock while * changing epi above (but ep_poll_callback does take * ep->lock). * * 2) We also need to ensure we do not miss _past_ events * when calling f_op->poll(). This barrier also * pairs with the barrier in wq_has_sleeper (see * comments for wq_has_sleeper). * * This barrier will now guarantee ep_poll_callback or f_op->poll * (or both) will notice the readiness of an item. */ smp_mb(); /* * Get current event bits. We can safely use the file* here because * its usage count has been increased by the caller of this function. * If the item is "hot" and it is not registered inside the ready * list, push it inside. */ if (ep_item_poll(epi, &pt, 1)) { write_lock_irq(&ep->lock); if (!ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) wake_up(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } write_unlock_irq(&ep->lock); } /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(ep, NULL, 0); return 0; } static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events, int maxevents) { struct epitem *epi, *tmp; LIST_HEAD(txlist); poll_table pt; int res = 0; /* * Always short-circuit for fatal signals to allow threads to make a * timely exit without the chance of finding more events available and * fetching repeatedly. */ if (fatal_signal_pending(current)) return -EINTR; init_poll_funcptr(&pt, NULL); mutex_lock(&ep->mtx); ep_start_scan(ep, &txlist); /* * We can loop without lock because we are passed a task private list. * Items cannot vanish during the loop we are holding ep->mtx. */ list_for_each_entry_safe(epi, tmp, &txlist, rdllink) { struct wakeup_source *ws; __poll_t revents; if (res >= maxevents) break; /* * Activate ep->ws before deactivating epi->ws to prevent * triggering auto-suspend here (in case we reactive epi->ws * below). * * This could be rearranged to delay the deactivation of epi->ws * instead, but then epi->ws would temporarily be out of sync * with ep_is_linked(). */ ws = ep_wakeup_source(epi); if (ws) { if (ws->active) __pm_stay_awake(ep->ws); __pm_relax(ws); } list_del_init(&epi->rdllink); /* * If the event mask intersect the caller-requested one, * deliver the event to userspace. Again, we are holding ep->mtx, * so no operations coming from userspace can change the item. */ revents = ep_item_poll(epi, &pt, 1); if (!revents) continue; events = epoll_put_uevent(revents, epi->event.data, events); if (!events) { list_add(&epi->rdllink, &txlist); ep_pm_stay_awake(epi); if (!res) res = -EFAULT; break; } res++; if (epi->event.events & EPOLLONESHOT) epi->event.events &= EP_PRIVATE_BITS; else if (!(epi->event.events & EPOLLET)) { /* * If this file has been added with Level * Trigger mode, we need to insert back inside * the ready list, so that the next call to * epoll_wait() will check again the events * availability. At this point, no one can insert * into ep->rdllist besides us. The epoll_ctl() * callers are locked out by * ep_send_events() holding "mtx" and the * poll callback will queue them in ep->ovflist. */ list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); } } ep_done_scan(ep, &txlist); mutex_unlock(&ep->mtx); return res; } static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms) { struct timespec64 now; if (ms < 0) return NULL; if (!ms) { to->tv_sec = 0; to->tv_nsec = 0; return to; } to->tv_sec = ms / MSEC_PER_SEC; to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC); ktime_get_ts64(&now); *to = timespec64_add_safe(now, *to); return to; } /* * autoremove_wake_function, but remove even on failure to wake up, because we * know that default_wake_function/ttwu will only fail if the thread is already * woken, and in that case the ep_poll loop will remove the entry anyways, not * try to reuse it. */ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode, int sync, void *key) { int ret = default_wake_function(wq_entry, mode, sync, key); /* * Pairs with list_empty_careful in ep_poll, and ensures future loop * iterations see the cause of this wakeup. */ list_del_init_careful(&wq_entry->entry); return ret; } /** * ep_poll - Retrieves ready events, and delivers them to the caller-supplied * event buffer. * * @ep: Pointer to the eventpoll context. * @events: Pointer to the userspace buffer where the ready events should be * stored. * @maxevents: Size (in terms of number of events) of the caller event buffer. * @timeout: Maximum timeout for the ready events fetch operation, in * timespec. If the timeout is zero, the function will not block, * while if the @timeout ptr is NULL, the function will block * until at least one event has been retrieved (or an error * occurred). * * Return: the number of ready events which have been fetched, or an * error code, in case of error. */ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, struct timespec64 *timeout) { int res, eavail, timed_out = 0; u64 slack = 0; wait_queue_entry_t wait; ktime_t expires, *to = NULL; lockdep_assert_irqs_enabled(); if (timeout && (timeout->tv_sec | timeout->tv_nsec)) { slack = select_estimate_accuracy(timeout); to = &expires; *to = timespec64_to_ktime(*timeout); } else if (timeout) { /* * Avoid the unnecessary trip to the wait queue loop, if the * caller specified a non blocking operation. */ timed_out = 1; } /* * This call is racy: We may or may not see events that are being added * to the ready list under the lock (e.g., in IRQ callbacks). For cases * with a non-zero timeout, this thread will check the ready list under * lock and will add to the wait queue. For cases with a zero * timeout, the user by definition should not care and will have to * recheck again. */ eavail = ep_events_available(ep); while (1) { if (eavail) { /* * Try to transfer events to user space. In case we get * 0 events and there's still timeout left over, we go * trying again in search of more luck. */ res = ep_send_events(ep, events, maxevents); if (res) { if (res > 0) ep_suspend_napi_irqs(ep); return res; } } if (timed_out) return 0; eavail = ep_busy_loop(ep, timed_out); if (eavail) continue; if (signal_pending(current)) return -EINTR; /* * Internally init_wait() uses autoremove_wake_function(), * thus wait entry is removed from the wait queue on each * wakeup. Why it is important? In case of several waiters * each new wakeup will hit the next waiter, giving it the * chance to harvest new event. Otherwise wakeup can be * lost. This is also good performance-wise, because on * normal wakeup path no need to call __remove_wait_queue() * explicitly, thus ep->lock is not taken, which halts the * event delivery. * * In fact, we now use an even more aggressive function that * unconditionally removes, because we don't reuse the wait * entry between loop iterations. This lets us also avoid the * performance issue if a process is killed, causing all of its * threads to wake up without being removed normally. */ init_wait(&wait); wait.func = ep_autoremove_wake_function; write_lock_irq(&ep->lock); /* * Barrierless variant, waitqueue_active() is called under * the same lock on wakeup ep_poll_callback() side, so it * is safe to avoid an explicit barrier. */ __set_current_state(TASK_INTERRUPTIBLE); /* * Do the final check under the lock. ep_start/done_scan() * plays with two lists (->rdllist and ->ovflist) and there * is always a race when both lists are empty for short * period of time although events are pending, so lock is * important. */ eavail = ep_events_available(ep); if (!eavail) __add_wait_queue_exclusive(&ep->wq, &wait); write_unlock_irq(&ep->lock); if (!eavail) timed_out = !schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); /* * We were woken up, thus go and try to harvest some events. * If timed out and still on the wait queue, recheck eavail * carefully under lock, below. */ eavail = 1; if (!list_empty_careful(&wait.entry)) { write_lock_irq(&ep->lock); /* * If the thread timed out and is not on the wait queue, * it means that the thread was woken up after its * timeout expired before it could reacquire the lock. * Thus, when wait.entry is empty, it needs to harvest * events. */ if (timed_out) eavail = list_empty(&wait.entry); __remove_wait_queue(&ep->wq, &wait); write_unlock_irq(&ep->lock); } } } /** * ep_loop_check_proc - verify that adding an epoll file inside another * epoll structure does not violate the constraints, in * terms of closed loops, or too deep chains (which can * result in excessive stack usage). * * @ep: the &struct eventpoll to be currently checked. * @depth: Current depth of the path being checked. * * Return: %zero if adding the epoll @file inside current epoll * structure @ep does not violate the constraints, or %-1 otherwise. */ static int ep_loop_check_proc(struct eventpoll *ep, int depth) { int error = 0; struct rb_node *rbp; struct epitem *epi; mutex_lock_nested(&ep->mtx, depth + 1); ep->gen = loop_check_gen; for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (unlikely(is_file_epoll(epi->ffd.file))) { struct eventpoll *ep_tovisit; ep_tovisit = epi->ffd.file->private_data; if (ep_tovisit->gen == loop_check_gen) continue; if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS) error = -1; else error = ep_loop_check_proc(ep_tovisit, depth + 1); if (error != 0) break; } else { /* * If we've reached a file that is not associated with * an ep, then we need to check if the newly added * links are going to add too many wakeup paths. We do * this by adding it to the tfile_check_list, if it's * not already there, and calling reverse_path_check() * during ep_insert(). */ list_file(epi->ffd.file); } } mutex_unlock(&ep->mtx); return error; } /** * ep_loop_check - Performs a check to verify that adding an epoll file (@to) * into another epoll file (represented by @ep) does not create * closed loops or too deep chains. * * @ep: Pointer to the epoll we are inserting into. * @to: Pointer to the epoll to be inserted. * * Return: %zero if adding the epoll @to inside the epoll @from * does not violate the constraints, or %-1 otherwise. */ static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to) { inserting_into = ep; return ep_loop_check_proc(to, 0); } static void clear_tfile_check_list(void) { rcu_read_lock(); while (tfile_check_list != EP_UNACTIVE_PTR) { struct epitems_head *head = tfile_check_list; tfile_check_list = head->next; unlist_file(head); } rcu_read_unlock(); } /* * Open an eventpoll file descriptor. */ static int do_epoll_create(int flags) { int error, fd; struct eventpoll *ep = NULL; struct file *file; /* Check the EPOLL_* constant for consistency. */ BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); if (flags & ~EPOLL_CLOEXEC) return -EINVAL; /* * Create the internal data structure ("struct eventpoll"). */ error = ep_alloc(&ep); if (error < 0) return error; /* * Creates all the items needed to setup an eventpoll file. That is, * a file structure and a free file descriptor. */ fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)); if (fd < 0) { error = fd; goto out_free_ep; } file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, O_RDWR | (flags & O_CLOEXEC)); if (IS_ERR(file)) { error = PTR_ERR(file); goto out_free_fd; } ep->file = file; fd_install(fd, file); return fd; out_free_fd: put_unused_fd(fd); out_free_ep: ep_clear_and_put(ep); return error; } SYSCALL_DEFINE1(epoll_create1, int, flags) { return do_epoll_create(flags); } SYSCALL_DEFINE1(epoll_create, int, size) { if (size <= 0) return -EINVAL; return do_epoll_create(0); } #ifdef CONFIG_PM_SLEEP static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev) { if ((epev->events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND)) epev->events &= ~EPOLLWAKEUP; } #else static inline void ep_take_care_of_epollwakeup(struct epoll_event *epev) { epev->events &= ~EPOLLWAKEUP; } #endif static inline int epoll_mutex_lock(struct mutex *mutex, int depth, bool nonblock) { if (!nonblock) { mutex_lock_nested(mutex, depth); return 0; } if (mutex_trylock(mutex)) return 0; return -EAGAIN; } int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, bool nonblock) { int error; int full_check = 0; struct eventpoll *ep; struct epitem *epi; struct eventpoll *tep = NULL; CLASS(fd, f)(epfd); if (fd_empty(f)) return -EBADF; /* Get the "struct file *" for the target file */ CLASS(fd, tf)(fd); if (fd_empty(tf)) return -EBADF; /* The target file descriptor must support poll */ if (!file_can_poll(fd_file(tf))) return -EPERM; /* Check if EPOLLWAKEUP is allowed */ if (ep_op_has_event(op)) ep_take_care_of_epollwakeup(epds); /* * We have to check that the file structure underneath the file descriptor * the user passed to us _is_ an eventpoll file. And also we do not permit * adding an epoll file descriptor inside itself. */ error = -EINVAL; if (fd_file(f) == fd_file(tf) || !is_file_epoll(fd_file(f))) goto error_tgt_fput; /* * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only, * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation. * Also, we do not currently supported nested exclusive wakeups. */ if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) { if (op == EPOLL_CTL_MOD) goto error_tgt_fput; if (op == EPOLL_CTL_ADD && (is_file_epoll(fd_file(tf)) || (epds->events & ~EPOLLEXCLUSIVE_OK_BITS))) goto error_tgt_fput; } /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ ep = fd_file(f)->private_data; /* * When we insert an epoll file descriptor inside another epoll file * descriptor, there is the chance of creating closed loops, which are * better be handled here, than in more critical paths. While we are * checking for loops we also determine the list of files reachable * and hang them on the tfile_check_list, so we can check that we * haven't created too many possible wakeup paths. * * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when * the epoll file descriptor is attaching directly to a wakeup source, * unless the epoll file descriptor is nested. The purpose of taking the * 'epnested_mutex' on add is to prevent complex toplogies such as loops and * deep wakeup paths from forming in parallel through multiple * EPOLL_CTL_ADD operations. */ error = epoll_mutex_lock(&ep->mtx, 0, nonblock); if (error) goto error_tgt_fput; if (op == EPOLL_CTL_ADD) { if (READ_ONCE(fd_file(f)->f_ep) || ep->gen == loop_check_gen || is_file_epoll(fd_file(tf))) { mutex_unlock(&ep->mtx); error = epoll_mutex_lock(&epnested_mutex, 0, nonblock); if (error) goto error_tgt_fput; loop_check_gen++; full_check = 1; if (is_file_epoll(fd_file(tf))) { tep = fd_file(tf)->private_data; error = -ELOOP; if (ep_loop_check(ep, tep) != 0) goto error_tgt_fput; } error = epoll_mutex_lock(&ep->mtx, 0, nonblock); if (error) goto error_tgt_fput; } } /* * Try to lookup the file inside our RB tree. Since we grabbed "mtx" * above, we can be sure to be able to use the item looked up by * ep_find() till we release the mutex. */ epi = ep_find(ep, fd_file(tf), fd); error = -EINVAL; switch (op) { case EPOLL_CTL_ADD: if (!epi) { epds->events |= EPOLLERR | EPOLLHUP; error = ep_insert(ep, epds, fd_file(tf), fd, full_check); } else error = -EEXIST; break; case EPOLL_CTL_DEL: if (epi) { /* * The eventpoll itself is still alive: the refcount * can't go to zero here. */ ep_remove_safe(ep, epi); error = 0; } else { error = -ENOENT; } break; case EPOLL_CTL_MOD: if (epi) { if (!(epi->event.events & EPOLLEXCLUSIVE)) { epds->events |= EPOLLERR | EPOLLHUP; error = ep_modify(ep, epi, epds); } } else error = -ENOENT; break; } mutex_unlock(&ep->mtx); error_tgt_fput: if (full_check) { clear_tfile_check_list(); loop_check_gen++; mutex_unlock(&epnested_mutex); } return error; } /* * The following function implements the controller interface for * the eventpoll file that enables the insertion/removal/change of * file descriptors inside the interest set. */ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event __user *, event) { struct epoll_event epds; if (ep_op_has_event(op) && copy_from_user(&epds, event, sizeof(struct epoll_event))) return -EFAULT; return do_epoll_ctl(epfd, op, fd, &epds, false); } /* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_wait(2). */ static int do_epoll_wait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *to) { struct eventpoll *ep; /* The maximum number of event must be greater than zero */ if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) return -EINVAL; /* Verify that the area passed by the user is writeable */ if (!access_ok(events, maxevents * sizeof(struct epoll_event))) return -EFAULT; /* Get the "struct file *" for the eventpoll file */ CLASS(fd, f)(epfd); if (fd_empty(f)) return -EBADF; /* * We have to check that the file structure underneath the fd * the user passed to us _is_ an eventpoll file. */ if (!is_file_epoll(fd_file(f))) return -EINVAL; /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ ep = fd_file(f)->private_data; /* Time to fish for events ... */ return ep_poll(ep, events, maxevents, to); } SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout) { struct timespec64 to; return do_epoll_wait(epfd, events, maxevents, ep_timeout_to_timespec(&to, timeout)); } /* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_pwait(2). */ static int do_epoll_pwait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *to, const sigset_t __user *sigmask, size_t sigsetsize) { int error; /* * If the caller wants a certain signal mask to be set during the wait, * we apply it here. */ error = set_user_sigmask(sigmask, sigsetsize); if (error) return error; error = do_epoll_wait(epfd, events, maxevents, to); restore_saved_sigmask_unless(error == -EINTR); return error; } SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 to; return do_epoll_pwait(epfd, events, maxevents, ep_timeout_to_timespec(&to, timeout), sigmask, sigsetsize); } SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events, int, maxevents, const struct __kernel_timespec __user *, timeout, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 ts, *to = NULL; if (timeout) { if (get_timespec64(&ts, timeout)) return -EFAULT; to = &ts; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } return do_epoll_pwait(epfd, events, maxevents, to, sigmask, sigsetsize); } #ifdef CONFIG_COMPAT static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *timeout, const compat_sigset_t __user *sigmask, compat_size_t sigsetsize) { long err; /* * If the caller wants a certain signal mask to be set during the wait, * we apply it here. */ err = set_compat_user_sigmask(sigmask, sigsetsize); if (err) return err; err = do_epoll_wait(epfd, events, maxevents, timeout); restore_saved_sigmask_unless(err == -EINTR); return err; } COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 to; return do_compat_epoll_pwait(epfd, events, maxevents, ep_timeout_to_timespec(&to, timeout), sigmask, sigsetsize); } COMPAT_SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events, int, maxevents, const struct __kernel_timespec __user *, timeout, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 ts, *to = NULL; if (timeout) { if (get_timespec64(&ts, timeout)) return -EFAULT; to = &ts; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } return do_compat_epoll_pwait(epfd, events, maxevents, to, sigmask, sigsetsize); } #endif static int __init eventpoll_init(void) { struct sysinfo si; si_meminfo(&si); /* * Allows top 4% of lomem to be allocated for epoll watches (per user). */ max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) / EP_ITEM_COST; BUG_ON(max_user_watches < 0); /* * We can have many thousands of epitems, so prevent this from * using an extra cache line on 64-bit (and smaller) CPUs */ BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128); /* Allocates slab cache used to allocate "struct epitem" items */ epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); /* Allocates slab cache used to allocate "struct eppoll_entry" */ pwq_cache = kmem_cache_create("eventpoll_pwq", sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); epoll_sysctls_init(); ephead_cache = kmem_cache_create("ep_head", sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); return 0; } fs_initcall(eventpoll_init);
1100 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM pagemap #if !defined(_TRACE_PAGEMAP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_PAGEMAP_H #include <linux/tracepoint.h> #include <linux/mm.h> #define PAGEMAP_MAPPED 0x0001u #define PAGEMAP_ANONYMOUS 0x0002u #define PAGEMAP_FILE 0x0004u #define PAGEMAP_SWAPCACHE 0x0008u #define PAGEMAP_SWAPBACKED 0x0010u #define PAGEMAP_MAPPEDDISK 0x0020u #define PAGEMAP_BUFFERS 0x0040u #define trace_pagemap_flags(folio) ( \ (folio_test_anon(folio) ? PAGEMAP_ANONYMOUS : PAGEMAP_FILE) | \ (folio_mapped(folio) ? PAGEMAP_MAPPED : 0) | \ (folio_test_swapcache(folio) ? PAGEMAP_SWAPCACHE : 0) | \ (folio_test_swapbacked(folio) ? PAGEMAP_SWAPBACKED : 0) | \ (folio_test_mappedtodisk(folio) ? PAGEMAP_MAPPEDDISK : 0) | \ (folio_test_private(folio) ? PAGEMAP_BUFFERS : 0) \ ) TRACE_EVENT(mm_lru_insertion, TP_PROTO(struct folio *folio), TP_ARGS(folio), TP_STRUCT__entry( __field(struct folio *, folio ) __field(unsigned long, pfn ) __field(enum lru_list, lru ) __field(unsigned long, flags ) ), TP_fast_assign( __entry->folio = folio; __entry->pfn = folio_pfn(folio); __entry->lru = folio_lru_list(folio); __entry->flags = trace_pagemap_flags(folio); ), /* Flag format is based on page-types.c formatting for pagemap */ TP_printk("folio=%p pfn=0x%lx lru=%d flags=%s%s%s%s%s%s", __entry->folio, __entry->pfn, __entry->lru, __entry->flags & PAGEMAP_MAPPED ? "M" : " ", __entry->flags & PAGEMAP_ANONYMOUS ? "a" : "f", __entry->flags & PAGEMAP_SWAPCACHE ? "s" : " ", __entry->flags & PAGEMAP_SWAPBACKED ? "b" : " ", __entry->flags & PAGEMAP_MAPPEDDISK ? "d" : " ", __entry->flags & PAGEMAP_BUFFERS ? "B" : " ") ); TRACE_EVENT(mm_lru_activate, TP_PROTO(struct folio *folio), TP_ARGS(folio), TP_STRUCT__entry( __field(struct folio *, folio ) __field(unsigned long, pfn ) ), TP_fast_assign( __entry->folio = folio; __entry->pfn = folio_pfn(folio); ), TP_printk("folio=%p pfn=0x%lx", __entry->folio, __entry->pfn) ); #endif /* _TRACE_PAGEMAP_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
14 2 151 1 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Definitions for the 'struct skb_array' datastructure. * * Author: * Michael S. Tsirkin <mst@redhat.com> * * Copyright (C) 2016 Red Hat, Inc. * * Limited-size FIFO of skbs. Can be used more or less whenever * sk_buff_head can be used, except you need to know the queue size in * advance. * Implemented as a type-safe wrapper around ptr_ring. */ #ifndef _LINUX_SKB_ARRAY_H #define _LINUX_SKB_ARRAY_H 1 #ifdef __KERNEL__ #include <linux/ptr_ring.h> #include <linux/skbuff.h> #include <linux/if_vlan.h> #endif struct skb_array { struct ptr_ring ring; }; /* Might be slightly faster than skb_array_full below, but callers invoking * this in a loop must use a compiler barrier, for example cpu_relax(). */ static inline bool __skb_array_full(struct skb_array *a) { return __ptr_ring_full(&a->ring); } static inline bool skb_array_full(struct skb_array *a) { return ptr_ring_full(&a->ring); } static inline int skb_array_produce(struct skb_array *a, struct sk_buff *skb) { return ptr_ring_produce(&a->ring, skb); } static inline int skb_array_produce_irq(struct skb_array *a, struct sk_buff *skb) { return ptr_ring_produce_irq(&a->ring, skb); } static inline int skb_array_produce_bh(struct skb_array *a, struct sk_buff *skb) { return ptr_ring_produce_bh(&a->ring, skb); } static inline int skb_array_produce_any(struct skb_array *a, struct sk_buff *skb) { return ptr_ring_produce_any(&a->ring, skb); } /* Might be slightly faster than skb_array_empty below, but only safe if the * array is never resized. Also, callers invoking this in a loop must take care * to use a compiler barrier, for example cpu_relax(). */ static inline bool __skb_array_empty(struct skb_array *a) { return __ptr_ring_empty(&a->ring); } static inline struct sk_buff *__skb_array_peek(struct skb_array *a) { return __ptr_ring_peek(&a->ring); } static inline bool skb_array_empty(struct skb_array *a) { return ptr_ring_empty(&a->ring); } static inline bool skb_array_empty_bh(struct skb_array *a) { return ptr_ring_empty_bh(&a->ring); } static inline bool skb_array_empty_irq(struct skb_array *a) { return ptr_ring_empty_irq(&a->ring); } static inline bool skb_array_empty_any(struct skb_array *a) { return ptr_ring_empty_any(&a->ring); } static inline struct sk_buff *__skb_array_consume(struct skb_array *a) { return __ptr_ring_consume(&a->ring); } static inline struct sk_buff *skb_array_consume(struct skb_array *a) { return ptr_ring_consume(&a->ring); } static inline int skb_array_consume_batched(struct skb_array *a, struct sk_buff **array, int n) { return ptr_ring_consume_batched(&a->ring, (void **)array, n); } static inline struct sk_buff *skb_array_consume_irq(struct skb_array *a) { return ptr_ring_consume_irq(&a->ring); } static inline int skb_array_consume_batched_irq(struct skb_array *a, struct sk_buff **array, int n) { return ptr_ring_consume_batched_irq(&a->ring, (void **)array, n); } static inline struct sk_buff *skb_array_consume_any(struct skb_array *a) { return ptr_ring_consume_any(&a->ring); } static inline int skb_array_consume_batched_any(struct skb_array *a, struct sk_buff **array, int n) { return ptr_ring_consume_batched_any(&a->ring, (void **)array, n); } static inline struct sk_buff *skb_array_consume_bh(struct skb_array *a) { return ptr_ring_consume_bh(&a->ring); } static inline int skb_array_consume_batched_bh(struct skb_array *a, struct sk_buff **array, int n) { return ptr_ring_consume_batched_bh(&a->ring, (void **)array, n); } static inline int __skb_array_len_with_tag(struct sk_buff *skb) { if (likely(skb)) { int len = skb->len; if (skb_vlan_tag_present(skb)) len += VLAN_HLEN; return len; } else { return 0; } } static inline int skb_array_peek_len(struct skb_array *a) { return PTR_RING_PEEK_CALL(&a->ring, __skb_array_len_with_tag); } static inline int skb_array_peek_len_irq(struct skb_array *a) { return PTR_RING_PEEK_CALL_IRQ(&a->ring, __skb_array_len_with_tag); } static inline int skb_array_peek_len_bh(struct skb_array *a) { return PTR_RING_PEEK_CALL_BH(&a->ring, __skb_array_len_with_tag); } static inline int skb_array_peek_len_any(struct skb_array *a) { return PTR_RING_PEEK_CALL_ANY(&a->ring, __skb_array_len_with_tag); } static inline int skb_array_init_noprof(struct skb_array *a, int size, gfp_t gfp) { return ptr_ring_init_noprof(&a->ring, size, gfp); } #define skb_array_init(...) alloc_hooks(skb_array_init_noprof(__VA_ARGS__)) static void __skb_array_destroy_skb(void *ptr) { kfree_skb(ptr); } static inline void skb_array_unconsume(struct skb_array *a, struct sk_buff **skbs, int n) { ptr_ring_unconsume(&a->ring, (void **)skbs, n, __skb_array_destroy_skb); } static inline int skb_array_resize(struct skb_array *a, int size, gfp_t gfp) { return ptr_ring_resize(&a->ring, size, gfp, __skb_array_destroy_skb); } static inline int skb_array_resize_multiple_bh_noprof(struct skb_array **rings, int nrings, unsigned int size, gfp_t gfp) { BUILD_BUG_ON(offsetof(struct skb_array, ring)); return ptr_ring_resize_multiple_bh_noprof((struct ptr_ring **)rings, nrings, size, gfp, __skb_array_destroy_skb); } #define skb_array_resize_multiple_bh(...) \ alloc_hooks(skb_array_resize_multiple_bh_noprof(__VA_ARGS__)) static inline void skb_array_cleanup(struct skb_array *a) { ptr_ring_cleanup(&a->ring, __skb_array_destroy_skb); } #endif /* _LINUX_SKB_ARRAY_H */
635 163 163 683 683 1 116 5 685 65 66 66 63 685 640 783 635 637 636 3 635 5860 5788 786 957 235 63 5864 5797 681 135 5812 135 5872 785 785 956 5873 671 121 3 66 3 682 679 3 681 65 682 683 683 681 683 683 632 635 635 634 6 65 636 3 1 3 637 636 636 637 637 638 65 66 631 3 637 127 5858 781 5873 5854 671 668 3 745 5582 1403 683 683 683 683 606 645 646 65 643 646 643 3 644 633 642 3 646 645 8 8 5 3 3 3 3 745 33 193 640 638 8 65 66 3 66 66 66 66 630 630 630 631 66 66 3 66 66 66 6 6 6 6 37 64 2 2 37 37 37 37 37 66 66 66 66 8 66 65 66 66 66 66 66 66 66 66 99 639 66 66 66 66 66 66 66 66 66 66 66 66 34 34 66 64 1 33 43 41 3 44 66 18 63 44 25 3 26 25 23 5 26 3 62 3 57 7 60 3 58 58 58 784 781 116 785 785 20 3 18 20 20 19 13 11 20 66 3 66 66 1 1 20 1 1 1 1 66 66 1 66 66 66 66 66 65 3 66 66 66 66 66 3 65 66 66 66 65 66 66 66 66 66 66 11 13 19 13 11 17 5 20 20 20 19 5798 5815 5798 5409 4966 4964 5816 5816 5795 5802 20 20 20 1 20 20 20 20 20 20 20 20 20 20 19 19 19 19 12 10 19 17 3 63 63 63 63 3 62 62 3 63 63 63 3 62 63 63 63 63 63 63 63 60 62 62 33 6 36 42 6 36 42 36 6 35 63 35 6 36 63 62 3 63 121 121 2 122 121 447 18 680 16 8 781 785 785 784 785 83 17 18 18 18 3 1 4 4 2 2 605 4 1 1 1 1 1 1 18 18 18 14 4 18 18 638 604 281 34 630 637 636 344 1 636 9 81 632 637 7 7 7 14 67 4 7 4 85 6 278 684 144 144 143 778 63 671 532 59 144 7 638 18 674 16 782 5 704 121 669 63 3 641 783 671 785 785 783 89 784 744 745 38 745 144 641 774 773 772 674 127 2 134 135 135 135 135 135 135 682 682 24 24 24 23 3 24 24 681 848 852 35 847 846 851 24 851 853 851 34 682 682 682 3 682 683 683 681 682 682 683 681 682 1748 1747 1753 683 681 33 1749 690 1742 682 1748 1757 1751 682 679 5 5 5 5 5 5 5 5 5 5 2 5 5 1 5 5 5 5 134 134 134 134 134 134 3 134 134 3 831 605 1414 604 605 134 3 135 135 2 134 134 134 134 134 5 5 5 5 5 5 5 3 3 3 3 3 3 3 3 3 3 116 116 3 3 115 3 116 670 669 670 664 664 14 663 663 661 682 681 680 634 636 637 783 782 781 88 87 681 670 139 672 672 119 22 22 851 849 8 8 853 851 128 127 682 683 620 683 683 1 604 681 683 4 4 4 4 772 1 1 1 1 1 135 135 4 667 667 602 116 5145 5141 5140 5141 6 1121 1122 5132 5 5144 683 683 682 683 683 683 682 682 683 681 682 680 682 683 683 682 681 683 681 680 683 682 682 681 680 682 683 683 682 681 683 682 681 682 683 679 679 681 683 683 683 681 681 683 683 683 682 682 683 683 680 683 681 683 681 683 682 681 683 681 681 680 683 682 683 683 683 683 683 683 683 683 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 // SPDX-License-Identifier: GPL-2.0+ /* * Maple Tree implementation * Copyright (c) 2018-2022 Oracle Corporation * Authors: Liam R. Howlett <Liam.Howlett@oracle.com> * Matthew Wilcox <willy@infradead.org> * Copyright (c) 2023 ByteDance * Author: Peng Zhang <zhangpeng.00@bytedance.com> */ /* * DOC: Interesting implementation details of the Maple Tree * * Each node type has a number of slots for entries and a number of slots for * pivots. In the case of dense nodes, the pivots are implied by the position * and are simply the slot index + the minimum of the node. * * In regular B-Tree terms, pivots are called keys. The term pivot is used to * indicate that the tree is specifying ranges. Pivots may appear in the * subtree with an entry attached to the value whereas keys are unique to a * specific position of a B-tree. Pivot values are inclusive of the slot with * the same index. * * * The following illustrates the layout of a range64 nodes slots and pivots. * * * Slots -> | 0 | 1 | 2 | ... | 12 | 13 | 14 | 15 | * ┬ ┬ ┬ ┬ ┬ ┬ ┬ ┬ ┬ * │ │ │ │ │ │ │ │ └─ Implied maximum * │ │ │ │ │ │ │ └─ Pivot 14 * │ │ │ │ │ │ └─ Pivot 13 * │ │ │ │ │ └─ Pivot 12 * │ │ │ │ └─ Pivot 11 * │ │ │ └─ Pivot 2 * │ │ └─ Pivot 1 * │ └─ Pivot 0 * └─ Implied minimum * * Slot contents: * Internal (non-leaf) nodes contain pointers to other nodes. * Leaf nodes contain entries. * * The location of interest is often referred to as an offset. All offsets have * a slot, but the last offset has an implied pivot from the node above (or * UINT_MAX for the root node. * * Ranges complicate certain write activities. When modifying any of * the B-tree variants, it is known that one entry will either be added or * deleted. When modifying the Maple Tree, one store operation may overwrite * the entire data set, or one half of the tree, or the middle half of the tree. * */ #include <linux/maple_tree.h> #include <linux/xarray.h> #include <linux/types.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/limits.h> #include <asm/barrier.h> #define CREATE_TRACE_POINTS #include <trace/events/maple_tree.h> /* * Kernel pointer hashing renders much of the maple tree dump useless as tagged * pointers get hashed to arbitrary values. * * If CONFIG_DEBUG_VM_MAPLE_TREE is set we are in a debug mode where it is * permissible to bypass this. Otherwise remain cautious and retain the hashing. * * Userland doesn't know about %px so also use %p there. */ #if defined(__KERNEL__) && defined(CONFIG_DEBUG_VM_MAPLE_TREE) #define PTR_FMT "%px" #else #define PTR_FMT "%p" #endif #define MA_ROOT_PARENT 1 /* * Maple state flags * * MA_STATE_BULK - Bulk insert mode * * MA_STATE_REBALANCE - Indicate a rebalance during bulk insert * * MA_STATE_PREALLOC - Preallocated nodes, WARN_ON allocation */ #define MA_STATE_BULK 1 #define MA_STATE_REBALANCE 2 #define MA_STATE_PREALLOC 4 #define ma_parent_ptr(x) ((struct maple_pnode *)(x)) #define mas_tree_parent(x) ((unsigned long)(x->tree) | MA_ROOT_PARENT) #define ma_mnode_ptr(x) ((struct maple_node *)(x)) #define ma_enode_ptr(x) ((struct maple_enode *)(x)) static struct kmem_cache *maple_node_cache; #ifdef CONFIG_DEBUG_MAPLE_TREE static const unsigned long mt_max[] = { [maple_dense] = MAPLE_NODE_SLOTS, [maple_leaf_64] = ULONG_MAX, [maple_range_64] = ULONG_MAX, [maple_arange_64] = ULONG_MAX, }; #define mt_node_max(x) mt_max[mte_node_type(x)] #endif static const unsigned char mt_slots[] = { [maple_dense] = MAPLE_NODE_SLOTS, [maple_leaf_64] = MAPLE_RANGE64_SLOTS, [maple_range_64] = MAPLE_RANGE64_SLOTS, [maple_arange_64] = MAPLE_ARANGE64_SLOTS, }; #define mt_slot_count(x) mt_slots[mte_node_type(x)] static const unsigned char mt_pivots[] = { [maple_dense] = 0, [maple_leaf_64] = MAPLE_RANGE64_SLOTS - 1, [maple_range_64] = MAPLE_RANGE64_SLOTS - 1, [maple_arange_64] = MAPLE_ARANGE64_SLOTS - 1, }; #define mt_pivot_count(x) mt_pivots[mte_node_type(x)] static const unsigned char mt_min_slots[] = { [maple_dense] = MAPLE_NODE_SLOTS / 2, [maple_leaf_64] = (MAPLE_RANGE64_SLOTS / 2) - 2, [maple_range_64] = (MAPLE_RANGE64_SLOTS / 2) - 2, [maple_arange_64] = (MAPLE_ARANGE64_SLOTS / 2) - 1, }; #define mt_min_slot_count(x) mt_min_slots[mte_node_type(x)] #define MAPLE_BIG_NODE_SLOTS (MAPLE_RANGE64_SLOTS * 2 + 2) #define MAPLE_BIG_NODE_GAPS (MAPLE_ARANGE64_SLOTS * 2 + 1) struct maple_big_node { unsigned long pivot[MAPLE_BIG_NODE_SLOTS - 1]; union { struct maple_enode *slot[MAPLE_BIG_NODE_SLOTS]; struct { unsigned long padding[MAPLE_BIG_NODE_GAPS]; unsigned long gap[MAPLE_BIG_NODE_GAPS]; }; }; unsigned char b_end; enum maple_type type; }; /* * The maple_subtree_state is used to build a tree to replace a segment of an * existing tree in a more atomic way. Any walkers of the older tree will hit a * dead node and restart on updates. */ struct maple_subtree_state { struct ma_state *orig_l; /* Original left side of subtree */ struct ma_state *orig_r; /* Original right side of subtree */ struct ma_state *l; /* New left side of subtree */ struct ma_state *m; /* New middle of subtree (rare) */ struct ma_state *r; /* New right side of subtree */ struct ma_topiary *free; /* nodes to be freed */ struct ma_topiary *destroy; /* Nodes to be destroyed (walked and freed) */ struct maple_big_node *bn; }; #ifdef CONFIG_KASAN_STACK /* Prevent mas_wr_bnode() from exceeding the stack frame limit */ #define noinline_for_kasan noinline_for_stack #else #define noinline_for_kasan inline #endif /* Functions */ static inline struct maple_node *mt_alloc_one(gfp_t gfp) { return kmem_cache_alloc(maple_node_cache, gfp); } static inline int mt_alloc_bulk(gfp_t gfp, size_t size, void **nodes) { return kmem_cache_alloc_bulk(maple_node_cache, gfp, size, nodes); } static inline void mt_free_one(struct maple_node *node) { kmem_cache_free(maple_node_cache, node); } static inline void mt_free_bulk(size_t size, void __rcu **nodes) { kmem_cache_free_bulk(maple_node_cache, size, (void **)nodes); } static void mt_free_rcu(struct rcu_head *head) { struct maple_node *node = container_of(head, struct maple_node, rcu); kmem_cache_free(maple_node_cache, node); } /* * ma_free_rcu() - Use rcu callback to free a maple node * @node: The node to free * * The maple tree uses the parent pointer to indicate this node is no longer in * use and will be freed. */ static void ma_free_rcu(struct maple_node *node) { WARN_ON(node->parent != ma_parent_ptr(node)); call_rcu(&node->rcu, mt_free_rcu); } static void mas_set_height(struct ma_state *mas) { unsigned int new_flags = mas->tree->ma_flags; new_flags &= ~MT_FLAGS_HEIGHT_MASK; MAS_BUG_ON(mas, mas->depth > MAPLE_HEIGHT_MAX); new_flags |= mas->depth << MT_FLAGS_HEIGHT_OFFSET; mas->tree->ma_flags = new_flags; } static unsigned int mas_mt_height(struct ma_state *mas) { return mt_height(mas->tree); } static inline unsigned int mt_attr(struct maple_tree *mt) { return mt->ma_flags & ~MT_FLAGS_HEIGHT_MASK; } static __always_inline enum maple_type mte_node_type( const struct maple_enode *entry) { return ((unsigned long)entry >> MAPLE_NODE_TYPE_SHIFT) & MAPLE_NODE_TYPE_MASK; } static __always_inline bool ma_is_dense(const enum maple_type type) { return type < maple_leaf_64; } static __always_inline bool ma_is_leaf(const enum maple_type type) { return type < maple_range_64; } static __always_inline bool mte_is_leaf(const struct maple_enode *entry) { return ma_is_leaf(mte_node_type(entry)); } /* * We also reserve values with the bottom two bits set to '10' which are * below 4096 */ static __always_inline bool mt_is_reserved(const void *entry) { return ((unsigned long)entry < MAPLE_RESERVED_RANGE) && xa_is_internal(entry); } static __always_inline void mas_set_err(struct ma_state *mas, long err) { mas->node = MA_ERROR(err); mas->status = ma_error; } static __always_inline bool mas_is_ptr(const struct ma_state *mas) { return mas->status == ma_root; } static __always_inline bool mas_is_start(const struct ma_state *mas) { return mas->status == ma_start; } static __always_inline bool mas_is_none(const struct ma_state *mas) { return mas->status == ma_none; } static __always_inline bool mas_is_paused(const struct ma_state *mas) { return mas->status == ma_pause; } static __always_inline bool mas_is_overflow(struct ma_state *mas) { return mas->status == ma_overflow; } static inline bool mas_is_underflow(struct ma_state *mas) { return mas->status == ma_underflow; } static __always_inline struct maple_node *mte_to_node( const struct maple_enode *entry) { return (struct maple_node *)((unsigned long)entry & ~MAPLE_NODE_MASK); } /* * mte_to_mat() - Convert a maple encoded node to a maple topiary node. * @entry: The maple encoded node * * Return: a maple topiary pointer */ static inline struct maple_topiary *mte_to_mat(const struct maple_enode *entry) { return (struct maple_topiary *) ((unsigned long)entry & ~MAPLE_NODE_MASK); } /* * mas_mn() - Get the maple state node. * @mas: The maple state * * Return: the maple node (not encoded - bare pointer). */ static inline struct maple_node *mas_mn(const struct ma_state *mas) { return mte_to_node(mas->node); } /* * mte_set_node_dead() - Set a maple encoded node as dead. * @mn: The maple encoded node. */ static inline void mte_set_node_dead(struct maple_enode *mn) { mte_to_node(mn)->parent = ma_parent_ptr(mte_to_node(mn)); smp_wmb(); /* Needed for RCU */ } /* Bit 1 indicates the root is a node */ #define MAPLE_ROOT_NODE 0x02 /* maple_type stored bit 3-6 */ #define MAPLE_ENODE_TYPE_SHIFT 0x03 /* Bit 2 means a NULL somewhere below */ #define MAPLE_ENODE_NULL 0x04 static inline struct maple_enode *mt_mk_node(const struct maple_node *node, enum maple_type type) { return (void *)((unsigned long)node | (type << MAPLE_ENODE_TYPE_SHIFT) | MAPLE_ENODE_NULL); } static inline void *mte_mk_root(const struct maple_enode *node) { return (void *)((unsigned long)node | MAPLE_ROOT_NODE); } static inline void *mte_safe_root(const struct maple_enode *node) { return (void *)((unsigned long)node & ~MAPLE_ROOT_NODE); } static inline void __maybe_unused *mte_set_full(const struct maple_enode *node) { return (void *)((unsigned long)node & ~MAPLE_ENODE_NULL); } static inline void __maybe_unused *mte_clear_full(const struct maple_enode *node) { return (void *)((unsigned long)node | MAPLE_ENODE_NULL); } static inline bool __maybe_unused mte_has_null(const struct maple_enode *node) { return (unsigned long)node & MAPLE_ENODE_NULL; } static __always_inline bool ma_is_root(struct maple_node *node) { return ((unsigned long)node->parent & MA_ROOT_PARENT); } static __always_inline bool mte_is_root(const struct maple_enode *node) { return ma_is_root(mte_to_node(node)); } static inline bool mas_is_root_limits(const struct ma_state *mas) { return !mas->min && mas->max == ULONG_MAX; } static __always_inline bool mt_is_alloc(struct maple_tree *mt) { return (mt->ma_flags & MT_FLAGS_ALLOC_RANGE); } /* * The Parent Pointer * Excluding root, the parent pointer is 256B aligned like all other tree nodes. * When storing a 32 or 64 bit values, the offset can fit into 5 bits. The 16 * bit values need an extra bit to store the offset. This extra bit comes from * a reuse of the last bit in the node type. This is possible by using bit 1 to * indicate if bit 2 is part of the type or the slot. * * Note types: * 0x??1 = Root * 0x?00 = 16 bit nodes * 0x010 = 32 bit nodes * 0x110 = 64 bit nodes * * Slot size and alignment * 0b??1 : Root * 0b?00 : 16 bit values, type in 0-1, slot in 2-7 * 0b010 : 32 bit values, type in 0-2, slot in 3-7 * 0b110 : 64 bit values, type in 0-2, slot in 3-7 */ #define MAPLE_PARENT_ROOT 0x01 #define MAPLE_PARENT_SLOT_SHIFT 0x03 #define MAPLE_PARENT_SLOT_MASK 0xF8 #define MAPLE_PARENT_16B_SLOT_SHIFT 0x02 #define MAPLE_PARENT_16B_SLOT_MASK 0xFC #define MAPLE_PARENT_RANGE64 0x06 #define MAPLE_PARENT_RANGE32 0x04 #define MAPLE_PARENT_NOT_RANGE16 0x02 /* * mte_parent_shift() - Get the parent shift for the slot storage. * @parent: The parent pointer cast as an unsigned long * Return: The shift into that pointer to the star to of the slot */ static inline unsigned long mte_parent_shift(unsigned long parent) { /* Note bit 1 == 0 means 16B */ if (likely(parent & MAPLE_PARENT_NOT_RANGE16)) return MAPLE_PARENT_SLOT_SHIFT; return MAPLE_PARENT_16B_SLOT_SHIFT; } /* * mte_parent_slot_mask() - Get the slot mask for the parent. * @parent: The parent pointer cast as an unsigned long. * Return: The slot mask for that parent. */ static inline unsigned long mte_parent_slot_mask(unsigned long parent) { /* Note bit 1 == 0 means 16B */ if (likely(parent & MAPLE_PARENT_NOT_RANGE16)) return MAPLE_PARENT_SLOT_MASK; return MAPLE_PARENT_16B_SLOT_MASK; } /* * mas_parent_type() - Return the maple_type of the parent from the stored * parent type. * @mas: The maple state * @enode: The maple_enode to extract the parent's enum * Return: The node->parent maple_type */ static inline enum maple_type mas_parent_type(struct ma_state *mas, struct maple_enode *enode) { unsigned long p_type; p_type = (unsigned long)mte_to_node(enode)->parent; if (WARN_ON(p_type & MAPLE_PARENT_ROOT)) return 0; p_type &= MAPLE_NODE_MASK; p_type &= ~mte_parent_slot_mask(p_type); switch (p_type) { case MAPLE_PARENT_RANGE64: /* or MAPLE_PARENT_ARANGE64 */ if (mt_is_alloc(mas->tree)) return maple_arange_64; return maple_range_64; } return 0; } /* * mas_set_parent() - Set the parent node and encode the slot * @mas: The maple state * @enode: The encoded maple node. * @parent: The encoded maple node that is the parent of @enode. * @slot: The slot that @enode resides in @parent. * * Slot number is encoded in the enode->parent bit 3-6 or 2-6, depending on the * parent type. */ static inline void mas_set_parent(struct ma_state *mas, struct maple_enode *enode, const struct maple_enode *parent, unsigned char slot) { unsigned long val = (unsigned long)parent; unsigned long shift; unsigned long type; enum maple_type p_type = mte_node_type(parent); MAS_BUG_ON(mas, p_type == maple_dense); MAS_BUG_ON(mas, p_type == maple_leaf_64); switch (p_type) { case maple_range_64: case maple_arange_64: shift = MAPLE_PARENT_SLOT_SHIFT; type = MAPLE_PARENT_RANGE64; break; default: case maple_dense: case maple_leaf_64: shift = type = 0; break; } val &= ~MAPLE_NODE_MASK; /* Clear all node metadata in parent */ val |= (slot << shift) | type; mte_to_node(enode)->parent = ma_parent_ptr(val); } /* * mte_parent_slot() - get the parent slot of @enode. * @enode: The encoded maple node. * * Return: The slot in the parent node where @enode resides. */ static __always_inline unsigned int mte_parent_slot(const struct maple_enode *enode) { unsigned long val = (unsigned long)mte_to_node(enode)->parent; if (unlikely(val & MA_ROOT_PARENT)) return 0; /* * Okay to use MAPLE_PARENT_16B_SLOT_MASK as the last bit will be lost * by shift if the parent shift is MAPLE_PARENT_SLOT_SHIFT */ return (val & MAPLE_PARENT_16B_SLOT_MASK) >> mte_parent_shift(val); } /* * mte_parent() - Get the parent of @node. * @enode: The encoded maple node. * * Return: The parent maple node. */ static __always_inline struct maple_node *mte_parent(const struct maple_enode *enode) { return (void *)((unsigned long) (mte_to_node(enode)->parent) & ~MAPLE_NODE_MASK); } /* * ma_dead_node() - check if the @enode is dead. * @enode: The encoded maple node * * Return: true if dead, false otherwise. */ static __always_inline bool ma_dead_node(const struct maple_node *node) { struct maple_node *parent; /* Do not reorder reads from the node prior to the parent check */ smp_rmb(); parent = (void *)((unsigned long) node->parent & ~MAPLE_NODE_MASK); return (parent == node); } /* * mte_dead_node() - check if the @enode is dead. * @enode: The encoded maple node * * Return: true if dead, false otherwise. */ static __always_inline bool mte_dead_node(const struct maple_enode *enode) { struct maple_node *parent, *node; node = mte_to_node(enode); /* Do not reorder reads from the node prior to the parent check */ smp_rmb(); parent = mte_parent(enode); return (parent == node); } /* * mas_allocated() - Get the number of nodes allocated in a maple state. * @mas: The maple state * * The ma_state alloc member is overloaded to hold a pointer to the first * allocated node or to the number of requested nodes to allocate. If bit 0 is * set, then the alloc contains the number of requested nodes. If there is an * allocated node, then the total allocated nodes is in that node. * * Return: The total number of nodes allocated */ static inline unsigned long mas_allocated(const struct ma_state *mas) { if (!mas->alloc || ((unsigned long)mas->alloc & 0x1)) return 0; return mas->alloc->total; } /* * mas_set_alloc_req() - Set the requested number of allocations. * @mas: the maple state * @count: the number of allocations. * * The requested number of allocations is either in the first allocated node, * located in @mas->alloc->request_count, or directly in @mas->alloc if there is * no allocated node. Set the request either in the node or do the necessary * encoding to store in @mas->alloc directly. */ static inline void mas_set_alloc_req(struct ma_state *mas, unsigned long count) { if (!mas->alloc || ((unsigned long)mas->alloc & 0x1)) { if (!count) mas->alloc = NULL; else mas->alloc = (struct maple_alloc *)(((count) << 1U) | 1U); return; } mas->alloc->request_count = count; } /* * mas_alloc_req() - get the requested number of allocations. * @mas: The maple state * * The alloc count is either stored directly in @mas, or in * @mas->alloc->request_count if there is at least one node allocated. Decode * the request count if it's stored directly in @mas->alloc. * * Return: The allocation request count. */ static inline unsigned int mas_alloc_req(const struct ma_state *mas) { if ((unsigned long)mas->alloc & 0x1) return (unsigned long)(mas->alloc) >> 1; else if (mas->alloc) return mas->alloc->request_count; return 0; } /* * ma_pivots() - Get a pointer to the maple node pivots. * @node: the maple node * @type: the node type * * In the event of a dead node, this array may be %NULL * * Return: A pointer to the maple node pivots */ static inline unsigned long *ma_pivots(struct maple_node *node, enum maple_type type) { switch (type) { case maple_arange_64: return node->ma64.pivot; case maple_range_64: case maple_leaf_64: return node->mr64.pivot; case maple_dense: return NULL; } return NULL; } /* * ma_gaps() - Get a pointer to the maple node gaps. * @node: the maple node * @type: the node type * * Return: A pointer to the maple node gaps */ static inline unsigned long *ma_gaps(struct maple_node *node, enum maple_type type) { switch (type) { case maple_arange_64: return node->ma64.gap; case maple_range_64: case maple_leaf_64: case maple_dense: return NULL; } return NULL; } /* * mas_safe_pivot() - get the pivot at @piv or mas->max. * @mas: The maple state * @pivots: The pointer to the maple node pivots * @piv: The pivot to fetch * @type: The maple node type * * Return: The pivot at @piv within the limit of the @pivots array, @mas->max * otherwise. */ static __always_inline unsigned long mas_safe_pivot(const struct ma_state *mas, unsigned long *pivots, unsigned char piv, enum maple_type type) { if (piv >= mt_pivots[type]) return mas->max; return pivots[piv]; } /* * mas_safe_min() - Return the minimum for a given offset. * @mas: The maple state * @pivots: The pointer to the maple node pivots * @offset: The offset into the pivot array * * Return: The minimum range value that is contained in @offset. */ static inline unsigned long mas_safe_min(struct ma_state *mas, unsigned long *pivots, unsigned char offset) { if (likely(offset)) return pivots[offset - 1] + 1; return mas->min; } /* * mte_set_pivot() - Set a pivot to a value in an encoded maple node. * @mn: The encoded maple node * @piv: The pivot offset * @val: The value of the pivot */ static inline void mte_set_pivot(struct maple_enode *mn, unsigned char piv, unsigned long val) { struct maple_node *node = mte_to_node(mn); enum maple_type type = mte_node_type(mn); BUG_ON(piv >= mt_pivots[type]); switch (type) { case maple_range_64: case maple_leaf_64: node->mr64.pivot[piv] = val; break; case maple_arange_64: node->ma64.pivot[piv] = val; break; case maple_dense: break; } } /* * ma_slots() - Get a pointer to the maple node slots. * @mn: The maple node * @mt: The maple node type * * Return: A pointer to the maple node slots */ static inline void __rcu **ma_slots(struct maple_node *mn, enum maple_type mt) { switch (mt) { case maple_arange_64: return mn->ma64.slot; case maple_range_64: case maple_leaf_64: return mn->mr64.slot; case maple_dense: return mn->slot; } return NULL; } static inline bool mt_write_locked(const struct maple_tree *mt) { return mt_external_lock(mt) ? mt_write_lock_is_held(mt) : lockdep_is_held(&mt->ma_lock); } static __always_inline bool mt_locked(const struct maple_tree *mt) { return mt_external_lock(mt) ? mt_lock_is_held(mt) : lockdep_is_held(&mt->ma_lock); } static __always_inline void *mt_slot(const struct maple_tree *mt, void __rcu **slots, unsigned char offset) { return rcu_dereference_check(slots[offset], mt_locked(mt)); } static __always_inline void *mt_slot_locked(struct maple_tree *mt, void __rcu **slots, unsigned char offset) { return rcu_dereference_protected(slots[offset], mt_write_locked(mt)); } /* * mas_slot_locked() - Get the slot value when holding the maple tree lock. * @mas: The maple state * @slots: The pointer to the slots * @offset: The offset into the slots array to fetch * * Return: The entry stored in @slots at the @offset. */ static __always_inline void *mas_slot_locked(struct ma_state *mas, void __rcu **slots, unsigned char offset) { return mt_slot_locked(mas->tree, slots, offset); } /* * mas_slot() - Get the slot value when not holding the maple tree lock. * @mas: The maple state * @slots: The pointer to the slots * @offset: The offset into the slots array to fetch * * Return: The entry stored in @slots at the @offset */ static __always_inline void *mas_slot(struct ma_state *mas, void __rcu **slots, unsigned char offset) { return mt_slot(mas->tree, slots, offset); } /* * mas_root() - Get the maple tree root. * @mas: The maple state. * * Return: The pointer to the root of the tree */ static __always_inline void *mas_root(struct ma_state *mas) { return rcu_dereference_check(mas->tree->ma_root, mt_locked(mas->tree)); } static inline void *mt_root_locked(struct maple_tree *mt) { return rcu_dereference_protected(mt->ma_root, mt_write_locked(mt)); } /* * mas_root_locked() - Get the maple tree root when holding the maple tree lock. * @mas: The maple state. * * Return: The pointer to the root of the tree */ static inline void *mas_root_locked(struct ma_state *mas) { return mt_root_locked(mas->tree); } static inline struct maple_metadata *ma_meta(struct maple_node *mn, enum maple_type mt) { switch (mt) { case maple_arange_64: return &mn->ma64.meta; default: return &mn->mr64.meta; } } /* * ma_set_meta() - Set the metadata information of a node. * @mn: The maple node * @mt: The maple node type * @offset: The offset of the highest sub-gap in this node. * @end: The end of the data in this node. */ static inline void ma_set_meta(struct maple_node *mn, enum maple_type mt, unsigned char offset, unsigned char end) { struct maple_metadata *meta = ma_meta(mn, mt); meta->gap = offset; meta->end = end; } /* * mt_clear_meta() - clear the metadata information of a node, if it exists * @mt: The maple tree * @mn: The maple node * @type: The maple node type */ static inline void mt_clear_meta(struct maple_tree *mt, struct maple_node *mn, enum maple_type type) { struct maple_metadata *meta; unsigned long *pivots; void __rcu **slots; void *next; switch (type) { case maple_range_64: pivots = mn->mr64.pivot; if (unlikely(pivots[MAPLE_RANGE64_SLOTS - 2])) { slots = mn->mr64.slot; next = mt_slot_locked(mt, slots, MAPLE_RANGE64_SLOTS - 1); if (unlikely((mte_to_node(next) && mte_node_type(next)))) return; /* no metadata, could be node */ } fallthrough; case maple_arange_64: meta = ma_meta(mn, type); break; default: return; } meta->gap = 0; meta->end = 0; } /* * ma_meta_end() - Get the data end of a node from the metadata * @mn: The maple node * @mt: The maple node type */ static inline unsigned char ma_meta_end(struct maple_node *mn, enum maple_type mt) { struct maple_metadata *meta = ma_meta(mn, mt); return meta->end; } /* * ma_meta_gap() - Get the largest gap location of a node from the metadata * @mn: The maple node */ static inline unsigned char ma_meta_gap(struct maple_node *mn) { return mn->ma64.meta.gap; } /* * ma_set_meta_gap() - Set the largest gap location in a nodes metadata * @mn: The maple node * @mt: The maple node type * @offset: The location of the largest gap. */ static inline void ma_set_meta_gap(struct maple_node *mn, enum maple_type mt, unsigned char offset) { struct maple_metadata *meta = ma_meta(mn, mt); meta->gap = offset; } /* * mat_add() - Add a @dead_enode to the ma_topiary of a list of dead nodes. * @mat: the ma_topiary, a linked list of dead nodes. * @dead_enode: the node to be marked as dead and added to the tail of the list * * Add the @dead_enode to the linked list in @mat. */ static inline void mat_add(struct ma_topiary *mat, struct maple_enode *dead_enode) { mte_set_node_dead(dead_enode); mte_to_mat(dead_enode)->next = NULL; if (!mat->tail) { mat->tail = mat->head = dead_enode; return; } mte_to_mat(mat->tail)->next = dead_enode; mat->tail = dead_enode; } static void mt_free_walk(struct rcu_head *head); static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt, bool free); /* * mas_mat_destroy() - Free all nodes and subtrees in a dead list. * @mas: the maple state * @mat: the ma_topiary linked list of dead nodes to free. * * Destroy walk a dead list. */ static void mas_mat_destroy(struct ma_state *mas, struct ma_topiary *mat) { struct maple_enode *next; struct maple_node *node; bool in_rcu = mt_in_rcu(mas->tree); while (mat->head) { next = mte_to_mat(mat->head)->next; node = mte_to_node(mat->head); mt_destroy_walk(mat->head, mas->tree, !in_rcu); if (in_rcu) call_rcu(&node->rcu, mt_free_walk); mat->head = next; } } /* * mas_descend() - Descend into the slot stored in the ma_state. * @mas: the maple state. * * Note: Not RCU safe, only use in write side or debug code. */ static inline void mas_descend(struct ma_state *mas) { enum maple_type type; unsigned long *pivots; struct maple_node *node; void __rcu **slots; node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); slots = ma_slots(node, type); if (mas->offset) mas->min = pivots[mas->offset - 1] + 1; mas->max = mas_safe_pivot(mas, pivots, mas->offset, type); mas->node = mas_slot(mas, slots, mas->offset); } /* * mte_set_gap() - Set a maple node gap. * @mn: The encoded maple node * @gap: The offset of the gap to set * @val: The gap value */ static inline void mte_set_gap(const struct maple_enode *mn, unsigned char gap, unsigned long val) { switch (mte_node_type(mn)) { default: break; case maple_arange_64: mte_to_node(mn)->ma64.gap[gap] = val; break; } } /* * mas_ascend() - Walk up a level of the tree. * @mas: The maple state * * Sets the @mas->max and @mas->min to the correct values when walking up. This * may cause several levels of walking up to find the correct min and max. * May find a dead node which will cause a premature return. * Return: 1 on dead node, 0 otherwise */ static int mas_ascend(struct ma_state *mas) { struct maple_enode *p_enode; /* parent enode. */ struct maple_enode *a_enode; /* ancestor enode. */ struct maple_node *a_node; /* ancestor node. */ struct maple_node *p_node; /* parent node. */ unsigned char a_slot; enum maple_type a_type; unsigned long min, max; unsigned long *pivots; bool set_max = false, set_min = false; a_node = mas_mn(mas); if (ma_is_root(a_node)) { mas->offset = 0; return 0; } p_node = mte_parent(mas->node); if (unlikely(a_node == p_node)) return 1; a_type = mas_parent_type(mas, mas->node); mas->offset = mte_parent_slot(mas->node); a_enode = mt_mk_node(p_node, a_type); /* Check to make sure all parent information is still accurate */ if (p_node != mte_parent(mas->node)) return 1; mas->node = a_enode; if (mte_is_root(a_enode)) { mas->max = ULONG_MAX; mas->min = 0; return 0; } min = 0; max = ULONG_MAX; if (!mas->offset) { min = mas->min; set_min = true; } if (mas->max == ULONG_MAX) set_max = true; do { p_enode = a_enode; a_type = mas_parent_type(mas, p_enode); a_node = mte_parent(p_enode); a_slot = mte_parent_slot(p_enode); a_enode = mt_mk_node(a_node, a_type); pivots = ma_pivots(a_node, a_type); if (unlikely(ma_dead_node(a_node))) return 1; if (!set_min && a_slot) { set_min = true; min = pivots[a_slot - 1] + 1; } if (!set_max && a_slot < mt_pivots[a_type]) { set_max = true; max = pivots[a_slot]; } if (unlikely(ma_dead_node(a_node))) return 1; if (unlikely(ma_is_root(a_node))) break; } while (!set_min || !set_max); mas->max = max; mas->min = min; return 0; } /* * mas_pop_node() - Get a previously allocated maple node from the maple state. * @mas: The maple state * * Return: A pointer to a maple node. */ static inline struct maple_node *mas_pop_node(struct ma_state *mas) { struct maple_alloc *ret, *node = mas->alloc; unsigned long total = mas_allocated(mas); unsigned int req = mas_alloc_req(mas); /* nothing or a request pending. */ if (WARN_ON(!total)) return NULL; if (total == 1) { /* single allocation in this ma_state */ mas->alloc = NULL; ret = node; goto single_node; } if (node->node_count == 1) { /* Single allocation in this node. */ mas->alloc = node->slot[0]; mas->alloc->total = node->total - 1; ret = node; goto new_head; } node->total--; ret = node->slot[--node->node_count]; node->slot[node->node_count] = NULL; single_node: new_head: if (req) { req++; mas_set_alloc_req(mas, req); } memset(ret, 0, sizeof(*ret)); return (struct maple_node *)ret; } /* * mas_push_node() - Push a node back on the maple state allocation. * @mas: The maple state * @used: The used maple node * * Stores the maple node back into @mas->alloc for reuse. Updates allocated and * requested node count as necessary. */ static inline void mas_push_node(struct ma_state *mas, struct maple_node *used) { struct maple_alloc *reuse = (struct maple_alloc *)used; struct maple_alloc *head = mas->alloc; unsigned long count; unsigned int requested = mas_alloc_req(mas); count = mas_allocated(mas); reuse->request_count = 0; reuse->node_count = 0; if (count) { if (head->node_count < MAPLE_ALLOC_SLOTS) { head->slot[head->node_count++] = reuse; head->total++; goto done; } reuse->slot[0] = head; reuse->node_count = 1; } reuse->total = count + 1; mas->alloc = reuse; done: if (requested > 1) mas_set_alloc_req(mas, requested - 1); } /* * mas_alloc_nodes() - Allocate nodes into a maple state * @mas: The maple state * @gfp: The GFP Flags */ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) { struct maple_alloc *node; unsigned long allocated = mas_allocated(mas); unsigned int requested = mas_alloc_req(mas); unsigned int count; void **slots = NULL; unsigned int max_req = 0; if (!requested) return; mas_set_alloc_req(mas, 0); if (mas->mas_flags & MA_STATE_PREALLOC) { if (allocated) return; BUG_ON(!allocated); WARN_ON(!allocated); } if (!allocated || mas->alloc->node_count == MAPLE_ALLOC_SLOTS) { node = (struct maple_alloc *)mt_alloc_one(gfp); if (!node) goto nomem_one; if (allocated) { node->slot[0] = mas->alloc; node->node_count = 1; } else { node->node_count = 0; } mas->alloc = node; node->total = ++allocated; node->request_count = 0; requested--; } node = mas->alloc; while (requested) { max_req = MAPLE_ALLOC_SLOTS - node->node_count; slots = (void **)&node->slot[node->node_count]; max_req = min(requested, max_req); count = mt_alloc_bulk(gfp, max_req, slots); if (!count) goto nomem_bulk; if (node->node_count == 0) { node->slot[0]->node_count = 0; node->slot[0]->request_count = 0; } node->node_count += count; allocated += count; /* find a non-full node*/ do { node = node->slot[0]; } while (unlikely(node->node_count == MAPLE_ALLOC_SLOTS)); requested -= count; } mas->alloc->total = allocated; return; nomem_bulk: /* Clean up potential freed allocations on bulk failure */ memset(slots, 0, max_req * sizeof(unsigned long)); mas->alloc->total = allocated; nomem_one: mas_set_alloc_req(mas, requested); mas_set_err(mas, -ENOMEM); } /* * mas_free() - Free an encoded maple node * @mas: The maple state * @used: The encoded maple node to free. * * Uses rcu free if necessary, pushes @used back on the maple state allocations * otherwise. */ static inline void mas_free(struct ma_state *mas, struct maple_enode *used) { struct maple_node *tmp = mte_to_node(used); if (mt_in_rcu(mas->tree)) ma_free_rcu(tmp); else mas_push_node(mas, tmp); } /* * mas_node_count_gfp() - Check if enough nodes are allocated and request more * if there is not enough nodes. * @mas: The maple state * @count: The number of nodes needed * @gfp: the gfp flags */ static void mas_node_count_gfp(struct ma_state *mas, int count, gfp_t gfp) { unsigned long allocated = mas_allocated(mas); if (allocated < count) { mas_set_alloc_req(mas, count - allocated); mas_alloc_nodes(mas, gfp); } } /* * mas_node_count() - Check if enough nodes are allocated and request more if * there is not enough nodes. * @mas: The maple state * @count: The number of nodes needed * * Note: Uses GFP_NOWAIT | __GFP_NOWARN for gfp flags. */ static void mas_node_count(struct ma_state *mas, int count) { return mas_node_count_gfp(mas, count, GFP_NOWAIT | __GFP_NOWARN); } /* * mas_start() - Sets up maple state for operations. * @mas: The maple state. * * If mas->status == mas_start, then set the min, max and depth to * defaults. * * Return: * - If mas->node is an error or not mas_start, return NULL. * - If it's an empty tree: NULL & mas->status == ma_none * - If it's a single entry: The entry & mas->status == ma_root * - If it's a tree: NULL & mas->status == ma_active */ static inline struct maple_enode *mas_start(struct ma_state *mas) { if (likely(mas_is_start(mas))) { struct maple_enode *root; mas->min = 0; mas->max = ULONG_MAX; retry: mas->depth = 0; root = mas_root(mas); /* Tree with nodes */ if (likely(xa_is_node(root))) { mas->depth = 1; mas->status = ma_active; mas->node = mte_safe_root(root); mas->offset = 0; if (mte_dead_node(mas->node)) goto retry; return NULL; } mas->node = NULL; /* empty tree */ if (unlikely(!root)) { mas->status = ma_none; mas->offset = MAPLE_NODE_SLOTS; return NULL; } /* Single entry tree */ mas->status = ma_root; mas->offset = MAPLE_NODE_SLOTS; /* Single entry tree. */ if (mas->index > 0) return NULL; return root; } return NULL; } /* * ma_data_end() - Find the end of the data in a node. * @node: The maple node * @type: The maple node type * @pivots: The array of pivots in the node * @max: The maximum value in the node * * Uses metadata to find the end of the data when possible. * Return: The zero indexed last slot with data (may be null). */ static __always_inline unsigned char ma_data_end(struct maple_node *node, enum maple_type type, unsigned long *pivots, unsigned long max) { unsigned char offset; if (!pivots) return 0; if (type == maple_arange_64) return ma_meta_end(node, type); offset = mt_pivots[type] - 1; if (likely(!pivots[offset])) return ma_meta_end(node, type); if (likely(pivots[offset] == max)) return offset; return mt_pivots[type]; } /* * mas_data_end() - Find the end of the data (slot). * @mas: the maple state * * This method is optimized to check the metadata of a node if the node type * supports data end metadata. * * Return: The zero indexed last slot with data (may be null). */ static inline unsigned char mas_data_end(struct ma_state *mas) { enum maple_type type; struct maple_node *node; unsigned char offset; unsigned long *pivots; type = mte_node_type(mas->node); node = mas_mn(mas); if (type == maple_arange_64) return ma_meta_end(node, type); pivots = ma_pivots(node, type); if (unlikely(ma_dead_node(node))) return 0; offset = mt_pivots[type] - 1; if (likely(!pivots[offset])) return ma_meta_end(node, type); if (likely(pivots[offset] == mas->max)) return offset; return mt_pivots[type]; } /* * mas_leaf_max_gap() - Returns the largest gap in a leaf node * @mas: the maple state * * Return: The maximum gap in the leaf. */ static unsigned long mas_leaf_max_gap(struct ma_state *mas) { enum maple_type mt; unsigned long pstart, gap, max_gap; struct maple_node *mn; unsigned long *pivots; void __rcu **slots; unsigned char i; unsigned char max_piv; mt = mte_node_type(mas->node); mn = mas_mn(mas); slots = ma_slots(mn, mt); max_gap = 0; if (unlikely(ma_is_dense(mt))) { gap = 0; for (i = 0; i < mt_slots[mt]; i++) { if (slots[i]) { if (gap > max_gap) max_gap = gap; gap = 0; } else { gap++; } } if (gap > max_gap) max_gap = gap; return max_gap; } /* * Check the first implied pivot optimizes the loop below and slot 1 may * be skipped if there is a gap in slot 0. */ pivots = ma_pivots(mn, mt); if (likely(!slots[0])) { max_gap = pivots[0] - mas->min + 1; i = 2; } else { i = 1; } /* reduce max_piv as the special case is checked before the loop */ max_piv = ma_data_end(mn, mt, pivots, mas->max) - 1; /* * Check end implied pivot which can only be a gap on the right most * node. */ if (unlikely(mas->max == ULONG_MAX) && !slots[max_piv + 1]) { gap = ULONG_MAX - pivots[max_piv]; if (gap > max_gap) max_gap = gap; if (max_gap > pivots[max_piv] - mas->min) return max_gap; } for (; i <= max_piv; i++) { /* data == no gap. */ if (likely(slots[i])) continue; pstart = pivots[i - 1]; gap = pivots[i] - pstart; if (gap > max_gap) max_gap = gap; /* There cannot be two gaps in a row. */ i++; } return max_gap; } /* * ma_max_gap() - Get the maximum gap in a maple node (non-leaf) * @node: The maple node * @gaps: The pointer to the gaps * @mt: The maple node type * @off: Pointer to store the offset location of the gap. * * Uses the metadata data end to scan backwards across set gaps. * * Return: The maximum gap value */ static inline unsigned long ma_max_gap(struct maple_node *node, unsigned long *gaps, enum maple_type mt, unsigned char *off) { unsigned char offset, i; unsigned long max_gap = 0; i = offset = ma_meta_end(node, mt); do { if (gaps[i] > max_gap) { max_gap = gaps[i]; offset = i; } } while (i--); *off = offset; return max_gap; } /* * mas_max_gap() - find the largest gap in a non-leaf node and set the slot. * @mas: The maple state. * * Return: The gap value. */ static inline unsigned long mas_max_gap(struct ma_state *mas) { unsigned long *gaps; unsigned char offset; enum maple_type mt; struct maple_node *node; mt = mte_node_type(mas->node); if (ma_is_leaf(mt)) return mas_leaf_max_gap(mas); node = mas_mn(mas); MAS_BUG_ON(mas, mt != maple_arange_64); offset = ma_meta_gap(node); gaps = ma_gaps(node, mt); return gaps[offset]; } /* * mas_parent_gap() - Set the parent gap and any gaps above, as needed * @mas: The maple state * @offset: The gap offset in the parent to set * @new: The new gap value. * * Set the parent gap then continue to set the gap upwards, using the metadata * of the parent to see if it is necessary to check the node above. */ static inline void mas_parent_gap(struct ma_state *mas, unsigned char offset, unsigned long new) { unsigned long meta_gap = 0; struct maple_node *pnode; struct maple_enode *penode; unsigned long *pgaps; unsigned char meta_offset; enum maple_type pmt; pnode = mte_parent(mas->node); pmt = mas_parent_type(mas, mas->node); penode = mt_mk_node(pnode, pmt); pgaps = ma_gaps(pnode, pmt); ascend: MAS_BUG_ON(mas, pmt != maple_arange_64); meta_offset = ma_meta_gap(pnode); meta_gap = pgaps[meta_offset]; pgaps[offset] = new; if (meta_gap == new) return; if (offset != meta_offset) { if (meta_gap > new) return; ma_set_meta_gap(pnode, pmt, offset); } else if (new < meta_gap) { new = ma_max_gap(pnode, pgaps, pmt, &meta_offset); ma_set_meta_gap(pnode, pmt, meta_offset); } if (ma_is_root(pnode)) return; /* Go to the parent node. */ pnode = mte_parent(penode); pmt = mas_parent_type(mas, penode); pgaps = ma_gaps(pnode, pmt); offset = mte_parent_slot(penode); penode = mt_mk_node(pnode, pmt); goto ascend; } /* * mas_update_gap() - Update a nodes gaps and propagate up if necessary. * @mas: the maple state. */ static inline void mas_update_gap(struct ma_state *mas) { unsigned char pslot; unsigned long p_gap; unsigned long max_gap; if (!mt_is_alloc(mas->tree)) return; if (mte_is_root(mas->node)) return; max_gap = mas_max_gap(mas); pslot = mte_parent_slot(mas->node); p_gap = ma_gaps(mte_parent(mas->node), mas_parent_type(mas, mas->node))[pslot]; if (p_gap != max_gap) mas_parent_gap(mas, pslot, max_gap); } /* * mas_adopt_children() - Set the parent pointer of all nodes in @parent to * @parent with the slot encoded. * @mas: the maple state (for the tree) * @parent: the maple encoded node containing the children. */ static inline void mas_adopt_children(struct ma_state *mas, struct maple_enode *parent) { enum maple_type type = mte_node_type(parent); struct maple_node *node = mte_to_node(parent); void __rcu **slots = ma_slots(node, type); unsigned long *pivots = ma_pivots(node, type); struct maple_enode *child; unsigned char offset; offset = ma_data_end(node, type, pivots, mas->max); do { child = mas_slot_locked(mas, slots, offset); mas_set_parent(mas, child, parent, offset); } while (offset--); } /* * mas_put_in_tree() - Put a new node in the tree, smp_wmb(), and mark the old * node as dead. * @mas: the maple state with the new node * @old_enode: The old maple encoded node to replace. */ static inline void mas_put_in_tree(struct ma_state *mas, struct maple_enode *old_enode) __must_hold(mas->tree->ma_lock) { unsigned char offset; void __rcu **slots; if (mte_is_root(mas->node)) { mas_mn(mas)->parent = ma_parent_ptr(mas_tree_parent(mas)); rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); mas_set_height(mas); } else { offset = mte_parent_slot(mas->node); slots = ma_slots(mte_parent(mas->node), mas_parent_type(mas, mas->node)); rcu_assign_pointer(slots[offset], mas->node); } mte_set_node_dead(old_enode); } /* * mas_replace_node() - Replace a node by putting it in the tree, marking it * dead, and freeing it. * the parent encoding to locate the maple node in the tree. * @mas: the ma_state with @mas->node pointing to the new node. * @old_enode: The old maple encoded node. */ static inline void mas_replace_node(struct ma_state *mas, struct maple_enode *old_enode) __must_hold(mas->tree->ma_lock) { mas_put_in_tree(mas, old_enode); mas_free(mas, old_enode); } /* * mas_find_child() - Find a child who has the parent @mas->node. * @mas: the maple state with the parent. * @child: the maple state to store the child. */ static inline bool mas_find_child(struct ma_state *mas, struct ma_state *child) __must_hold(mas->tree->ma_lock) { enum maple_type mt; unsigned char offset; unsigned char end; unsigned long *pivots; struct maple_enode *entry; struct maple_node *node; void __rcu **slots; mt = mte_node_type(mas->node); node = mas_mn(mas); slots = ma_slots(node, mt); pivots = ma_pivots(node, mt); end = ma_data_end(node, mt, pivots, mas->max); for (offset = mas->offset; offset <= end; offset++) { entry = mas_slot_locked(mas, slots, offset); if (mte_parent(entry) == node) { *child = *mas; mas->offset = offset + 1; child->offset = offset; mas_descend(child); child->offset = 0; return true; } } return false; } /* * mab_shift_right() - Shift the data in mab right. Note, does not clean out the * old data or set b_node->b_end. * @b_node: the maple_big_node * @shift: the shift count */ static inline void mab_shift_right(struct maple_big_node *b_node, unsigned char shift) { unsigned long size = b_node->b_end * sizeof(unsigned long); memmove(b_node->pivot + shift, b_node->pivot, size); memmove(b_node->slot + shift, b_node->slot, size); if (b_node->type == maple_arange_64) memmove(b_node->gap + shift, b_node->gap, size); } /* * mab_middle_node() - Check if a middle node is needed (unlikely) * @b_node: the maple_big_node that contains the data. * @split: the potential split location * @slot_count: the size that can be stored in a single node being considered. * * Return: true if a middle node is required. */ static inline bool mab_middle_node(struct maple_big_node *b_node, int split, unsigned char slot_count) { unsigned char size = b_node->b_end; if (size >= 2 * slot_count) return true; if (!b_node->slot[split] && (size >= 2 * slot_count - 1)) return true; return false; } /* * mab_no_null_split() - ensure the split doesn't fall on a NULL * @b_node: the maple_big_node with the data * @split: the suggested split location * @slot_count: the number of slots in the node being considered. * * Return: the split location. */ static inline int mab_no_null_split(struct maple_big_node *b_node, unsigned char split, unsigned char slot_count) { if (!b_node->slot[split]) { /* * If the split is less than the max slot && the right side will * still be sufficient, then increment the split on NULL. */ if ((split < slot_count - 1) && (b_node->b_end - split) > (mt_min_slots[b_node->type])) split++; else split--; } return split; } /* * mab_calc_split() - Calculate the split location and if there needs to be two * splits. * @mas: The maple state * @bn: The maple_big_node with the data * @mid_split: The second split, if required. 0 otherwise. * * Return: The first split location. The middle split is set in @mid_split. */ static inline int mab_calc_split(struct ma_state *mas, struct maple_big_node *bn, unsigned char *mid_split) { unsigned char b_end = bn->b_end; int split = b_end / 2; /* Assume equal split. */ unsigned char slot_count = mt_slots[bn->type]; /* * To support gap tracking, all NULL entries are kept together and a node cannot * end on a NULL entry, with the exception of the left-most leaf. The * limitation means that the split of a node must be checked for this condition * and be able to put more data in one direction or the other. */ if (unlikely((mas->mas_flags & MA_STATE_BULK))) { *mid_split = 0; split = b_end - mt_min_slots[bn->type]; if (!ma_is_leaf(bn->type)) return split; mas->mas_flags |= MA_STATE_REBALANCE; if (!bn->slot[split]) split--; return split; } /* * Although extremely rare, it is possible to enter what is known as the 3-way * split scenario. The 3-way split comes about by means of a store of a range * that overwrites the end and beginning of two full nodes. The result is a set * of entries that cannot be stored in 2 nodes. Sometimes, these two nodes can * also be located in different parent nodes which are also full. This can * carry upwards all the way to the root in the worst case. */ if (unlikely(mab_middle_node(bn, split, slot_count))) { split = b_end / 3; *mid_split = split * 2; } else { *mid_split = 0; } /* Avoid ending a node on a NULL entry */ split = mab_no_null_split(bn, split, slot_count); if (unlikely(*mid_split)) *mid_split = mab_no_null_split(bn, *mid_split, slot_count); return split; } /* * mas_mab_cp() - Copy data from a maple state inclusively to a maple_big_node * and set @b_node->b_end to the next free slot. * @mas: The maple state * @mas_start: The starting slot to copy * @mas_end: The end slot to copy (inclusively) * @b_node: The maple_big_node to place the data * @mab_start: The starting location in maple_big_node to store the data. */ static inline void mas_mab_cp(struct ma_state *mas, unsigned char mas_start, unsigned char mas_end, struct maple_big_node *b_node, unsigned char mab_start) { enum maple_type mt; struct maple_node *node; void __rcu **slots; unsigned long *pivots, *gaps; int i = mas_start, j = mab_start; unsigned char piv_end; node = mas_mn(mas); mt = mte_node_type(mas->node); pivots = ma_pivots(node, mt); if (!i) { b_node->pivot[j] = pivots[i++]; if (unlikely(i > mas_end)) goto complete; j++; } piv_end = min(mas_end, mt_pivots[mt]); for (; i < piv_end; i++, j++) { b_node->pivot[j] = pivots[i]; if (unlikely(!b_node->pivot[j])) goto complete; if (unlikely(mas->max == b_node->pivot[j])) goto complete; } b_node->pivot[j] = mas_safe_pivot(mas, pivots, i, mt); complete: b_node->b_end = ++j; j -= mab_start; slots = ma_slots(node, mt); memcpy(b_node->slot + mab_start, slots + mas_start, sizeof(void *) * j); if (!ma_is_leaf(mt) && mt_is_alloc(mas->tree)) { gaps = ma_gaps(node, mt); memcpy(b_node->gap + mab_start, gaps + mas_start, sizeof(unsigned long) * j); } } /* * mas_leaf_set_meta() - Set the metadata of a leaf if possible. * @node: The maple node * @mt: The maple type * @end: The node end */ static inline void mas_leaf_set_meta(struct maple_node *node, enum maple_type mt, unsigned char end) { if (end < mt_slots[mt] - 1) ma_set_meta(node, mt, 0, end); } /* * mab_mas_cp() - Copy data from maple_big_node to a maple encoded node. * @b_node: the maple_big_node that has the data * @mab_start: the start location in @b_node. * @mab_end: The end location in @b_node (inclusively) * @mas: The maple state with the maple encoded node. */ static inline void mab_mas_cp(struct maple_big_node *b_node, unsigned char mab_start, unsigned char mab_end, struct ma_state *mas, bool new_max) { int i, j = 0; enum maple_type mt = mte_node_type(mas->node); struct maple_node *node = mte_to_node(mas->node); void __rcu **slots = ma_slots(node, mt); unsigned long *pivots = ma_pivots(node, mt); unsigned long *gaps = NULL; unsigned char end; if (mab_end - mab_start > mt_pivots[mt]) mab_end--; if (!pivots[mt_pivots[mt] - 1]) slots[mt_pivots[mt]] = NULL; i = mab_start; do { pivots[j++] = b_node->pivot[i++]; } while (i <= mab_end && likely(b_node->pivot[i])); memcpy(slots, b_node->slot + mab_start, sizeof(void *) * (i - mab_start)); if (new_max) mas->max = b_node->pivot[i - 1]; end = j - 1; if (likely(!ma_is_leaf(mt) && mt_is_alloc(mas->tree))) { unsigned long max_gap = 0; unsigned char offset = 0; gaps = ma_gaps(node, mt); do { gaps[--j] = b_node->gap[--i]; if (gaps[j] > max_gap) { offset = j; max_gap = gaps[j]; } } while (j); ma_set_meta(node, mt, offset, end); } else { mas_leaf_set_meta(node, mt, end); } } /* * mas_bulk_rebalance() - Rebalance the end of a tree after a bulk insert. * @mas: The maple state * @end: The maple node end * @mt: The maple node type */ static inline void mas_bulk_rebalance(struct ma_state *mas, unsigned char end, enum maple_type mt) { if (!(mas->mas_flags & MA_STATE_BULK)) return; if (mte_is_root(mas->node)) return; if (end > mt_min_slots[mt]) { mas->mas_flags &= ~MA_STATE_REBALANCE; return; } } /* * mas_store_b_node() - Store an @entry into the b_node while also copying the * data from a maple encoded node. * @wr_mas: the maple write state * @b_node: the maple_big_node to fill with data * @offset_end: the offset to end copying * * Return: The actual end of the data stored in @b_node */ static noinline_for_kasan void mas_store_b_node(struct ma_wr_state *wr_mas, struct maple_big_node *b_node, unsigned char offset_end) { unsigned char slot; unsigned char b_end; /* Possible underflow of piv will wrap back to 0 before use. */ unsigned long piv; struct ma_state *mas = wr_mas->mas; b_node->type = wr_mas->type; b_end = 0; slot = mas->offset; if (slot) { /* Copy start data up to insert. */ mas_mab_cp(mas, 0, slot - 1, b_node, 0); b_end = b_node->b_end; piv = b_node->pivot[b_end - 1]; } else piv = mas->min - 1; if (piv + 1 < mas->index) { /* Handle range starting after old range */ b_node->slot[b_end] = wr_mas->content; if (!wr_mas->content) b_node->gap[b_end] = mas->index - 1 - piv; b_node->pivot[b_end++] = mas->index - 1; } /* Store the new entry. */ mas->offset = b_end; b_node->slot[b_end] = wr_mas->entry; b_node->pivot[b_end] = mas->last; /* Appended. */ if (mas->last >= mas->max) goto b_end; /* Handle new range ending before old range ends */ piv = mas_safe_pivot(mas, wr_mas->pivots, offset_end, wr_mas->type); if (piv > mas->last) { if (piv == ULONG_MAX) mas_bulk_rebalance(mas, b_node->b_end, wr_mas->type); if (offset_end != slot) wr_mas->content = mas_slot_locked(mas, wr_mas->slots, offset_end); b_node->slot[++b_end] = wr_mas->content; if (!wr_mas->content) b_node->gap[b_end] = piv - mas->last + 1; b_node->pivot[b_end] = piv; } slot = offset_end + 1; if (slot > mas->end) goto b_end; /* Copy end data to the end of the node. */ mas_mab_cp(mas, slot, mas->end + 1, b_node, ++b_end); b_node->b_end--; return; b_end: b_node->b_end = b_end; } /* * mas_prev_sibling() - Find the previous node with the same parent. * @mas: the maple state * * Return: True if there is a previous sibling, false otherwise. */ static inline bool mas_prev_sibling(struct ma_state *mas) { unsigned int p_slot = mte_parent_slot(mas->node); /* For root node, p_slot is set to 0 by mte_parent_slot(). */ if (!p_slot) return false; mas_ascend(mas); mas->offset = p_slot - 1; mas_descend(mas); return true; } /* * mas_next_sibling() - Find the next node with the same parent. * @mas: the maple state * * Return: true if there is a next sibling, false otherwise. */ static inline bool mas_next_sibling(struct ma_state *mas) { MA_STATE(parent, mas->tree, mas->index, mas->last); if (mte_is_root(mas->node)) return false; parent = *mas; mas_ascend(&parent); parent.offset = mte_parent_slot(mas->node) + 1; if (parent.offset > mas_data_end(&parent)) return false; *mas = parent; mas_descend(mas); return true; } /* * mas_node_or_none() - Set the enode and state. * @mas: the maple state * @enode: The encoded maple node. * * Set the node to the enode and the status. */ static inline void mas_node_or_none(struct ma_state *mas, struct maple_enode *enode) { if (enode) { mas->node = enode; mas->status = ma_active; } else { mas->node = NULL; mas->status = ma_none; } } /* * mas_wr_node_walk() - Find the correct offset for the index in the @mas. * If @mas->index cannot be found within the containing * node, we traverse to the last entry in the node. * @wr_mas: The maple write state * * Uses mas_slot_locked() and does not need to worry about dead nodes. */ static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char count, offset; if (unlikely(ma_is_dense(wr_mas->type))) { wr_mas->r_max = wr_mas->r_min = mas->index; mas->offset = mas->index = mas->min; return; } wr_mas->node = mas_mn(wr_mas->mas); wr_mas->pivots = ma_pivots(wr_mas->node, wr_mas->type); count = mas->end = ma_data_end(wr_mas->node, wr_mas->type, wr_mas->pivots, mas->max); offset = mas->offset; while (offset < count && mas->index > wr_mas->pivots[offset]) offset++; wr_mas->r_max = offset < count ? wr_mas->pivots[offset] : mas->max; wr_mas->r_min = mas_safe_min(mas, wr_mas->pivots, offset); wr_mas->offset_end = mas->offset = offset; } /* * mast_rebalance_next() - Rebalance against the next node * @mast: The maple subtree state */ static inline void mast_rebalance_next(struct maple_subtree_state *mast) { unsigned char b_end = mast->bn->b_end; mas_mab_cp(mast->orig_r, 0, mt_slot_count(mast->orig_r->node), mast->bn, b_end); mast->orig_r->last = mast->orig_r->max; } /* * mast_rebalance_prev() - Rebalance against the previous node * @mast: The maple subtree state */ static inline void mast_rebalance_prev(struct maple_subtree_state *mast) { unsigned char end = mas_data_end(mast->orig_l) + 1; unsigned char b_end = mast->bn->b_end; mab_shift_right(mast->bn, end); mas_mab_cp(mast->orig_l, 0, end - 1, mast->bn, 0); mast->l->min = mast->orig_l->min; mast->orig_l->index = mast->orig_l->min; mast->bn->b_end = end + b_end; mast->l->offset += end; } /* * mast_spanning_rebalance() - Rebalance nodes with nearest neighbour favouring * the node to the right. Checking the nodes to the right then the left at each * level upwards until root is reached. * Data is copied into the @mast->bn. * @mast: The maple_subtree_state. */ static inline bool mast_spanning_rebalance(struct maple_subtree_state *mast) { struct ma_state r_tmp = *mast->orig_r; struct ma_state l_tmp = *mast->orig_l; unsigned char depth = 0; do { mas_ascend(mast->orig_r); mas_ascend(mast->orig_l); depth++; if (mast->orig_r->offset < mas_data_end(mast->orig_r)) { mast->orig_r->offset++; do { mas_descend(mast->orig_r); mast->orig_r->offset = 0; } while (--depth); mast_rebalance_next(mast); *mast->orig_l = l_tmp; return true; } else if (mast->orig_l->offset != 0) { mast->orig_l->offset--; do { mas_descend(mast->orig_l); mast->orig_l->offset = mas_data_end(mast->orig_l); } while (--depth); mast_rebalance_prev(mast); *mast->orig_r = r_tmp; return true; } } while (!mte_is_root(mast->orig_r->node)); *mast->orig_r = r_tmp; *mast->orig_l = l_tmp; return false; } /* * mast_ascend() - Ascend the original left and right maple states. * @mast: the maple subtree state. * * Ascend the original left and right sides. Set the offsets to point to the * data already in the new tree (@mast->l and @mast->r). */ static inline void mast_ascend(struct maple_subtree_state *mast) { MA_WR_STATE(wr_mas, mast->orig_r, NULL); mas_ascend(mast->orig_l); mas_ascend(mast->orig_r); mast->orig_r->offset = 0; mast->orig_r->index = mast->r->max; /* last should be larger than or equal to index */ if (mast->orig_r->last < mast->orig_r->index) mast->orig_r->last = mast->orig_r->index; wr_mas.type = mte_node_type(mast->orig_r->node); mas_wr_node_walk(&wr_mas); /* Set up the left side of things */ mast->orig_l->offset = 0; mast->orig_l->index = mast->l->min; wr_mas.mas = mast->orig_l; wr_mas.type = mte_node_type(mast->orig_l->node); mas_wr_node_walk(&wr_mas); mast->bn->type = wr_mas.type; } /* * mas_new_ma_node() - Create and return a new maple node. Helper function. * @mas: the maple state with the allocations. * @b_node: the maple_big_node with the type encoding. * * Use the node type from the maple_big_node to allocate a new node from the * ma_state. This function exists mainly for code readability. * * Return: A new maple encoded node */ static inline struct maple_enode *mas_new_ma_node(struct ma_state *mas, struct maple_big_node *b_node) { return mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), b_node->type); } /* * mas_mab_to_node() - Set up right and middle nodes * * @mas: the maple state that contains the allocations. * @b_node: the node which contains the data. * @left: The pointer which will have the left node * @right: The pointer which may have the right node * @middle: the pointer which may have the middle node (rare) * @mid_split: the split location for the middle node * * Return: the split of left. */ static inline unsigned char mas_mab_to_node(struct ma_state *mas, struct maple_big_node *b_node, struct maple_enode **left, struct maple_enode **right, struct maple_enode **middle, unsigned char *mid_split) { unsigned char split = 0; unsigned char slot_count = mt_slots[b_node->type]; *left = mas_new_ma_node(mas, b_node); *right = NULL; *middle = NULL; *mid_split = 0; if (b_node->b_end < slot_count) { split = b_node->b_end; } else { split = mab_calc_split(mas, b_node, mid_split); *right = mas_new_ma_node(mas, b_node); } if (*mid_split) *middle = mas_new_ma_node(mas, b_node); return split; } /* * mab_set_b_end() - Add entry to b_node at b_node->b_end and increment the end * pointer. * @b_node: the big node to add the entry * @mas: the maple state to get the pivot (mas->max) * @entry: the entry to add, if NULL nothing happens. */ static inline void mab_set_b_end(struct maple_big_node *b_node, struct ma_state *mas, void *entry) { if (!entry) return; b_node->slot[b_node->b_end] = entry; if (mt_is_alloc(mas->tree)) b_node->gap[b_node->b_end] = mas_max_gap(mas); b_node->pivot[b_node->b_end++] = mas->max; } /* * mas_set_split_parent() - combine_then_separate helper function. Sets the parent * of @mas->node to either @left or @right, depending on @slot and @split * * @mas: the maple state with the node that needs a parent * @left: possible parent 1 * @right: possible parent 2 * @slot: the slot the mas->node was placed * @split: the split location between @left and @right */ static inline void mas_set_split_parent(struct ma_state *mas, struct maple_enode *left, struct maple_enode *right, unsigned char *slot, unsigned char split) { if (mas_is_none(mas)) return; if ((*slot) <= split) mas_set_parent(mas, mas->node, left, *slot); else if (right) mas_set_parent(mas, mas->node, right, (*slot) - split - 1); (*slot)++; } /* * mte_mid_split_check() - Check if the next node passes the mid-split * @l: Pointer to left encoded maple node. * @m: Pointer to middle encoded maple node. * @r: Pointer to right encoded maple node. * @slot: The offset * @split: The split location. * @mid_split: The middle split. */ static inline void mte_mid_split_check(struct maple_enode **l, struct maple_enode **r, struct maple_enode *right, unsigned char slot, unsigned char *split, unsigned char mid_split) { if (*r == right) return; if (slot < mid_split) return; *l = *r; *r = right; *split = mid_split; } /* * mast_set_split_parents() - Helper function to set three nodes parents. Slot * is taken from @mast->l. * @mast: the maple subtree state * @left: the left node * @right: the right node * @split: the split location. */ static inline void mast_set_split_parents(struct maple_subtree_state *mast, struct maple_enode *left, struct maple_enode *middle, struct maple_enode *right, unsigned char split, unsigned char mid_split) { unsigned char slot; struct maple_enode *l = left; struct maple_enode *r = right; if (mas_is_none(mast->l)) return; if (middle) r = middle; slot = mast->l->offset; mte_mid_split_check(&l, &r, right, slot, &split, mid_split); mas_set_split_parent(mast->l, l, r, &slot, split); mte_mid_split_check(&l, &r, right, slot, &split, mid_split); mas_set_split_parent(mast->m, l, r, &slot, split); mte_mid_split_check(&l, &r, right, slot, &split, mid_split); mas_set_split_parent(mast->r, l, r, &slot, split); } /* * mas_topiary_node() - Dispose of a single node * @mas: The maple state for pushing nodes * @in_rcu: If the tree is in rcu mode * * The node will either be RCU freed or pushed back on the maple state. */ static inline void mas_topiary_node(struct ma_state *mas, struct ma_state *tmp_mas, bool in_rcu) { struct maple_node *tmp; struct maple_enode *enode; if (mas_is_none(tmp_mas)) return; enode = tmp_mas->node; tmp = mte_to_node(enode); mte_set_node_dead(enode); if (in_rcu) ma_free_rcu(tmp); else mas_push_node(mas, tmp); } /* * mas_topiary_replace() - Replace the data with new data, then repair the * parent links within the new tree. Iterate over the dead sub-tree and collect * the dead subtrees and topiary the nodes that are no longer of use. * * The new tree will have up to three children with the correct parent. Keep * track of the new entries as they need to be followed to find the next level * of new entries. * * The old tree will have up to three children with the old parent. Keep track * of the old entries as they may have more nodes below replaced. Nodes within * [index, last] are dead subtrees, others need to be freed and followed. * * @mas: The maple state pointing at the new data * @old_enode: The maple encoded node being replaced * */ static inline void mas_topiary_replace(struct ma_state *mas, struct maple_enode *old_enode) { struct ma_state tmp[3], tmp_next[3]; MA_TOPIARY(subtrees, mas->tree); bool in_rcu; int i, n; /* Place data in tree & then mark node as old */ mas_put_in_tree(mas, old_enode); /* Update the parent pointers in the tree */ tmp[0] = *mas; tmp[0].offset = 0; tmp[1].status = ma_none; tmp[2].status = ma_none; while (!mte_is_leaf(tmp[0].node)) { n = 0; for (i = 0; i < 3; i++) { if (mas_is_none(&tmp[i])) continue; while (n < 3) { if (!mas_find_child(&tmp[i], &tmp_next[n])) break; n++; } mas_adopt_children(&tmp[i], tmp[i].node); } if (MAS_WARN_ON(mas, n == 0)) break; while (n < 3) tmp_next[n++].status = ma_none; for (i = 0; i < 3; i++) tmp[i] = tmp_next[i]; } /* Collect the old nodes that need to be discarded */ if (mte_is_leaf(old_enode)) return mas_free(mas, old_enode); tmp[0] = *mas; tmp[0].offset = 0; tmp[0].node = old_enode; tmp[1].status = ma_none; tmp[2].status = ma_none; in_rcu = mt_in_rcu(mas->tree); do { n = 0; for (i = 0; i < 3; i++) { if (mas_is_none(&tmp[i])) continue; while (n < 3) { if (!mas_find_child(&tmp[i], &tmp_next[n])) break; if ((tmp_next[n].min >= tmp_next->index) && (tmp_next[n].max <= tmp_next->last)) { mat_add(&subtrees, tmp_next[n].node); tmp_next[n].status = ma_none; } else { n++; } } } if (MAS_WARN_ON(mas, n == 0)) break; while (n < 3) tmp_next[n++].status = ma_none; for (i = 0; i < 3; i++) { mas_topiary_node(mas, &tmp[i], in_rcu); tmp[i] = tmp_next[i]; } } while (!mte_is_leaf(tmp[0].node)); for (i = 0; i < 3; i++) mas_topiary_node(mas, &tmp[i], in_rcu); mas_mat_destroy(mas, &subtrees); } /* * mas_wmb_replace() - Write memory barrier and replace * @mas: The maple state * @old_enode: The old maple encoded node that is being replaced. * * Updates gap as necessary. */ static inline void mas_wmb_replace(struct ma_state *mas, struct maple_enode *old_enode) { /* Insert the new data in the tree */ mas_topiary_replace(mas, old_enode); if (mte_is_leaf(mas->node)) return; mas_update_gap(mas); } /* * mast_cp_to_nodes() - Copy data out to nodes. * @mast: The maple subtree state * @left: The left encoded maple node * @middle: The middle encoded maple node * @right: The right encoded maple node * @split: The location to split between left and (middle ? middle : right) * @mid_split: The location to split between middle and right. */ static inline void mast_cp_to_nodes(struct maple_subtree_state *mast, struct maple_enode *left, struct maple_enode *middle, struct maple_enode *right, unsigned char split, unsigned char mid_split) { bool new_lmax = true; mas_node_or_none(mast->l, left); mas_node_or_none(mast->m, middle); mas_node_or_none(mast->r, right); mast->l->min = mast->orig_l->min; if (split == mast->bn->b_end) { mast->l->max = mast->orig_r->max; new_lmax = false; } mab_mas_cp(mast->bn, 0, split, mast->l, new_lmax); if (middle) { mab_mas_cp(mast->bn, 1 + split, mid_split, mast->m, true); mast->m->min = mast->bn->pivot[split] + 1; split = mid_split; } mast->r->max = mast->orig_r->max; if (right) { mab_mas_cp(mast->bn, 1 + split, mast->bn->b_end, mast->r, false); mast->r->min = mast->bn->pivot[split] + 1; } } /* * mast_combine_cp_left - Copy in the original left side of the tree into the * combined data set in the maple subtree state big node. * @mast: The maple subtree state */ static inline void mast_combine_cp_left(struct maple_subtree_state *mast) { unsigned char l_slot = mast->orig_l->offset; if (!l_slot) return; mas_mab_cp(mast->orig_l, 0, l_slot - 1, mast->bn, 0); } /* * mast_combine_cp_right: Copy in the original right side of the tree into the * combined data set in the maple subtree state big node. * @mast: The maple subtree state */ static inline void mast_combine_cp_right(struct maple_subtree_state *mast) { if (mast->bn->pivot[mast->bn->b_end - 1] >= mast->orig_r->max) return; mas_mab_cp(mast->orig_r, mast->orig_r->offset + 1, mt_slot_count(mast->orig_r->node), mast->bn, mast->bn->b_end); mast->orig_r->last = mast->orig_r->max; } /* * mast_sufficient: Check if the maple subtree state has enough data in the big * node to create at least one sufficient node * @mast: the maple subtree state */ static inline bool mast_sufficient(struct maple_subtree_state *mast) { if (mast->bn->b_end > mt_min_slot_count(mast->orig_l->node)) return true; return false; } /* * mast_overflow: Check if there is too much data in the subtree state for a * single node. * @mast: The maple subtree state */ static inline bool mast_overflow(struct maple_subtree_state *mast) { if (mast->bn->b_end >= mt_slot_count(mast->orig_l->node)) return true; return false; } static inline void *mtree_range_walk(struct ma_state *mas) { unsigned long *pivots; unsigned char offset; struct maple_node *node; struct maple_enode *next, *last; enum maple_type type; void __rcu **slots; unsigned char end; unsigned long max, min; unsigned long prev_max, prev_min; next = mas->node; min = mas->min; max = mas->max; do { last = next; node = mte_to_node(next); type = mte_node_type(next); pivots = ma_pivots(node, type); end = ma_data_end(node, type, pivots, max); prev_min = min; prev_max = max; if (pivots[0] >= mas->index) { offset = 0; max = pivots[0]; goto next; } offset = 1; while (offset < end) { if (pivots[offset] >= mas->index) { max = pivots[offset]; break; } offset++; } min = pivots[offset - 1] + 1; next: slots = ma_slots(node, type); next = mt_slot(mas->tree, slots, offset); if (unlikely(ma_dead_node(node))) goto dead_node; } while (!ma_is_leaf(type)); mas->end = end; mas->offset = offset; mas->index = min; mas->last = max; mas->min = prev_min; mas->max = prev_max; mas->node = last; return (void *)next; dead_node: mas_reset(mas); return NULL; } /* * mas_spanning_rebalance() - Rebalance across two nodes which may not be peers. * @mas: The starting maple state * @mast: The maple_subtree_state, keeps track of 4 maple states. * @count: The estimated count of iterations needed. * * Follow the tree upwards from @l_mas and @r_mas for @count, or until the root * is hit. First @b_node is split into two entries which are inserted into the * next iteration of the loop. @b_node is returned populated with the final * iteration. @mas is used to obtain allocations. orig_l_mas keeps track of the * nodes that will remain active by using orig_l_mas->index and orig_l_mas->last * to account of what has been copied into the new sub-tree. The update of * orig_l_mas->last is used in mas_consume to find the slots that will need to * be either freed or destroyed. orig_l_mas->depth keeps track of the height of * the new sub-tree in case the sub-tree becomes the full tree. */ static void mas_spanning_rebalance(struct ma_state *mas, struct maple_subtree_state *mast, unsigned char count) { unsigned char split, mid_split; unsigned char slot = 0; struct maple_enode *left = NULL, *middle = NULL, *right = NULL; struct maple_enode *old_enode; MA_STATE(l_mas, mas->tree, mas->index, mas->index); MA_STATE(r_mas, mas->tree, mas->index, mas->last); MA_STATE(m_mas, mas->tree, mas->index, mas->index); /* * The tree needs to be rebalanced and leaves need to be kept at the same level. * Rebalancing is done by use of the ``struct maple_topiary``. */ mast->l = &l_mas; mast->m = &m_mas; mast->r = &r_mas; l_mas.status = r_mas.status = m_mas.status = ma_none; /* Check if this is not root and has sufficient data. */ if (((mast->orig_l->min != 0) || (mast->orig_r->max != ULONG_MAX)) && unlikely(mast->bn->b_end <= mt_min_slots[mast->bn->type])) mast_spanning_rebalance(mast); l_mas.depth = 0; /* * Each level of the tree is examined and balanced, pushing data to the left or * right, or rebalancing against left or right nodes is employed to avoid * rippling up the tree to limit the amount of churn. Once a new sub-section of * the tree is created, there may be a mix of new and old nodes. The old nodes * will have the incorrect parent pointers and currently be in two trees: the * original tree and the partially new tree. To remedy the parent pointers in * the old tree, the new data is swapped into the active tree and a walk down * the tree is performed and the parent pointers are updated. * See mas_topiary_replace() for more information. */ while (count--) { mast->bn->b_end--; mast->bn->type = mte_node_type(mast->orig_l->node); split = mas_mab_to_node(mas, mast->bn, &left, &right, &middle, &mid_split); mast_set_split_parents(mast, left, middle, right, split, mid_split); mast_cp_to_nodes(mast, left, middle, right, split, mid_split); /* * Copy data from next level in the tree to mast->bn from next * iteration */ memset(mast->bn, 0, sizeof(struct maple_big_node)); mast->bn->type = mte_node_type(left); l_mas.depth++; /* Root already stored in l->node. */ if (mas_is_root_limits(mast->l)) goto new_root; mast_ascend(mast); mast_combine_cp_left(mast); l_mas.offset = mast->bn->b_end; mab_set_b_end(mast->bn, &l_mas, left); mab_set_b_end(mast->bn, &m_mas, middle); mab_set_b_end(mast->bn, &r_mas, right); /* Copy anything necessary out of the right node. */ mast_combine_cp_right(mast); mast->orig_l->last = mast->orig_l->max; if (mast_sufficient(mast)) continue; if (mast_overflow(mast)) continue; /* May be a new root stored in mast->bn */ if (mas_is_root_limits(mast->orig_l)) break; mast_spanning_rebalance(mast); /* rebalancing from other nodes may require another loop. */ if (!count) count++; } l_mas.node = mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), mte_node_type(mast->orig_l->node)); l_mas.depth++; mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, &l_mas, true); mas_set_parent(mas, left, l_mas.node, slot); if (middle) mas_set_parent(mas, middle, l_mas.node, ++slot); if (right) mas_set_parent(mas, right, l_mas.node, ++slot); if (mas_is_root_limits(mast->l)) { new_root: mas_mn(mast->l)->parent = ma_parent_ptr(mas_tree_parent(mas)); while (!mte_is_root(mast->orig_l->node)) mast_ascend(mast); } else { mas_mn(&l_mas)->parent = mas_mn(mast->orig_l)->parent; } old_enode = mast->orig_l->node; mas->depth = l_mas.depth; mas->node = l_mas.node; mas->min = l_mas.min; mas->max = l_mas.max; mas->offset = l_mas.offset; mas_wmb_replace(mas, old_enode); mtree_range_walk(mas); return; } /* * mas_rebalance() - Rebalance a given node. * @mas: The maple state * @b_node: The big maple node. * * Rebalance two nodes into a single node or two new nodes that are sufficient. * Continue upwards until tree is sufficient. */ static inline void mas_rebalance(struct ma_state *mas, struct maple_big_node *b_node) { char empty_count = mas_mt_height(mas); struct maple_subtree_state mast; unsigned char shift, b_end = ++b_node->b_end; MA_STATE(l_mas, mas->tree, mas->index, mas->last); MA_STATE(r_mas, mas->tree, mas->index, mas->last); trace_ma_op(__func__, mas); /* * Rebalancing occurs if a node is insufficient. Data is rebalanced * against the node to the right if it exists, otherwise the node to the * left of this node is rebalanced against this node. If rebalancing * causes just one node to be produced instead of two, then the parent * is also examined and rebalanced if it is insufficient. Every level * tries to combine the data in the same way. If one node contains the * entire range of the tree, then that node is used as a new root node. */ mast.orig_l = &l_mas; mast.orig_r = &r_mas; mast.bn = b_node; mast.bn->type = mte_node_type(mas->node); l_mas = r_mas = *mas; if (mas_next_sibling(&r_mas)) { mas_mab_cp(&r_mas, 0, mt_slot_count(r_mas.node), b_node, b_end); r_mas.last = r_mas.index = r_mas.max; } else { mas_prev_sibling(&l_mas); shift = mas_data_end(&l_mas) + 1; mab_shift_right(b_node, shift); mas->offset += shift; mas_mab_cp(&l_mas, 0, shift - 1, b_node, 0); b_node->b_end = shift + b_end; l_mas.index = l_mas.last = l_mas.min; } return mas_spanning_rebalance(mas, &mast, empty_count); } /* * mas_destroy_rebalance() - Rebalance left-most node while destroying the maple * state. * @mas: The maple state * @end: The end of the left-most node. * * During a mass-insert event (such as forking), it may be necessary to * rebalance the left-most node when it is not sufficient. */ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end) { enum maple_type mt = mte_node_type(mas->node); struct maple_node reuse, *newnode, *parent, *new_left, *left, *node; struct maple_enode *eparent, *old_eparent; unsigned char offset, tmp, split = mt_slots[mt] / 2; void __rcu **l_slots, **slots; unsigned long *l_pivs, *pivs, gap; bool in_rcu = mt_in_rcu(mas->tree); MA_STATE(l_mas, mas->tree, mas->index, mas->last); l_mas = *mas; mas_prev_sibling(&l_mas); /* set up node. */ if (in_rcu) { newnode = mas_pop_node(mas); } else { newnode = &reuse; } node = mas_mn(mas); newnode->parent = node->parent; slots = ma_slots(newnode, mt); pivs = ma_pivots(newnode, mt); left = mas_mn(&l_mas); l_slots = ma_slots(left, mt); l_pivs = ma_pivots(left, mt); if (!l_slots[split]) split++; tmp = mas_data_end(&l_mas) - split; memcpy(slots, l_slots + split + 1, sizeof(void *) * tmp); memcpy(pivs, l_pivs + split + 1, sizeof(unsigned long) * tmp); pivs[tmp] = l_mas.max; memcpy(slots + tmp, ma_slots(node, mt), sizeof(void *) * end); memcpy(pivs + tmp, ma_pivots(node, mt), sizeof(unsigned long) * end); l_mas.max = l_pivs[split]; mas->min = l_mas.max + 1; old_eparent = mt_mk_node(mte_parent(l_mas.node), mas_parent_type(&l_mas, l_mas.node)); tmp += end; if (!in_rcu) { unsigned char max_p = mt_pivots[mt]; unsigned char max_s = mt_slots[mt]; if (tmp < max_p) memset(pivs + tmp, 0, sizeof(unsigned long) * (max_p - tmp)); if (tmp < mt_slots[mt]) memset(slots + tmp, 0, sizeof(void *) * (max_s - tmp)); memcpy(node, newnode, sizeof(struct maple_node)); ma_set_meta(node, mt, 0, tmp - 1); mte_set_pivot(old_eparent, mte_parent_slot(l_mas.node), l_pivs[split]); /* Remove data from l_pivs. */ tmp = split + 1; memset(l_pivs + tmp, 0, sizeof(unsigned long) * (max_p - tmp)); memset(l_slots + tmp, 0, sizeof(void *) * (max_s - tmp)); ma_set_meta(left, mt, 0, split); eparent = old_eparent; goto done; } /* RCU requires replacing both l_mas, mas, and parent. */ mas->node = mt_mk_node(newnode, mt); ma_set_meta(newnode, mt, 0, tmp); new_left = mas_pop_node(mas); new_left->parent = left->parent; mt = mte_node_type(l_mas.node); slots = ma_slots(new_left, mt); pivs = ma_pivots(new_left, mt); memcpy(slots, l_slots, sizeof(void *) * split); memcpy(pivs, l_pivs, sizeof(unsigned long) * split); ma_set_meta(new_left, mt, 0, split); l_mas.node = mt_mk_node(new_left, mt); /* replace parent. */ offset = mte_parent_slot(mas->node); mt = mas_parent_type(&l_mas, l_mas.node); parent = mas_pop_node(mas); slots = ma_slots(parent, mt); pivs = ma_pivots(parent, mt); memcpy(parent, mte_to_node(old_eparent), sizeof(struct maple_node)); rcu_assign_pointer(slots[offset], mas->node); rcu_assign_pointer(slots[offset - 1], l_mas.node); pivs[offset - 1] = l_mas.max; eparent = mt_mk_node(parent, mt); done: gap = mas_leaf_max_gap(mas); mte_set_gap(eparent, mte_parent_slot(mas->node), gap); gap = mas_leaf_max_gap(&l_mas); mte_set_gap(eparent, mte_parent_slot(l_mas.node), gap); mas_ascend(mas); if (in_rcu) { mas_replace_node(mas, old_eparent); mas_adopt_children(mas, mas->node); } mas_update_gap(mas); } /* * mas_split_final_node() - Split the final node in a subtree operation. * @mast: the maple subtree state * @mas: The maple state * @height: The height of the tree in case it's a new root. */ static inline void mas_split_final_node(struct maple_subtree_state *mast, struct ma_state *mas, int height) { struct maple_enode *ancestor; if (mte_is_root(mas->node)) { if (mt_is_alloc(mas->tree)) mast->bn->type = maple_arange_64; else mast->bn->type = maple_range_64; mas->depth = height; } /* * Only a single node is used here, could be root. * The Big_node data should just fit in a single node. */ ancestor = mas_new_ma_node(mas, mast->bn); mas_set_parent(mas, mast->l->node, ancestor, mast->l->offset); mas_set_parent(mas, mast->r->node, ancestor, mast->r->offset); mte_to_node(ancestor)->parent = mas_mn(mas)->parent; mast->l->node = ancestor; mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, mast->l, true); mas->offset = mast->bn->b_end - 1; } /* * mast_fill_bnode() - Copy data into the big node in the subtree state * @mast: The maple subtree state * @mas: the maple state * @skip: The number of entries to skip for new nodes insertion. */ static inline void mast_fill_bnode(struct maple_subtree_state *mast, struct ma_state *mas, unsigned char skip) { bool cp = true; unsigned char split; memset(mast->bn, 0, sizeof(struct maple_big_node)); if (mte_is_root(mas->node)) { cp = false; } else { mas_ascend(mas); mas->offset = mte_parent_slot(mas->node); } if (cp && mast->l->offset) mas_mab_cp(mas, 0, mast->l->offset - 1, mast->bn, 0); split = mast->bn->b_end; mab_set_b_end(mast->bn, mast->l, mast->l->node); mast->r->offset = mast->bn->b_end; mab_set_b_end(mast->bn, mast->r, mast->r->node); if (mast->bn->pivot[mast->bn->b_end - 1] == mas->max) cp = false; if (cp) mas_mab_cp(mas, split + skip, mt_slot_count(mas->node) - 1, mast->bn, mast->bn->b_end); mast->bn->b_end--; mast->bn->type = mte_node_type(mas->node); } /* * mast_split_data() - Split the data in the subtree state big node into regular * nodes. * @mast: The maple subtree state * @mas: The maple state * @split: The location to split the big node */ static inline void mast_split_data(struct maple_subtree_state *mast, struct ma_state *mas, unsigned char split) { unsigned char p_slot; mab_mas_cp(mast->bn, 0, split, mast->l, true); mte_set_pivot(mast->r->node, 0, mast->r->max); mab_mas_cp(mast->bn, split + 1, mast->bn->b_end, mast->r, false); mast->l->offset = mte_parent_slot(mas->node); mast->l->max = mast->bn->pivot[split]; mast->r->min = mast->l->max + 1; if (mte_is_leaf(mas->node)) return; p_slot = mast->orig_l->offset; mas_set_split_parent(mast->orig_l, mast->l->node, mast->r->node, &p_slot, split); mas_set_split_parent(mast->orig_r, mast->l->node, mast->r->node, &p_slot, split); } /* * mas_push_data() - Instead of splitting a node, it is beneficial to push the * data to the right or left node if there is room. * @mas: The maple state * @height: The current height of the maple state * @mast: The maple subtree state * @left: Push left or not. * * Keeping the height of the tree low means faster lookups. * * Return: True if pushed, false otherwise. */ static inline bool mas_push_data(struct ma_state *mas, int height, struct maple_subtree_state *mast, bool left) { unsigned char slot_total = mast->bn->b_end; unsigned char end, space, split; MA_STATE(tmp_mas, mas->tree, mas->index, mas->last); tmp_mas = *mas; tmp_mas.depth = mast->l->depth; if (left && !mas_prev_sibling(&tmp_mas)) return false; else if (!left && !mas_next_sibling(&tmp_mas)) return false; end = mas_data_end(&tmp_mas); slot_total += end; space = 2 * mt_slot_count(mas->node) - 2; /* -2 instead of -1 to ensure there isn't a triple split */ if (ma_is_leaf(mast->bn->type)) space--; if (mas->max == ULONG_MAX) space--; if (slot_total >= space) return false; /* Get the data; Fill mast->bn */ mast->bn->b_end++; if (left) { mab_shift_right(mast->bn, end + 1); mas_mab_cp(&tmp_mas, 0, end, mast->bn, 0); mast->bn->b_end = slot_total + 1; } else { mas_mab_cp(&tmp_mas, 0, end, mast->bn, mast->bn->b_end); } /* Configure mast for splitting of mast->bn */ split = mt_slots[mast->bn->type] - 2; if (left) { /* Switch mas to prev node */ *mas = tmp_mas; /* Start using mast->l for the left side. */ tmp_mas.node = mast->l->node; *mast->l = tmp_mas; } else { tmp_mas.node = mast->r->node; *mast->r = tmp_mas; split = slot_total - split; } split = mab_no_null_split(mast->bn, split, mt_slots[mast->bn->type]); /* Update parent slot for split calculation. */ if (left) mast->orig_l->offset += end + 1; mast_split_data(mast, mas, split); mast_fill_bnode(mast, mas, 2); mas_split_final_node(mast, mas, height + 1); return true; } /* * mas_split() - Split data that is too big for one node into two. * @mas: The maple state * @b_node: The maple big node */ static void mas_split(struct ma_state *mas, struct maple_big_node *b_node) { struct maple_subtree_state mast; int height = 0; unsigned char mid_split, split = 0; struct maple_enode *old; /* * Splitting is handled differently from any other B-tree; the Maple * Tree splits upwards. Splitting up means that the split operation * occurs when the walk of the tree hits the leaves and not on the way * down. The reason for splitting up is that it is impossible to know * how much space will be needed until the leaf is (or leaves are) * reached. Since overwriting data is allowed and a range could * overwrite more than one range or result in changing one entry into 3 * entries, it is impossible to know if a split is required until the * data is examined. * * Splitting is a balancing act between keeping allocations to a minimum * and avoiding a 'jitter' event where a tree is expanded to make room * for an entry followed by a contraction when the entry is removed. To * accomplish the balance, there are empty slots remaining in both left * and right nodes after a split. */ MA_STATE(l_mas, mas->tree, mas->index, mas->last); MA_STATE(r_mas, mas->tree, mas->index, mas->last); MA_STATE(prev_l_mas, mas->tree, mas->index, mas->last); MA_STATE(prev_r_mas, mas->tree, mas->index, mas->last); trace_ma_op(__func__, mas); mas->depth = mas_mt_height(mas); mast.l = &l_mas; mast.r = &r_mas; mast.orig_l = &prev_l_mas; mast.orig_r = &prev_r_mas; mast.bn = b_node; while (height++ <= mas->depth) { if (mt_slots[b_node->type] > b_node->b_end) { mas_split_final_node(&mast, mas, height); break; } l_mas = r_mas = *mas; l_mas.node = mas_new_ma_node(mas, b_node); r_mas.node = mas_new_ma_node(mas, b_node); /* * Another way that 'jitter' is avoided is to terminate a split up early if the * left or right node has space to spare. This is referred to as "pushing left" * or "pushing right" and is similar to the B* tree, except the nodes left or * right can rarely be reused due to RCU, but the ripple upwards is halted which * is a significant savings. */ /* Try to push left. */ if (mas_push_data(mas, height, &mast, true)) break; /* Try to push right. */ if (mas_push_data(mas, height, &mast, false)) break; split = mab_calc_split(mas, b_node, &mid_split); mast_split_data(&mast, mas, split); /* * Usually correct, mab_mas_cp in the above call overwrites * r->max. */ mast.r->max = mas->max; mast_fill_bnode(&mast, mas, 1); prev_l_mas = *mast.l; prev_r_mas = *mast.r; } /* Set the original node as dead */ old = mas->node; mas->node = l_mas.node; mas_wmb_replace(mas, old); mtree_range_walk(mas); return; } /* * mas_commit_b_node() - Commit the big node into the tree. * @wr_mas: The maple write state * @b_node: The maple big node */ static noinline_for_kasan void mas_commit_b_node(struct ma_wr_state *wr_mas, struct maple_big_node *b_node) { enum store_type type = wr_mas->mas->store_type; WARN_ON_ONCE(type != wr_rebalance && type != wr_split_store); if (type == wr_rebalance) return mas_rebalance(wr_mas->mas, b_node); return mas_split(wr_mas->mas, b_node); } /* * mas_root_expand() - Expand a root to a node * @mas: The maple state * @entry: The entry to store into the tree */ static inline void mas_root_expand(struct ma_state *mas, void *entry) { void *contents = mas_root_locked(mas); enum maple_type type = maple_leaf_64; struct maple_node *node; void __rcu **slots; unsigned long *pivots; int slot = 0; node = mas_pop_node(mas); pivots = ma_pivots(node, type); slots = ma_slots(node, type); node->parent = ma_parent_ptr(mas_tree_parent(mas)); mas->node = mt_mk_node(node, type); mas->status = ma_active; if (mas->index) { if (contents) { rcu_assign_pointer(slots[slot], contents); if (likely(mas->index > 1)) slot++; } pivots[slot++] = mas->index - 1; } rcu_assign_pointer(slots[slot], entry); mas->offset = slot; pivots[slot] = mas->last; if (mas->last != ULONG_MAX) pivots[++slot] = ULONG_MAX; mas->depth = 1; mas_set_height(mas); ma_set_meta(node, maple_leaf_64, 0, slot); /* swap the new root into the tree */ rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); return; } /* * mas_store_root() - Storing value into root. * @mas: The maple state * @entry: The entry to store. * * There is no root node now and we are storing a value into the root - this * function either assigns the pointer or expands into a node. */ static inline void mas_store_root(struct ma_state *mas, void *entry) { if (!entry) { if (!mas->index) rcu_assign_pointer(mas->tree->ma_root, NULL); } else if (likely((mas->last != 0) || (mas->index != 0))) mas_root_expand(mas, entry); else if (((unsigned long) (entry) & 3) == 2) mas_root_expand(mas, entry); else { rcu_assign_pointer(mas->tree->ma_root, entry); mas->status = ma_start; } } /* * mas_is_span_wr() - Check if the write needs to be treated as a write that * spans the node. * @wr_mas: The maple write state * * Spanning writes are writes that start in one node and end in another OR if * the write of a %NULL will cause the node to end with a %NULL. * * Return: True if this is a spanning write, false otherwise. */ static bool mas_is_span_wr(struct ma_wr_state *wr_mas) { unsigned long max = wr_mas->r_max; unsigned long last = wr_mas->mas->last; enum maple_type type = wr_mas->type; void *entry = wr_mas->entry; /* Contained in this pivot, fast path */ if (last < max) return false; if (ma_is_leaf(type)) { max = wr_mas->mas->max; if (last < max) return false; } if (last == max) { /* * The last entry of leaf node cannot be NULL unless it is the * rightmost node (writing ULONG_MAX), otherwise it spans slots. */ if (entry || last == ULONG_MAX) return false; } trace_ma_write(__func__, wr_mas->mas, wr_mas->r_max, entry); return true; } static inline void mas_wr_walk_descend(struct ma_wr_state *wr_mas) { wr_mas->type = mte_node_type(wr_mas->mas->node); mas_wr_node_walk(wr_mas); wr_mas->slots = ma_slots(wr_mas->node, wr_mas->type); } static inline void mas_wr_walk_traverse(struct ma_wr_state *wr_mas) { wr_mas->mas->max = wr_mas->r_max; wr_mas->mas->min = wr_mas->r_min; wr_mas->mas->node = wr_mas->content; wr_mas->mas->offset = 0; wr_mas->mas->depth++; } /* * mas_wr_walk() - Walk the tree for a write. * @wr_mas: The maple write state * * Uses mas_slot_locked() and does not need to worry about dead nodes. * * Return: True if it's contained in a node, false on spanning write. */ static bool mas_wr_walk(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; while (true) { mas_wr_walk_descend(wr_mas); if (unlikely(mas_is_span_wr(wr_mas))) return false; wr_mas->content = mas_slot_locked(mas, wr_mas->slots, mas->offset); if (ma_is_leaf(wr_mas->type)) return true; mas_wr_walk_traverse(wr_mas); } return true; } static void mas_wr_walk_index(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; while (true) { mas_wr_walk_descend(wr_mas); wr_mas->content = mas_slot_locked(mas, wr_mas->slots, mas->offset); if (ma_is_leaf(wr_mas->type)) return; mas_wr_walk_traverse(wr_mas); } } /* * mas_extend_spanning_null() - Extend a store of a %NULL to include surrounding %NULLs. * @l_wr_mas: The left maple write state * @r_wr_mas: The right maple write state */ static inline void mas_extend_spanning_null(struct ma_wr_state *l_wr_mas, struct ma_wr_state *r_wr_mas) { struct ma_state *r_mas = r_wr_mas->mas; struct ma_state *l_mas = l_wr_mas->mas; unsigned char l_slot; l_slot = l_mas->offset; if (!l_wr_mas->content) l_mas->index = l_wr_mas->r_min; if ((l_mas->index == l_wr_mas->r_min) && (l_slot && !mas_slot_locked(l_mas, l_wr_mas->slots, l_slot - 1))) { if (l_slot > 1) l_mas->index = l_wr_mas->pivots[l_slot - 2] + 1; else l_mas->index = l_mas->min; l_mas->offset = l_slot - 1; } if (!r_wr_mas->content) { if (r_mas->last < r_wr_mas->r_max) r_mas->last = r_wr_mas->r_max; r_mas->offset++; } else if ((r_mas->last == r_wr_mas->r_max) && (r_mas->last < r_mas->max) && !mas_slot_locked(r_mas, r_wr_mas->slots, r_mas->offset + 1)) { r_mas->last = mas_safe_pivot(r_mas, r_wr_mas->pivots, r_wr_mas->type, r_mas->offset + 1); r_mas->offset++; } } static inline void *mas_state_walk(struct ma_state *mas) { void *entry; entry = mas_start(mas); if (mas_is_none(mas)) return NULL; if (mas_is_ptr(mas)) return entry; return mtree_range_walk(mas); } /* * mtree_lookup_walk() - Internal quick lookup that does not keep maple state up * to date. * * @mas: The maple state. * * Note: Leaves mas in undesirable state. * Return: The entry for @mas->index or %NULL on dead node. */ static inline void *mtree_lookup_walk(struct ma_state *mas) { unsigned long *pivots; unsigned char offset; struct maple_node *node; struct maple_enode *next; enum maple_type type; void __rcu **slots; unsigned char end; next = mas->node; do { node = mte_to_node(next); type = mte_node_type(next); pivots = ma_pivots(node, type); end = mt_pivots[type]; offset = 0; do { if (pivots[offset] >= mas->index) break; } while (++offset < end); slots = ma_slots(node, type); next = mt_slot(mas->tree, slots, offset); if (unlikely(ma_dead_node(node))) goto dead_node; } while (!ma_is_leaf(type)); return (void *)next; dead_node: mas_reset(mas); return NULL; } static void mte_destroy_walk(struct maple_enode *, struct maple_tree *); /* * mas_new_root() - Create a new root node that only contains the entry passed * in. * @mas: The maple state * @entry: The entry to store. * * Only valid when the index == 0 and the last == ULONG_MAX */ static inline void mas_new_root(struct ma_state *mas, void *entry) { struct maple_enode *root = mas_root_locked(mas); enum maple_type type = maple_leaf_64; struct maple_node *node; void __rcu **slots; unsigned long *pivots; WARN_ON_ONCE(mas->index || mas->last != ULONG_MAX); if (!entry) { mas->depth = 0; mas_set_height(mas); rcu_assign_pointer(mas->tree->ma_root, entry); mas->status = ma_start; goto done; } node = mas_pop_node(mas); pivots = ma_pivots(node, type); slots = ma_slots(node, type); node->parent = ma_parent_ptr(mas_tree_parent(mas)); mas->node = mt_mk_node(node, type); mas->status = ma_active; rcu_assign_pointer(slots[0], entry); pivots[0] = mas->last; mas->depth = 1; mas_set_height(mas); rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); done: if (xa_is_node(root)) mte_destroy_walk(root, mas->tree); return; } /* * mas_wr_spanning_store() - Create a subtree with the store operation completed * and new nodes where necessary, then place the sub-tree in the actual tree. * Note that mas is expected to point to the node which caused the store to * span. * @wr_mas: The maple write state */ static noinline void mas_wr_spanning_store(struct ma_wr_state *wr_mas) { struct maple_subtree_state mast; struct maple_big_node b_node; struct ma_state *mas; unsigned char height; /* Left and Right side of spanning store */ MA_STATE(l_mas, NULL, 0, 0); MA_STATE(r_mas, NULL, 0, 0); MA_WR_STATE(r_wr_mas, &r_mas, wr_mas->entry); MA_WR_STATE(l_wr_mas, &l_mas, wr_mas->entry); /* * A store operation that spans multiple nodes is called a spanning * store and is handled early in the store call stack by the function * mas_is_span_wr(). When a spanning store is identified, the maple * state is duplicated. The first maple state walks the left tree path * to ``index``, the duplicate walks the right tree path to ``last``. * The data in the two nodes are combined into a single node, two nodes, * or possibly three nodes (see the 3-way split above). A ``NULL`` * written to the last entry of a node is considered a spanning store as * a rebalance is required for the operation to complete and an overflow * of data may happen. */ mas = wr_mas->mas; trace_ma_op(__func__, mas); if (unlikely(!mas->index && mas->last == ULONG_MAX)) return mas_new_root(mas, wr_mas->entry); /* * Node rebalancing may occur due to this store, so there may be three new * entries per level plus a new root. */ height = mas_mt_height(mas); /* * Set up right side. Need to get to the next offset after the spanning * store to ensure it's not NULL and to combine both the next node and * the node with the start together. */ r_mas = *mas; /* Avoid overflow, walk to next slot in the tree. */ if (r_mas.last + 1) r_mas.last++; r_mas.index = r_mas.last; mas_wr_walk_index(&r_wr_mas); r_mas.last = r_mas.index = mas->last; /* Set up left side. */ l_mas = *mas; mas_wr_walk_index(&l_wr_mas); if (!wr_mas->entry) { mas_extend_spanning_null(&l_wr_mas, &r_wr_mas); mas->offset = l_mas.offset; mas->index = l_mas.index; mas->last = l_mas.last = r_mas.last; } /* expanding NULLs may make this cover the entire range */ if (!l_mas.index && r_mas.last == ULONG_MAX) { mas_set_range(mas, 0, ULONG_MAX); return mas_new_root(mas, wr_mas->entry); } memset(&b_node, 0, sizeof(struct maple_big_node)); /* Copy l_mas and store the value in b_node. */ mas_store_b_node(&l_wr_mas, &b_node, l_mas.end); /* Copy r_mas into b_node if there is anything to copy. */ if (r_mas.max > r_mas.last) mas_mab_cp(&r_mas, r_mas.offset, r_mas.end, &b_node, b_node.b_end + 1); else b_node.b_end++; /* Stop spanning searches by searching for just index. */ l_mas.index = l_mas.last = mas->index; mast.bn = &b_node; mast.orig_l = &l_mas; mast.orig_r = &r_mas; /* Combine l_mas and r_mas and split them up evenly again. */ return mas_spanning_rebalance(mas, &mast, height + 1); } /* * mas_wr_node_store() - Attempt to store the value in a node * @wr_mas: The maple write state * * Attempts to reuse the node, but may allocate. */ static inline void mas_wr_node_store(struct ma_wr_state *wr_mas, unsigned char new_end) { struct ma_state *mas = wr_mas->mas; void __rcu **dst_slots; unsigned long *dst_pivots; unsigned char dst_offset, offset_end = wr_mas->offset_end; struct maple_node reuse, *newnode; unsigned char copy_size, node_pivots = mt_pivots[wr_mas->type]; bool in_rcu = mt_in_rcu(mas->tree); if (mas->last == wr_mas->end_piv) offset_end++; /* don't copy this offset */ else if (unlikely(wr_mas->r_max == ULONG_MAX)) mas_bulk_rebalance(mas, mas->end, wr_mas->type); /* set up node. */ if (in_rcu) { newnode = mas_pop_node(mas); } else { memset(&reuse, 0, sizeof(struct maple_node)); newnode = &reuse; } newnode->parent = mas_mn(mas)->parent; dst_pivots = ma_pivots(newnode, wr_mas->type); dst_slots = ma_slots(newnode, wr_mas->type); /* Copy from start to insert point */ memcpy(dst_pivots, wr_mas->pivots, sizeof(unsigned long) * mas->offset); memcpy(dst_slots, wr_mas->slots, sizeof(void *) * mas->offset); /* Handle insert of new range starting after old range */ if (wr_mas->r_min < mas->index) { rcu_assign_pointer(dst_slots[mas->offset], wr_mas->content); dst_pivots[mas->offset++] = mas->index - 1; } /* Store the new entry and range end. */ if (mas->offset < node_pivots) dst_pivots[mas->offset] = mas->last; rcu_assign_pointer(dst_slots[mas->offset], wr_mas->entry); /* * this range wrote to the end of the node or it overwrote the rest of * the data */ if (offset_end > mas->end) goto done; dst_offset = mas->offset + 1; /* Copy to the end of node if necessary. */ copy_size = mas->end - offset_end + 1; memcpy(dst_slots + dst_offset, wr_mas->slots + offset_end, sizeof(void *) * copy_size); memcpy(dst_pivots + dst_offset, wr_mas->pivots + offset_end, sizeof(unsigned long) * (copy_size - 1)); if (new_end < node_pivots) dst_pivots[new_end] = mas->max; done: mas_leaf_set_meta(newnode, maple_leaf_64, new_end); if (in_rcu) { struct maple_enode *old_enode = mas->node; mas->node = mt_mk_node(newnode, wr_mas->type); mas_replace_node(mas, old_enode); } else { memcpy(wr_mas->node, newnode, sizeof(struct maple_node)); } trace_ma_write(__func__, mas, 0, wr_mas->entry); mas_update_gap(mas); mas->end = new_end; return; } /* * mas_wr_slot_store: Attempt to store a value in a slot. * @wr_mas: the maple write state */ static inline void mas_wr_slot_store(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char offset = mas->offset; void __rcu **slots = wr_mas->slots; bool gap = false; gap |= !mt_slot_locked(mas->tree, slots, offset); gap |= !mt_slot_locked(mas->tree, slots, offset + 1); if (wr_mas->offset_end - offset == 1) { if (mas->index == wr_mas->r_min) { /* Overwriting the range and a part of the next one */ rcu_assign_pointer(slots[offset], wr_mas->entry); wr_mas->pivots[offset] = mas->last; } else { /* Overwriting a part of the range and the next one */ rcu_assign_pointer(slots[offset + 1], wr_mas->entry); wr_mas->pivots[offset] = mas->index - 1; mas->offset++; /* Keep mas accurate. */ } } else { WARN_ON_ONCE(mt_in_rcu(mas->tree)); /* * Expand the range, only partially overwriting the previous and * next ranges */ gap |= !mt_slot_locked(mas->tree, slots, offset + 2); rcu_assign_pointer(slots[offset + 1], wr_mas->entry); wr_mas->pivots[offset] = mas->index - 1; wr_mas->pivots[offset + 1] = mas->last; mas->offset++; /* Keep mas accurate. */ } trace_ma_write(__func__, mas, 0, wr_mas->entry); /* * Only update gap when the new entry is empty or there is an empty * entry in the original two ranges. */ if (!wr_mas->entry || gap) mas_update_gap(mas); return; } static inline void mas_wr_extend_null(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; if (!wr_mas->slots[wr_mas->offset_end]) { /* If this one is null, the next and prev are not */ mas->last = wr_mas->end_piv; } else { /* Check next slot(s) if we are overwriting the end */ if ((mas->last == wr_mas->end_piv) && (mas->end != wr_mas->offset_end) && !wr_mas->slots[wr_mas->offset_end + 1]) { wr_mas->offset_end++; if (wr_mas->offset_end == mas->end) mas->last = mas->max; else mas->last = wr_mas->pivots[wr_mas->offset_end]; wr_mas->end_piv = mas->last; } } if (!wr_mas->content) { /* If this one is null, the next and prev are not */ mas->index = wr_mas->r_min; } else { /* Check prev slot if we are overwriting the start */ if (mas->index == wr_mas->r_min && mas->offset && !wr_mas->slots[mas->offset - 1]) { mas->offset--; wr_mas->r_min = mas->index = mas_safe_min(mas, wr_mas->pivots, mas->offset); wr_mas->r_max = wr_mas->pivots[mas->offset]; } } } static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas) { while ((wr_mas->offset_end < wr_mas->mas->end) && (wr_mas->mas->last > wr_mas->pivots[wr_mas->offset_end])) wr_mas->offset_end++; if (wr_mas->offset_end < wr_mas->mas->end) wr_mas->end_piv = wr_mas->pivots[wr_mas->offset_end]; else wr_mas->end_piv = wr_mas->mas->max; } static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char new_end = mas->end + 2; new_end -= wr_mas->offset_end - mas->offset; if (wr_mas->r_min == mas->index) new_end--; if (wr_mas->end_piv == mas->last) new_end--; return new_end; } /* * mas_wr_append: Attempt to append * @wr_mas: the maple write state * @new_end: The end of the node after the modification * * This is currently unsafe in rcu mode since the end of the node may be cached * by readers while the node contents may be updated which could result in * inaccurate information. */ static inline void mas_wr_append(struct ma_wr_state *wr_mas, unsigned char new_end) { struct ma_state *mas = wr_mas->mas; void __rcu **slots; unsigned char end = mas->end; if (new_end < mt_pivots[wr_mas->type]) { wr_mas->pivots[new_end] = wr_mas->pivots[end]; ma_set_meta(wr_mas->node, wr_mas->type, 0, new_end); } slots = wr_mas->slots; if (new_end == end + 1) { if (mas->last == wr_mas->r_max) { /* Append to end of range */ rcu_assign_pointer(slots[new_end], wr_mas->entry); wr_mas->pivots[end] = mas->index - 1; mas->offset = new_end; } else { /* Append to start of range */ rcu_assign_pointer(slots[new_end], wr_mas->content); wr_mas->pivots[end] = mas->last; rcu_assign_pointer(slots[end], wr_mas->entry); } } else { /* Append to the range without touching any boundaries. */ rcu_assign_pointer(slots[new_end], wr_mas->content); wr_mas->pivots[end + 1] = mas->last; rcu_assign_pointer(slots[end + 1], wr_mas->entry); wr_mas->pivots[end] = mas->index - 1; mas->offset = end + 1; } if (!wr_mas->content || !wr_mas->entry) mas_update_gap(mas); mas->end = new_end; trace_ma_write(__func__, mas, new_end, wr_mas->entry); return; } /* * mas_wr_bnode() - Slow path for a modification. * @wr_mas: The write maple state * * This is where split, rebalance end up. */ static void mas_wr_bnode(struct ma_wr_state *wr_mas) { struct maple_big_node b_node; trace_ma_write(__func__, wr_mas->mas, 0, wr_mas->entry); memset(&b_node, 0, sizeof(struct maple_big_node)); mas_store_b_node(wr_mas, &b_node, wr_mas->offset_end); mas_commit_b_node(wr_mas, &b_node); } /* * mas_wr_store_entry() - Internal call to store a value * @wr_mas: The maple write state */ static inline void mas_wr_store_entry(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char new_end = mas_wr_new_end(wr_mas); switch (mas->store_type) { case wr_invalid: MT_BUG_ON(mas->tree, 1); return; case wr_new_root: mas_new_root(mas, wr_mas->entry); break; case wr_store_root: mas_store_root(mas, wr_mas->entry); break; case wr_exact_fit: rcu_assign_pointer(wr_mas->slots[mas->offset], wr_mas->entry); if (!!wr_mas->entry ^ !!wr_mas->content) mas_update_gap(mas); break; case wr_append: mas_wr_append(wr_mas, new_end); break; case wr_slot_store: mas_wr_slot_store(wr_mas); break; case wr_node_store: mas_wr_node_store(wr_mas, new_end); break; case wr_spanning_store: mas_wr_spanning_store(wr_mas); break; case wr_split_store: case wr_rebalance: mas_wr_bnode(wr_mas); break; } return; } static inline void mas_wr_prealloc_setup(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; if (!mas_is_active(mas)) { if (mas_is_start(mas)) goto set_content; if (unlikely(mas_is_paused(mas))) goto reset; if (unlikely(mas_is_none(mas))) goto reset; if (unlikely(mas_is_overflow(mas))) goto reset; if (unlikely(mas_is_underflow(mas))) goto reset; } /* * A less strict version of mas_is_span_wr() where we allow spanning * writes within this node. This is to stop partial walks in * mas_prealloc() from being reset. */ if (mas->last > mas->max) goto reset; if (wr_mas->entry) goto set_content; if (mte_is_leaf(mas->node) && mas->last == mas->max) goto reset; goto set_content; reset: mas_reset(mas); set_content: wr_mas->content = mas_start(mas); } /** * mas_prealloc_calc() - Calculate number of nodes needed for a * given store oepration * @mas: The maple state * @entry: The entry to store into the tree * * Return: Number of nodes required for preallocation. */ static inline int mas_prealloc_calc(struct ma_state *mas, void *entry) { int ret = mas_mt_height(mas) * 3 + 1; switch (mas->store_type) { case wr_invalid: WARN_ON_ONCE(1); break; case wr_new_root: ret = 1; break; case wr_store_root: if (likely((mas->last != 0) || (mas->index != 0))) ret = 1; else if (((unsigned long) (entry) & 3) == 2) ret = 1; else ret = 0; break; case wr_spanning_store: ret = mas_mt_height(mas) * 3 + 1; break; case wr_split_store: ret = mas_mt_height(mas) * 2 + 1; break; case wr_rebalance: ret = mas_mt_height(mas) * 2 - 1; break; case wr_node_store: ret = mt_in_rcu(mas->tree) ? 1 : 0; break; case wr_append: case wr_exact_fit: case wr_slot_store: ret = 0; } return ret; } /* * mas_wr_store_type() - Determine the store type for a given * store operation. * @wr_mas: The maple write state * * Return: the type of store needed for the operation */ static inline enum store_type mas_wr_store_type(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char new_end; if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) return wr_store_root; if (unlikely(!mas_wr_walk(wr_mas))) return wr_spanning_store; /* At this point, we are at the leaf node that needs to be altered. */ mas_wr_end_piv(wr_mas); if (!wr_mas->entry) mas_wr_extend_null(wr_mas); if ((wr_mas->r_min == mas->index) && (wr_mas->r_max == mas->last)) return wr_exact_fit; if (unlikely(!mas->index && mas->last == ULONG_MAX)) return wr_new_root; new_end = mas_wr_new_end(wr_mas); /* Potential spanning rebalance collapsing a node */ if (new_end < mt_min_slots[wr_mas->type]) { if (!mte_is_root(mas->node) && !(mas->mas_flags & MA_STATE_BULK)) return wr_rebalance; return wr_node_store; } if (new_end >= mt_slots[wr_mas->type]) return wr_split_store; if (!mt_in_rcu(mas->tree) && (mas->offset == mas->end)) return wr_append; if ((new_end == mas->end) && (!mt_in_rcu(mas->tree) || (wr_mas->offset_end - mas->offset == 1))) return wr_slot_store; return wr_node_store; } /** * mas_wr_preallocate() - Preallocate enough nodes for a store operation * @wr_mas: The maple write state * @entry: The entry that will be stored * */ static inline void mas_wr_preallocate(struct ma_wr_state *wr_mas, void *entry) { struct ma_state *mas = wr_mas->mas; int request; mas_wr_prealloc_setup(wr_mas); mas->store_type = mas_wr_store_type(wr_mas); request = mas_prealloc_calc(mas, entry); if (!request) return; mas_node_count(mas, request); } /** * mas_insert() - Internal call to insert a value * @mas: The maple state * @entry: The entry to store * * Return: %NULL or the contents that already exists at the requested index * otherwise. The maple state needs to be checked for error conditions. */ static inline void *mas_insert(struct ma_state *mas, void *entry) { MA_WR_STATE(wr_mas, mas, entry); /* * Inserting a new range inserts either 0, 1, or 2 pivots within the * tree. If the insert fits exactly into an existing gap with a value * of NULL, then the slot only needs to be written with the new value. * If the range being inserted is adjacent to another range, then only a * single pivot needs to be inserted (as well as writing the entry). If * the new range is within a gap but does not touch any other ranges, * then two pivots need to be inserted: the start - 1, and the end. As * usual, the entry must be written. Most operations require a new node * to be allocated and replace an existing node to ensure RCU safety, * when in RCU mode. The exception to requiring a newly allocated node * is when inserting at the end of a node (appending). When done * carefully, appending can reuse the node in place. */ wr_mas.content = mas_start(mas); if (wr_mas.content) goto exists; mas_wr_preallocate(&wr_mas, entry); if (mas_is_err(mas)) return NULL; /* spanning writes always overwrite something */ if (mas->store_type == wr_spanning_store) goto exists; /* At this point, we are at the leaf node that needs to be altered. */ if (mas->store_type != wr_new_root && mas->store_type != wr_store_root) { wr_mas.offset_end = mas->offset; wr_mas.end_piv = wr_mas.r_max; if (wr_mas.content || (mas->last > wr_mas.r_max)) goto exists; } mas_wr_store_entry(&wr_mas); return wr_mas.content; exists: mas_set_err(mas, -EEXIST); return wr_mas.content; } /** * mas_alloc_cyclic() - Internal call to find somewhere to store an entry * @mas: The maple state. * @startp: Pointer to ID. * @range_lo: Lower bound of range to search. * @range_hi: Upper bound of range to search. * @entry: The entry to store. * @next: Pointer to next ID to allocate. * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 if the allocation succeeded without wrapping, 1 if the * allocation succeeded after wrapping, or -EBUSY if there are no * free entries. */ int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp, void *entry, unsigned long range_lo, unsigned long range_hi, unsigned long *next, gfp_t gfp) { unsigned long min = range_lo; int ret = 0; range_lo = max(min, *next); ret = mas_empty_area(mas, range_lo, range_hi, 1); if ((mas->tree->ma_flags & MT_FLAGS_ALLOC_WRAPPED) && ret == 0) { mas->tree->ma_flags &= ~MT_FLAGS_ALLOC_WRAPPED; ret = 1; } if (ret < 0 && range_lo > min) { mas_reset(mas); ret = mas_empty_area(mas, min, range_hi, 1); if (ret == 0) ret = 1; } if (ret < 0) return ret; do { mas_insert(mas, entry); } while (mas_nomem(mas, gfp)); if (mas_is_err(mas)) return xa_err(mas->node); *startp = mas->index; *next = *startp + 1; if (*next == 0) mas->tree->ma_flags |= MT_FLAGS_ALLOC_WRAPPED; mas_destroy(mas); return ret; } EXPORT_SYMBOL(mas_alloc_cyclic); static __always_inline void mas_rewalk(struct ma_state *mas, unsigned long index) { retry: mas_set(mas, index); mas_state_walk(mas); if (mas_is_start(mas)) goto retry; } static __always_inline bool mas_rewalk_if_dead(struct ma_state *mas, struct maple_node *node, const unsigned long index) { if (unlikely(ma_dead_node(node))) { mas_rewalk(mas, index); return true; } return false; } /* * mas_prev_node() - Find the prev non-null entry at the same level in the * tree. The prev value will be mas->node[mas->offset] or the status will be * ma_none. * @mas: The maple state * @min: The lower limit to search * * The prev node value will be mas->node[mas->offset] or the status will be * ma_none. * Return: 1 if the node is dead, 0 otherwise. */ static int mas_prev_node(struct ma_state *mas, unsigned long min) { enum maple_type mt; int offset, level; void __rcu **slots; struct maple_node *node; unsigned long *pivots; unsigned long max; node = mas_mn(mas); if (!mas->min) goto no_entry; max = mas->min - 1; if (max < min) goto no_entry; level = 0; do { if (ma_is_root(node)) goto no_entry; /* Walk up. */ if (unlikely(mas_ascend(mas))) return 1; offset = mas->offset; level++; node = mas_mn(mas); } while (!offset); offset--; mt = mte_node_type(mas->node); while (level > 1) { level--; slots = ma_slots(node, mt); mas->node = mas_slot(mas, slots, offset); if (unlikely(ma_dead_node(node))) return 1; mt = mte_node_type(mas->node); node = mas_mn(mas); pivots = ma_pivots(node, mt); offset = ma_data_end(node, mt, pivots, max); if (unlikely(ma_dead_node(node))) return 1; } slots = ma_slots(node, mt); mas->node = mas_slot(mas, slots, offset); pivots = ma_pivots(node, mt); if (unlikely(ma_dead_node(node))) return 1; if (likely(offset)) mas->min = pivots[offset - 1] + 1; mas->max = max; mas->offset = mas_data_end(mas); if (unlikely(mte_dead_node(mas->node))) return 1; mas->end = mas->offset; return 0; no_entry: if (unlikely(ma_dead_node(node))) return 1; mas->status = ma_underflow; return 0; } /* * mas_prev_slot() - Get the entry in the previous slot * * @mas: The maple state * @min: The minimum starting range * @empty: Can be empty * * Return: The entry in the previous slot which is possibly NULL */ static void *mas_prev_slot(struct ma_state *mas, unsigned long min, bool empty) { void *entry; void __rcu **slots; unsigned long pivot; enum maple_type type; unsigned long *pivots; struct maple_node *node; unsigned long save_point = mas->index; retry: node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (mas->min <= min) { pivot = mas_safe_min(mas, pivots, mas->offset); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (pivot <= min) goto underflow; } again: if (likely(mas->offset)) { mas->offset--; mas->last = mas->index - 1; mas->index = mas_safe_min(mas, pivots, mas->offset); } else { if (mas->index <= min) goto underflow; if (mas_prev_node(mas, min)) { mas_rewalk(mas, save_point); goto retry; } if (WARN_ON_ONCE(mas_is_underflow(mas))) return NULL; mas->last = mas->max; node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); mas->index = pivots[mas->offset - 1] + 1; } slots = ma_slots(node, type); entry = mas_slot(mas, slots, mas->offset); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (likely(entry)) return entry; if (!empty) { if (mas->index <= min) { mas->status = ma_underflow; return NULL; } goto again; } return entry; underflow: mas->status = ma_underflow; return NULL; } /* * mas_next_node() - Get the next node at the same level in the tree. * @mas: The maple state * @node: The maple node * @max: The maximum pivot value to check. * * The next value will be mas->node[mas->offset] or the status will have * overflowed. * Return: 1 on dead node, 0 otherwise. */ static int mas_next_node(struct ma_state *mas, struct maple_node *node, unsigned long max) { unsigned long min; unsigned long *pivots; struct maple_enode *enode; struct maple_node *tmp; int level = 0; unsigned char node_end; enum maple_type mt; void __rcu **slots; if (mas->max >= max) goto overflow; min = mas->max + 1; level = 0; do { if (ma_is_root(node)) goto overflow; /* Walk up. */ if (unlikely(mas_ascend(mas))) return 1; level++; node = mas_mn(mas); mt = mte_node_type(mas->node); pivots = ma_pivots(node, mt); node_end = ma_data_end(node, mt, pivots, mas->max); if (unlikely(ma_dead_node(node))) return 1; } while (unlikely(mas->offset == node_end)); slots = ma_slots(node, mt); mas->offset++; enode = mas_slot(mas, slots, mas->offset); if (unlikely(ma_dead_node(node))) return 1; if (level > 1) mas->offset = 0; while (unlikely(level > 1)) { level--; mas->node = enode; node = mas_mn(mas); mt = mte_node_type(mas->node); slots = ma_slots(node, mt); enode = mas_slot(mas, slots, 0); if (unlikely(ma_dead_node(node))) return 1; } if (!mas->offset) pivots = ma_pivots(node, mt); mas->max = mas_safe_pivot(mas, pivots, mas->offset, mt); tmp = mte_to_node(enode); mt = mte_node_type(enode); pivots = ma_pivots(tmp, mt); mas->end = ma_data_end(tmp, mt, pivots, mas->max); if (unlikely(ma_dead_node(node))) return 1; mas->node = enode; mas->min = min; return 0; overflow: if (unlikely(ma_dead_node(node))) return 1; mas->status = ma_overflow; return 0; } /* * mas_next_slot() - Get the entry in the next slot * * @mas: The maple state * @max: The maximum starting range * @empty: Can be empty * * Return: The entry in the next slot which is possibly NULL */ static void *mas_next_slot(struct ma_state *mas, unsigned long max, bool empty) { void __rcu **slots; unsigned long *pivots; unsigned long pivot; enum maple_type type; struct maple_node *node; unsigned long save_point = mas->last; void *entry; retry: node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (mas->max >= max) { if (likely(mas->offset < mas->end)) pivot = pivots[mas->offset]; else pivot = mas->max; if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (pivot >= max) { /* Was at the limit, next will extend beyond */ mas->status = ma_overflow; return NULL; } } if (likely(mas->offset < mas->end)) { mas->index = pivots[mas->offset] + 1; again: mas->offset++; if (likely(mas->offset < mas->end)) mas->last = pivots[mas->offset]; else mas->last = mas->max; } else { if (mas->last >= max) { mas->status = ma_overflow; return NULL; } if (mas_next_node(mas, node, max)) { mas_rewalk(mas, save_point); goto retry; } if (WARN_ON_ONCE(mas_is_overflow(mas))) return NULL; mas->offset = 0; mas->index = mas->min; node = mas_mn(mas); type = mte_node_type(mas->node); pivots = ma_pivots(node, type); mas->last = pivots[0]; } slots = ma_slots(node, type); entry = mt_slot(mas->tree, slots, mas->offset); if (unlikely(mas_rewalk_if_dead(mas, node, save_point))) goto retry; if (entry) return entry; if (!empty) { if (mas->last >= max) { mas->status = ma_overflow; return NULL; } mas->index = mas->last + 1; goto again; } return entry; } /* * mas_rev_awalk() - Internal function. Reverse allocation walk. Find the * highest gap address of a given size in a given node and descend. * @mas: The maple state * @size: The needed size. * * Return: True if found in a leaf, false otherwise. * */ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size, unsigned long *gap_min, unsigned long *gap_max) { enum maple_type type = mte_node_type(mas->node); struct maple_node *node = mas_mn(mas); unsigned long *pivots, *gaps; void __rcu **slots; unsigned long gap = 0; unsigned long max, min; unsigned char offset; if (unlikely(mas_is_err(mas))) return true; if (ma_is_dense(type)) { /* dense nodes. */ mas->offset = (unsigned char)(mas->index - mas->min); return true; } pivots = ma_pivots(node, type); slots = ma_slots(node, type); gaps = ma_gaps(node, type); offset = mas->offset; min = mas_safe_min(mas, pivots, offset); /* Skip out of bounds. */ while (mas->last < min) min = mas_safe_min(mas, pivots, --offset); max = mas_safe_pivot(mas, pivots, offset, type); while (mas->index <= max) { gap = 0; if (gaps) gap = gaps[offset]; else if (!mas_slot(mas, slots, offset)) gap = max - min + 1; if (gap) { if ((size <= gap) && (size <= mas->last - min + 1)) break; if (!gaps) { /* Skip the next slot, it cannot be a gap. */ if (offset < 2) goto ascend; offset -= 2; max = pivots[offset]; min = mas_safe_min(mas, pivots, offset); continue; } } if (!offset) goto ascend; offset--; max = min - 1; min = mas_safe_min(mas, pivots, offset); } if (unlikely((mas->index > max) || (size - 1 > max - mas->index))) goto no_space; if (unlikely(ma_is_leaf(type))) { mas->offset = offset; *gap_min = min; *gap_max = min + gap - 1; return true; } /* descend, only happens under lock. */ mas->node = mas_slot(mas, slots, offset); mas->min = min; mas->max = max; mas->offset = mas_data_end(mas); return false; ascend: if (!mte_is_root(mas->node)) return false; no_space: mas_set_err(mas, -EBUSY); return false; } static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size) { enum maple_type type = mte_node_type(mas->node); unsigned long pivot, min, gap = 0; unsigned char offset, data_end; unsigned long *gaps, *pivots; void __rcu **slots; struct maple_node *node; bool found = false; if (ma_is_dense(type)) { mas->offset = (unsigned char)(mas->index - mas->min); return true; } node = mas_mn(mas); pivots = ma_pivots(node, type); slots = ma_slots(node, type); gaps = ma_gaps(node, type); offset = mas->offset; min = mas_safe_min(mas, pivots, offset); data_end = ma_data_end(node, type, pivots, mas->max); for (; offset <= data_end; offset++) { pivot = mas_safe_pivot(mas, pivots, offset, type); /* Not within lower bounds */ if (mas->index > pivot) goto next_slot; if (gaps) gap = gaps[offset]; else if (!mas_slot(mas, slots, offset)) gap = min(pivot, mas->last) - max(mas->index, min) + 1; else goto next_slot; if (gap >= size) { if (ma_is_leaf(type)) { found = true; break; } mas->node = mas_slot(mas, slots, offset); mas->min = min; mas->max = pivot; offset = 0; break; } next_slot: min = pivot + 1; if (mas->last <= pivot) { mas_set_err(mas, -EBUSY); return true; } } mas->offset = offset; return found; } /** * mas_walk() - Search for @mas->index in the tree. * @mas: The maple state. * * mas->index and mas->last will be set to the range if there is a value. If * mas->status is ma_none, reset to ma_start * * Return: the entry at the location or %NULL. */ void *mas_walk(struct ma_state *mas) { void *entry; if (!mas_is_active(mas) || !mas_is_start(mas)) mas->status = ma_start; retry: entry = mas_state_walk(mas); if (mas_is_start(mas)) { goto retry; } else if (mas_is_none(mas)) { mas->index = 0; mas->last = ULONG_MAX; } else if (mas_is_ptr(mas)) { if (!mas->index) { mas->last = 0; return entry; } mas->index = 1; mas->last = ULONG_MAX; mas->status = ma_none; return NULL; } return entry; } EXPORT_SYMBOL_GPL(mas_walk); static inline bool mas_rewind_node(struct ma_state *mas) { unsigned char slot; do { if (mte_is_root(mas->node)) { slot = mas->offset; if (!slot) return false; } else { mas_ascend(mas); slot = mas->offset; } } while (!slot); mas->offset = --slot; return true; } /* * mas_skip_node() - Internal function. Skip over a node. * @mas: The maple state. * * Return: true if there is another node, false otherwise. */ static inline bool mas_skip_node(struct ma_state *mas) { if (mas_is_err(mas)) return false; do { if (mte_is_root(mas->node)) { if (mas->offset >= mas_data_end(mas)) { mas_set_err(mas, -EBUSY); return false; } } else { mas_ascend(mas); } } while (mas->offset >= mas_data_end(mas)); mas->offset++; return true; } /* * mas_awalk() - Allocation walk. Search from low address to high, for a gap of * @size * @mas: The maple state * @size: The size of the gap required * * Search between @mas->index and @mas->last for a gap of @size. */ static inline void mas_awalk(struct ma_state *mas, unsigned long size) { struct maple_enode *last = NULL; /* * There are 4 options: * go to child (descend) * go back to parent (ascend) * no gap found. (return, error == -EBUSY) * found the gap. (return) */ while (!mas_is_err(mas) && !mas_anode_descend(mas, size)) { if (last == mas->node) mas_skip_node(mas); else last = mas->node; } } /* * mas_sparse_area() - Internal function. Return upper or lower limit when * searching for a gap in an empty tree. * @mas: The maple state * @min: the minimum range * @max: The maximum range * @size: The size of the gap * @fwd: Searching forward or back */ static inline int mas_sparse_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size, bool fwd) { if (!unlikely(mas_is_none(mas)) && min == 0) { min++; /* * At this time, min is increased, we need to recheck whether * the size is satisfied. */ if (min > max || max - min + 1 < size) return -EBUSY; } /* mas_is_ptr */ if (fwd) { mas->index = min; mas->last = min + size - 1; } else { mas->last = max; mas->index = max - size + 1; } return 0; } /* * mas_empty_area() - Get the lowest address within the range that is * sufficient for the size requested. * @mas: The maple state * @min: The lowest value of the range * @max: The highest value of the range * @size: The size needed */ int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size) { unsigned char offset; unsigned long *pivots; enum maple_type mt; struct maple_node *node; if (min > max) return -EINVAL; if (size == 0 || max - min < size - 1) return -EINVAL; if (mas_is_start(mas)) mas_start(mas); else if (mas->offset >= 2) mas->offset -= 2; else if (!mas_skip_node(mas)) return -EBUSY; /* Empty set */ if (mas_is_none(mas) || mas_is_ptr(mas)) return mas_sparse_area(mas, min, max, size, true); /* The start of the window can only be within these values */ mas->index = min; mas->last = max; mas_awalk(mas, size); if (unlikely(mas_is_err(mas))) return xa_err(mas->node); offset = mas->offset; node = mas_mn(mas); mt = mte_node_type(mas->node); pivots = ma_pivots(node, mt); min = mas_safe_min(mas, pivots, offset); if (mas->index < min) mas->index = min; mas->last = mas->index + size - 1; mas->end = ma_data_end(node, mt, pivots, mas->max); return 0; } EXPORT_SYMBOL_GPL(mas_empty_area); /* * mas_empty_area_rev() - Get the highest address within the range that is * sufficient for the size requested. * @mas: The maple state * @min: The lowest value of the range * @max: The highest value of the range * @size: The size needed */ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size) { struct maple_enode *last = mas->node; if (min > max) return -EINVAL; if (size == 0 || max - min < size - 1) return -EINVAL; if (mas_is_start(mas)) mas_start(mas); else if ((mas->offset < 2) && (!mas_rewind_node(mas))) return -EBUSY; if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) return mas_sparse_area(mas, min, max, size, false); else if (mas->offset >= 2) mas->offset -= 2; else mas->offset = mas_data_end(mas); /* The start of the window can only be within these values. */ mas->index = min; mas->last = max; while (!mas_rev_awalk(mas, size, &min, &max)) { if (last == mas->node) { if (!mas_rewind_node(mas)) return -EBUSY; } else { last = mas->node; } } if (mas_is_err(mas)) return xa_err(mas->node); if (unlikely(mas->offset == MAPLE_NODE_SLOTS)) return -EBUSY; /* Trim the upper limit to the max. */ if (max < mas->last) mas->last = max; mas->index = mas->last - size + 1; mas->end = mas_data_end(mas); return 0; } EXPORT_SYMBOL_GPL(mas_empty_area_rev); /* * mte_dead_leaves() - Mark all leaves of a node as dead. * @enode: the encoded node * @mt: the maple tree * @slots: Pointer to the slot array * * Must hold the write lock. * * Return: The number of leaves marked as dead. */ static inline unsigned char mte_dead_leaves(struct maple_enode *enode, struct maple_tree *mt, void __rcu **slots) { struct maple_node *node; enum maple_type type; void *entry; int offset; for (offset = 0; offset < mt_slot_count(enode); offset++) { entry = mt_slot(mt, slots, offset); type = mte_node_type(entry); node = mte_to_node(entry); /* Use both node and type to catch LE & BE metadata */ if (!node || !type) break; mte_set_node_dead(entry); node->type = type; rcu_assign_pointer(slots[offset], node); } return offset; } /** * mte_dead_walk() - Walk down a dead tree to just before the leaves * @enode: The maple encoded node * @offset: The starting offset * * Note: This can only be used from the RCU callback context. */ static void __rcu **mte_dead_walk(struct maple_enode **enode, unsigned char offset) { struct maple_node *node, *next; void __rcu **slots = NULL; next = mte_to_node(*enode); do { *enode = ma_enode_ptr(next); node = mte_to_node(*enode); slots = ma_slots(node, node->type); next = rcu_dereference_protected(slots[offset], lock_is_held(&rcu_callback_map)); offset = 0; } while (!ma_is_leaf(next->type)); return slots; } /** * mt_free_walk() - Walk & free a tree in the RCU callback context * @head: The RCU head that's within the node. * * Note: This can only be used from the RCU callback context. */ static void mt_free_walk(struct rcu_head *head) { void __rcu **slots; struct maple_node *node, *start; struct maple_enode *enode; unsigned char offset; enum maple_type type; node = container_of(head, struct maple_node, rcu); if (ma_is_leaf(node->type)) goto free_leaf; start = node; enode = mt_mk_node(node, node->type); slots = mte_dead_walk(&enode, 0); node = mte_to_node(enode); do { mt_free_bulk(node->slot_len, slots); offset = node->parent_slot + 1; enode = node->piv_parent; if (mte_to_node(enode) == node) goto free_leaf; type = mte_node_type(enode); slots = ma_slots(mte_to_node(enode), type); if ((offset < mt_slots[type]) && rcu_dereference_protected(slots[offset], lock_is_held(&rcu_callback_map))) slots = mte_dead_walk(&enode, offset); node = mte_to_node(enode); } while ((node != start) || (node->slot_len < offset)); slots = ma_slots(node, node->type); mt_free_bulk(node->slot_len, slots); free_leaf: mt_free_rcu(&node->rcu); } static inline void __rcu **mte_destroy_descend(struct maple_enode **enode, struct maple_tree *mt, struct maple_enode *prev, unsigned char offset) { struct maple_node *node; struct maple_enode *next = *enode; void __rcu **slots = NULL; enum maple_type type; unsigned char next_offset = 0; do { *enode = next; node = mte_to_node(*enode); type = mte_node_type(*enode); slots = ma_slots(node, type); next = mt_slot_locked(mt, slots, next_offset); if ((mte_dead_node(next))) next = mt_slot_locked(mt, slots, ++next_offset); mte_set_node_dead(*enode); node->type = type; node->piv_parent = prev; node->parent_slot = offset; offset = next_offset; next_offset = 0; prev = *enode; } while (!mte_is_leaf(next)); return slots; } static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt, bool free) { void __rcu **slots; struct maple_node *node = mte_to_node(enode); struct maple_enode *start; if (mte_is_leaf(enode)) { node->type = mte_node_type(enode); goto free_leaf; } start = enode; slots = mte_destroy_descend(&enode, mt, start, 0); node = mte_to_node(enode); // Updated in the above call. do { enum maple_type type; unsigned char offset; struct maple_enode *parent, *tmp; node->slot_len = mte_dead_leaves(enode, mt, slots); if (free) mt_free_bulk(node->slot_len, slots); offset = node->parent_slot + 1; enode = node->piv_parent; if (mte_to_node(enode) == node) goto free_leaf; type = mte_node_type(enode); slots = ma_slots(mte_to_node(enode), type); if (offset >= mt_slots[type]) goto next; tmp = mt_slot_locked(mt, slots, offset); if (mte_node_type(tmp) && mte_to_node(tmp)) { parent = enode; enode = tmp; slots = mte_destroy_descend(&enode, mt, parent, offset); } next: node = mte_to_node(enode); } while (start != enode); node = mte_to_node(enode); node->slot_len = mte_dead_leaves(enode, mt, slots); if (free) mt_free_bulk(node->slot_len, slots); free_leaf: if (free) mt_free_rcu(&node->rcu); else mt_clear_meta(mt, node, node->type); } /* * mte_destroy_walk() - Free a tree or sub-tree. * @enode: the encoded maple node (maple_enode) to start * @mt: the tree to free - needed for node types. * * Must hold the write lock. */ static inline void mte_destroy_walk(struct maple_enode *enode, struct maple_tree *mt) { struct maple_node *node = mte_to_node(enode); if (mt_in_rcu(mt)) { mt_destroy_walk(enode, mt, false); call_rcu(&node->rcu, mt_free_walk); } else { mt_destroy_walk(enode, mt, true); } } /* Interface */ /** * mas_store() - Store an @entry. * @mas: The maple state. * @entry: The entry to store. * * The @mas->index and @mas->last is used to set the range for the @entry. * * Return: the first entry between mas->index and mas->last or %NULL. */ void *mas_store(struct ma_state *mas, void *entry) { int request; MA_WR_STATE(wr_mas, mas, entry); trace_ma_write(__func__, mas, 0, entry); #ifdef CONFIG_DEBUG_MAPLE_TREE if (MAS_WARN_ON(mas, mas->index > mas->last)) pr_err("Error %lX > %lX " PTR_FMT "\n", mas->index, mas->last, entry); if (mas->index > mas->last) { mas_set_err(mas, -EINVAL); return NULL; } #endif /* * Storing is the same operation as insert with the added caveat that it * can overwrite entries. Although this seems simple enough, one may * want to examine what happens if a single store operation was to * overwrite multiple entries within a self-balancing B-Tree. */ mas_wr_prealloc_setup(&wr_mas); mas->store_type = mas_wr_store_type(&wr_mas); if (mas->mas_flags & MA_STATE_PREALLOC) { mas_wr_store_entry(&wr_mas); MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas)); return wr_mas.content; } request = mas_prealloc_calc(mas, entry); if (!request) goto store; mas_node_count(mas, request); if (mas_is_err(mas)) return NULL; store: mas_wr_store_entry(&wr_mas); mas_destroy(mas); return wr_mas.content; } EXPORT_SYMBOL_GPL(mas_store); /** * mas_store_gfp() - Store a value into the tree. * @mas: The maple state * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations if necessary. * * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not * be allocated. */ int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp) { unsigned long index = mas->index; unsigned long last = mas->last; MA_WR_STATE(wr_mas, mas, entry); int ret = 0; retry: mas_wr_preallocate(&wr_mas, entry); if (unlikely(mas_nomem(mas, gfp))) { if (!entry) __mas_set_range(mas, index, last); goto retry; } if (mas_is_err(mas)) { ret = xa_err(mas->node); goto out; } mas_wr_store_entry(&wr_mas); out: mas_destroy(mas); return ret; } EXPORT_SYMBOL_GPL(mas_store_gfp); /** * mas_store_prealloc() - Store a value into the tree using memory * preallocated in the maple state. * @mas: The maple state * @entry: The entry to store. */ void mas_store_prealloc(struct ma_state *mas, void *entry) { MA_WR_STATE(wr_mas, mas, entry); if (mas->store_type == wr_store_root) { mas_wr_prealloc_setup(&wr_mas); goto store; } mas_wr_walk_descend(&wr_mas); if (mas->store_type != wr_spanning_store) { /* set wr_mas->content to current slot */ wr_mas.content = mas_slot_locked(mas, wr_mas.slots, mas->offset); mas_wr_end_piv(&wr_mas); } store: trace_ma_write(__func__, mas, 0, entry); mas_wr_store_entry(&wr_mas); MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas)); mas_destroy(mas); } EXPORT_SYMBOL_GPL(mas_store_prealloc); /** * mas_preallocate() - Preallocate enough nodes for a store operation * @mas: The maple state * @entry: The entry that will be stored * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -ENOMEM if memory could not be allocated. */ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) { MA_WR_STATE(wr_mas, mas, entry); int ret = 0; int request; mas_wr_prealloc_setup(&wr_mas); mas->store_type = mas_wr_store_type(&wr_mas); request = mas_prealloc_calc(mas, entry); if (!request) return ret; mas_node_count_gfp(mas, request, gfp); if (mas_is_err(mas)) { mas_set_alloc_req(mas, 0); ret = xa_err(mas->node); mas_destroy(mas); mas_reset(mas); return ret; } mas->mas_flags |= MA_STATE_PREALLOC; return ret; } EXPORT_SYMBOL_GPL(mas_preallocate); /* * mas_destroy() - destroy a maple state. * @mas: The maple state * * Upon completion, check the left-most node and rebalance against the node to * the right if necessary. Frees any allocated nodes associated with this maple * state. */ void mas_destroy(struct ma_state *mas) { struct maple_alloc *node; unsigned long total; /* * When using mas_for_each() to insert an expected number of elements, * it is possible that the number inserted is less than the expected * number. To fix an invalid final node, a check is performed here to * rebalance the previous node with the final node. */ if (mas->mas_flags & MA_STATE_REBALANCE) { unsigned char end; if (mas_is_err(mas)) mas_reset(mas); mas_start(mas); mtree_range_walk(mas); end = mas->end + 1; if (end < mt_min_slot_count(mas->node) - 1) mas_destroy_rebalance(mas, end); mas->mas_flags &= ~MA_STATE_REBALANCE; } mas->mas_flags &= ~(MA_STATE_BULK|MA_STATE_PREALLOC); total = mas_allocated(mas); while (total) { node = mas->alloc; mas->alloc = node->slot[0]; if (node->node_count > 1) { size_t count = node->node_count - 1; mt_free_bulk(count, (void __rcu **)&node->slot[1]); total -= count; } mt_free_one(ma_mnode_ptr(node)); total--; } mas->alloc = NULL; } EXPORT_SYMBOL_GPL(mas_destroy); /* * mas_expected_entries() - Set the expected number of entries that will be inserted. * @mas: The maple state * @nr_entries: The number of expected entries. * * This will attempt to pre-allocate enough nodes to store the expected number * of entries. The allocations will occur using the bulk allocator interface * for speed. Please call mas_destroy() on the @mas after inserting the entries * to ensure any unused nodes are freed. * * Return: 0 on success, -ENOMEM if memory could not be allocated. */ int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries) { int nonleaf_cap = MAPLE_ARANGE64_SLOTS - 2; struct maple_enode *enode = mas->node; int nr_nodes; int ret; /* * Sometimes it is necessary to duplicate a tree to a new tree, such as * forking a process and duplicating the VMAs from one tree to a new * tree. When such a situation arises, it is known that the new tree is * not going to be used until the entire tree is populated. For * performance reasons, it is best to use a bulk load with RCU disabled. * This allows for optimistic splitting that favours the left and reuse * of nodes during the operation. */ /* Optimize splitting for bulk insert in-order */ mas->mas_flags |= MA_STATE_BULK; /* * Avoid overflow, assume a gap between each entry and a trailing null. * If this is wrong, it just means allocation can happen during * insertion of entries. */ nr_nodes = max(nr_entries, nr_entries * 2 + 1); if (!mt_is_alloc(mas->tree)) nonleaf_cap = MAPLE_RANGE64_SLOTS - 2; /* Leaves; reduce slots to keep space for expansion */ nr_nodes = DIV_ROUND_UP(nr_nodes, MAPLE_RANGE64_SLOTS - 2); /* Internal nodes */ nr_nodes += DIV_ROUND_UP(nr_nodes, nonleaf_cap); /* Add working room for split (2 nodes) + new parents */ mas_node_count_gfp(mas, nr_nodes + 3, GFP_KERNEL); /* Detect if allocations run out */ mas->mas_flags |= MA_STATE_PREALLOC; if (!mas_is_err(mas)) return 0; ret = xa_err(mas->node); mas->node = enode; mas_destroy(mas); return ret; } EXPORT_SYMBOL_GPL(mas_expected_entries); static bool mas_next_setup(struct ma_state *mas, unsigned long max, void **entry) { bool was_none = mas_is_none(mas); if (unlikely(mas->last >= max)) { mas->status = ma_overflow; return true; } switch (mas->status) { case ma_active: return false; case ma_none: fallthrough; case ma_pause: mas->status = ma_start; fallthrough; case ma_start: mas_walk(mas); /* Retries on dead nodes handled by mas_walk */ break; case ma_overflow: /* Overflowed before, but the max changed */ mas->status = ma_active; break; case ma_underflow: /* The user expects the mas to be one before where it is */ mas->status = ma_active; *entry = mas_walk(mas); if (*entry) return true; break; case ma_root: break; case ma_error: return true; } if (likely(mas_is_active(mas))) /* Fast path */ return false; if (mas_is_ptr(mas)) { *entry = NULL; if (was_none && mas->index == 0) { mas->index = mas->last = 0; return true; } mas->index = 1; mas->last = ULONG_MAX; mas->status = ma_none; return true; } if (mas_is_none(mas)) return true; return false; } /** * mas_next() - Get the next entry. * @mas: The maple state * @max: The maximum index to check. * * Returns the next entry after @mas->index. * Must hold rcu_read_lock or the write lock. * Can return the zero entry. * * Return: The next entry or %NULL */ void *mas_next(struct ma_state *mas, unsigned long max) { void *entry = NULL; if (mas_next_setup(mas, max, &entry)) return entry; /* Retries on dead nodes handled by mas_next_slot */ return mas_next_slot(mas, max, false); } EXPORT_SYMBOL_GPL(mas_next); /** * mas_next_range() - Advance the maple state to the next range * @mas: The maple state * @max: The maximum index to check. * * Sets @mas->index and @mas->last to the range. * Must hold rcu_read_lock or the write lock. * Can return the zero entry. * * Return: The next entry or %NULL */ void *mas_next_range(struct ma_state *mas, unsigned long max) { void *entry = NULL; if (mas_next_setup(mas, max, &entry)) return entry; /* Retries on dead nodes handled by mas_next_slot */ return mas_next_slot(mas, max, true); } EXPORT_SYMBOL_GPL(mas_next_range); /** * mt_next() - get the next value in the maple tree * @mt: The maple tree * @index: The start index * @max: The maximum index to check * * Takes RCU read lock internally to protect the search, which does not * protect the returned pointer after dropping RCU read lock. * See also: Documentation/core-api/maple_tree.rst * * Return: The entry higher than @index or %NULL if nothing is found. */ void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max) { void *entry = NULL; MA_STATE(mas, mt, index, index); rcu_read_lock(); entry = mas_next(&mas, max); rcu_read_unlock(); return entry; } EXPORT_SYMBOL_GPL(mt_next); static bool mas_prev_setup(struct ma_state *mas, unsigned long min, void **entry) { if (unlikely(mas->index <= min)) { mas->status = ma_underflow; return true; } switch (mas->status) { case ma_active: return false; case ma_start: break; case ma_none: fallthrough; case ma_pause: mas->status = ma_start; break; case ma_underflow: /* underflowed before but the min changed */ mas->status = ma_active; break; case ma_overflow: /* User expects mas to be one after where it is */ mas->status = ma_active; *entry = mas_walk(mas); if (*entry) return true; break; case ma_root: break; case ma_error: return true; } if (mas_is_start(mas)) mas_walk(mas); if (unlikely(mas_is_ptr(mas))) { if (!mas->index) { mas->status = ma_none; return true; } mas->index = mas->last = 0; *entry = mas_root(mas); return true; } if (mas_is_none(mas)) { if (mas->index) { /* Walked to out-of-range pointer? */ mas->index = mas->last = 0; mas->status = ma_root; *entry = mas_root(mas); return true; } return true; } return false; } /** * mas_prev() - Get the previous entry * @mas: The maple state * @min: The minimum value to check. * * Must hold rcu_read_lock or the write lock. * Will reset mas to ma_start if the status is ma_none. Will stop on not * searchable nodes. * * Return: the previous value or %NULL. */ void *mas_prev(struct ma_state *mas, unsigned long min) { void *entry = NULL; if (mas_prev_setup(mas, min, &entry)) return entry; return mas_prev_slot(mas, min, false); } EXPORT_SYMBOL_GPL(mas_prev); /** * mas_prev_range() - Advance to the previous range * @mas: The maple state * @min: The minimum value to check. * * Sets @mas->index and @mas->last to the range. * Must hold rcu_read_lock or the write lock. * Will reset mas to ma_start if the node is ma_none. Will stop on not * searchable nodes. * * Return: the previous value or %NULL. */ void *mas_prev_range(struct ma_state *mas, unsigned long min) { void *entry = NULL; if (mas_prev_setup(mas, min, &entry)) return entry; return mas_prev_slot(mas, min, true); } EXPORT_SYMBOL_GPL(mas_prev_range); /** * mt_prev() - get the previous value in the maple tree * @mt: The maple tree * @index: The start index * @min: The minimum index to check * * Takes RCU read lock internally to protect the search, which does not * protect the returned pointer after dropping RCU read lock. * See also: Documentation/core-api/maple_tree.rst * * Return: The entry before @index or %NULL if nothing is found. */ void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min) { void *entry = NULL; MA_STATE(mas, mt, index, index); rcu_read_lock(); entry = mas_prev(&mas, min); rcu_read_unlock(); return entry; } EXPORT_SYMBOL_GPL(mt_prev); /** * mas_pause() - Pause a mas_find/mas_for_each to drop the lock. * @mas: The maple state to pause * * Some users need to pause a walk and drop the lock they're holding in * order to yield to a higher priority thread or carry out an operation * on an entry. Those users should call this function before they drop * the lock. It resets the @mas to be suitable for the next iteration * of the loop after the user has reacquired the lock. If most entries * found during a walk require you to call mas_pause(), the mt_for_each() * iterator may be more appropriate. * */ void mas_pause(struct ma_state *mas) { mas->status = ma_pause; mas->node = NULL; } EXPORT_SYMBOL_GPL(mas_pause); /** * mas_find_setup() - Internal function to set up mas_find*(). * @mas: The maple state * @max: The maximum index * @entry: Pointer to the entry * * Returns: True if entry is the answer, false otherwise. */ static __always_inline bool mas_find_setup(struct ma_state *mas, unsigned long max, void **entry) { switch (mas->status) { case ma_active: if (mas->last < max) return false; return true; case ma_start: break; case ma_pause: if (unlikely(mas->last >= max)) return true; mas->index = ++mas->last; mas->status = ma_start; break; case ma_none: if (unlikely(mas->last >= max)) return true; mas->index = mas->last; mas->status = ma_start; break; case ma_underflow: /* mas is pointing at entry before unable to go lower */ if (unlikely(mas->index >= max)) { mas->status = ma_overflow; return true; } mas->status = ma_active; *entry = mas_walk(mas); if (*entry) return true; break; case ma_overflow: if (unlikely(mas->last >= max)) return true; mas->status = ma_active; *entry = mas_walk(mas); if (*entry) return true; break; case ma_root: break; case ma_error: return true; } if (mas_is_start(mas)) { /* First run or continue */ if (mas->index > max) return true; *entry = mas_walk(mas); if (*entry) return true; } if (unlikely(mas_is_ptr(mas))) goto ptr_out_of_range; if (unlikely(mas_is_none(mas))) return true; if (mas->index == max) return true; return false; ptr_out_of_range: mas->status = ma_none; mas->index = 1; mas->last = ULONG_MAX; return true; } /** * mas_find() - On the first call, find the entry at or after mas->index up to * %max. Otherwise, find the entry after mas->index. * @mas: The maple state * @max: The maximum value to check. * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. * May set @mas->status to ma_overflow. * * Return: The entry or %NULL. */ void *mas_find(struct ma_state *mas, unsigned long max) { void *entry = NULL; if (mas_find_setup(mas, max, &entry)) return entry; /* Retries on dead nodes handled by mas_next_slot */ entry = mas_next_slot(mas, max, false); /* Ignore overflow */ mas->status = ma_active; return entry; } EXPORT_SYMBOL_GPL(mas_find); /** * mas_find_range() - On the first call, find the entry at or after * mas->index up to %max. Otherwise, advance to the next slot mas->index. * @mas: The maple state * @max: The maximum value to check. * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. * May set @mas->status to ma_overflow. * * Return: The entry or %NULL. */ void *mas_find_range(struct ma_state *mas, unsigned long max) { void *entry = NULL; if (mas_find_setup(mas, max, &entry)) return entry; /* Retries on dead nodes handled by mas_next_slot */ return mas_next_slot(mas, max, true); } EXPORT_SYMBOL_GPL(mas_find_range); /** * mas_find_rev_setup() - Internal function to set up mas_find_*_rev() * @mas: The maple state * @min: The minimum index * @entry: Pointer to the entry * * Returns: True if entry is the answer, false otherwise. */ static bool mas_find_rev_setup(struct ma_state *mas, unsigned long min, void **entry) { switch (mas->status) { case ma_active: goto active; case ma_start: break; case ma_pause: if (unlikely(mas->index <= min)) { mas->status = ma_underflow; return true; } mas->last = --mas->index; mas->status = ma_start; break; case ma_none: if (mas->index <= min) goto none; mas->last = mas->index; mas->status = ma_start; break; case ma_overflow: /* user expects the mas to be one after where it is */ if (unlikely(mas->index <= min)) { mas->status = ma_underflow; return true; } mas->status = ma_active; break; case ma_underflow: /* user expects the mas to be one before where it is */ if (unlikely(mas->index <= min)) return true; mas->status = ma_active; break; case ma_root: break; case ma_error: return true; } if (mas_is_start(mas)) { /* First run or continue */ if (mas->index < min) return true; *entry = mas_walk(mas); if (*entry) return true; } if (unlikely(mas_is_ptr(mas))) goto none; if (unlikely(mas_is_none(mas))) { /* * Walked to the location, and there was nothing so the previous * location is 0. */ mas->last = mas->index = 0; mas->status = ma_root; *entry = mas_root(mas); return true; } active: if (mas->index < min) return true; return false; none: mas->status = ma_none; return true; } /** * mas_find_rev: On the first call, find the first non-null entry at or below * mas->index down to %min. Otherwise find the first non-null entry below * mas->index down to %min. * @mas: The maple state * @min: The minimum value to check. * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. * May set @mas->status to ma_underflow. * * Return: The entry or %NULL. */ void *mas_find_rev(struct ma_state *mas, unsigned long min) { void *entry = NULL; if (mas_find_rev_setup(mas, min, &entry)) return entry; /* Retries on dead nodes handled by mas_prev_slot */ return mas_prev_slot(mas, min, false); } EXPORT_SYMBOL_GPL(mas_find_rev); /** * mas_find_range_rev: On the first call, find the first non-null entry at or * below mas->index down to %min. Otherwise advance to the previous slot after * mas->index down to %min. * @mas: The maple state * @min: The minimum value to check. * * Must hold rcu_read_lock or the write lock. * If an entry exists, last and index are updated accordingly. * May set @mas->status to ma_underflow. * * Return: The entry or %NULL. */ void *mas_find_range_rev(struct ma_state *mas, unsigned long min) { void *entry = NULL; if (mas_find_rev_setup(mas, min, &entry)) return entry; /* Retries on dead nodes handled by mas_prev_slot */ return mas_prev_slot(mas, min, true); } EXPORT_SYMBOL_GPL(mas_find_range_rev); /** * mas_erase() - Find the range in which index resides and erase the entire * range. * @mas: The maple state * * Must hold the write lock. * Searches for @mas->index, sets @mas->index and @mas->last to the range and * erases that range. * * Return: the entry that was erased or %NULL, @mas->index and @mas->last are updated. */ void *mas_erase(struct ma_state *mas) { void *entry; unsigned long index = mas->index; MA_WR_STATE(wr_mas, mas, NULL); if (!mas_is_active(mas) || !mas_is_start(mas)) mas->status = ma_start; write_retry: entry = mas_state_walk(mas); if (!entry) return NULL; /* Must reset to ensure spanning writes of last slot are detected */ mas_reset(mas); mas_wr_preallocate(&wr_mas, NULL); if (mas_nomem(mas, GFP_KERNEL)) { /* in case the range of entry changed when unlocked */ mas->index = mas->last = index; goto write_retry; } if (mas_is_err(mas)) goto out; mas_wr_store_entry(&wr_mas); out: mas_destroy(mas); return entry; } EXPORT_SYMBOL_GPL(mas_erase); /** * mas_nomem() - Check if there was an error allocating and do the allocation * if necessary If there are allocations, then free them. * @mas: The maple state * @gfp: The GFP_FLAGS to use for allocations * Return: true on allocation, false otherwise. */ bool mas_nomem(struct ma_state *mas, gfp_t gfp) __must_hold(mas->tree->ma_lock) { if (likely(mas->node != MA_ERROR(-ENOMEM))) return false; if (gfpflags_allow_blocking(gfp) && !mt_external_lock(mas->tree)) { mtree_unlock(mas->tree); mas_alloc_nodes(mas, gfp); mtree_lock(mas->tree); } else { mas_alloc_nodes(mas, gfp); } if (!mas_allocated(mas)) return false; mas->status = ma_start; return true; } void __init maple_tree_init(void) { maple_node_cache = kmem_cache_create("maple_node", sizeof(struct maple_node), sizeof(struct maple_node), SLAB_PANIC, NULL); } /** * mtree_load() - Load a value stored in a maple tree * @mt: The maple tree * @index: The index to load * * Return: the entry or %NULL */ void *mtree_load(struct maple_tree *mt, unsigned long index) { MA_STATE(mas, mt, index, index); void *entry; trace_ma_read(__func__, &mas); rcu_read_lock(); retry: entry = mas_start(&mas); if (unlikely(mas_is_none(&mas))) goto unlock; if (unlikely(mas_is_ptr(&mas))) { if (index) entry = NULL; goto unlock; } entry = mtree_lookup_walk(&mas); if (!entry && unlikely(mas_is_start(&mas))) goto retry; unlock: rcu_read_unlock(); if (xa_is_zero(entry)) return NULL; return entry; } EXPORT_SYMBOL(mtree_load); /** * mtree_store_range() - Store an entry at a given range. * @mt: The maple tree * @index: The start of the range * @last: The end of the range * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations * * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not * be allocated. */ int mtree_store_range(struct maple_tree *mt, unsigned long index, unsigned long last, void *entry, gfp_t gfp) { MA_STATE(mas, mt, index, last); int ret = 0; trace_ma_write(__func__, &mas, 0, entry); if (WARN_ON_ONCE(xa_is_advanced(entry))) return -EINVAL; if (index > last) return -EINVAL; mtree_lock(mt); ret = mas_store_gfp(&mas, entry, gfp); mtree_unlock(mt); return ret; } EXPORT_SYMBOL(mtree_store_range); /** * mtree_store() - Store an entry at a given index. * @mt: The maple tree * @index: The index to store the value * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations * * Return: 0 on success, -EINVAL on invalid request, -ENOMEM if memory could not * be allocated. */ int mtree_store(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp) { return mtree_store_range(mt, index, index, entry, gfp); } EXPORT_SYMBOL(mtree_store); /** * mtree_insert_range() - Insert an entry at a given range if there is no value. * @mt: The maple tree * @first: The start of the range * @last: The end of the range * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid * request, -ENOMEM if memory could not be allocated. */ int mtree_insert_range(struct maple_tree *mt, unsigned long first, unsigned long last, void *entry, gfp_t gfp) { MA_STATE(ms, mt, first, last); int ret = 0; if (WARN_ON_ONCE(xa_is_advanced(entry))) return -EINVAL; if (first > last) return -EINVAL; mtree_lock(mt); retry: mas_insert(&ms, entry); if (mas_nomem(&ms, gfp)) goto retry; mtree_unlock(mt); if (mas_is_err(&ms)) ret = xa_err(ms.node); mas_destroy(&ms); return ret; } EXPORT_SYMBOL(mtree_insert_range); /** * mtree_insert() - Insert an entry at a given index if there is no value. * @mt: The maple tree * @index : The index to store the value * @entry: The entry to store * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid * request, -ENOMEM if memory could not be allocated. */ int mtree_insert(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp) { return mtree_insert_range(mt, index, index, entry, gfp); } EXPORT_SYMBOL(mtree_insert); int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp) { int ret = 0; MA_STATE(mas, mt, 0, 0); if (!mt_is_alloc(mt)) return -EINVAL; if (WARN_ON_ONCE(mt_is_reserved(entry))) return -EINVAL; mtree_lock(mt); retry: ret = mas_empty_area(&mas, min, max, size); if (ret) goto unlock; mas_insert(&mas, entry); /* * mas_nomem() may release the lock, causing the allocated area * to be unavailable, so try to allocate a free area again. */ if (mas_nomem(&mas, gfp)) goto retry; if (mas_is_err(&mas)) ret = xa_err(mas.node); else *startp = mas.index; unlock: mtree_unlock(mt); mas_destroy(&mas); return ret; } EXPORT_SYMBOL(mtree_alloc_range); /** * mtree_alloc_cyclic() - Find somewhere to store this entry in the tree. * @mt: The maple tree. * @startp: Pointer to ID. * @range_lo: Lower bound of range to search. * @range_hi: Upper bound of range to search. * @entry: The entry to store. * @next: Pointer to next ID to allocate. * @gfp: The GFP_FLAGS to use for allocations. * * Finds an empty entry in @mt after @next, stores the new index into * the @id pointer, stores the entry at that index, then updates @next. * * @mt must be initialized with the MT_FLAGS_ALLOC_RANGE flag. * * Context: Any context. Takes and releases the mt.lock. May sleep if * the @gfp flags permit. * * Return: 0 if the allocation succeeded without wrapping, 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated, -EINVAL if @mt cannot be used, or -EBUSY if there are no * free entries. */ int mtree_alloc_cyclic(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long range_lo, unsigned long range_hi, unsigned long *next, gfp_t gfp) { int ret; MA_STATE(mas, mt, 0, 0); if (!mt_is_alloc(mt)) return -EINVAL; if (WARN_ON_ONCE(mt_is_reserved(entry))) return -EINVAL; mtree_lock(mt); ret = mas_alloc_cyclic(&mas, startp, entry, range_lo, range_hi, next, gfp); mtree_unlock(mt); return ret; } EXPORT_SYMBOL(mtree_alloc_cyclic); int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp) { int ret = 0; MA_STATE(mas, mt, 0, 0); if (!mt_is_alloc(mt)) return -EINVAL; if (WARN_ON_ONCE(mt_is_reserved(entry))) return -EINVAL; mtree_lock(mt); retry: ret = mas_empty_area_rev(&mas, min, max, size); if (ret) goto unlock; mas_insert(&mas, entry); /* * mas_nomem() may release the lock, causing the allocated area * to be unavailable, so try to allocate a free area again. */ if (mas_nomem(&mas, gfp)) goto retry; if (mas_is_err(&mas)) ret = xa_err(mas.node); else *startp = mas.index; unlock: mtree_unlock(mt); mas_destroy(&mas); return ret; } EXPORT_SYMBOL(mtree_alloc_rrange); /** * mtree_erase() - Find an index and erase the entire range. * @mt: The maple tree * @index: The index to erase * * Erasing is the same as a walk to an entry then a store of a NULL to that * ENTIRE range. In fact, it is implemented as such using the advanced API. * * Return: The entry stored at the @index or %NULL */ void *mtree_erase(struct maple_tree *mt, unsigned long index) { void *entry = NULL; MA_STATE(mas, mt, index, index); trace_ma_op(__func__, &mas); mtree_lock(mt); entry = mas_erase(&mas); mtree_unlock(mt); return entry; } EXPORT_SYMBOL(mtree_erase); /* * mas_dup_free() - Free an incomplete duplication of a tree. * @mas: The maple state of a incomplete tree. * * The parameter @mas->node passed in indicates that the allocation failed on * this node. This function frees all nodes starting from @mas->node in the * reverse order of mas_dup_build(). There is no need to hold the source tree * lock at this time. */ static void mas_dup_free(struct ma_state *mas) { struct maple_node *node; enum maple_type type; void __rcu **slots; unsigned char count, i; /* Maybe the first node allocation failed. */ if (mas_is_none(mas)) return; while (!mte_is_root(mas->node)) { mas_ascend(mas); if (mas->offset) { mas->offset--; do { mas_descend(mas); mas->offset = mas_data_end(mas); } while (!mte_is_leaf(mas->node)); mas_ascend(mas); } node = mte_to_node(mas->node); type = mte_node_type(mas->node); slots = ma_slots(node, type); count = mas_data_end(mas) + 1; for (i = 0; i < count; i++) ((unsigned long *)slots)[i] &= ~MAPLE_NODE_MASK; mt_free_bulk(count, slots); } node = mte_to_node(mas->node); mt_free_one(node); } /* * mas_copy_node() - Copy a maple node and replace the parent. * @mas: The maple state of source tree. * @new_mas: The maple state of new tree. * @parent: The parent of the new node. * * Copy @mas->node to @new_mas->node, set @parent to be the parent of * @new_mas->node. If memory allocation fails, @mas is set to -ENOMEM. */ static inline void mas_copy_node(struct ma_state *mas, struct ma_state *new_mas, struct maple_pnode *parent) { struct maple_node *node = mte_to_node(mas->node); struct maple_node *new_node = mte_to_node(new_mas->node); unsigned long val; /* Copy the node completely. */ memcpy(new_node, node, sizeof(struct maple_node)); /* Update the parent node pointer. */ val = (unsigned long)node->parent & MAPLE_NODE_MASK; new_node->parent = ma_parent_ptr(val | (unsigned long)parent); } /* * mas_dup_alloc() - Allocate child nodes for a maple node. * @mas: The maple state of source tree. * @new_mas: The maple state of new tree. * @gfp: The GFP_FLAGS to use for allocations. * * This function allocates child nodes for @new_mas->node during the duplication * process. If memory allocation fails, @mas is set to -ENOMEM. */ static inline void mas_dup_alloc(struct ma_state *mas, struct ma_state *new_mas, gfp_t gfp) { struct maple_node *node = mte_to_node(mas->node); struct maple_node *new_node = mte_to_node(new_mas->node); enum maple_type type; unsigned char request, count, i; void __rcu **slots; void __rcu **new_slots; unsigned long val; /* Allocate memory for child nodes. */ type = mte_node_type(mas->node); new_slots = ma_slots(new_node, type); request = mas_data_end(mas) + 1; count = mt_alloc_bulk(gfp, request, (void **)new_slots); if (unlikely(count < request)) { memset(new_slots, 0, request * sizeof(void *)); mas_set_err(mas, -ENOMEM); return; } /* Restore node type information in slots. */ slots = ma_slots(node, type); for (i = 0; i < count; i++) { val = (unsigned long)mt_slot_locked(mas->tree, slots, i); val &= MAPLE_NODE_MASK; ((unsigned long *)new_slots)[i] |= val; } } /* * mas_dup_build() - Build a new maple tree from a source tree * @mas: The maple state of source tree, need to be in MAS_START state. * @new_mas: The maple state of new tree, need to be in MAS_START state. * @gfp: The GFP_FLAGS to use for allocations. * * This function builds a new tree in DFS preorder. If the memory allocation * fails, the error code -ENOMEM will be set in @mas, and @new_mas points to the * last node. mas_dup_free() will free the incomplete duplication of a tree. * * Note that the attributes of the two trees need to be exactly the same, and the * new tree needs to be empty, otherwise -EINVAL will be set in @mas. */ static inline void mas_dup_build(struct ma_state *mas, struct ma_state *new_mas, gfp_t gfp) { struct maple_node *node; struct maple_pnode *parent = NULL; struct maple_enode *root; enum maple_type type; if (unlikely(mt_attr(mas->tree) != mt_attr(new_mas->tree)) || unlikely(!mtree_empty(new_mas->tree))) { mas_set_err(mas, -EINVAL); return; } root = mas_start(mas); if (mas_is_ptr(mas) || mas_is_none(mas)) goto set_new_tree; node = mt_alloc_one(gfp); if (!node) { new_mas->status = ma_none; mas_set_err(mas, -ENOMEM); return; } type = mte_node_type(mas->node); root = mt_mk_node(node, type); new_mas->node = root; new_mas->min = 0; new_mas->max = ULONG_MAX; root = mte_mk_root(root); while (1) { mas_copy_node(mas, new_mas, parent); if (!mte_is_leaf(mas->node)) { /* Only allocate child nodes for non-leaf nodes. */ mas_dup_alloc(mas, new_mas, gfp); if (unlikely(mas_is_err(mas))) return; } else { /* * This is the last leaf node and duplication is * completed. */ if (mas->max == ULONG_MAX) goto done; /* This is not the last leaf node and needs to go up. */ do { mas_ascend(mas); mas_ascend(new_mas); } while (mas->offset == mas_data_end(mas)); /* Move to the next subtree. */ mas->offset++; new_mas->offset++; } mas_descend(mas); parent = ma_parent_ptr(mte_to_node(new_mas->node)); mas_descend(new_mas); mas->offset = 0; new_mas->offset = 0; } done: /* Specially handle the parent of the root node. */ mte_to_node(root)->parent = ma_parent_ptr(mas_tree_parent(new_mas)); set_new_tree: /* Make them the same height */ new_mas->tree->ma_flags = mas->tree->ma_flags; rcu_assign_pointer(new_mas->tree->ma_root, root); } /** * __mt_dup(): Duplicate an entire maple tree * @mt: The source maple tree * @new: The new maple tree * @gfp: The GFP_FLAGS to use for allocations * * This function duplicates a maple tree in Depth-First Search (DFS) pre-order * traversal. It uses memcpy() to copy nodes in the source tree and allocate * new child nodes in non-leaf nodes. The new node is exactly the same as the * source node except for all the addresses stored in it. It will be faster than * traversing all elements in the source tree and inserting them one by one into * the new tree. * The user needs to ensure that the attributes of the source tree and the new * tree are the same, and the new tree needs to be an empty tree, otherwise * -EINVAL will be returned. * Note that the user needs to manually lock the source tree and the new tree. * * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If * the attributes of the two trees are different or the new tree is not an empty * tree. */ int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp) { int ret = 0; MA_STATE(mas, mt, 0, 0); MA_STATE(new_mas, new, 0, 0); mas_dup_build(&mas, &new_mas, gfp); if (unlikely(mas_is_err(&mas))) { ret = xa_err(mas.node); if (ret == -ENOMEM) mas_dup_free(&new_mas); } return ret; } EXPORT_SYMBOL(__mt_dup); /** * mtree_dup(): Duplicate an entire maple tree * @mt: The source maple tree * @new: The new maple tree * @gfp: The GFP_FLAGS to use for allocations * * This function duplicates a maple tree in Depth-First Search (DFS) pre-order * traversal. It uses memcpy() to copy nodes in the source tree and allocate * new child nodes in non-leaf nodes. The new node is exactly the same as the * source node except for all the addresses stored in it. It will be faster than * traversing all elements in the source tree and inserting them one by one into * the new tree. * The user needs to ensure that the attributes of the source tree and the new * tree are the same, and the new tree needs to be an empty tree, otherwise * -EINVAL will be returned. * * Return: 0 on success, -ENOMEM if memory could not be allocated, -EINVAL If * the attributes of the two trees are different or the new tree is not an empty * tree. */ int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp) { int ret = 0; MA_STATE(mas, mt, 0, 0); MA_STATE(new_mas, new, 0, 0); mas_lock(&new_mas); mas_lock_nested(&mas, SINGLE_DEPTH_NESTING); mas_dup_build(&mas, &new_mas, gfp); mas_unlock(&mas); if (unlikely(mas_is_err(&mas))) { ret = xa_err(mas.node); if (ret == -ENOMEM) mas_dup_free(&new_mas); } mas_unlock(&new_mas); return ret; } EXPORT_SYMBOL(mtree_dup); /** * __mt_destroy() - Walk and free all nodes of a locked maple tree. * @mt: The maple tree * * Note: Does not handle locking. */ void __mt_destroy(struct maple_tree *mt) { void *root = mt_root_locked(mt); rcu_assign_pointer(mt->ma_root, NULL); if (xa_is_node(root)) mte_destroy_walk(root, mt); mt->ma_flags = mt_attr(mt); } EXPORT_SYMBOL_GPL(__mt_destroy); /** * mtree_destroy() - Destroy a maple tree * @mt: The maple tree * * Frees all resources used by the tree. Handles locking. */ void mtree_destroy(struct maple_tree *mt) { mtree_lock(mt); __mt_destroy(mt); mtree_unlock(mt); } EXPORT_SYMBOL(mtree_destroy); /** * mt_find() - Search from the start up until an entry is found. * @mt: The maple tree * @index: Pointer which contains the start location of the search * @max: The maximum value of the search range * * Takes RCU read lock internally to protect the search, which does not * protect the returned pointer after dropping RCU read lock. * See also: Documentation/core-api/maple_tree.rst * * In case that an entry is found @index is updated to point to the next * possible entry independent whether the found entry is occupying a * single index or a range if indices. * * Return: The entry at or after the @index or %NULL */ void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max) { MA_STATE(mas, mt, *index, *index); void *entry; #ifdef CONFIG_DEBUG_MAPLE_TREE unsigned long copy = *index; #endif trace_ma_read(__func__, &mas); if ((*index) > max) return NULL; rcu_read_lock(); retry: entry = mas_state_walk(&mas); if (mas_is_start(&mas)) goto retry; if (unlikely(xa_is_zero(entry))) entry = NULL; if (entry) goto unlock; while (mas_is_active(&mas) && (mas.last < max)) { entry = mas_next_slot(&mas, max, false); if (likely(entry && !xa_is_zero(entry))) break; } if (unlikely(xa_is_zero(entry))) entry = NULL; unlock: rcu_read_unlock(); if (likely(entry)) { *index = mas.last + 1; #ifdef CONFIG_DEBUG_MAPLE_TREE if (MT_WARN_ON(mt, (*index) && ((*index) <= copy))) pr_err("index not increased! %lx <= %lx\n", *index, copy); #endif } return entry; } EXPORT_SYMBOL(mt_find); /** * mt_find_after() - Search from the start up until an entry is found. * @mt: The maple tree * @index: Pointer which contains the start location of the search * @max: The maximum value to check * * Same as mt_find() except that it checks @index for 0 before * searching. If @index == 0, the search is aborted. This covers a wrap * around of @index to 0 in an iterator loop. * * Return: The entry at or after the @index or %NULL */ void *mt_find_after(struct maple_tree *mt, unsigned long *index, unsigned long max) { if (!(*index)) return NULL; return mt_find(mt, index, max); } EXPORT_SYMBOL(mt_find_after); #ifdef CONFIG_DEBUG_MAPLE_TREE atomic_t maple_tree_tests_run; EXPORT_SYMBOL_GPL(maple_tree_tests_run); atomic_t maple_tree_tests_passed; EXPORT_SYMBOL_GPL(maple_tree_tests_passed); #ifndef __KERNEL__ extern void kmem_cache_set_non_kernel(struct kmem_cache *, unsigned int); void mt_set_non_kernel(unsigned int val) { kmem_cache_set_non_kernel(maple_node_cache, val); } extern void kmem_cache_set_callback(struct kmem_cache *cachep, void (*callback)(void *)); void mt_set_callback(void (*callback)(void *)) { kmem_cache_set_callback(maple_node_cache, callback); } extern void kmem_cache_set_private(struct kmem_cache *cachep, void *private); void mt_set_private(void *private) { kmem_cache_set_private(maple_node_cache, private); } extern unsigned long kmem_cache_get_alloc(struct kmem_cache *); unsigned long mt_get_alloc_size(void) { return kmem_cache_get_alloc(maple_node_cache); } extern void kmem_cache_zero_nr_tallocated(struct kmem_cache *); void mt_zero_nr_tallocated(void) { kmem_cache_zero_nr_tallocated(maple_node_cache); } extern unsigned int kmem_cache_nr_tallocated(struct kmem_cache *); unsigned int mt_nr_tallocated(void) { return kmem_cache_nr_tallocated(maple_node_cache); } extern unsigned int kmem_cache_nr_allocated(struct kmem_cache *); unsigned int mt_nr_allocated(void) { return kmem_cache_nr_allocated(maple_node_cache); } void mt_cache_shrink(void) { } #else /* * mt_cache_shrink() - For testing, don't use this. * * Certain testcases can trigger an OOM when combined with other memory * debugging configuration options. This function is used to reduce the * possibility of an out of memory even due to kmem_cache objects remaining * around for longer than usual. */ void mt_cache_shrink(void) { kmem_cache_shrink(maple_node_cache); } EXPORT_SYMBOL_GPL(mt_cache_shrink); #endif /* not defined __KERNEL__ */ /* * mas_get_slot() - Get the entry in the maple state node stored at @offset. * @mas: The maple state * @offset: The offset into the slot array to fetch. * * Return: The entry stored at @offset. */ static inline struct maple_enode *mas_get_slot(struct ma_state *mas, unsigned char offset) { return mas_slot(mas, ma_slots(mas_mn(mas), mte_node_type(mas->node)), offset); } /* Depth first search, post-order */ static void mas_dfs_postorder(struct ma_state *mas, unsigned long max) { struct maple_enode *p, *mn = mas->node; unsigned long p_min, p_max; mas_next_node(mas, mas_mn(mas), max); if (!mas_is_overflow(mas)) return; if (mte_is_root(mn)) return; mas->node = mn; mas_ascend(mas); do { p = mas->node; p_min = mas->min; p_max = mas->max; mas_prev_node(mas, 0); } while (!mas_is_underflow(mas)); mas->node = p; mas->max = p_max; mas->min = p_min; } /* Tree validations */ static void mt_dump_node(const struct maple_tree *mt, void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format); static void mt_dump_range(unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { static const char spaces[] = " "; switch(format) { case mt_dump_hex: if (min == max) pr_info("%.*s%lx: ", depth * 2, spaces, min); else pr_info("%.*s%lx-%lx: ", depth * 2, spaces, min, max); break; case mt_dump_dec: if (min == max) pr_info("%.*s%lu: ", depth * 2, spaces, min); else pr_info("%.*s%lu-%lu: ", depth * 2, spaces, min, max); } } static void mt_dump_entry(void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { mt_dump_range(min, max, depth, format); if (xa_is_value(entry)) pr_cont("value %ld (0x%lx) [" PTR_FMT "]\n", xa_to_value(entry), xa_to_value(entry), entry); else if (xa_is_zero(entry)) pr_cont("zero (%ld)\n", xa_to_internal(entry)); else if (mt_is_reserved(entry)) pr_cont("UNKNOWN ENTRY (" PTR_FMT ")\n", entry); else pr_cont(PTR_FMT "\n", entry); } static void mt_dump_range64(const struct maple_tree *mt, void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { struct maple_range_64 *node = &mte_to_node(entry)->mr64; bool leaf = mte_is_leaf(entry); unsigned long first = min; int i; pr_cont(" contents: "); for (i = 0; i < MAPLE_RANGE64_SLOTS - 1; i++) { switch(format) { case mt_dump_hex: pr_cont(PTR_FMT " %lX ", node->slot[i], node->pivot[i]); break; case mt_dump_dec: pr_cont(PTR_FMT " %lu ", node->slot[i], node->pivot[i]); } } pr_cont(PTR_FMT "\n", node->slot[i]); for (i = 0; i < MAPLE_RANGE64_SLOTS; i++) { unsigned long last = max; if (i < (MAPLE_RANGE64_SLOTS - 1)) last = node->pivot[i]; else if (!node->slot[i] && max != mt_node_max(entry)) break; if (last == 0 && i > 0) break; if (leaf) mt_dump_entry(mt_slot(mt, node->slot, i), first, last, depth + 1, format); else if (node->slot[i]) mt_dump_node(mt, mt_slot(mt, node->slot, i), first, last, depth + 1, format); if (last == max) break; if (last > max) { switch(format) { case mt_dump_hex: pr_err("node " PTR_FMT " last (%lx) > max (%lx) at pivot %d!\n", node, last, max, i); break; case mt_dump_dec: pr_err("node " PTR_FMT " last (%lu) > max (%lu) at pivot %d!\n", node, last, max, i); } } first = last + 1; } } static void mt_dump_arange64(const struct maple_tree *mt, void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { struct maple_arange_64 *node = &mte_to_node(entry)->ma64; unsigned long first = min; int i; pr_cont(" contents: "); for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) { switch (format) { case mt_dump_hex: pr_cont("%lx ", node->gap[i]); break; case mt_dump_dec: pr_cont("%lu ", node->gap[i]); } } pr_cont("| %02X %02X| ", node->meta.end, node->meta.gap); for (i = 0; i < MAPLE_ARANGE64_SLOTS - 1; i++) { switch (format) { case mt_dump_hex: pr_cont(PTR_FMT " %lX ", node->slot[i], node->pivot[i]); break; case mt_dump_dec: pr_cont(PTR_FMT " %lu ", node->slot[i], node->pivot[i]); } } pr_cont(PTR_FMT "\n", node->slot[i]); for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) { unsigned long last = max; if (i < (MAPLE_ARANGE64_SLOTS - 1)) last = node->pivot[i]; else if (!node->slot[i]) break; if (last == 0 && i > 0) break; if (node->slot[i]) mt_dump_node(mt, mt_slot(mt, node->slot, i), first, last, depth + 1, format); if (last == max) break; if (last > max) { switch(format) { case mt_dump_hex: pr_err("node " PTR_FMT " last (%lx) > max (%lx) at pivot %d!\n", node, last, max, i); break; case mt_dump_dec: pr_err("node " PTR_FMT " last (%lu) > max (%lu) at pivot %d!\n", node, last, max, i); } } first = last + 1; } } static void mt_dump_node(const struct maple_tree *mt, void *entry, unsigned long min, unsigned long max, unsigned int depth, enum mt_dump_format format) { struct maple_node *node = mte_to_node(entry); unsigned int type = mte_node_type(entry); unsigned int i; mt_dump_range(min, max, depth, format); pr_cont("node " PTR_FMT " depth %d type %d parent " PTR_FMT, node, depth, type, node ? node->parent : NULL); switch (type) { case maple_dense: pr_cont("\n"); for (i = 0; i < MAPLE_NODE_SLOTS; i++) { if (min + i > max) pr_cont("OUT OF RANGE: "); mt_dump_entry(mt_slot(mt, node->slot, i), min + i, min + i, depth, format); } break; case maple_leaf_64: case maple_range_64: mt_dump_range64(mt, entry, min, max, depth, format); break; case maple_arange_64: mt_dump_arange64(mt, entry, min, max, depth, format); break; default: pr_cont(" UNKNOWN TYPE\n"); } } void mt_dump(const struct maple_tree *mt, enum mt_dump_format format) { void *entry = rcu_dereference_check(mt->ma_root, mt_locked(mt)); pr_info("maple_tree(" PTR_FMT ") flags %X, height %u root " PTR_FMT "\n", mt, mt->ma_flags, mt_height(mt), entry); if (xa_is_node(entry)) mt_dump_node(mt, entry, 0, mt_node_max(entry), 0, format); else if (entry) mt_dump_entry(entry, 0, 0, 0, format); else pr_info("(empty)\n"); } EXPORT_SYMBOL_GPL(mt_dump); /* * Calculate the maximum gap in a node and check if that's what is reported in * the parent (unless root). */ static void mas_validate_gaps(struct ma_state *mas) { struct maple_enode *mte = mas->node; struct maple_node *p_mn, *node = mte_to_node(mte); enum maple_type mt = mte_node_type(mas->node); unsigned long gap = 0, max_gap = 0; unsigned long p_end, p_start = mas->min; unsigned char p_slot, offset; unsigned long *gaps = NULL; unsigned long *pivots = ma_pivots(node, mt); unsigned int i; if (ma_is_dense(mt)) { for (i = 0; i < mt_slot_count(mte); i++) { if (mas_get_slot(mas, i)) { if (gap > max_gap) max_gap = gap; gap = 0; continue; } gap++; } goto counted; } gaps = ma_gaps(node, mt); for (i = 0; i < mt_slot_count(mte); i++) { p_end = mas_safe_pivot(mas, pivots, i, mt); if (!gaps) { if (!mas_get_slot(mas, i)) gap = p_end - p_start + 1; } else { void *entry = mas_get_slot(mas, i); gap = gaps[i]; MT_BUG_ON(mas->tree, !entry); if (gap > p_end - p_start + 1) { pr_err(PTR_FMT "[%u] %lu >= %lu - %lu + 1 (%lu)\n", mas_mn(mas), i, gap, p_end, p_start, p_end - p_start + 1); MT_BUG_ON(mas->tree, gap > p_end - p_start + 1); } } if (gap > max_gap) max_gap = gap; p_start = p_end + 1; if (p_end >= mas->max) break; } counted: if (mt == maple_arange_64) { MT_BUG_ON(mas->tree, !gaps); offset = ma_meta_gap(node); if (offset > i) { pr_err("gap offset " PTR_FMT "[%u] is invalid\n", node, offset); MT_BUG_ON(mas->tree, 1); } if (gaps[offset] != max_gap) { pr_err("gap " PTR_FMT "[%u] is not the largest gap %lu\n", node, offset, max_gap); MT_BUG_ON(mas->tree, 1); } for (i++ ; i < mt_slot_count(mte); i++) { if (gaps[i] != 0) { pr_err("gap " PTR_FMT "[%u] beyond node limit != 0\n", node, i); MT_BUG_ON(mas->tree, 1); } } } if (mte_is_root(mte)) return; p_slot = mte_parent_slot(mas->node); p_mn = mte_parent(mte); MT_BUG_ON(mas->tree, max_gap > mas->max); if (ma_gaps(p_mn, mas_parent_type(mas, mte))[p_slot] != max_gap) { pr_err("gap " PTR_FMT "[%u] != %lu\n", p_mn, p_slot, max_gap); mt_dump(mas->tree, mt_dump_hex); MT_BUG_ON(mas->tree, 1); } } static void mas_validate_parent_slot(struct ma_state *mas) { struct maple_node *parent; struct maple_enode *node; enum maple_type p_type; unsigned char p_slot; void __rcu **slots; int i; if (mte_is_root(mas->node)) return; p_slot = mte_parent_slot(mas->node); p_type = mas_parent_type(mas, mas->node); parent = mte_parent(mas->node); slots = ma_slots(parent, p_type); MT_BUG_ON(mas->tree, mas_mn(mas) == parent); /* Check prev/next parent slot for duplicate node entry */ for (i = 0; i < mt_slots[p_type]; i++) { node = mas_slot(mas, slots, i); if (i == p_slot) { if (node != mas->node) pr_err("parent " PTR_FMT "[%u] does not have " PTR_FMT "\n", parent, i, mas_mn(mas)); MT_BUG_ON(mas->tree, node != mas->node); } else if (node == mas->node) { pr_err("Invalid child " PTR_FMT " at parent " PTR_FMT "[%u] p_slot %u\n", mas_mn(mas), parent, i, p_slot); MT_BUG_ON(mas->tree, node == mas->node); } } } static void mas_validate_child_slot(struct ma_state *mas) { enum maple_type type = mte_node_type(mas->node); void __rcu **slots = ma_slots(mte_to_node(mas->node), type); unsigned long *pivots = ma_pivots(mte_to_node(mas->node), type); struct maple_enode *child; unsigned char i; if (mte_is_leaf(mas->node)) return; for (i = 0; i < mt_slots[type]; i++) { child = mas_slot(mas, slots, i); if (!child) { pr_err("Non-leaf node lacks child at " PTR_FMT "[%u]\n", mas_mn(mas), i); MT_BUG_ON(mas->tree, 1); } if (mte_parent_slot(child) != i) { pr_err("Slot error at " PTR_FMT "[%u]: child " PTR_FMT " has pslot %u\n", mas_mn(mas), i, mte_to_node(child), mte_parent_slot(child)); MT_BUG_ON(mas->tree, 1); } if (mte_parent(child) != mte_to_node(mas->node)) { pr_err("child " PTR_FMT " has parent " PTR_FMT " not " PTR_FMT "\n", mte_to_node(child), mte_parent(child), mte_to_node(mas->node)); MT_BUG_ON(mas->tree, 1); } if (i < mt_pivots[type] && pivots[i] == mas->max) break; } } /* * Validate all pivots are within mas->min and mas->max, check metadata ends * where the maximum ends and ensure there is no slots or pivots set outside of * the end of the data. */ static void mas_validate_limits(struct ma_state *mas) { int i; unsigned long prev_piv = 0; enum maple_type type = mte_node_type(mas->node); void __rcu **slots = ma_slots(mte_to_node(mas->node), type); unsigned long *pivots = ma_pivots(mas_mn(mas), type); for (i = 0; i < mt_slots[type]; i++) { unsigned long piv; piv = mas_safe_pivot(mas, pivots, i, type); if (!piv && (i != 0)) { pr_err("Missing node limit pivot at " PTR_FMT "[%u]", mas_mn(mas), i); MAS_WARN_ON(mas, 1); } if (prev_piv > piv) { pr_err(PTR_FMT "[%u] piv %lu < prev_piv %lu\n", mas_mn(mas), i, piv, prev_piv); MAS_WARN_ON(mas, piv < prev_piv); } if (piv < mas->min) { pr_err(PTR_FMT "[%u] %lu < %lu\n", mas_mn(mas), i, piv, mas->min); MAS_WARN_ON(mas, piv < mas->min); } if (piv > mas->max) { pr_err(PTR_FMT "[%u] %lu > %lu\n", mas_mn(mas), i, piv, mas->max); MAS_WARN_ON(mas, piv > mas->max); } prev_piv = piv; if (piv == mas->max) break; } if (mas_data_end(mas) != i) { pr_err("node" PTR_FMT ": data_end %u != the last slot offset %u\n", mas_mn(mas), mas_data_end(mas), i); MT_BUG_ON(mas->tree, 1); } for (i += 1; i < mt_slots[type]; i++) { void *entry = mas_slot(mas, slots, i); if (entry && (i != mt_slots[type] - 1)) { pr_err(PTR_FMT "[%u] should not have entry " PTR_FMT "\n", mas_mn(mas), i, entry); MT_BUG_ON(mas->tree, entry != NULL); } if (i < mt_pivots[type]) { unsigned long piv = pivots[i]; if (!piv) continue; pr_err(PTR_FMT "[%u] should not have piv %lu\n", mas_mn(mas), i, piv); MAS_WARN_ON(mas, i < mt_pivots[type] - 1); } } } static void mt_validate_nulls(struct maple_tree *mt) { void *entry, *last = (void *)1; unsigned char offset = 0; void __rcu **slots; MA_STATE(mas, mt, 0, 0); mas_start(&mas); if (mas_is_none(&mas) || (mas_is_ptr(&mas))) return; while (!mte_is_leaf(mas.node)) mas_descend(&mas); slots = ma_slots(mte_to_node(mas.node), mte_node_type(mas.node)); do { entry = mas_slot(&mas, slots, offset); if (!last && !entry) { pr_err("Sequential nulls end at " PTR_FMT "[%u]\n", mas_mn(&mas), offset); } MT_BUG_ON(mt, !last && !entry); last = entry; if (offset == mas_data_end(&mas)) { mas_next_node(&mas, mas_mn(&mas), ULONG_MAX); if (mas_is_overflow(&mas)) return; offset = 0; slots = ma_slots(mte_to_node(mas.node), mte_node_type(mas.node)); } else { offset++; } } while (!mas_is_overflow(&mas)); } /* * validate a maple tree by checking: * 1. The limits (pivots are within mas->min to mas->max) * 2. The gap is correctly set in the parents */ void mt_validate(struct maple_tree *mt) __must_hold(mas->tree->ma_lock) { unsigned char end; MA_STATE(mas, mt, 0, 0); mas_start(&mas); if (!mas_is_active(&mas)) return; while (!mte_is_leaf(mas.node)) mas_descend(&mas); while (!mas_is_overflow(&mas)) { MAS_WARN_ON(&mas, mte_dead_node(mas.node)); end = mas_data_end(&mas); if (MAS_WARN_ON(&mas, (end < mt_min_slot_count(mas.node)) && (!mte_is_root(mas.node)))) { pr_err("Invalid size %u of " PTR_FMT "\n", end, mas_mn(&mas)); } mas_validate_parent_slot(&mas); mas_validate_limits(&mas); mas_validate_child_slot(&mas); if (mt_is_alloc(mt)) mas_validate_gaps(&mas); mas_dfs_postorder(&mas, ULONG_MAX); } mt_validate_nulls(mt); } EXPORT_SYMBOL_GPL(mt_validate); void mas_dump(const struct ma_state *mas) { pr_err("MAS: tree=" PTR_FMT " enode=" PTR_FMT " ", mas->tree, mas->node); switch (mas->status) { case ma_active: pr_err("(ma_active)"); break; case ma_none: pr_err("(ma_none)"); break; case ma_root: pr_err("(ma_root)"); break; case ma_start: pr_err("(ma_start) "); break; case ma_pause: pr_err("(ma_pause) "); break; case ma_overflow: pr_err("(ma_overflow) "); break; case ma_underflow: pr_err("(ma_underflow) "); break; case ma_error: pr_err("(ma_error) "); break; } pr_err("Store Type: "); switch (mas->store_type) { case wr_invalid: pr_err("invalid store type\n"); break; case wr_new_root: pr_err("new_root\n"); break; case wr_store_root: pr_err("store_root\n"); break; case wr_exact_fit: pr_err("exact_fit\n"); break; case wr_split_store: pr_err("split_store\n"); break; case wr_slot_store: pr_err("slot_store\n"); break; case wr_append: pr_err("append\n"); break; case wr_node_store: pr_err("node_store\n"); break; case wr_spanning_store: pr_err("spanning_store\n"); break; case wr_rebalance: pr_err("rebalance\n"); break; } pr_err("[%u/%u] index=%lx last=%lx\n", mas->offset, mas->end, mas->index, mas->last); pr_err(" min=%lx max=%lx alloc=" PTR_FMT ", depth=%u, flags=%x\n", mas->min, mas->max, mas->alloc, mas->depth, mas->mas_flags); if (mas->index > mas->last) pr_err("Check index & last\n"); } EXPORT_SYMBOL_GPL(mas_dump); void mas_wr_dump(const struct ma_wr_state *wr_mas) { pr_err("WR_MAS: node=" PTR_FMT " r_min=%lx r_max=%lx\n", wr_mas->node, wr_mas->r_min, wr_mas->r_max); pr_err(" type=%u off_end=%u, node_end=%u, end_piv=%lx\n", wr_mas->type, wr_mas->offset_end, wr_mas->mas->end, wr_mas->end_piv); } EXPORT_SYMBOL_GPL(mas_wr_dump); #endif /* CONFIG_DEBUG_MAPLE_TREE */
222 221 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 // SPDX-License-Identifier: GPL-2.0-only /* PIPAPO: PIle PAcket POlicies: AVX2 packet lookup routines * * Copyright (c) 2019-2020 Red Hat GmbH * * Author: Stefano Brivio <sbrivio@redhat.com> */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <uapi/linux/netfilter/nf_tables.h> #include <linux/bitmap.h> #include <linux/bitops.h> #include <linux/compiler.h> #include <asm/fpu/api.h> #include "nft_set_pipapo_avx2.h" #include "nft_set_pipapo.h" #define NFT_PIPAPO_LONGS_PER_M256 (XSAVE_YMM_SIZE / BITS_PER_LONG) /* Load from memory into YMM register with non-temporal hint ("stream load"), * that is, don't fetch lines from memory into the cache. This avoids pushing * precious packet data out of the cache hierarchy, and is appropriate when: * * - loading buckets from lookup tables, as they are not going to be used * again before packets are entirely classified * * - loading the result bitmap from the previous field, as it's never used * again */ #define NFT_PIPAPO_AVX2_LOAD(reg, loc) \ asm volatile("vmovntdqa %0, %%ymm" #reg : : "m" (loc)) /* Stream a single lookup table bucket into YMM register given lookup table, * group index, value of packet bits, bucket size. */ #define NFT_PIPAPO_AVX2_BUCKET_LOAD4(reg, lt, group, v, bsize) \ NFT_PIPAPO_AVX2_LOAD(reg, \ lt[((group) * NFT_PIPAPO_BUCKETS(4) + \ (v)) * (bsize)]) #define NFT_PIPAPO_AVX2_BUCKET_LOAD8(reg, lt, group, v, bsize) \ NFT_PIPAPO_AVX2_LOAD(reg, \ lt[((group) * NFT_PIPAPO_BUCKETS(8) + \ (v)) * (bsize)]) /* Bitwise AND: the staple operation of this algorithm */ #define NFT_PIPAPO_AVX2_AND(dst, a, b) \ asm volatile("vpand %ymm" #a ", %ymm" #b ", %ymm" #dst) /* Jump to label if @reg is zero */ #define NFT_PIPAPO_AVX2_NOMATCH_GOTO(reg, label) \ asm goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \ "je %l[" #label "]" : : : : label) /* Store 256 bits from YMM register into memory. Contrary to bucket load * operation, we don't bypass the cache here, as stored matching results * are always used shortly after. */ #define NFT_PIPAPO_AVX2_STORE(loc, reg) \ asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (loc)) /* Zero out a complete YMM register, @reg */ #define NFT_PIPAPO_AVX2_ZERO(reg) \ asm volatile("vpxor %ymm" #reg ", %ymm" #reg ", %ymm" #reg) /** * nft_pipapo_avx2_prepare() - Prepare before main algorithm body * * This zeroes out ymm15, which is later used whenever we need to clear a * memory location, by storing its content into memory. */ static void nft_pipapo_avx2_prepare(void) { NFT_PIPAPO_AVX2_ZERO(15); } /** * nft_pipapo_avx2_fill() - Fill a bitmap region with ones * @data: Base memory area * @start: First bit to set * @len: Count of bits to fill * * This is nothing else than a version of bitmap_set(), as used e.g. by * pipapo_refill(), tailored for the microarchitectures using it and better * suited for the specific usage: it's very likely that we'll set a small number * of bits, not crossing a word boundary, and correct branch prediction is * critical here. * * This function doesn't actually use any AVX2 instruction. */ static void nft_pipapo_avx2_fill(unsigned long *data, int start, int len) { int offset = start % BITS_PER_LONG; unsigned long mask; data += start / BITS_PER_LONG; if (likely(len == 1)) { *data |= BIT(offset); return; } if (likely(len < BITS_PER_LONG || offset)) { if (likely(len + offset <= BITS_PER_LONG)) { *data |= GENMASK(len - 1 + offset, offset); return; } *data |= ~0UL << offset; len -= BITS_PER_LONG - offset; data++; if (len <= BITS_PER_LONG) { mask = ~0UL >> (BITS_PER_LONG - len); *data |= mask; return; } } memset(data, 0xff, len / BITS_PER_BYTE); data += len / BITS_PER_LONG; len %= BITS_PER_LONG; if (len) *data |= ~0UL >> (BITS_PER_LONG - len); } /** * nft_pipapo_avx2_refill() - Scan bitmap, select mapping table item, set bits * @offset: Start from given bitmap (equivalent to bucket) offset, in longs * @map: Bitmap to be scanned for set bits * @dst: Destination bitmap * @mt: Mapping table containing bit set specifiers * @last: Return index of first set bit, if this is the last field * * This is an alternative implementation of pipapo_refill() suitable for usage * with AVX2 lookup routines: we know there are four words to be scanned, at * a given offset inside the map, for each matching iteration. * * This function doesn't actually use any AVX2 instruction. * * Return: first set bit index if @last, index of first filled word otherwise. */ static int nft_pipapo_avx2_refill(int offset, unsigned long *map, unsigned long *dst, union nft_pipapo_map_bucket *mt, bool last) { int ret = -1; #define NFT_PIPAPO_AVX2_REFILL_ONE_WORD(x) \ do { \ while (map[(x)]) { \ int r = __builtin_ctzl(map[(x)]); \ int i = (offset + (x)) * BITS_PER_LONG + r; \ \ if (last) \ return i; \ \ nft_pipapo_avx2_fill(dst, mt[i].to, mt[i].n); \ \ if (ret == -1) \ ret = mt[i].to; \ \ map[(x)] &= ~(1UL << r); \ } \ } while (0) NFT_PIPAPO_AVX2_REFILL_ONE_WORD(0); NFT_PIPAPO_AVX2_REFILL_ONE_WORD(1); NFT_PIPAPO_AVX2_REFILL_ONE_WORD(2); NFT_PIPAPO_AVX2_REFILL_ONE_WORD(3); #undef NFT_PIPAPO_AVX2_REFILL_ONE_WORD return ret; } /** * nft_pipapo_avx2_lookup_4b_2() - AVX2-based lookup for 2 four-bit groups * @map: Previous match result, used as initial bitmap * @fill: Destination bitmap to be filled with current match result * @f: Field, containing lookup and mapping tables * @offset: Ignore buckets before the given index, no bits are filled there * @pkt: Packet data, pointer to input nftables register * @first: If this is the first field, don't source previous result * @last: Last field: stop at the first match and return bit index * * Load buckets from lookup table corresponding to the values of each 4-bit * group of packet bytes, and perform a bitwise intersection between them. If * this is the first field in the set, simply AND the buckets together * (equivalent to using an all-ones starting bitmap), use the provided starting * bitmap otherwise. Then call nft_pipapo_avx2_refill() to generate the next * working bitmap, @fill. * * This is used for 8-bit fields (i.e. protocol numbers). * * Out-of-order (and superscalar) execution is vital here, so it's critical to * avoid false data dependencies. CPU and compiler could (mostly) take care of * this on their own, but the operation ordering is explicitly given here with * a likely execution order in mind, to highlight possible stalls. That's why * a number of logically distinct operations (i.e. loading buckets, intersecting * buckets) are interleaved. * * Return: -1 on no match, rule index of match if @last, otherwise first long * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_4b_2(unsigned long *map, unsigned long *fill, const struct nft_pipapo_field *f, int offset, const u8 *pkt, bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; u8 pg[2] = { pkt[0] >> 4, pkt[0] & 0xf }; unsigned long *lt = f->lt, bsize = f->bsize; lt += offset * NFT_PIPAPO_LONGS_PER_M256; for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; if (first) { NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize); NFT_PIPAPO_AVX2_AND(4, 0, 1); } else { NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize); NFT_PIPAPO_AVX2_LOAD(2, map[i_ul]); NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize); NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nothing); NFT_PIPAPO_AVX2_AND(3, 0, 1); NFT_PIPAPO_AVX2_AND(4, 2, 3); } NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch); NFT_PIPAPO_AVX2_STORE(map[i_ul], 4); b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); if (last) return b; if (unlikely(ret == -1)) ret = b / XSAVE_YMM_SIZE; continue; nomatch: NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); nothing: ; } return ret; } /** * nft_pipapo_avx2_lookup_4b_4() - AVX2-based lookup for 4 four-bit groups * @map: Previous match result, used as initial bitmap * @fill: Destination bitmap to be filled with current match result * @f: Field, containing lookup and mapping tables * @offset: Ignore buckets before the given index, no bits are filled there * @pkt: Packet data, pointer to input nftables register * @first: If this is the first field, don't source previous result * @last: Last field: stop at the first match and return bit index * * See nft_pipapo_avx2_lookup_4b_2(). * * This is used for 16-bit fields (i.e. ports). * * Return: -1 on no match, rule index of match if @last, otherwise first long * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_4b_4(unsigned long *map, unsigned long *fill, const struct nft_pipapo_field *f, int offset, const u8 *pkt, bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; u8 pg[4] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf }; unsigned long *lt = f->lt, bsize = f->bsize; lt += offset * NFT_PIPAPO_LONGS_PER_M256; for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; if (first) { NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 2, pg[2], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 3, pg[3], bsize); NFT_PIPAPO_AVX2_AND(4, 0, 1); NFT_PIPAPO_AVX2_AND(5, 2, 3); NFT_PIPAPO_AVX2_AND(7, 4, 5); } else { NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize); NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]); NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize); NFT_PIPAPO_AVX2_AND(5, 0, 1); NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing); NFT_PIPAPO_AVX2_AND(6, 2, 3); NFT_PIPAPO_AVX2_AND(7, 4, 5); /* Stall */ NFT_PIPAPO_AVX2_AND(7, 6, 7); } /* Stall */ NFT_PIPAPO_AVX2_NOMATCH_GOTO(7, nomatch); NFT_PIPAPO_AVX2_STORE(map[i_ul], 7); b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); if (last) return b; if (unlikely(ret == -1)) ret = b / XSAVE_YMM_SIZE; continue; nomatch: NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); nothing: ; } return ret; } /** * nft_pipapo_avx2_lookup_4b_8() - AVX2-based lookup for 8 four-bit groups * @map: Previous match result, used as initial bitmap * @fill: Destination bitmap to be filled with current match result * @f: Field, containing lookup and mapping tables * @offset: Ignore buckets before the given index, no bits are filled there * @pkt: Packet data, pointer to input nftables register * @first: If this is the first field, don't source previous result * @last: Last field: stop at the first match and return bit index * * See nft_pipapo_avx2_lookup_4b_2(). * * This is used for 32-bit fields (i.e. IPv4 addresses). * * Return: -1 on no match, rule index of match if @last, otherwise first long * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_4b_8(unsigned long *map, unsigned long *fill, const struct nft_pipapo_field *f, int offset, const u8 *pkt, bool first, bool last) { u8 pg[8] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf, pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf, }; int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; lt += offset * NFT_PIPAPO_LONGS_PER_M256; for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; if (first) { NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 2, pg[2], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 3, pg[3], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 4, pg[4], bsize); NFT_PIPAPO_AVX2_AND(5, 0, 1); NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 5, pg[5], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 6, pg[6], bsize); NFT_PIPAPO_AVX2_AND(8, 2, 3); NFT_PIPAPO_AVX2_AND(9, 4, 5); NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 7, pg[7], bsize); NFT_PIPAPO_AVX2_AND(11, 6, 7); NFT_PIPAPO_AVX2_AND(12, 8, 9); NFT_PIPAPO_AVX2_AND(13, 10, 11); /* Stall */ NFT_PIPAPO_AVX2_AND(1, 12, 13); } else { NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize); NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]); NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize); NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing); NFT_PIPAPO_AVX2_AND(5, 0, 1); NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 4, pg[4], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize); NFT_PIPAPO_AVX2_AND(8, 2, 3); NFT_PIPAPO_AVX2_BUCKET_LOAD4(9, lt, 6, pg[6], bsize); NFT_PIPAPO_AVX2_AND(10, 4, 5); NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt, 7, pg[7], bsize); NFT_PIPAPO_AVX2_AND(12, 6, 7); NFT_PIPAPO_AVX2_AND(13, 8, 9); NFT_PIPAPO_AVX2_AND(14, 10, 11); /* Stall */ NFT_PIPAPO_AVX2_AND(1, 12, 13); NFT_PIPAPO_AVX2_AND(1, 1, 14); } NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nomatch); NFT_PIPAPO_AVX2_STORE(map[i_ul], 1); b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); if (last) return b; if (unlikely(ret == -1)) ret = b / XSAVE_YMM_SIZE; continue; nomatch: NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); nothing: ; } return ret; } /** * nft_pipapo_avx2_lookup_4b_12() - AVX2-based lookup for 12 four-bit groups * @map: Previous match result, used as initial bitmap * @fill: Destination bitmap to be filled with current match result * @f: Field, containing lookup and mapping tables * @offset: Ignore buckets before the given index, no bits are filled there * @pkt: Packet data, pointer to input nftables register * @first: If this is the first field, don't source previous result * @last: Last field: stop at the first match and return bit index * * See nft_pipapo_avx2_lookup_4b_2(). * * This is used for 48-bit fields (i.e. MAC addresses/EUI-48). * * Return: -1 on no match, rule index of match if @last, otherwise first long * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_4b_12(unsigned long *map, unsigned long *fill, const struct nft_pipapo_field *f, int offset, const u8 *pkt, bool first, bool last) { u8 pg[12] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf, pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf, pkt[4] >> 4, pkt[4] & 0xf, pkt[5] >> 4, pkt[5] & 0xf, }; int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; lt += offset * NFT_PIPAPO_LONGS_PER_M256; for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; if (!first) NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]); NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 0, pg[0], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize); if (!first) { NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing); NFT_PIPAPO_AVX2_AND(1, 1, 0); } NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 4, pg[4], bsize); NFT_PIPAPO_AVX2_AND(6, 2, 3); NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 6, pg[6], bsize); NFT_PIPAPO_AVX2_AND(9, 1, 4); NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 7, pg[7], bsize); NFT_PIPAPO_AVX2_AND(11, 5, 6); NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 8, pg[8], bsize); NFT_PIPAPO_AVX2_AND(13, 7, 8); NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 9, pg[9], bsize); NFT_PIPAPO_AVX2_AND(0, 9, 10); NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 10, pg[10], bsize); NFT_PIPAPO_AVX2_AND(2, 11, 12); NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 11, pg[11], bsize); NFT_PIPAPO_AVX2_AND(4, 13, 14); NFT_PIPAPO_AVX2_AND(5, 0, 1); NFT_PIPAPO_AVX2_AND(6, 2, 3); /* Stalls */ NFT_PIPAPO_AVX2_AND(7, 4, 5); NFT_PIPAPO_AVX2_AND(8, 6, 7); NFT_PIPAPO_AVX2_NOMATCH_GOTO(8, nomatch); NFT_PIPAPO_AVX2_STORE(map[i_ul], 8); b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); if (last) return b; if (unlikely(ret == -1)) ret = b / XSAVE_YMM_SIZE; continue; nomatch: NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); nothing: ; } return ret; } /** * nft_pipapo_avx2_lookup_4b_32() - AVX2-based lookup for 32 four-bit groups * @map: Previous match result, used as initial bitmap * @fill: Destination bitmap to be filled with current match result * @f: Field, containing lookup and mapping tables * @offset: Ignore buckets before the given index, no bits are filled there * @pkt: Packet data, pointer to input nftables register * @first: If this is the first field, don't source previous result * @last: Last field: stop at the first match and return bit index * * See nft_pipapo_avx2_lookup_4b_2(). * * This is used for 128-bit fields (i.e. IPv6 addresses). * * Return: -1 on no match, rule index of match if @last, otherwise first long * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_4b_32(unsigned long *map, unsigned long *fill, const struct nft_pipapo_field *f, int offset, const u8 *pkt, bool first, bool last) { u8 pg[32] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf, pkt[2] >> 4, pkt[2] & 0xf, pkt[3] >> 4, pkt[3] & 0xf, pkt[4] >> 4, pkt[4] & 0xf, pkt[5] >> 4, pkt[5] & 0xf, pkt[6] >> 4, pkt[6] & 0xf, pkt[7] >> 4, pkt[7] & 0xf, pkt[8] >> 4, pkt[8] & 0xf, pkt[9] >> 4, pkt[9] & 0xf, pkt[10] >> 4, pkt[10] & 0xf, pkt[11] >> 4, pkt[11] & 0xf, pkt[12] >> 4, pkt[12] & 0xf, pkt[13] >> 4, pkt[13] & 0xf, pkt[14] >> 4, pkt[14] & 0xf, pkt[15] >> 4, pkt[15] & 0xf, }; int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; lt += offset * NFT_PIPAPO_LONGS_PER_M256; for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; if (!first) NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]); NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 0, pg[0], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize); if (!first) { NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing); NFT_PIPAPO_AVX2_AND(1, 1, 0); } NFT_PIPAPO_AVX2_AND(5, 2, 3); NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 4, pg[4], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 5, pg[5], bsize); NFT_PIPAPO_AVX2_AND(8, 1, 4); NFT_PIPAPO_AVX2_BUCKET_LOAD4(9, lt, 6, pg[6], bsize); NFT_PIPAPO_AVX2_AND(10, 5, 6); NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt, 7, pg[7], bsize); NFT_PIPAPO_AVX2_AND(12, 7, 8); NFT_PIPAPO_AVX2_BUCKET_LOAD4(13, lt, 8, pg[8], bsize); NFT_PIPAPO_AVX2_AND(14, 9, 10); NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 9, pg[9], bsize); NFT_PIPAPO_AVX2_AND(1, 11, 12); NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 10, pg[10], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 11, pg[11], bsize); NFT_PIPAPO_AVX2_AND(4, 13, 14); NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 12, pg[12], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(6, lt, 13, pg[13], bsize); NFT_PIPAPO_AVX2_AND(7, 0, 1); NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 14, pg[14], bsize); NFT_PIPAPO_AVX2_AND(9, 2, 3); NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 15, pg[15], bsize); NFT_PIPAPO_AVX2_AND(11, 4, 5); NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 16, pg[16], bsize); NFT_PIPAPO_AVX2_AND(13, 6, 7); NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 17, pg[17], bsize); NFT_PIPAPO_AVX2_AND(0, 8, 9); NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 18, pg[18], bsize); NFT_PIPAPO_AVX2_AND(2, 10, 11); NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 19, pg[19], bsize); NFT_PIPAPO_AVX2_AND(4, 12, 13); NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 20, pg[20], bsize); NFT_PIPAPO_AVX2_AND(6, 14, 0); NFT_PIPAPO_AVX2_AND(7, 1, 2); NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 21, pg[21], bsize); NFT_PIPAPO_AVX2_AND(9, 3, 4); NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 22, pg[22], bsize); NFT_PIPAPO_AVX2_AND(11, 5, 6); NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 23, pg[23], bsize); NFT_PIPAPO_AVX2_AND(13, 7, 8); NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 24, pg[24], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 25, pg[25], bsize); NFT_PIPAPO_AVX2_AND(1, 9, 10); NFT_PIPAPO_AVX2_AND(2, 11, 12); NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 26, pg[26], bsize); NFT_PIPAPO_AVX2_AND(4, 13, 14); NFT_PIPAPO_AVX2_BUCKET_LOAD4(5, lt, 27, pg[27], bsize); NFT_PIPAPO_AVX2_AND(6, 0, 1); NFT_PIPAPO_AVX2_BUCKET_LOAD4(7, lt, 28, pg[28], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD4(8, lt, 29, pg[29], bsize); NFT_PIPAPO_AVX2_AND(9, 2, 3); NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 30, pg[30], bsize); NFT_PIPAPO_AVX2_AND(11, 4, 5); NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 31, pg[31], bsize); NFT_PIPAPO_AVX2_AND(0, 6, 7); NFT_PIPAPO_AVX2_AND(1, 8, 9); NFT_PIPAPO_AVX2_AND(2, 10, 11); NFT_PIPAPO_AVX2_AND(3, 12, 0); /* Stalls */ NFT_PIPAPO_AVX2_AND(4, 1, 2); NFT_PIPAPO_AVX2_AND(5, 3, 4); NFT_PIPAPO_AVX2_NOMATCH_GOTO(5, nomatch); NFT_PIPAPO_AVX2_STORE(map[i_ul], 5); b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); if (last) return b; if (unlikely(ret == -1)) ret = b / XSAVE_YMM_SIZE; continue; nomatch: NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); nothing: ; } return ret; } /** * nft_pipapo_avx2_lookup_8b_1() - AVX2-based lookup for one eight-bit group * @map: Previous match result, used as initial bitmap * @fill: Destination bitmap to be filled with current match result * @f: Field, containing lookup and mapping tables * @offset: Ignore buckets before the given index, no bits are filled there * @pkt: Packet data, pointer to input nftables register * @first: If this is the first field, don't source previous result * @last: Last field: stop at the first match and return bit index * * See nft_pipapo_avx2_lookup_4b_2(). * * This is used for 8-bit fields (i.e. protocol numbers). * * Return: -1 on no match, rule index of match if @last, otherwise first long * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_8b_1(unsigned long *map, unsigned long *fill, const struct nft_pipapo_field *f, int offset, const u8 *pkt, bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; lt += offset * NFT_PIPAPO_LONGS_PER_M256; for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; if (first) { NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 0, pkt[0], bsize); } else { NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]); NFT_PIPAPO_AVX2_AND(2, 0, 1); NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing); } NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nomatch); NFT_PIPAPO_AVX2_STORE(map[i_ul], 2); b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); if (last) return b; if (unlikely(ret == -1)) ret = b / XSAVE_YMM_SIZE; continue; nomatch: NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); nothing: ; } return ret; } /** * nft_pipapo_avx2_lookup_8b_2() - AVX2-based lookup for 2 eight-bit groups * @map: Previous match result, used as initial bitmap * @fill: Destination bitmap to be filled with current match result * @f: Field, containing lookup and mapping tables * @offset: Ignore buckets before the given index, no bits are filled there * @pkt: Packet data, pointer to input nftables register * @first: If this is the first field, don't source previous result * @last: Last field: stop at the first match and return bit index * * See nft_pipapo_avx2_lookup_4b_2(). * * This is used for 16-bit fields (i.e. ports). * * Return: -1 on no match, rule index of match if @last, otherwise first long * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_8b_2(unsigned long *map, unsigned long *fill, const struct nft_pipapo_field *f, int offset, const u8 *pkt, bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; lt += offset * NFT_PIPAPO_LONGS_PER_M256; for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; if (first) { NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize); NFT_PIPAPO_AVX2_AND(4, 0, 1); } else { NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]); NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 0, pkt[0], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize); /* Stall */ NFT_PIPAPO_AVX2_AND(3, 0, 1); NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing); NFT_PIPAPO_AVX2_AND(4, 3, 2); } /* Stall */ NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch); NFT_PIPAPO_AVX2_STORE(map[i_ul], 4); b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); if (last) return b; if (unlikely(ret == -1)) ret = b / XSAVE_YMM_SIZE; continue; nomatch: NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); nothing: ; } return ret; } /** * nft_pipapo_avx2_lookup_8b_4() - AVX2-based lookup for 4 eight-bit groups * @map: Previous match result, used as initial bitmap * @fill: Destination bitmap to be filled with current match result * @f: Field, containing lookup and mapping tables * @offset: Ignore buckets before the given index, no bits are filled there * @pkt: Packet data, pointer to input nftables register * @first: If this is the first field, don't source previous result * @last: Last field: stop at the first match and return bit index * * See nft_pipapo_avx2_lookup_4b_2(). * * This is used for 32-bit fields (i.e. IPv4 addresses). * * Return: -1 on no match, rule index of match if @last, otherwise first long * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_8b_4(unsigned long *map, unsigned long *fill, const struct nft_pipapo_field *f, int offset, const u8 *pkt, bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; lt += offset * NFT_PIPAPO_LONGS_PER_M256; for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; if (first) { NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 2, pkt[2], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 3, pkt[3], bsize); /* Stall */ NFT_PIPAPO_AVX2_AND(4, 0, 1); NFT_PIPAPO_AVX2_AND(5, 2, 3); NFT_PIPAPO_AVX2_AND(0, 4, 5); } else { NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]); NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize); NFT_PIPAPO_AVX2_AND(5, 0, 1); NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing); NFT_PIPAPO_AVX2_AND(6, 2, 3); /* Stall */ NFT_PIPAPO_AVX2_AND(7, 4, 5); NFT_PIPAPO_AVX2_AND(0, 6, 7); } NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nomatch); NFT_PIPAPO_AVX2_STORE(map[i_ul], 0); b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); if (last) return b; if (unlikely(ret == -1)) ret = b / XSAVE_YMM_SIZE; continue; nomatch: NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); nothing: ; } return ret; } /** * nft_pipapo_avx2_lookup_8b_6() - AVX2-based lookup for 6 eight-bit groups * @map: Previous match result, used as initial bitmap * @fill: Destination bitmap to be filled with current match result * @f: Field, containing lookup and mapping tables * @offset: Ignore buckets before the given index, no bits are filled there * @pkt: Packet data, pointer to input nftables register * @first: If this is the first field, don't source previous result * @last: Last field: stop at the first match and return bit index * * See nft_pipapo_avx2_lookup_4b_2(). * * This is used for 48-bit fields (i.e. MAC addresses/EUI-48). * * Return: -1 on no match, rule index of match if @last, otherwise first long * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_8b_6(unsigned long *map, unsigned long *fill, const struct nft_pipapo_field *f, int offset, const u8 *pkt, bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; lt += offset * NFT_PIPAPO_LONGS_PER_M256; for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; if (first) { NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 2, pkt[2], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 3, pkt[3], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 4, pkt[4], bsize); NFT_PIPAPO_AVX2_AND(5, 0, 1); NFT_PIPAPO_AVX2_BUCKET_LOAD8(6, lt, 5, pkt[5], bsize); NFT_PIPAPO_AVX2_AND(7, 2, 3); /* Stall */ NFT_PIPAPO_AVX2_AND(0, 4, 5); NFT_PIPAPO_AVX2_AND(1, 6, 7); NFT_PIPAPO_AVX2_AND(4, 0, 1); } else { NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize); NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]); NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize); NFT_PIPAPO_AVX2_AND(5, 0, 1); NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing); NFT_PIPAPO_AVX2_AND(6, 2, 3); NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 4, pkt[4], bsize); NFT_PIPAPO_AVX2_AND(0, 4, 5); NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 5, pkt[5], bsize); NFT_PIPAPO_AVX2_AND(2, 6, 7); /* Stall */ NFT_PIPAPO_AVX2_AND(3, 0, 1); NFT_PIPAPO_AVX2_AND(4, 2, 3); } NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch); NFT_PIPAPO_AVX2_STORE(map[i_ul], 4); b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); if (last) return b; if (unlikely(ret == -1)) ret = b / XSAVE_YMM_SIZE; continue; nomatch: NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); nothing: ; } return ret; } /** * nft_pipapo_avx2_lookup_8b_16() - AVX2-based lookup for 16 eight-bit groups * @map: Previous match result, used as initial bitmap * @fill: Destination bitmap to be filled with current match result * @f: Field, containing lookup and mapping tables * @offset: Ignore buckets before the given index, no bits are filled there * @pkt: Packet data, pointer to input nftables register * @first: If this is the first field, don't source previous result * @last: Last field: stop at the first match and return bit index * * See nft_pipapo_avx2_lookup_4b_2(). * * This is used for 128-bit fields (i.e. IPv6 addresses). * * Return: -1 on no match, rule index of match if @last, otherwise first long * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill, const struct nft_pipapo_field *f, int offset, const u8 *pkt, bool first, bool last) { int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b; unsigned long *lt = f->lt, bsize = f->bsize; lt += offset * NFT_PIPAPO_LONGS_PER_M256; for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) { int i_ul = i * NFT_PIPAPO_LONGS_PER_M256; if (!first) NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]); NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 0, pkt[0], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 2, pkt[2], bsize); if (!first) { NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing); NFT_PIPAPO_AVX2_AND(1, 1, 0); } NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt, 3, pkt[3], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 4, pkt[4], bsize); NFT_PIPAPO_AVX2_AND(6, 1, 2); NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 5, pkt[5], bsize); NFT_PIPAPO_AVX2_AND(0, 3, 4); NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 6, pkt[6], bsize); NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 7, pkt[7], bsize); NFT_PIPAPO_AVX2_AND(3, 5, 6); NFT_PIPAPO_AVX2_AND(4, 0, 1); NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 8, pkt[8], bsize); NFT_PIPAPO_AVX2_AND(6, 2, 3); NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 9, pkt[9], bsize); NFT_PIPAPO_AVX2_AND(0, 4, 5); NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 10, pkt[10], bsize); NFT_PIPAPO_AVX2_AND(2, 6, 7); NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 11, pkt[11], bsize); NFT_PIPAPO_AVX2_AND(4, 0, 1); NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 12, pkt[12], bsize); NFT_PIPAPO_AVX2_AND(6, 2, 3); NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 13, pkt[13], bsize); NFT_PIPAPO_AVX2_AND(0, 4, 5); NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 14, pkt[14], bsize); NFT_PIPAPO_AVX2_AND(2, 6, 7); NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 15, pkt[15], bsize); NFT_PIPAPO_AVX2_AND(4, 0, 1); /* Stall */ NFT_PIPAPO_AVX2_AND(5, 2, 3); NFT_PIPAPO_AVX2_AND(6, 4, 5); NFT_PIPAPO_AVX2_NOMATCH_GOTO(6, nomatch); NFT_PIPAPO_AVX2_STORE(map[i_ul], 6); b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last); if (last) return b; if (unlikely(ret == -1)) ret = b / XSAVE_YMM_SIZE; continue; nomatch: NFT_PIPAPO_AVX2_STORE(map[i_ul], 15); nothing: ; } return ret; } /** * nft_pipapo_avx2_lookup_slow() - Fallback function for uncommon field sizes * @mdata: Matching data, including mapping table * @map: Previous match result, used as initial bitmap * @fill: Destination bitmap to be filled with current match result * @f: Field, containing lookup and mapping tables * @offset: Ignore buckets before the given index, no bits are filled there * @pkt: Packet data, pointer to input nftables register * @first: If this is the first field, don't source previous result * @last: Last field: stop at the first match and return bit index * * This function should never be called, but is provided for the case the field * size doesn't match any of the known data types. Matching rate is * substantially lower than AVX2 routines. * * Return: -1 on no match, rule index of match if @last, otherwise first long * word index to be checked next (i.e. first filled word). */ static int nft_pipapo_avx2_lookup_slow(const struct nft_pipapo_match *mdata, unsigned long *map, unsigned long *fill, const struct nft_pipapo_field *f, int offset, const u8 *pkt, bool first, bool last) { unsigned long bsize = f->bsize; int i, ret = -1, b; if (first) pipapo_resmap_init(mdata, map); for (i = offset; i < bsize; i++) { if (f->bb == 8) pipapo_and_field_buckets_8bit(f, map, pkt); else pipapo_and_field_buckets_4bit(f, map, pkt); NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4; b = pipapo_refill(map, bsize, f->rules, fill, f->mt, last); if (last) return b; if (ret == -1) ret = b / XSAVE_YMM_SIZE; } return ret; } /** * nft_pipapo_avx2_estimate() - Set size, space and lookup complexity * @desc: Set description, element count and field description used * @features: Flags: NFT_SET_INTERVAL needs to be there * @est: Storage for estimation data * * Return: true if set is compatible and AVX2 available, false otherwise. */ bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est) { if (!(features & NFT_SET_INTERVAL) || desc->field_count < NFT_PIPAPO_MIN_FIELDS) return false; if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_AVX)) return false; est->size = pipapo_estimate_size(desc); if (!est->size) return false; est->lookup = NFT_SET_CLASS_O_LOG_N; est->space = NFT_SET_CLASS_O_N; return true; } /** * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation * @net: Network namespace * @set: nftables API set representation * @key: nftables API element representation containing key data * @ext: nftables API extension pointer, filled with matching reference * * For more details, see DOC: Theory of Operation in nft_set_pipapo.c. * * This implementation exploits the repetitive characteristic of the algorithm * to provide a fast, vectorised version using the AVX2 SIMD instruction set. * * Return: true on match, false otherwise. */ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_scratch *scratch; u8 genmask = nft_genmask_cur(net); const struct nft_pipapo_match *m; const struct nft_pipapo_field *f; const u8 *rp = (const u8 *)key; unsigned long *res, *fill; bool map_index; int i, ret = 0; local_bh_disable(); if (unlikely(!irq_fpu_usable())) { bool fallback_res = nft_pipapo_lookup(net, set, key, ext); local_bh_enable(); return fallback_res; } m = rcu_dereference(priv->match); /* This also protects access to all data related to scratch maps. * * Note that we don't need a valid MXCSR state for any of the * operations we use here, so pass 0 as mask and spare a LDMXCSR * instruction. */ kernel_fpu_begin_mask(0); scratch = *raw_cpu_ptr(m->scratch); if (unlikely(!scratch)) { kernel_fpu_end(); local_bh_enable(); return false; } map_index = scratch->map_index; res = scratch->map + (map_index ? m->bsize_max : 0); fill = scratch->map + (map_index ? 0 : m->bsize_max); /* Starting map doesn't need to be set for this implementation */ nft_pipapo_avx2_prepare(); next_match: nft_pipapo_for_each_field(f, i, m) { bool last = i == m->field_count - 1, first = !i; #define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n) \ (ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f, \ ret, rp, \ first, last)) if (likely(f->bb == 8)) { if (f->groups == 1) { NFT_SET_PIPAPO_AVX2_LOOKUP(8, 1); } else if (f->groups == 2) { NFT_SET_PIPAPO_AVX2_LOOKUP(8, 2); } else if (f->groups == 4) { NFT_SET_PIPAPO_AVX2_LOOKUP(8, 4); } else if (f->groups == 6) { NFT_SET_PIPAPO_AVX2_LOOKUP(8, 6); } else if (f->groups == 16) { NFT_SET_PIPAPO_AVX2_LOOKUP(8, 16); } else { ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f, ret, rp, first, last); } } else { if (f->groups == 2) { NFT_SET_PIPAPO_AVX2_LOOKUP(4, 2); } else if (f->groups == 4) { NFT_SET_PIPAPO_AVX2_LOOKUP(4, 4); } else if (f->groups == 8) { NFT_SET_PIPAPO_AVX2_LOOKUP(4, 8); } else if (f->groups == 12) { NFT_SET_PIPAPO_AVX2_LOOKUP(4, 12); } else if (f->groups == 32) { NFT_SET_PIPAPO_AVX2_LOOKUP(4, 32); } else { ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f, ret, rp, first, last); } } NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4; #undef NFT_SET_PIPAPO_AVX2_LOOKUP if (ret < 0) goto out; if (last) { *ext = &f->mt[ret].e->ext; if (unlikely(nft_set_elem_expired(*ext) || !nft_set_elem_active(*ext, genmask))) { ret = 0; goto next_match; } goto out; } swap(res, fill); rp += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); } out: if (i % 2) scratch->map_index = !map_index; kernel_fpu_end(); local_bh_enable(); return ret >= 0; }
27 13 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Linux NET3: Internet Group Management Protocol [IGMP] * * Authors: * Alan Cox <alan@lxorguk.ukuu.org.uk> * * Extended to talk the BSD extended IGMP protocol of mrouted 3.6 */ #ifndef _LINUX_IGMP_H #define _LINUX_IGMP_H #include <linux/skbuff.h> #include <linux/timer.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/refcount.h> #include <linux/sockptr.h> #include <uapi/linux/igmp.h> static inline struct igmphdr *igmp_hdr(const struct sk_buff *skb) { return (struct igmphdr *)skb_transport_header(skb); } static inline struct igmpv3_report * igmpv3_report_hdr(const struct sk_buff *skb) { return (struct igmpv3_report *)skb_transport_header(skb); } static inline struct igmpv3_query * igmpv3_query_hdr(const struct sk_buff *skb) { return (struct igmpv3_query *)skb_transport_header(skb); } struct ip_sf_socklist { unsigned int sl_max; unsigned int sl_count; struct rcu_head rcu; __be32 sl_addr[] __counted_by(sl_max); }; #define IP_SFBLOCK 10 /* allocate this many at once */ /* ip_mc_socklist is real list now. Speed is not argument; this list never used in fast path code */ struct ip_mc_socklist { struct ip_mc_socklist __rcu *next_rcu; struct ip_mreqn multi; unsigned int sfmode; /* MCAST_{INCLUDE,EXCLUDE} */ struct ip_sf_socklist __rcu *sflist; struct rcu_head rcu; }; struct ip_sf_list { struct ip_sf_list *sf_next; unsigned long sf_count[2]; /* include/exclude counts */ __be32 sf_inaddr; unsigned char sf_gsresp; /* include in g & s response? */ unsigned char sf_oldin; /* change state */ unsigned char sf_crcount; /* retrans. left to send */ }; struct ip_mc_list { struct in_device *interface; __be32 multiaddr; unsigned int sfmode; struct ip_sf_list *sources; struct ip_sf_list *tomb; unsigned long sfcount[2]; union { struct ip_mc_list *next; struct ip_mc_list __rcu *next_rcu; }; struct ip_mc_list __rcu *next_hash; struct timer_list timer; int users; refcount_t refcnt; spinlock_t lock; char tm_running; char reporter; char unsolicit_count; char loaded; unsigned char gsquery; /* check source marks? */ unsigned char crcount; unsigned long mca_cstamp; unsigned long mca_tstamp; struct rcu_head rcu; }; /* V3 exponential field decoding */ #define IGMPV3_MASK(value, nb) ((nb)>=32 ? (value) : ((1<<(nb))-1) & (value)) #define IGMPV3_EXP(thresh, nbmant, nbexp, value) \ ((value) < (thresh) ? (value) : \ ((IGMPV3_MASK(value, nbmant) | (1<<(nbmant))) << \ (IGMPV3_MASK((value) >> (nbmant), nbexp) + (nbexp)))) #define IGMPV3_QQIC(value) IGMPV3_EXP(0x80, 4, 3, value) #define IGMPV3_MRC(value) IGMPV3_EXP(0x80, 4, 3, value) static inline int ip_mc_may_pull(struct sk_buff *skb, unsigned int len) { if (skb_transport_offset(skb) + ip_transport_len(skb) < len) return 0; return pskb_may_pull(skb, len); } extern int ip_check_mc_rcu(struct in_device *dev, __be32 mc_addr, __be32 src_addr, u8 proto); extern int igmp_rcv(struct sk_buff *); extern int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr); extern int ip_mc_join_group_ssm(struct sock *sk, struct ip_mreqn *imr, unsigned int mode); extern int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr); extern void ip_mc_drop_socket(struct sock *sk); extern int ip_mc_source(int add, int omode, struct sock *sk, struct ip_mreq_source *mreqs, int ifindex); extern int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf,int ifindex); extern int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, sockptr_t optval, sockptr_t optlen); extern int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, sockptr_t optval, size_t offset); extern int ip_mc_sf_allow(const struct sock *sk, __be32 local, __be32 rmt, int dif, int sdif); extern void ip_mc_init_dev(struct in_device *); extern void ip_mc_destroy_dev(struct in_device *); extern void ip_mc_up(struct in_device *); extern void ip_mc_down(struct in_device *); extern void ip_mc_unmap(struct in_device *); extern void ip_mc_remap(struct in_device *); extern void __ip_mc_dec_group(struct in_device *in_dev, __be32 addr, gfp_t gfp); static inline void ip_mc_dec_group(struct in_device *in_dev, __be32 addr) { return __ip_mc_dec_group(in_dev, addr, GFP_KERNEL); } extern void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, gfp_t gfp); extern void ip_mc_inc_group(struct in_device *in_dev, __be32 addr); int ip_mc_check_igmp(struct sk_buff *skb); #endif
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM udp #if !defined(_TRACE_UDP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_UDP_H #include <linux/udp.h> #include <linux/tracepoint.h> #include <trace/events/net_probe_common.h> TRACE_EVENT(udp_fail_queue_rcv_skb, TP_PROTO(int rc, struct sock *sk, struct sk_buff *skb), TP_ARGS(rc, sk, skb), TP_STRUCT__entry( __field(int, rc) __field(__u16, sport) __field(__u16, dport) __field(__u16, family) __array(__u8, saddr, sizeof(struct sockaddr_in6)) __array(__u8, daddr, sizeof(struct sockaddr_in6)) ), TP_fast_assign( const struct udphdr *uh = (const struct udphdr *)udp_hdr(skb); __entry->rc = rc; /* for filtering use */ __entry->sport = ntohs(uh->source); __entry->dport = ntohs(uh->dest); __entry->family = sk->sk_family; memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); TP_STORE_ADDR_PORTS_SKB(skb, uh, __entry->saddr, __entry->daddr); ), TP_printk("rc=%d family=%s src=%pISpc dest=%pISpc", __entry->rc, show_family_name(__entry->family), __entry->saddr, __entry->daddr) ); #endif /* _TRACE_UDP_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
237 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 1994 Linus Torvalds * * Pentium III FXSR, SSE support * General FPU state handling cleanups * Gareth Hughes <gareth@valinux.com>, May 2000 * x86-64 work by Andi Kleen 2002 */ #ifndef _ASM_X86_FPU_API_H #define _ASM_X86_FPU_API_H #include <linux/bottom_half.h> #include <asm/fpu/types.h> /* * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It * disables preemption so be careful if you intend to use it for long periods * of time. * If you intend to use the FPU in irq/softirq you need to check first with * irq_fpu_usable() if it is possible. */ /* Kernel FPU states to initialize in kernel_fpu_begin_mask() */ #define KFPU_387 _BITUL(0) /* 387 state will be initialized */ #define KFPU_MXCSR _BITUL(1) /* MXCSR will be initialized */ extern void kernel_fpu_begin_mask(unsigned int kfpu_mask); extern void kernel_fpu_end(void); extern bool irq_fpu_usable(void); extern void fpregs_mark_activate(void); /* Code that is unaware of kernel_fpu_begin_mask() can use this */ static inline void kernel_fpu_begin(void) { #ifdef CONFIG_X86_64 /* * Any 64-bit code that uses 387 instructions must explicitly request * KFPU_387. */ kernel_fpu_begin_mask(KFPU_MXCSR); #else /* * 32-bit kernel code may use 387 operations as well as SSE2, etc, * as long as it checks that the CPU has the required capability. */ kernel_fpu_begin_mask(KFPU_387 | KFPU_MXCSR); #endif } /* * Use fpregs_lock() while editing CPU's FPU registers or fpu->fpstate. * A context switch will (and softirq might) save CPU's FPU registers to * fpu->fpstate.regs and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in * a random state. * * local_bh_disable() protects against both preemption and soft interrupts * on !RT kernels. * * On RT kernels local_bh_disable() is not sufficient because it only * serializes soft interrupt related sections via a local lock, but stays * preemptible. Disabling preemption is the right choice here as bottom * half processing is always in thread context on RT kernels so it * implicitly prevents bottom half processing as well. * * Disabling preemption also serializes against kernel_fpu_begin(). */ static inline void fpregs_lock(void) { if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_bh_disable(); else preempt_disable(); } static inline void fpregs_unlock(void) { if (!IS_ENABLED(CONFIG_PREEMPT_RT)) local_bh_enable(); else preempt_enable(); } /* * FPU state gets lazily restored before returning to userspace. So when in the * kernel, the valid FPU state may be kept in the buffer. This function will force * restore all the fpu state to the registers early if needed, and lock them from * being automatically saved/restored. Then FPU state can be modified safely in the * registers, before unlocking with fpregs_unlock(). */ void fpregs_lock_and_load(void); #ifdef CONFIG_X86_DEBUG_FPU extern void fpregs_assert_state_consistent(void); #else static inline void fpregs_assert_state_consistent(void) { } #endif /* * Load the task FPU state before returning to userspace. */ extern void switch_fpu_return(void); /* * Query the presence of one or more xfeatures. Works on any legacy CPU as well. * * If 'feature_name' is set then put a human-readable description of * the feature there as well - this can be used to print error (or success) * messages. */ extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name); /* Trap handling */ extern int fpu__exception_code(struct fpu *fpu, int trap_nr); extern void fpu_sync_fpstate(struct fpu *fpu); extern void fpu_reset_from_exception_fixup(void); /* Boot, hotplug and resume */ extern void fpu__init_cpu(void); extern void fpu__init_system(void); extern void fpu__init_check_bugs(void); extern void fpu__resume_cpu(void); #ifdef CONFIG_MATH_EMULATION extern void fpstate_init_soft(struct swregs_state *soft); #else static inline void fpstate_init_soft(struct swregs_state *soft) {} #endif /* State tracking */ DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); /* Process cleanup */ #ifdef CONFIG_X86_64 extern void fpstate_free(struct fpu *fpu); #else static inline void fpstate_free(struct fpu *fpu) { } #endif /* fpstate-related functions which are exported to KVM */ extern void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature); extern u64 xstate_get_guest_group_perm(void); extern void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); /* KVM specific functions */ extern bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu); extern void fpu_free_guest_fpstate(struct fpu_guest *gfpu); extern int fpu_swap_kvm_fpstate(struct fpu_guest *gfpu, bool enter_guest); extern int fpu_enable_guest_xfd_features(struct fpu_guest *guest_fpu, u64 xfeatures); #ifdef CONFIG_X86_64 extern void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd); extern void fpu_sync_guest_vmexit_xfd_state(void); #else static inline void fpu_update_guest_xfd(struct fpu_guest *guest_fpu, u64 xfd) { } static inline void fpu_sync_guest_vmexit_xfd_state(void) { } #endif extern void fpu_copy_guest_fpstate_to_uabi(struct fpu_guest *gfpu, void *buf, unsigned int size, u64 xfeatures, u32 pkru); extern int fpu_copy_uabi_to_guest_fpstate(struct fpu_guest *gfpu, const void *buf, u64 xcr0, u32 *vpkru); static inline void fpstate_set_confidential(struct fpu_guest *gfpu) { gfpu->fpstate->is_confidential = true; } static inline bool fpstate_is_confidential(struct fpu_guest *gfpu) { return gfpu->fpstate->is_confidential; } /* prctl */ extern long fpu_xstate_prctl(int option, unsigned long arg2); extern void fpu_idle_fpregs(void); #endif /* _ASM_X86_FPU_API_H */
3 3 3 3 3 2 1 1 2 1 1 1 1 8 8 8 8 8 8 8 11 11 1 1 1 1 1 1 1 1 1 1 1 2 3 3 3 10 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 // SPDX-License-Identifier: GPL-2.0-or-later /* Manage a process's keyrings * * Copyright (C) 2004-2005, 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/init.h> #include <linux/sched.h> #include <linux/sched/user.h> #include <linux/keyctl.h> #include <linux/fs.h> #include <linux/err.h> #include <linux/mutex.h> #include <linux/security.h> #include <linux/user_namespace.h> #include <linux/uaccess.h> #include <linux/init_task.h> #include <keys/request_key_auth-type.h> #include "internal.h" /* Session keyring create vs join semaphore */ static DEFINE_MUTEX(key_session_mutex); /* The root user's tracking struct */ struct key_user root_key_user = { .usage = REFCOUNT_INIT(3), .cons_lock = __MUTEX_INITIALIZER(root_key_user.cons_lock), .lock = __SPIN_LOCK_UNLOCKED(root_key_user.lock), .nkeys = ATOMIC_INIT(2), .nikeys = ATOMIC_INIT(2), .uid = GLOBAL_ROOT_UID, }; /* * Get or create a user register keyring. */ static struct key *get_user_register(struct user_namespace *user_ns) { struct key *reg_keyring = READ_ONCE(user_ns->user_keyring_register); if (reg_keyring) return reg_keyring; down_write(&user_ns->keyring_sem); /* Make sure there's a register keyring. It gets owned by the * user_namespace's owner. */ reg_keyring = user_ns->user_keyring_register; if (!reg_keyring) { reg_keyring = keyring_alloc(".user_reg", user_ns->owner, INVALID_GID, &init_cred, KEY_POS_WRITE | KEY_POS_SEARCH | KEY_USR_VIEW | KEY_USR_READ, 0, NULL, NULL); if (!IS_ERR(reg_keyring)) smp_store_release(&user_ns->user_keyring_register, reg_keyring); } up_write(&user_ns->keyring_sem); /* We don't return a ref since the keyring is pinned by the user_ns */ return reg_keyring; } /* * Look up the user and user session keyrings for the current process's UID, * creating them if they don't exist. */ int look_up_user_keyrings(struct key **_user_keyring, struct key **_user_session_keyring) { const struct cred *cred = current_cred(); struct user_namespace *user_ns = current_user_ns(); struct key *reg_keyring, *uid_keyring, *session_keyring; key_perm_t user_keyring_perm; key_ref_t uid_keyring_r, session_keyring_r; uid_t uid = from_kuid(user_ns, cred->user->uid); char buf[20]; int ret; user_keyring_perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL; kenter("%u", uid); reg_keyring = get_user_register(user_ns); if (IS_ERR(reg_keyring)) return PTR_ERR(reg_keyring); down_write(&user_ns->keyring_sem); ret = 0; /* Get the user keyring. Note that there may be one in existence * already as it may have been pinned by a session, but the user_struct * pointing to it may have been destroyed by setuid. */ snprintf(buf, sizeof(buf), "_uid.%u", uid); uid_keyring_r = keyring_search(make_key_ref(reg_keyring, true), &key_type_keyring, buf, false); kdebug("_uid %p", uid_keyring_r); if (uid_keyring_r == ERR_PTR(-EAGAIN)) { uid_keyring = keyring_alloc(buf, cred->user->uid, INVALID_GID, cred, user_keyring_perm, KEY_ALLOC_UID_KEYRING | KEY_ALLOC_IN_QUOTA, NULL, reg_keyring); if (IS_ERR(uid_keyring)) { ret = PTR_ERR(uid_keyring); goto error; } } else if (IS_ERR(uid_keyring_r)) { ret = PTR_ERR(uid_keyring_r); goto error; } else { uid_keyring = key_ref_to_ptr(uid_keyring_r); } /* Get a default session keyring (which might also exist already) */ snprintf(buf, sizeof(buf), "_uid_ses.%u", uid); session_keyring_r = keyring_search(make_key_ref(reg_keyring, true), &key_type_keyring, buf, false); kdebug("_uid_ses %p", session_keyring_r); if (session_keyring_r == ERR_PTR(-EAGAIN)) { session_keyring = keyring_alloc(buf, cred->user->uid, INVALID_GID, cred, user_keyring_perm, KEY_ALLOC_UID_KEYRING | KEY_ALLOC_IN_QUOTA, NULL, NULL); if (IS_ERR(session_keyring)) { ret = PTR_ERR(session_keyring); goto error_release; } /* We install a link from the user session keyring to * the user keyring. */ ret = key_link(session_keyring, uid_keyring); if (ret < 0) goto error_release_session; /* And only then link the user-session keyring to the * register. */ ret = key_link(reg_keyring, session_keyring); if (ret < 0) goto error_release_session; } else if (IS_ERR(session_keyring_r)) { ret = PTR_ERR(session_keyring_r); goto error_release; } else { session_keyring = key_ref_to_ptr(session_keyring_r); } up_write(&user_ns->keyring_sem); if (_user_session_keyring) *_user_session_keyring = session_keyring; else key_put(session_keyring); if (_user_keyring) *_user_keyring = uid_keyring; else key_put(uid_keyring); kleave(" = 0"); return 0; error_release_session: key_put(session_keyring); error_release: key_put(uid_keyring); error: up_write(&user_ns->keyring_sem); kleave(" = %d", ret); return ret; } /* * Get the user session keyring if it exists, but don't create it if it * doesn't. */ struct key *get_user_session_keyring_rcu(const struct cred *cred) { struct key *reg_keyring = READ_ONCE(cred->user_ns->user_keyring_register); key_ref_t session_keyring_r; char buf[20]; struct keyring_search_context ctx = { .index_key.type = &key_type_keyring, .index_key.description = buf, .cred = cred, .match_data.cmp = key_default_cmp, .match_data.raw_data = buf, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = KEYRING_SEARCH_DO_STATE_CHECK, }; if (!reg_keyring) return NULL; ctx.index_key.desc_len = snprintf(buf, sizeof(buf), "_uid_ses.%u", from_kuid(cred->user_ns, cred->user->uid)); session_keyring_r = keyring_search_rcu(make_key_ref(reg_keyring, true), &ctx); if (IS_ERR(session_keyring_r)) return NULL; return key_ref_to_ptr(session_keyring_r); } /* * Install a thread keyring to the given credentials struct if it didn't have * one already. This is allowed to overrun the quota. * * Return: 0 if a thread keyring is now present; -errno on failure. */ int install_thread_keyring_to_cred(struct cred *new) { struct key *keyring; if (new->thread_keyring) return 0; keyring = keyring_alloc("_tid", new->uid, new->gid, new, KEY_POS_ALL | KEY_USR_VIEW, KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); new->thread_keyring = keyring; return 0; } /* * Install a thread keyring to the current task if it didn't have one already. * * Return: 0 if a thread keyring is now present; -errno on failure. */ static int install_thread_keyring(void) { struct cred *new; int ret; new = prepare_creds(); if (!new) return -ENOMEM; ret = install_thread_keyring_to_cred(new); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); } /* * Install a process keyring to the given credentials struct if it didn't have * one already. This is allowed to overrun the quota. * * Return: 0 if a process keyring is now present; -errno on failure. */ int install_process_keyring_to_cred(struct cred *new) { struct key *keyring; if (new->process_keyring) return 0; keyring = keyring_alloc("_pid", new->uid, new->gid, new, KEY_POS_ALL | KEY_USR_VIEW, KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); new->process_keyring = keyring; return 0; } /* * Install a process keyring to the current task if it didn't have one already. * * Return: 0 if a process keyring is now present; -errno on failure. */ static int install_process_keyring(void) { struct cred *new; int ret; new = prepare_creds(); if (!new) return -ENOMEM; ret = install_process_keyring_to_cred(new); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); } /* * Install the given keyring as the session keyring of the given credentials * struct, replacing the existing one if any. If the given keyring is NULL, * then install a new anonymous session keyring. * @cred can not be in use by any task yet. * * Return: 0 on success; -errno on failure. */ int install_session_keyring_to_cred(struct cred *cred, struct key *keyring) { unsigned long flags; struct key *old; might_sleep(); /* create an empty session keyring */ if (!keyring) { flags = KEY_ALLOC_QUOTA_OVERRUN; if (cred->session_keyring) flags = KEY_ALLOC_IN_QUOTA; keyring = keyring_alloc("_ses", cred->uid, cred->gid, cred, KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ, flags, NULL, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); } else { __key_get(keyring); } /* install the keyring */ old = cred->session_keyring; cred->session_keyring = keyring; if (old) key_put(old); return 0; } /* * Install the given keyring as the session keyring of the current task, * replacing the existing one if any. If the given keyring is NULL, then * install a new anonymous session keyring. * * Return: 0 on success; -errno on failure. */ static int install_session_keyring(struct key *keyring) { struct cred *new; int ret; new = prepare_creds(); if (!new) return -ENOMEM; ret = install_session_keyring_to_cred(new, keyring); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); } /* * Handle the fsuid changing. */ void key_fsuid_changed(struct cred *new_cred) { /* update the ownership of the thread keyring */ if (new_cred->thread_keyring) { down_write(&new_cred->thread_keyring->sem); new_cred->thread_keyring->uid = new_cred->fsuid; up_write(&new_cred->thread_keyring->sem); } } /* * Handle the fsgid changing. */ void key_fsgid_changed(struct cred *new_cred) { /* update the ownership of the thread keyring */ if (new_cred->thread_keyring) { down_write(&new_cred->thread_keyring->sem); new_cred->thread_keyring->gid = new_cred->fsgid; up_write(&new_cred->thread_keyring->sem); } } /* * Search the process keyrings attached to the supplied cred for the first * matching key under RCU conditions (the caller must be holding the RCU read * lock). * * The search criteria are the type and the match function. The description is * given to the match function as a parameter, but doesn't otherwise influence * the search. Typically the match function will compare the description * parameter to the key's description. * * This can only search keyrings that grant Search permission to the supplied * credentials. Keyrings linked to searched keyrings will also be searched if * they grant Search permission too. Keys can only be found if they grant * Search permission to the credentials. * * Returns a pointer to the key with the key usage count incremented if * successful, -EAGAIN if we didn't find any matching key or -ENOKEY if we only * matched negative keys. * * In the case of a successful return, the possession attribute is set on the * returned key reference. */ key_ref_t search_cred_keyrings_rcu(struct keyring_search_context *ctx) { struct key *user_session; key_ref_t key_ref, ret, err; const struct cred *cred = ctx->cred; /* we want to return -EAGAIN or -ENOKEY if any of the keyrings were * searchable, but we failed to find a key or we found a negative key; * otherwise we want to return a sample error (probably -EACCES) if * none of the keyrings were searchable * * in terms of priority: success > -ENOKEY > -EAGAIN > other error */ key_ref = NULL; ret = NULL; err = ERR_PTR(-EAGAIN); /* search the thread keyring first */ if (cred->thread_keyring) { key_ref = keyring_search_rcu( make_key_ref(cred->thread_keyring, 1), ctx); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* search the process keyring second */ if (cred->process_keyring) { key_ref = keyring_search_rcu( make_key_ref(cred->process_keyring, 1), ctx); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ if (ret) break; fallthrough; case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* search the session keyring */ if (cred->session_keyring) { key_ref = keyring_search_rcu( make_key_ref(cred->session_keyring, 1), ctx); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ if (ret) break; fallthrough; case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* or search the user-session keyring */ else if ((user_session = get_user_session_keyring_rcu(cred))) { key_ref = keyring_search_rcu(make_key_ref(user_session, 1), ctx); key_put(user_session); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ if (ret) break; fallthrough; case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* no key - decide on the error we're going to go for */ key_ref = ret ? ret : err; found: return key_ref; } /* * Search the process keyrings attached to the supplied cred for the first * matching key in the manner of search_my_process_keyrings(), but also search * the keys attached to the assumed authorisation key using its credentials if * one is available. * * The caller must be holding the RCU read lock. * * Return same as search_cred_keyrings_rcu(). */ key_ref_t search_process_keyrings_rcu(struct keyring_search_context *ctx) { struct request_key_auth *rka; key_ref_t key_ref, ret = ERR_PTR(-EACCES), err; key_ref = search_cred_keyrings_rcu(ctx); if (!IS_ERR(key_ref)) goto found; err = key_ref; /* if this process has an instantiation authorisation key, then we also * search the keyrings of the process mentioned there * - we don't permit access to request_key auth keys via this method */ if (ctx->cred->request_key_auth && ctx->cred == current_cred() && ctx->index_key.type != &key_type_request_key_auth ) { const struct cred *cred = ctx->cred; if (key_validate(cred->request_key_auth) == 0) { rka = ctx->cred->request_key_auth->payload.data[0]; //// was search_process_keyrings() [ie. recursive] ctx->cred = rka->cred; key_ref = search_cred_keyrings_rcu(ctx); ctx->cred = cred; if (!IS_ERR(key_ref)) goto found; ret = key_ref; } } /* no key - decide on the error we're going to go for */ if (err == ERR_PTR(-ENOKEY) || ret == ERR_PTR(-ENOKEY)) key_ref = ERR_PTR(-ENOKEY); else if (err == ERR_PTR(-EACCES)) key_ref = ret; else key_ref = err; found: return key_ref; } /* * See if the key we're looking at is the target key. */ bool lookup_user_key_possessed(const struct key *key, const struct key_match_data *match_data) { return key == match_data->raw_data; } /* * Look up a key ID given us by userspace with a given permissions mask to get * the key it refers to. * * Flags can be passed to request that special keyrings be created if referred * to directly, to permit partially constructed keys to be found and to skip * validity and permission checks on the found key. * * Returns a pointer to the key with an incremented usage count if successful; * -EINVAL if the key ID is invalid; -ENOKEY if the key ID does not correspond * to a key or the best found key was a negative key; -EKEYREVOKED or * -EKEYEXPIRED if the best found key was revoked or expired; -EACCES if the * found key doesn't grant the requested permit or the LSM denied access to it; * or -ENOMEM if a special keyring couldn't be created. * * In the case of a successful return, the possession attribute is set on the * returned key reference. */ key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags, enum key_need_perm need_perm) { struct keyring_search_context ctx = { .match_data.cmp = lookup_user_key_possessed, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = (KEYRING_SEARCH_NO_STATE_CHECK | KEYRING_SEARCH_RECURSE), }; struct request_key_auth *rka; struct key *key, *user_session; key_ref_t key_ref, skey_ref; int ret; try_again: ctx.cred = get_current_cred(); key_ref = ERR_PTR(-ENOKEY); switch (id) { case KEY_SPEC_THREAD_KEYRING: if (!ctx.cred->thread_keyring) { if (!(lflags & KEY_LOOKUP_CREATE)) goto error; ret = install_thread_keyring(); if (ret < 0) { key_ref = ERR_PTR(ret); goto error; } goto reget_creds; } key = ctx.cred->thread_keyring; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_PROCESS_KEYRING: if (!ctx.cred->process_keyring) { if (!(lflags & KEY_LOOKUP_CREATE)) goto error; ret = install_process_keyring(); if (ret < 0) { key_ref = ERR_PTR(ret); goto error; } goto reget_creds; } key = ctx.cred->process_keyring; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_SESSION_KEYRING: if (!ctx.cred->session_keyring) { /* always install a session keyring upon access if one * doesn't exist yet */ ret = look_up_user_keyrings(NULL, &user_session); if (ret < 0) goto error; if (lflags & KEY_LOOKUP_CREATE) ret = join_session_keyring(NULL); else ret = install_session_keyring(user_session); key_put(user_session); if (ret < 0) goto error; goto reget_creds; } else if (test_bit(KEY_FLAG_UID_KEYRING, &ctx.cred->session_keyring->flags) && lflags & KEY_LOOKUP_CREATE) { ret = join_session_keyring(NULL); if (ret < 0) goto error; goto reget_creds; } key = ctx.cred->session_keyring; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_USER_KEYRING: ret = look_up_user_keyrings(&key, NULL); if (ret < 0) goto error; key_ref = make_key_ref(key, 1); break; case KEY_SPEC_USER_SESSION_KEYRING: ret = look_up_user_keyrings(NULL, &key); if (ret < 0) goto error; key_ref = make_key_ref(key, 1); break; case KEY_SPEC_GROUP_KEYRING: /* group keyrings are not yet supported */ key_ref = ERR_PTR(-EINVAL); goto error; case KEY_SPEC_REQKEY_AUTH_KEY: key = ctx.cred->request_key_auth; if (!key) goto error; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_REQUESTOR_KEYRING: if (!ctx.cred->request_key_auth) goto error; down_read(&ctx.cred->request_key_auth->sem); if (test_bit(KEY_FLAG_REVOKED, &ctx.cred->request_key_auth->flags)) { key_ref = ERR_PTR(-EKEYREVOKED); key = NULL; } else { rka = ctx.cred->request_key_auth->payload.data[0]; key = rka->dest_keyring; __key_get(key); } up_read(&ctx.cred->request_key_auth->sem); if (!key) goto error; key_ref = make_key_ref(key, 1); break; default: key_ref = ERR_PTR(-EINVAL); if (id < 1) goto error; key = key_lookup(id); if (IS_ERR(key)) { key_ref = ERR_CAST(key); goto error; } key_ref = make_key_ref(key, 0); /* check to see if we possess the key */ ctx.index_key = key->index_key; ctx.match_data.raw_data = key; kdebug("check possessed"); rcu_read_lock(); skey_ref = search_process_keyrings_rcu(&ctx); rcu_read_unlock(); kdebug("possessed=%p", skey_ref); if (!IS_ERR(skey_ref)) { key_put(key); key_ref = skey_ref; } break; } /* unlink does not use the nominated key in any way, so can skip all * the permission checks as it is only concerned with the keyring */ if (need_perm != KEY_NEED_UNLINK) { if (!(lflags & KEY_LOOKUP_PARTIAL)) { ret = wait_for_key_construction(key, true); switch (ret) { case -ERESTARTSYS: goto invalid_key; default: if (need_perm != KEY_AUTHTOKEN_OVERRIDE && need_perm != KEY_DEFER_PERM_CHECK) goto invalid_key; break; case 0: break; } } else if (need_perm != KEY_DEFER_PERM_CHECK) { ret = key_validate(key); if (ret < 0) goto invalid_key; } ret = -EIO; if (!(lflags & KEY_LOOKUP_PARTIAL) && key_read_state(key) == KEY_IS_UNINSTANTIATED) goto invalid_key; } /* check the permissions */ ret = key_task_permission(key_ref, ctx.cred, need_perm); if (ret < 0) goto invalid_key; key->last_used_at = ktime_get_real_seconds(); error: put_cred(ctx.cred); return key_ref; invalid_key: key_ref_put(key_ref); key_ref = ERR_PTR(ret); goto error; /* if we attempted to install a keyring, then it may have caused new * creds to be installed */ reget_creds: put_cred(ctx.cred); goto try_again; } EXPORT_SYMBOL(lookup_user_key); /* * Join the named keyring as the session keyring if possible else attempt to * create a new one of that name and join that. * * If the name is NULL, an empty anonymous keyring will be installed as the * session keyring. * * Named session keyrings are joined with a semaphore held to prevent the * keyrings from going away whilst the attempt is made to going them and also * to prevent a race in creating compatible session keyrings. */ long join_session_keyring(const char *name) { const struct cred *old; struct cred *new; struct key *keyring; long ret, serial; new = prepare_creds(); if (!new) return -ENOMEM; old = current_cred(); /* if no name is provided, install an anonymous keyring */ if (!name) { ret = install_session_keyring_to_cred(new, NULL); if (ret < 0) goto error; serial = new->session_keyring->serial; ret = commit_creds(new); if (ret == 0) ret = serial; goto okay; } /* allow the user to join or create a named keyring */ mutex_lock(&key_session_mutex); /* look for an existing keyring of this name */ keyring = find_keyring_by_name(name, false); if (PTR_ERR(keyring) == -ENOKEY) { /* not found - try and create a new one */ keyring = keyring_alloc( name, old->uid, old->gid, old, KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_LINK, KEY_ALLOC_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error2; } } else if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error2; } else if (keyring == new->session_keyring) { ret = 0; goto error3; } /* we've got a keyring - now to install it */ ret = install_session_keyring_to_cred(new, keyring); if (ret < 0) goto error3; commit_creds(new); mutex_unlock(&key_session_mutex); ret = keyring->serial; key_put(keyring); okay: return ret; error3: key_put(keyring); error2: mutex_unlock(&key_session_mutex); error: abort_creds(new); return ret; } /* * Replace a process's session keyring on behalf of one of its children when * the target process is about to resume userspace execution. */ void key_change_session_keyring(struct callback_head *twork) { const struct cred *old = current_cred(); struct cred *new = container_of(twork, struct cred, rcu); if (unlikely(current->flags & PF_EXITING)) { put_cred(new); return; } /* If get_ucounts fails more bits are needed in the refcount */ if (unlikely(!get_ucounts(old->ucounts))) { WARN_ONCE(1, "In %s get_ucounts failed\n", __func__); put_cred(new); return; } new-> uid = old-> uid; new-> euid = old-> euid; new-> suid = old-> suid; new->fsuid = old->fsuid; new-> gid = old-> gid; new-> egid = old-> egid; new-> sgid = old-> sgid; new->fsgid = old->fsgid; new->user = get_uid(old->user); new->ucounts = old->ucounts; new->user_ns = get_user_ns(old->user_ns); new->group_info = get_group_info(old->group_info); new->securebits = old->securebits; new->cap_inheritable = old->cap_inheritable; new->cap_permitted = old->cap_permitted; new->cap_effective = old->cap_effective; new->cap_ambient = old->cap_ambient; new->cap_bset = old->cap_bset; new->jit_keyring = old->jit_keyring; new->thread_keyring = key_get(old->thread_keyring); new->process_keyring = key_get(old->process_keyring); security_transfer_creds(new, old); commit_creds(new); } /* * Make sure that root's user and user-session keyrings exist. */ static int __init init_root_keyring(void) { return look_up_user_keyrings(NULL, NULL); } late_initcall(init_root_keyring);
5 5 4 5 5 5 5 5 4 27 28 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/cfm_bridge.h> #include <uapi/linux/cfm_bridge.h> #include "br_private_cfm.h" static struct br_cfm_mep *br_mep_find(struct net_bridge *br, u32 instance) { struct br_cfm_mep *mep; hlist_for_each_entry(mep, &br->mep_list, head) if (mep->instance == instance) return mep; return NULL; } static struct br_cfm_mep *br_mep_find_ifindex(struct net_bridge *br, u32 ifindex) { struct br_cfm_mep *mep; hlist_for_each_entry_rcu(mep, &br->mep_list, head, lockdep_rtnl_is_held()) if (mep->create.ifindex == ifindex) return mep; return NULL; } static struct br_cfm_peer_mep *br_peer_mep_find(struct br_cfm_mep *mep, u32 mepid) { struct br_cfm_peer_mep *peer_mep; hlist_for_each_entry_rcu(peer_mep, &mep->peer_mep_list, head, lockdep_rtnl_is_held()) if (peer_mep->mepid == mepid) return peer_mep; return NULL; } static struct net_bridge_port *br_mep_get_port(struct net_bridge *br, u32 ifindex) { struct net_bridge_port *port; list_for_each_entry(port, &br->port_list, list) if (port->dev->ifindex == ifindex) return port; return NULL; } /* Calculate the CCM interval in us. */ static u32 interval_to_us(enum br_cfm_ccm_interval interval) { switch (interval) { case BR_CFM_CCM_INTERVAL_NONE: return 0; case BR_CFM_CCM_INTERVAL_3_3_MS: return 3300; case BR_CFM_CCM_INTERVAL_10_MS: return 10 * 1000; case BR_CFM_CCM_INTERVAL_100_MS: return 100 * 1000; case BR_CFM_CCM_INTERVAL_1_SEC: return 1000 * 1000; case BR_CFM_CCM_INTERVAL_10_SEC: return 10 * 1000 * 1000; case BR_CFM_CCM_INTERVAL_1_MIN: return 60 * 1000 * 1000; case BR_CFM_CCM_INTERVAL_10_MIN: return 10 * 60 * 1000 * 1000; } return 0; } /* Convert the interface interval to CCM PDU value. */ static u32 interval_to_pdu(enum br_cfm_ccm_interval interval) { switch (interval) { case BR_CFM_CCM_INTERVAL_NONE: return 0; case BR_CFM_CCM_INTERVAL_3_3_MS: return 1; case BR_CFM_CCM_INTERVAL_10_MS: return 2; case BR_CFM_CCM_INTERVAL_100_MS: return 3; case BR_CFM_CCM_INTERVAL_1_SEC: return 4; case BR_CFM_CCM_INTERVAL_10_SEC: return 5; case BR_CFM_CCM_INTERVAL_1_MIN: return 6; case BR_CFM_CCM_INTERVAL_10_MIN: return 7; } return 0; } /* Convert the CCM PDU value to interval on interface. */ static u32 pdu_to_interval(u32 value) { switch (value) { case 0: return BR_CFM_CCM_INTERVAL_NONE; case 1: return BR_CFM_CCM_INTERVAL_3_3_MS; case 2: return BR_CFM_CCM_INTERVAL_10_MS; case 3: return BR_CFM_CCM_INTERVAL_100_MS; case 4: return BR_CFM_CCM_INTERVAL_1_SEC; case 5: return BR_CFM_CCM_INTERVAL_10_SEC; case 6: return BR_CFM_CCM_INTERVAL_1_MIN; case 7: return BR_CFM_CCM_INTERVAL_10_MIN; } return BR_CFM_CCM_INTERVAL_NONE; } static void ccm_rx_timer_start(struct br_cfm_peer_mep *peer_mep) { u32 interval_us; interval_us = interval_to_us(peer_mep->mep->cc_config.exp_interval); /* Function ccm_rx_dwork must be called with 1/4 * of the configured CC 'expected_interval' * in order to detect CCM defect after 3.25 interval. */ queue_delayed_work(system_wq, &peer_mep->ccm_rx_dwork, usecs_to_jiffies(interval_us / 4)); } static void br_cfm_notify(int event, const struct net_bridge_port *port) { u32 filter = RTEXT_FILTER_CFM_STATUS; br_info_notify(event, port->br, NULL, filter); } static void cc_peer_enable(struct br_cfm_peer_mep *peer_mep) { memset(&peer_mep->cc_status, 0, sizeof(peer_mep->cc_status)); peer_mep->ccm_rx_count_miss = 0; ccm_rx_timer_start(peer_mep); } static void cc_peer_disable(struct br_cfm_peer_mep *peer_mep) { cancel_delayed_work_sync(&peer_mep->ccm_rx_dwork); } static struct sk_buff *ccm_frame_build(struct br_cfm_mep *mep, const struct br_cfm_cc_ccm_tx_info *const tx_info) { struct br_cfm_common_hdr *common_hdr; struct net_bridge_port *b_port; struct br_cfm_maid *maid; u8 *itu_reserved, *e_tlv; struct ethhdr *eth_hdr; struct sk_buff *skb; __be32 *status_tlv; __be32 *snumber; __be16 *mepid; skb = dev_alloc_skb(CFM_CCM_MAX_FRAME_LENGTH); if (!skb) return NULL; rcu_read_lock(); b_port = rcu_dereference(mep->b_port); if (!b_port) { kfree_skb(skb); rcu_read_unlock(); return NULL; } skb->dev = b_port->dev; rcu_read_unlock(); /* The device cannot be deleted until the work_queue functions has * completed. This function is called from ccm_tx_work_expired() * that is a work_queue functions. */ skb->protocol = htons(ETH_P_CFM); skb->priority = CFM_FRAME_PRIO; /* Ethernet header */ eth_hdr = skb_put(skb, sizeof(*eth_hdr)); ether_addr_copy(eth_hdr->h_dest, tx_info->dmac.addr); ether_addr_copy(eth_hdr->h_source, mep->config.unicast_mac.addr); eth_hdr->h_proto = htons(ETH_P_CFM); /* Common CFM Header */ common_hdr = skb_put(skb, sizeof(*common_hdr)); common_hdr->mdlevel_version = mep->config.mdlevel << 5; common_hdr->opcode = BR_CFM_OPCODE_CCM; common_hdr->flags = (mep->rdi << 7) | interval_to_pdu(mep->cc_config.exp_interval); common_hdr->tlv_offset = CFM_CCM_TLV_OFFSET; /* Sequence number */ snumber = skb_put(skb, sizeof(*snumber)); if (tx_info->seq_no_update) { *snumber = cpu_to_be32(mep->ccm_tx_snumber); mep->ccm_tx_snumber += 1; } else { *snumber = 0; } mepid = skb_put(skb, sizeof(*mepid)); *mepid = cpu_to_be16((u16)mep->config.mepid); maid = skb_put(skb, sizeof(*maid)); memcpy(maid->data, mep->cc_config.exp_maid.data, sizeof(maid->data)); /* ITU reserved (CFM_CCM_ITU_RESERVED_SIZE octets) */ itu_reserved = skb_put(skb, CFM_CCM_ITU_RESERVED_SIZE); memset(itu_reserved, 0, CFM_CCM_ITU_RESERVED_SIZE); /* Generel CFM TLV format: * TLV type: one byte * TLV value length: two bytes * TLV value: 'TLV value length' bytes */ /* Port status TLV. The value length is 1. Total of 4 bytes. */ if (tx_info->port_tlv) { status_tlv = skb_put(skb, sizeof(*status_tlv)); *status_tlv = cpu_to_be32((CFM_PORT_STATUS_TLV_TYPE << 24) | (1 << 8) | /* Value length */ (tx_info->port_tlv_value & 0xFF)); } /* Interface status TLV. The value length is 1. Total of 4 bytes. */ if (tx_info->if_tlv) { status_tlv = skb_put(skb, sizeof(*status_tlv)); *status_tlv = cpu_to_be32((CFM_IF_STATUS_TLV_TYPE << 24) | (1 << 8) | /* Value length */ (tx_info->if_tlv_value & 0xFF)); } /* End TLV */ e_tlv = skb_put(skb, sizeof(*e_tlv)); *e_tlv = CFM_ENDE_TLV_TYPE; return skb; } static void ccm_frame_tx(struct sk_buff *skb) { skb_reset_network_header(skb); dev_queue_xmit(skb); } /* This function is called with the configured CC 'expected_interval' * in order to drive CCM transmission when enabled. */ static void ccm_tx_work_expired(struct work_struct *work) { struct delayed_work *del_work; struct br_cfm_mep *mep; struct sk_buff *skb; u32 interval_us; del_work = to_delayed_work(work); mep = container_of(del_work, struct br_cfm_mep, ccm_tx_dwork); if (time_before_eq(mep->ccm_tx_end, jiffies)) { /* Transmission period has ended */ mep->cc_ccm_tx_info.period = 0; return; } skb = ccm_frame_build(mep, &mep->cc_ccm_tx_info); if (skb) ccm_frame_tx(skb); interval_us = interval_to_us(mep->cc_config.exp_interval); queue_delayed_work(system_wq, &mep->ccm_tx_dwork, usecs_to_jiffies(interval_us)); } /* This function is called with 1/4 of the configured CC 'expected_interval' * in order to detect CCM defect after 3.25 interval. */ static void ccm_rx_work_expired(struct work_struct *work) { struct br_cfm_peer_mep *peer_mep; struct net_bridge_port *b_port; struct delayed_work *del_work; del_work = to_delayed_work(work); peer_mep = container_of(del_work, struct br_cfm_peer_mep, ccm_rx_dwork); /* After 13 counts (4 * 3,25) then 3.25 intervals are expired */ if (peer_mep->ccm_rx_count_miss < 13) { /* 3.25 intervals are NOT expired without CCM reception */ peer_mep->ccm_rx_count_miss++; /* Start timer again */ ccm_rx_timer_start(peer_mep); } else { /* 3.25 intervals are expired without CCM reception. * CCM defect detected */ peer_mep->cc_status.ccm_defect = true; /* Change in CCM defect status - notify */ rcu_read_lock(); b_port = rcu_dereference(peer_mep->mep->b_port); if (b_port) br_cfm_notify(RTM_NEWLINK, b_port); rcu_read_unlock(); } } static u32 ccm_tlv_extract(struct sk_buff *skb, u32 index, struct br_cfm_peer_mep *peer_mep) { __be32 *s_tlv; __be32 _s_tlv; u32 h_s_tlv; u8 *e_tlv; u8 _e_tlv; e_tlv = skb_header_pointer(skb, index, sizeof(_e_tlv), &_e_tlv); if (!e_tlv) return 0; /* TLV is present - get the status TLV */ s_tlv = skb_header_pointer(skb, index, sizeof(_s_tlv), &_s_tlv); if (!s_tlv) return 0; h_s_tlv = ntohl(*s_tlv); if ((h_s_tlv >> 24) == CFM_IF_STATUS_TLV_TYPE) { /* Interface status TLV */ peer_mep->cc_status.tlv_seen = true; peer_mep->cc_status.if_tlv_value = (h_s_tlv & 0xFF); } if ((h_s_tlv >> 24) == CFM_PORT_STATUS_TLV_TYPE) { /* Port status TLV */ peer_mep->cc_status.tlv_seen = true; peer_mep->cc_status.port_tlv_value = (h_s_tlv & 0xFF); } /* The Sender ID TLV is not handled */ /* The Organization-Specific TLV is not handled */ /* Return the length of this tlv. * This is the length of the value field plus 3 bytes for size of type * field and length field */ return ((h_s_tlv >> 8) & 0xFFFF) + 3; } /* note: already called with rcu_read_lock */ static int br_cfm_frame_rx(struct net_bridge_port *port, struct sk_buff *skb) { u32 mdlevel, interval, size, index, max; const struct br_cfm_common_hdr *hdr; struct br_cfm_peer_mep *peer_mep; const struct br_cfm_maid *maid; struct br_cfm_common_hdr _hdr; struct br_cfm_maid _maid; struct br_cfm_mep *mep; struct net_bridge *br; __be32 *snumber; __be32 _snumber; __be16 *mepid; __be16 _mepid; if (port->state == BR_STATE_DISABLED) return 0; hdr = skb_header_pointer(skb, 0, sizeof(_hdr), &_hdr); if (!hdr) return 1; br = port->br; mep = br_mep_find_ifindex(br, port->dev->ifindex); if (unlikely(!mep)) /* No MEP on this port - must be forwarded */ return 0; mdlevel = hdr->mdlevel_version >> 5; if (mdlevel > mep->config.mdlevel) /* The level is above this MEP level - must be forwarded */ return 0; if ((hdr->mdlevel_version & 0x1F) != 0) { /* Invalid version */ mep->status.version_unexp_seen = true; return 1; } if (mdlevel < mep->config.mdlevel) { /* The level is below this MEP level */ mep->status.rx_level_low_seen = true; return 1; } if (hdr->opcode == BR_CFM_OPCODE_CCM) { /* CCM PDU received. */ /* MA ID is after common header + sequence number + MEP ID */ maid = skb_header_pointer(skb, CFM_CCM_PDU_MAID_OFFSET, sizeof(_maid), &_maid); if (!maid) return 1; if (memcmp(maid->data, mep->cc_config.exp_maid.data, sizeof(maid->data))) /* MA ID not as expected */ return 1; /* MEP ID is after common header + sequence number */ mepid = skb_header_pointer(skb, CFM_CCM_PDU_MEPID_OFFSET, sizeof(_mepid), &_mepid); if (!mepid) return 1; peer_mep = br_peer_mep_find(mep, (u32)ntohs(*mepid)); if (!peer_mep) return 1; /* Interval is in common header flags */ interval = hdr->flags & 0x07; if (mep->cc_config.exp_interval != pdu_to_interval(interval)) /* Interval not as expected */ return 1; /* A valid CCM frame is received */ if (peer_mep->cc_status.ccm_defect) { peer_mep->cc_status.ccm_defect = false; /* Change in CCM defect status - notify */ br_cfm_notify(RTM_NEWLINK, port); /* Start CCM RX timer */ ccm_rx_timer_start(peer_mep); } peer_mep->cc_status.seen = true; peer_mep->ccm_rx_count_miss = 0; /* RDI is in common header flags */ peer_mep->cc_status.rdi = (hdr->flags & 0x80) ? true : false; /* Sequence number is after common header */ snumber = skb_header_pointer(skb, CFM_CCM_PDU_SEQNR_OFFSET, sizeof(_snumber), &_snumber); if (!snumber) return 1; if (ntohl(*snumber) != (mep->ccm_rx_snumber + 1)) /* Unexpected sequence number */ peer_mep->cc_status.seq_unexp_seen = true; mep->ccm_rx_snumber = ntohl(*snumber); /* TLV end is after common header + sequence number + MEP ID + * MA ID + ITU reserved */ index = CFM_CCM_PDU_TLV_OFFSET; max = 0; do { /* Handle all TLVs */ size = ccm_tlv_extract(skb, index, peer_mep); index += size; max += 1; } while (size != 0 && max < 4); /* Max four TLVs possible */ return 1; } mep->status.opcode_unexp_seen = true; return 1; } static struct br_frame_type cfm_frame_type __read_mostly = { .type = cpu_to_be16(ETH_P_CFM), .frame_handler = br_cfm_frame_rx, }; int br_cfm_mep_create(struct net_bridge *br, const u32 instance, struct br_cfm_mep_create *const create, struct netlink_ext_ack *extack) { struct net_bridge_port *p; struct br_cfm_mep *mep; ASSERT_RTNL(); if (create->domain == BR_CFM_VLAN) { NL_SET_ERR_MSG_MOD(extack, "VLAN domain not supported"); return -EINVAL; } if (create->domain != BR_CFM_PORT) { NL_SET_ERR_MSG_MOD(extack, "Invalid domain value"); return -EINVAL; } if (create->direction == BR_CFM_MEP_DIRECTION_UP) { NL_SET_ERR_MSG_MOD(extack, "Up-MEP not supported"); return -EINVAL; } if (create->direction != BR_CFM_MEP_DIRECTION_DOWN) { NL_SET_ERR_MSG_MOD(extack, "Invalid direction value"); return -EINVAL; } p = br_mep_get_port(br, create->ifindex); if (!p) { NL_SET_ERR_MSG_MOD(extack, "Port is not related to bridge"); return -EINVAL; } mep = br_mep_find(br, instance); if (mep) { NL_SET_ERR_MSG_MOD(extack, "MEP instance already exists"); return -EEXIST; } /* In PORT domain only one instance can be created per port */ if (create->domain == BR_CFM_PORT) { mep = br_mep_find_ifindex(br, create->ifindex); if (mep) { NL_SET_ERR_MSG_MOD(extack, "Only one Port MEP on a port allowed"); return -EINVAL; } } mep = kzalloc(sizeof(*mep), GFP_KERNEL); if (!mep) return -ENOMEM; mep->create = *create; mep->instance = instance; rcu_assign_pointer(mep->b_port, p); INIT_HLIST_HEAD(&mep->peer_mep_list); INIT_DELAYED_WORK(&mep->ccm_tx_dwork, ccm_tx_work_expired); if (hlist_empty(&br->mep_list)) br_add_frame(br, &cfm_frame_type); hlist_add_tail_rcu(&mep->head, &br->mep_list); return 0; } static void mep_delete_implementation(struct net_bridge *br, struct br_cfm_mep *mep) { struct br_cfm_peer_mep *peer_mep; struct hlist_node *n_store; ASSERT_RTNL(); /* Empty and free peer MEP list */ hlist_for_each_entry_safe(peer_mep, n_store, &mep->peer_mep_list, head) { cancel_delayed_work_sync(&peer_mep->ccm_rx_dwork); hlist_del_rcu(&peer_mep->head); kfree_rcu(peer_mep, rcu); } cancel_delayed_work_sync(&mep->ccm_tx_dwork); RCU_INIT_POINTER(mep->b_port, NULL); hlist_del_rcu(&mep->head); kfree_rcu(mep, rcu); if (hlist_empty(&br->mep_list)) br_del_frame(br, &cfm_frame_type); } int br_cfm_mep_delete(struct net_bridge *br, const u32 instance, struct netlink_ext_ack *extack) { struct br_cfm_mep *mep; ASSERT_RTNL(); mep = br_mep_find(br, instance); if (!mep) { NL_SET_ERR_MSG_MOD(extack, "MEP instance does not exists"); return -ENOENT; } mep_delete_implementation(br, mep); return 0; } int br_cfm_mep_config_set(struct net_bridge *br, const u32 instance, const struct br_cfm_mep_config *const config, struct netlink_ext_ack *extack) { struct br_cfm_mep *mep; ASSERT_RTNL(); mep = br_mep_find(br, instance); if (!mep) { NL_SET_ERR_MSG_MOD(extack, "MEP instance does not exists"); return -ENOENT; } mep->config = *config; return 0; } int br_cfm_cc_config_set(struct net_bridge *br, const u32 instance, const struct br_cfm_cc_config *const config, struct netlink_ext_ack *extack) { struct br_cfm_peer_mep *peer_mep; struct br_cfm_mep *mep; ASSERT_RTNL(); mep = br_mep_find(br, instance); if (!mep) { NL_SET_ERR_MSG_MOD(extack, "MEP instance does not exists"); return -ENOENT; } /* Check for no change in configuration */ if (memcmp(config, &mep->cc_config, sizeof(*config)) == 0) return 0; if (config->enable && !mep->cc_config.enable) /* CC is enabled */ hlist_for_each_entry(peer_mep, &mep->peer_mep_list, head) cc_peer_enable(peer_mep); if (!config->enable && mep->cc_config.enable) /* CC is disabled */ hlist_for_each_entry(peer_mep, &mep->peer_mep_list, head) cc_peer_disable(peer_mep); mep->cc_config = *config; mep->ccm_rx_snumber = 0; mep->ccm_tx_snumber = 1; return 0; } int br_cfm_cc_peer_mep_add(struct net_bridge *br, const u32 instance, u32 mepid, struct netlink_ext_ack *extack) { struct br_cfm_peer_mep *peer_mep; struct br_cfm_mep *mep; ASSERT_RTNL(); mep = br_mep_find(br, instance); if (!mep) { NL_SET_ERR_MSG_MOD(extack, "MEP instance does not exists"); return -ENOENT; } peer_mep = br_peer_mep_find(mep, mepid); if (peer_mep) { NL_SET_ERR_MSG_MOD(extack, "Peer MEP-ID already exists"); return -EEXIST; } peer_mep = kzalloc(sizeof(*peer_mep), GFP_KERNEL); if (!peer_mep) return -ENOMEM; peer_mep->mepid = mepid; peer_mep->mep = mep; INIT_DELAYED_WORK(&peer_mep->ccm_rx_dwork, ccm_rx_work_expired); if (mep->cc_config.enable) cc_peer_enable(peer_mep); hlist_add_tail_rcu(&peer_mep->head, &mep->peer_mep_list); return 0; } int br_cfm_cc_peer_mep_remove(struct net_bridge *br, const u32 instance, u32 mepid, struct netlink_ext_ack *extack) { struct br_cfm_peer_mep *peer_mep; struct br_cfm_mep *mep; ASSERT_RTNL(); mep = br_mep_find(br, instance); if (!mep) { NL_SET_ERR_MSG_MOD(extack, "MEP instance does not exists"); return -ENOENT; } peer_mep = br_peer_mep_find(mep, mepid); if (!peer_mep) { NL_SET_ERR_MSG_MOD(extack, "Peer MEP-ID does not exists"); return -ENOENT; } cc_peer_disable(peer_mep); hlist_del_rcu(&peer_mep->head); kfree_rcu(peer_mep, rcu); return 0; } int br_cfm_cc_rdi_set(struct net_bridge *br, const u32 instance, const bool rdi, struct netlink_ext_ack *extack) { struct br_cfm_mep *mep; ASSERT_RTNL(); mep = br_mep_find(br, instance); if (!mep) { NL_SET_ERR_MSG_MOD(extack, "MEP instance does not exists"); return -ENOENT; } mep->rdi = rdi; return 0; } int br_cfm_cc_ccm_tx(struct net_bridge *br, const u32 instance, const struct br_cfm_cc_ccm_tx_info *const tx_info, struct netlink_ext_ack *extack) { struct br_cfm_mep *mep; ASSERT_RTNL(); mep = br_mep_find(br, instance); if (!mep) { NL_SET_ERR_MSG_MOD(extack, "MEP instance does not exists"); return -ENOENT; } if (memcmp(tx_info, &mep->cc_ccm_tx_info, sizeof(*tx_info)) == 0) { /* No change in tx_info. */ if (mep->cc_ccm_tx_info.period == 0) /* Transmission is not enabled - just return */ return 0; /* Transmission is ongoing, the end time is recalculated */ mep->ccm_tx_end = jiffies + usecs_to_jiffies(tx_info->period * 1000000); return 0; } if (tx_info->period == 0 && mep->cc_ccm_tx_info.period == 0) /* Some change in info and transmission is not ongoing */ goto save; if (tx_info->period != 0 && mep->cc_ccm_tx_info.period != 0) { /* Some change in info and transmission is ongoing * The end time is recalculated */ mep->ccm_tx_end = jiffies + usecs_to_jiffies(tx_info->period * 1000000); goto save; } if (tx_info->period == 0 && mep->cc_ccm_tx_info.period != 0) { cancel_delayed_work_sync(&mep->ccm_tx_dwork); goto save; } /* Start delayed work to transmit CCM frames. It is done with zero delay * to send first frame immediately */ mep->ccm_tx_end = jiffies + usecs_to_jiffies(tx_info->period * 1000000); queue_delayed_work(system_wq, &mep->ccm_tx_dwork, 0); save: mep->cc_ccm_tx_info = *tx_info; return 0; } int br_cfm_mep_count(struct net_bridge *br, u32 *count) { struct br_cfm_mep *mep; *count = 0; rcu_read_lock(); hlist_for_each_entry_rcu(mep, &br->mep_list, head) *count += 1; rcu_read_unlock(); return 0; } int br_cfm_peer_mep_count(struct net_bridge *br, u32 *count) { struct br_cfm_peer_mep *peer_mep; struct br_cfm_mep *mep; *count = 0; rcu_read_lock(); hlist_for_each_entry_rcu(mep, &br->mep_list, head) hlist_for_each_entry_rcu(peer_mep, &mep->peer_mep_list, head) *count += 1; rcu_read_unlock(); return 0; } bool br_cfm_created(struct net_bridge *br) { return !hlist_empty(&br->mep_list); } /* Deletes the CFM instances on a specific bridge port */ void br_cfm_port_del(struct net_bridge *br, struct net_bridge_port *port) { struct hlist_node *n_store; struct br_cfm_mep *mep; ASSERT_RTNL(); hlist_for_each_entry_safe(mep, n_store, &br->mep_list, head) if (mep->create.ifindex == port->dev->ifindex) mep_delete_implementation(br, mep); }
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _CRYPTO_XTS_H #define _CRYPTO_XTS_H #include <crypto/b128ops.h> #include <crypto/internal/skcipher.h> #include <linux/fips.h> #define XTS_BLOCK_SIZE 16 static inline int xts_verify_key(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen) { /* * key consists of keys of equal size concatenated, therefore * the length must be even. */ if (keylen % 2) return -EINVAL; /* * In FIPS mode only a combined key length of either 256 or * 512 bits is allowed, c.f. FIPS 140-3 IG C.I. */ if (fips_enabled && keylen != 32 && keylen != 64) return -EINVAL; /* * Ensure that the AES and tweak key are not identical when * in FIPS mode or the FORBID_WEAK_KEYS flag is set. */ if ((fips_enabled || (crypto_skcipher_get_flags(tfm) & CRYPTO_TFM_REQ_FORBID_WEAK_KEYS)) && !crypto_memneq(key, key + (keylen / 2), keylen / 2)) return -EINVAL; return 0; } #endif /* _CRYPTO_XTS_H */
3 3 3 3 3 1 1 1 1 1 1 68 68 68 68 68 68 1 26 2 2 37 37 2 37 2 37 37 25 12 36 9 2 26 26 9 26 3 2 3 3 3 3 3 3 36 37 3 3 3 1 1 2 69 69 1 68 67 16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz> * Copyright 2017 Intel Deutschland GmbH * Copyright (C) 2019, 2022-2024 Intel Corporation */ #include <linux/kernel.h> #include <linux/rtnetlink.h> #include <linux/module.h> #include <linux/slab.h> #include "rate.h" #include "ieee80211_i.h" #include "debugfs.h" struct rate_control_alg { struct list_head list; const struct rate_control_ops *ops; }; static LIST_HEAD(rate_ctrl_algs); static DEFINE_MUTEX(rate_ctrl_mutex); static char *ieee80211_default_rc_algo = CONFIG_MAC80211_RC_DEFAULT; module_param(ieee80211_default_rc_algo, charp, 0644); MODULE_PARM_DESC(ieee80211_default_rc_algo, "Default rate control algorithm for mac80211 to use"); void rate_control_rate_init(struct link_sta_info *link_sta) { struct sta_info *sta = link_sta->sta; struct ieee80211_local *local = sta->sdata->local; struct rate_control_ref *ref = sta->rate_ctrl; struct ieee80211_sta *ista = &sta->sta; void *priv_sta = sta->rate_ctrl_priv; struct ieee80211_supported_band *sband; struct ieee80211_chanctx_conf *chanctx_conf; ieee80211_sta_init_nss(link_sta); if (!ref) return; /* SW rate control isn't supported with MLO right now */ if (WARN_ON(ieee80211_vif_is_mld(&sta->sdata->vif))) return; rcu_read_lock(); chanctx_conf = rcu_dereference(sta->sdata->vif.bss_conf.chanctx_conf); if (WARN_ON(!chanctx_conf)) { rcu_read_unlock(); return; } sband = local->hw.wiphy->bands[chanctx_conf->def.chan->band]; /* TODO: check for minstrel_s1g ? */ if (sband->band == NL80211_BAND_S1GHZ) { ieee80211_s1g_sta_rate_init(sta); rcu_read_unlock(); return; } spin_lock_bh(&sta->rate_ctrl_lock); ref->ops->rate_init(ref->priv, sband, &chanctx_conf->def, ista, priv_sta); spin_unlock_bh(&sta->rate_ctrl_lock); rcu_read_unlock(); set_sta_flag(sta, WLAN_STA_RATE_CONTROL); } void rate_control_rate_init_all_links(struct sta_info *sta) { int link_id; for (link_id = 0; link_id < ARRAY_SIZE(sta->link); link_id++) { struct link_sta_info *link_sta; link_sta = sdata_dereference(sta->link[link_id], sta->sdata); if (!link_sta) continue; rate_control_rate_init(link_sta); } } void rate_control_tx_status(struct ieee80211_local *local, struct ieee80211_tx_status *st) { struct rate_control_ref *ref = local->rate_ctrl; struct sta_info *sta = container_of(st->sta, struct sta_info, sta); void *priv_sta = sta->rate_ctrl_priv; struct ieee80211_supported_band *sband; if (!ref || !test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) return; sband = local->hw.wiphy->bands[st->info->band]; spin_lock_bh(&sta->rate_ctrl_lock); if (ref->ops->tx_status_ext) ref->ops->tx_status_ext(ref->priv, sband, priv_sta, st); else if (st->skb) ref->ops->tx_status(ref->priv, sband, st->sta, priv_sta, st->skb); else WARN_ON_ONCE(1); spin_unlock_bh(&sta->rate_ctrl_lock); } void rate_control_rate_update(struct ieee80211_local *local, struct ieee80211_supported_band *sband, struct link_sta_info *link_sta, u32 changed) { struct rate_control_ref *ref = local->rate_ctrl; struct sta_info *sta = link_sta->sta; struct ieee80211_sta *ista = &sta->sta; void *priv_sta = sta->rate_ctrl_priv; struct ieee80211_chanctx_conf *chanctx_conf; if (ref && ref->ops->rate_update) { rcu_read_lock(); chanctx_conf = rcu_dereference(sta->sdata->vif.bss_conf.chanctx_conf); if (WARN_ON(!chanctx_conf)) { rcu_read_unlock(); return; } spin_lock_bh(&sta->rate_ctrl_lock); ref->ops->rate_update(ref->priv, sband, &chanctx_conf->def, ista, priv_sta, changed); spin_unlock_bh(&sta->rate_ctrl_lock); rcu_read_unlock(); } if (sta->uploaded) drv_link_sta_rc_update(local, sta->sdata, link_sta->pub, changed); } int ieee80211_rate_control_register(const struct rate_control_ops *ops) { struct rate_control_alg *alg; if (!ops->name) return -EINVAL; mutex_lock(&rate_ctrl_mutex); list_for_each_entry(alg, &rate_ctrl_algs, list) { if (!strcmp(alg->ops->name, ops->name)) { /* don't register an algorithm twice */ WARN_ON(1); mutex_unlock(&rate_ctrl_mutex); return -EALREADY; } } alg = kzalloc(sizeof(*alg), GFP_KERNEL); if (alg == NULL) { mutex_unlock(&rate_ctrl_mutex); return -ENOMEM; } alg->ops = ops; list_add_tail(&alg->list, &rate_ctrl_algs); mutex_unlock(&rate_ctrl_mutex); return 0; } EXPORT_SYMBOL(ieee80211_rate_control_register); void ieee80211_rate_control_unregister(const struct rate_control_ops *ops) { struct rate_control_alg *alg; mutex_lock(&rate_ctrl_mutex); list_for_each_entry(alg, &rate_ctrl_algs, list) { if (alg->ops == ops) { list_del(&alg->list); kfree(alg); break; } } mutex_unlock(&rate_ctrl_mutex); } EXPORT_SYMBOL(ieee80211_rate_control_unregister); static const struct rate_control_ops * ieee80211_try_rate_control_ops_get(const char *name) { struct rate_control_alg *alg; const struct rate_control_ops *ops = NULL; if (!name) return NULL; mutex_lock(&rate_ctrl_mutex); list_for_each_entry(alg, &rate_ctrl_algs, list) { if (!strcmp(alg->ops->name, name)) { ops = alg->ops; break; } } mutex_unlock(&rate_ctrl_mutex); return ops; } /* Get the rate control algorithm. */ static const struct rate_control_ops * ieee80211_rate_control_ops_get(const char *name) { const struct rate_control_ops *ops; const char *alg_name; kernel_param_lock(THIS_MODULE); if (!name) alg_name = ieee80211_default_rc_algo; else alg_name = name; ops = ieee80211_try_rate_control_ops_get(alg_name); if (!ops && name) /* try default if specific alg requested but not found */ ops = ieee80211_try_rate_control_ops_get(ieee80211_default_rc_algo); /* Note: check for > 0 is intentional to avoid clang warning */ if (!ops && (strlen(CONFIG_MAC80211_RC_DEFAULT) > 0)) /* try built-in one if specific alg requested but not found */ ops = ieee80211_try_rate_control_ops_get(CONFIG_MAC80211_RC_DEFAULT); kernel_param_unlock(THIS_MODULE); return ops; } #ifdef CONFIG_MAC80211_DEBUGFS static ssize_t rcname_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { struct rate_control_ref *ref = file->private_data; int len = strlen(ref->ops->name); return simple_read_from_buffer(userbuf, count, ppos, ref->ops->name, len); } const struct debugfs_short_fops rcname_ops = { .read = rcname_read, .llseek = default_llseek, }; #endif static struct rate_control_ref * rate_control_alloc(const char *name, struct ieee80211_local *local) { struct rate_control_ref *ref; ref = kmalloc(sizeof(struct rate_control_ref), GFP_KERNEL); if (!ref) return NULL; ref->ops = ieee80211_rate_control_ops_get(name); if (!ref->ops) goto free; ref->priv = ref->ops->alloc(&local->hw); if (!ref->priv) goto free; return ref; free: kfree(ref); return NULL; } static void rate_control_free(struct ieee80211_local *local, struct rate_control_ref *ctrl_ref) { ctrl_ref->ops->free(ctrl_ref->priv); #ifdef CONFIG_MAC80211_DEBUGFS debugfs_remove_recursive(local->debugfs.rcdir); local->debugfs.rcdir = NULL; #endif kfree(ctrl_ref); } void ieee80211_check_rate_mask(struct ieee80211_link_data *link) { struct ieee80211_sub_if_data *sdata = link->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; u32 user_mask, basic_rates = link->conf->basic_rates; enum nl80211_band band; if (WARN_ON(!link->conf->chanreq.oper.chan)) return; band = link->conf->chanreq.oper.chan->band; if (band == NL80211_BAND_S1GHZ) { /* TODO */ return; } if (WARN_ON_ONCE(!basic_rates)) return; user_mask = sdata->rc_rateidx_mask[band]; sband = local->hw.wiphy->bands[band]; if (user_mask & basic_rates) return; sdata_dbg(sdata, "no overlap between basic rates (0x%x) and user mask (0x%x on band %d) - clearing the latter", basic_rates, user_mask, band); sdata->rc_rateidx_mask[band] = (1 << sband->n_bitrates) - 1; } static bool rc_no_data_or_no_ack_use_min(struct ieee80211_tx_rate_control *txrc) { struct sk_buff *skb = txrc->skb; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); return (info->flags & (IEEE80211_TX_CTL_NO_ACK | IEEE80211_TX_CTL_USE_MINRATE)) || !ieee80211_is_tx_data(skb); } static void rc_send_low_basicrate(struct ieee80211_tx_rate *rate, u32 basic_rates, struct ieee80211_supported_band *sband) { u8 i; if (sband->band == NL80211_BAND_S1GHZ) { /* TODO */ rate->flags |= IEEE80211_TX_RC_S1G_MCS; rate->idx = 0; return; } if (basic_rates == 0) return; /* assume basic rates unknown and accept rate */ if (rate->idx < 0) return; if (basic_rates & (1 << rate->idx)) return; /* selected rate is a basic rate */ for (i = rate->idx + 1; i <= sband->n_bitrates; i++) { if (basic_rates & (1 << i)) { rate->idx = i; return; } } /* could not find a basic rate; use original selection */ } static void __rate_control_send_low(struct ieee80211_hw *hw, struct ieee80211_supported_band *sband, struct ieee80211_sta *sta, struct ieee80211_tx_info *info, u32 rate_mask) { int i; u32 rate_flags = ieee80211_chandef_rate_flags(&hw->conf.chandef); if (sband->band == NL80211_BAND_S1GHZ) { info->control.rates[0].flags |= IEEE80211_TX_RC_S1G_MCS; info->control.rates[0].idx = 0; return; } if ((sband->band == NL80211_BAND_2GHZ) && (info->flags & IEEE80211_TX_CTL_NO_CCK_RATE)) rate_flags |= IEEE80211_RATE_ERP_G; info->control.rates[0].idx = 0; for (i = 0; i < sband->n_bitrates; i++) { if (!(rate_mask & BIT(i))) continue; if ((rate_flags & sband->bitrates[i].flags) != rate_flags) continue; if (!rate_supported(sta, sband->band, i)) continue; info->control.rates[0].idx = i; break; } WARN_ONCE(i == sband->n_bitrates, "no supported rates for sta %pM (0x%x, band %d) in rate_mask 0x%x with flags 0x%x\n", sta ? sta->addr : NULL, sta ? sta->deflink.supp_rates[sband->band] : -1, sband->band, rate_mask, rate_flags); info->control.rates[0].count = (info->flags & IEEE80211_TX_CTL_NO_ACK) ? 1 : hw->max_rate_tries; info->control.skip_table = 1; } static bool rate_control_send_low(struct ieee80211_sta *pubsta, struct ieee80211_tx_rate_control *txrc) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb); struct ieee80211_supported_band *sband = txrc->sband; struct sta_info *sta; int mcast_rate; bool use_basicrate = false; if (!pubsta || rc_no_data_or_no_ack_use_min(txrc)) { __rate_control_send_low(txrc->hw, sband, pubsta, info, txrc->rate_idx_mask); if (!pubsta && txrc->bss) { mcast_rate = txrc->bss_conf->mcast_rate[sband->band]; if (mcast_rate > 0) { info->control.rates[0].idx = mcast_rate - 1; return true; } use_basicrate = true; } else if (pubsta) { sta = container_of(pubsta, struct sta_info, sta); if (ieee80211_vif_is_mesh(&sta->sdata->vif)) use_basicrate = true; } if (use_basicrate) rc_send_low_basicrate(&info->control.rates[0], txrc->bss_conf->basic_rates, sband); return true; } return false; } static bool rate_idx_match_legacy_mask(s8 *rate_idx, int n_bitrates, u32 mask) { int j; /* See whether the selected rate or anything below it is allowed. */ for (j = *rate_idx; j >= 0; j--) { if (mask & (1 << j)) { /* Okay, found a suitable rate. Use it. */ *rate_idx = j; return true; } } /* Try to find a higher rate that would be allowed */ for (j = *rate_idx + 1; j < n_bitrates; j++) { if (mask & (1 << j)) { /* Okay, found a suitable rate. Use it. */ *rate_idx = j; return true; } } return false; } static bool rate_idx_match_mcs_mask(s8 *rate_idx, u8 *mcs_mask) { int i, j; int ridx, rbit; ridx = *rate_idx / 8; rbit = *rate_idx % 8; /* sanity check */ if (ridx < 0 || ridx >= IEEE80211_HT_MCS_MASK_LEN) return false; /* See whether the selected rate or anything below it is allowed. */ for (i = ridx; i >= 0; i--) { for (j = rbit; j >= 0; j--) if (mcs_mask[i] & BIT(j)) { *rate_idx = i * 8 + j; return true; } rbit = 7; } /* Try to find a higher rate that would be allowed */ ridx = (*rate_idx + 1) / 8; rbit = (*rate_idx + 1) % 8; for (i = ridx; i < IEEE80211_HT_MCS_MASK_LEN; i++) { for (j = rbit; j < 8; j++) if (mcs_mask[i] & BIT(j)) { *rate_idx = i * 8 + j; return true; } rbit = 0; } return false; } static bool rate_idx_match_vht_mcs_mask(s8 *rate_idx, u16 *vht_mask) { int i, j; int ridx, rbit; ridx = *rate_idx >> 4; rbit = *rate_idx & 0xf; if (ridx < 0 || ridx >= NL80211_VHT_NSS_MAX) return false; /* See whether the selected rate or anything below it is allowed. */ for (i = ridx; i >= 0; i--) { for (j = rbit; j >= 0; j--) { if (vht_mask[i] & BIT(j)) { *rate_idx = (i << 4) | j; return true; } } rbit = 15; } /* Try to find a higher rate that would be allowed */ ridx = (*rate_idx + 1) >> 4; rbit = (*rate_idx + 1) & 0xf; for (i = ridx; i < NL80211_VHT_NSS_MAX; i++) { for (j = rbit; j < 16; j++) { if (vht_mask[i] & BIT(j)) { *rate_idx = (i << 4) | j; return true; } } rbit = 0; } return false; } static void rate_idx_match_mask(s8 *rate_idx, u16 *rate_flags, struct ieee80211_supported_band *sband, enum nl80211_chan_width chan_width, u32 mask, u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN], u16 vht_mask[NL80211_VHT_NSS_MAX]) { if (*rate_flags & IEEE80211_TX_RC_VHT_MCS) { /* handle VHT rates */ if (rate_idx_match_vht_mcs_mask(rate_idx, vht_mask)) return; *rate_idx = 0; /* keep protection flags */ *rate_flags &= (IEEE80211_TX_RC_USE_RTS_CTS | IEEE80211_TX_RC_USE_CTS_PROTECT | IEEE80211_TX_RC_USE_SHORT_PREAMBLE); *rate_flags |= IEEE80211_TX_RC_MCS; if (chan_width == NL80211_CHAN_WIDTH_40) *rate_flags |= IEEE80211_TX_RC_40_MHZ_WIDTH; if (rate_idx_match_mcs_mask(rate_idx, mcs_mask)) return; /* also try the legacy rates. */ *rate_flags &= ~(IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_40_MHZ_WIDTH); if (rate_idx_match_legacy_mask(rate_idx, sband->n_bitrates, mask)) return; } else if (*rate_flags & IEEE80211_TX_RC_MCS) { /* handle HT rates */ if (rate_idx_match_mcs_mask(rate_idx, mcs_mask)) return; /* also try the legacy rates. */ *rate_idx = 0; /* keep protection flags */ *rate_flags &= (IEEE80211_TX_RC_USE_RTS_CTS | IEEE80211_TX_RC_USE_CTS_PROTECT | IEEE80211_TX_RC_USE_SHORT_PREAMBLE); if (rate_idx_match_legacy_mask(rate_idx, sband->n_bitrates, mask)) return; } else { /* handle legacy rates */ if (rate_idx_match_legacy_mask(rate_idx, sband->n_bitrates, mask)) return; /* if HT BSS, and we handle a data frame, also try HT rates */ switch (chan_width) { case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: return; default: break; } *rate_idx = 0; /* keep protection flags */ *rate_flags &= (IEEE80211_TX_RC_USE_RTS_CTS | IEEE80211_TX_RC_USE_CTS_PROTECT | IEEE80211_TX_RC_USE_SHORT_PREAMBLE); *rate_flags |= IEEE80211_TX_RC_MCS; if (chan_width == NL80211_CHAN_WIDTH_40) *rate_flags |= IEEE80211_TX_RC_40_MHZ_WIDTH; if (rate_idx_match_mcs_mask(rate_idx, mcs_mask)) return; } /* * Uh.. No suitable rate exists. This should not really happen with * sane TX rate mask configurations. However, should someone manage to * configure supported rates and TX rate mask in incompatible way, * allow the frame to be transmitted with whatever the rate control * selected. */ } static void rate_fixup_ratelist(struct ieee80211_vif *vif, struct ieee80211_supported_band *sband, struct ieee80211_tx_info *info, struct ieee80211_tx_rate *rates, int max_rates) { struct ieee80211_rate *rate; bool inval = false; int i; /* * Set up the RTS/CTS rate as the fastest basic rate * that is not faster than the data rate unless there * is no basic rate slower than the data rate, in which * case we pick the slowest basic rate * * XXX: Should this check all retry rates? */ if (!(rates[0].flags & (IEEE80211_TX_RC_MCS | IEEE80211_TX_RC_VHT_MCS))) { u32 basic_rates = vif->bss_conf.basic_rates; s8 baserate = basic_rates ? ffs(basic_rates) - 1 : 0; rate = &sband->bitrates[rates[0].idx]; for (i = 0; i < sband->n_bitrates; i++) { /* must be a basic rate */ if (!(basic_rates & BIT(i))) continue; /* must not be faster than the data rate */ if (sband->bitrates[i].bitrate > rate->bitrate) continue; /* maximum */ if (sband->bitrates[baserate].bitrate < sband->bitrates[i].bitrate) baserate = i; } info->control.rts_cts_rate_idx = baserate; } for (i = 0; i < max_rates; i++) { /* * make sure there's no valid rate following * an invalid one, just in case drivers don't * take the API seriously to stop at -1. */ if (inval) { rates[i].idx = -1; continue; } if (rates[i].idx < 0) { inval = true; continue; } /* * For now assume MCS is already set up correctly, this * needs to be fixed. */ if (rates[i].flags & IEEE80211_TX_RC_MCS) { WARN_ON(rates[i].idx > 76); if (!(rates[i].flags & IEEE80211_TX_RC_USE_RTS_CTS) && info->control.use_cts_prot) rates[i].flags |= IEEE80211_TX_RC_USE_CTS_PROTECT; continue; } if (rates[i].flags & IEEE80211_TX_RC_VHT_MCS) { WARN_ON(ieee80211_rate_get_vht_mcs(&rates[i]) > 9); continue; } /* set up RTS protection if desired */ if (info->control.use_rts) { rates[i].flags |= IEEE80211_TX_RC_USE_RTS_CTS; info->control.use_cts_prot = false; } /* RC is busted */ if (WARN_ON_ONCE(rates[i].idx >= sband->n_bitrates)) { rates[i].idx = -1; continue; } rate = &sband->bitrates[rates[i].idx]; /* set up short preamble */ if (info->control.short_preamble && rate->flags & IEEE80211_RATE_SHORT_PREAMBLE) rates[i].flags |= IEEE80211_TX_RC_USE_SHORT_PREAMBLE; /* set up G protection */ if (!(rates[i].flags & IEEE80211_TX_RC_USE_RTS_CTS) && info->control.use_cts_prot && rate->flags & IEEE80211_RATE_ERP_G) rates[i].flags |= IEEE80211_TX_RC_USE_CTS_PROTECT; } } static void rate_control_fill_sta_table(struct ieee80211_sta *sta, struct ieee80211_tx_info *info, struct ieee80211_tx_rate *rates, int max_rates) { struct ieee80211_sta_rates *ratetbl = NULL; int i; if (sta && !info->control.skip_table) ratetbl = rcu_dereference(sta->rates); /* Fill remaining rate slots with data from the sta rate table. */ max_rates = min_t(int, max_rates, IEEE80211_TX_RATE_TABLE_SIZE); for (i = 0; i < max_rates; i++) { if (i < ARRAY_SIZE(info->control.rates) && info->control.rates[i].idx >= 0 && info->control.rates[i].count) { if (rates != info->control.rates) rates[i] = info->control.rates[i]; } else if (ratetbl) { rates[i].idx = ratetbl->rate[i].idx; rates[i].flags = ratetbl->rate[i].flags; if (info->control.use_rts) rates[i].count = ratetbl->rate[i].count_rts; else if (info->control.use_cts_prot) rates[i].count = ratetbl->rate[i].count_cts; else rates[i].count = ratetbl->rate[i].count; } else { rates[i].idx = -1; rates[i].count = 0; } if (rates[i].idx < 0 || !rates[i].count) break; } } static bool rate_control_cap_mask(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, struct ieee80211_sta *sta, u32 *mask, u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN], u16 vht_mask[NL80211_VHT_NSS_MAX]) { u32 i, flags; *mask = sdata->rc_rateidx_mask[sband->band]; flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chanreq.oper); for (i = 0; i < sband->n_bitrates; i++) { if ((flags & sband->bitrates[i].flags) != flags) *mask &= ~BIT(i); } if (*mask == (1 << sband->n_bitrates) - 1 && !sdata->rc_has_mcs_mask[sband->band] && !sdata->rc_has_vht_mcs_mask[sband->band]) return false; if (sdata->rc_has_mcs_mask[sband->band]) memcpy(mcs_mask, sdata->rc_rateidx_mcs_mask[sband->band], IEEE80211_HT_MCS_MASK_LEN); else memset(mcs_mask, 0xff, IEEE80211_HT_MCS_MASK_LEN); if (sdata->rc_has_vht_mcs_mask[sband->band]) memcpy(vht_mask, sdata->rc_rateidx_vht_mcs_mask[sband->band], sizeof(u16) * NL80211_VHT_NSS_MAX); else memset(vht_mask, 0xff, sizeof(u16) * NL80211_VHT_NSS_MAX); if (sta) { __le16 sta_vht_cap; u16 sta_vht_mask[NL80211_VHT_NSS_MAX]; /* Filter out rates that the STA does not support */ *mask &= sta->deflink.supp_rates[sband->band]; for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) mcs_mask[i] &= sta->deflink.ht_cap.mcs.rx_mask[i]; sta_vht_cap = sta->deflink.vht_cap.vht_mcs.rx_mcs_map; ieee80211_get_vht_mask_from_cap(sta_vht_cap, sta_vht_mask); for (i = 0; i < NL80211_VHT_NSS_MAX; i++) vht_mask[i] &= sta_vht_mask[i]; } return true; } static void rate_control_apply_mask_ratetbl(struct sta_info *sta, struct ieee80211_supported_band *sband, struct ieee80211_sta_rates *rates) { int i; u32 mask; u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN]; u16 vht_mask[NL80211_VHT_NSS_MAX]; enum nl80211_chan_width chan_width; if (!rate_control_cap_mask(sta->sdata, sband, &sta->sta, &mask, mcs_mask, vht_mask)) return; chan_width = sta->sdata->vif.bss_conf.chanreq.oper.width; for (i = 0; i < IEEE80211_TX_RATE_TABLE_SIZE; i++) { if (rates->rate[i].idx < 0) break; rate_idx_match_mask(&rates->rate[i].idx, &rates->rate[i].flags, sband, chan_width, mask, mcs_mask, vht_mask); } } static void rate_control_apply_mask(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta *sta, struct ieee80211_supported_band *sband, struct ieee80211_tx_rate *rates, int max_rates) { enum nl80211_chan_width chan_width; u8 mcs_mask[IEEE80211_HT_MCS_MASK_LEN]; u32 mask; u16 rate_flags, vht_mask[NL80211_VHT_NSS_MAX]; int i; /* * Try to enforce the rateidx mask the user wanted. skip this if the * default mask (allow all rates) is used to save some processing for * the common case. */ if (!rate_control_cap_mask(sdata, sband, sta, &mask, mcs_mask, vht_mask)) return; /* * Make sure the rate index selected for each TX rate is * included in the configured mask and change the rate indexes * if needed. */ chan_width = sdata->vif.bss_conf.chanreq.oper.width; for (i = 0; i < max_rates; i++) { /* Skip invalid rates */ if (rates[i].idx < 0) break; rate_flags = rates[i].flags; rate_idx_match_mask(&rates[i].idx, &rate_flags, sband, chan_width, mask, mcs_mask, vht_mask); rates[i].flags = rate_flags; } } void ieee80211_get_tx_rates(struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct sk_buff *skb, struct ieee80211_tx_rate *dest, int max_rates) { struct ieee80211_sub_if_data *sdata; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_supported_band *sband; u32 mask = ~0; rate_control_fill_sta_table(sta, info, dest, max_rates); if (!vif) return; sdata = vif_to_sdata(vif); sband = sdata->local->hw.wiphy->bands[info->band]; if (ieee80211_is_tx_data(skb)) rate_control_apply_mask(sdata, sta, sband, dest, max_rates); if (!(info->control.flags & IEEE80211_TX_CTRL_DONT_USE_RATE_MASK)) mask = sdata->rc_rateidx_mask[info->band]; if (dest[0].idx < 0) __rate_control_send_low(&sdata->local->hw, sband, sta, info, mask); if (sta) rate_fixup_ratelist(vif, sband, info, dest, max_rates); } EXPORT_SYMBOL(ieee80211_get_tx_rates); void rate_control_get_rate(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee80211_tx_rate_control *txrc) { struct rate_control_ref *ref = sdata->local->rate_ctrl; void *priv_sta = NULL; struct ieee80211_sta *ista = NULL; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb); int i; for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { info->control.rates[i].idx = -1; info->control.rates[i].flags = 0; info->control.rates[i].count = 0; } if (rate_control_send_low(sta ? &sta->sta : NULL, txrc)) return; if (ieee80211_hw_check(&sdata->local->hw, HAS_RATE_CONTROL)) return; if (sta && test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) { ista = &sta->sta; priv_sta = sta->rate_ctrl_priv; } if (ista) { spin_lock_bh(&sta->rate_ctrl_lock); ref->ops->get_rate(ref->priv, ista, priv_sta, txrc); spin_unlock_bh(&sta->rate_ctrl_lock); } else { rate_control_send_low(NULL, txrc); } if (ieee80211_hw_check(&sdata->local->hw, SUPPORTS_RC_TABLE)) return; ieee80211_get_tx_rates(&sdata->vif, ista, txrc->skb, info->control.rates, ARRAY_SIZE(info->control.rates)); } int rate_control_set_rates(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, struct ieee80211_sta_rates *rates) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_sta_rates *old; struct ieee80211_supported_band *sband; sband = ieee80211_get_sband(sta->sdata); if (!sband) return -EINVAL; rate_control_apply_mask_ratetbl(sta, sband, rates); /* * mac80211 guarantees that this function will not be called * concurrently, so the following RCU access is safe, even without * extra locking. This can not be checked easily, so we just set * the condition to true. */ old = rcu_dereference_protected(pubsta->rates, true); rcu_assign_pointer(pubsta->rates, rates); if (old) kfree_rcu(old, rcu_head); if (sta->uploaded) drv_sta_rate_tbl_update(hw_to_local(hw), sta->sdata, pubsta); ieee80211_sta_set_expected_throughput(pubsta, sta_get_expected_throughput(sta)); return 0; } EXPORT_SYMBOL(rate_control_set_rates); int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local, const char *name) { struct rate_control_ref *ref; ASSERT_RTNL(); if (local->open_count) return -EBUSY; if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) { if (WARN_ON(!local->ops->set_rts_threshold)) return -EINVAL; return 0; } ref = rate_control_alloc(name, local); if (!ref) { wiphy_warn(local->hw.wiphy, "Failed to select rate control algorithm\n"); return -ENOENT; } WARN_ON(local->rate_ctrl); local->rate_ctrl = ref; wiphy_debug(local->hw.wiphy, "Selected rate control algorithm '%s'\n", ref->ops->name); return 0; } void rate_control_deinitialize(struct ieee80211_local *local) { struct rate_control_ref *ref; ref = local->rate_ctrl; if (!ref) return; local->rate_ctrl = NULL; rate_control_free(local, ref); }
6720 3963 2262 2577 5289 4967 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM timer #if !defined(_TRACE_TIMER_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_TIMER_H #include <linux/tracepoint.h> #include <linux/hrtimer.h> #include <linux/timer.h> DECLARE_EVENT_CLASS(timer_class, TP_PROTO(struct timer_list *timer), TP_ARGS(timer), TP_STRUCT__entry( __field( void *, timer ) ), TP_fast_assign( __entry->timer = timer; ), TP_printk("timer=%p", __entry->timer) ); /** * timer_init - called when the timer is initialized * @timer: pointer to struct timer_list */ DEFINE_EVENT(timer_class, timer_init, TP_PROTO(struct timer_list *timer), TP_ARGS(timer) ); #define decode_timer_flags(flags) \ __print_flags(flags, "|", \ { TIMER_MIGRATING, "M" }, \ { TIMER_DEFERRABLE, "D" }, \ { TIMER_PINNED, "P" }, \ { TIMER_IRQSAFE, "I" }) /** * timer_start - called when the timer is started * @timer: pointer to struct timer_list * @bucket_expiry: the bucket expiry time */ TRACE_EVENT(timer_start, TP_PROTO(struct timer_list *timer, unsigned long bucket_expiry), TP_ARGS(timer, bucket_expiry), TP_STRUCT__entry( __field( void *, timer ) __field( void *, function ) __field( unsigned long, expires ) __field( unsigned long, bucket_expiry ) __field( unsigned long, now ) __field( unsigned int, flags ) ), TP_fast_assign( __entry->timer = timer; __entry->function = timer->function; __entry->expires = timer->expires; __entry->bucket_expiry = bucket_expiry; __entry->now = jiffies; __entry->flags = timer->flags; ), TP_printk("timer=%p function=%ps expires=%lu [timeout=%ld] bucket_expiry=%lu cpu=%u idx=%u flags=%s", __entry->timer, __entry->function, __entry->expires, (long)__entry->expires - __entry->now, __entry->bucket_expiry, __entry->flags & TIMER_CPUMASK, __entry->flags >> TIMER_ARRAYSHIFT, decode_timer_flags(__entry->flags & TIMER_TRACE_FLAGMASK)) ); /** * timer_expire_entry - called immediately before the timer callback * @timer: pointer to struct timer_list * @baseclk: value of timer_base::clk when timer expires * * Allows to determine the timer latency. */ TRACE_EVENT(timer_expire_entry, TP_PROTO(struct timer_list *timer, unsigned long baseclk), TP_ARGS(timer, baseclk), TP_STRUCT__entry( __field( void *, timer ) __field( unsigned long, now ) __field( void *, function) __field( unsigned long, baseclk ) ), TP_fast_assign( __entry->timer = timer; __entry->now = jiffies; __entry->function = timer->function; __entry->baseclk = baseclk; ), TP_printk("timer=%p function=%ps now=%lu baseclk=%lu", __entry->timer, __entry->function, __entry->now, __entry->baseclk) ); /** * timer_expire_exit - called immediately after the timer callback returns * @timer: pointer to struct timer_list * * When used in combination with the timer_expire_entry tracepoint we can * determine the runtime of the timer callback function. * * NOTE: Do NOT dereference timer in TP_fast_assign. The pointer might * be invalid. We solely track the pointer. */ DEFINE_EVENT(timer_class, timer_expire_exit, TP_PROTO(struct timer_list *timer), TP_ARGS(timer) ); /** * timer_cancel - called when the timer is canceled * @timer: pointer to struct timer_list */ DEFINE_EVENT(timer_class, timer_cancel, TP_PROTO(struct timer_list *timer), TP_ARGS(timer) ); TRACE_EVENT(timer_base_idle, TP_PROTO(bool is_idle, unsigned int cpu), TP_ARGS(is_idle, cpu), TP_STRUCT__entry( __field( bool, is_idle ) __field( unsigned int, cpu ) ), TP_fast_assign( __entry->is_idle = is_idle; __entry->cpu = cpu; ), TP_printk("is_idle=%d cpu=%d", __entry->is_idle, __entry->cpu) ); #define decode_clockid(type) \ __print_symbolic(type, \ { CLOCK_REALTIME, "CLOCK_REALTIME" }, \ { CLOCK_MONOTONIC, "CLOCK_MONOTONIC" }, \ { CLOCK_BOOTTIME, "CLOCK_BOOTTIME" }, \ { CLOCK_TAI, "CLOCK_TAI" }) #define decode_hrtimer_mode(mode) \ __print_symbolic(mode, \ { HRTIMER_MODE_ABS, "ABS" }, \ { HRTIMER_MODE_REL, "REL" }, \ { HRTIMER_MODE_ABS_PINNED, "ABS|PINNED" }, \ { HRTIMER_MODE_REL_PINNED, "REL|PINNED" }, \ { HRTIMER_MODE_ABS_SOFT, "ABS|SOFT" }, \ { HRTIMER_MODE_REL_SOFT, "REL|SOFT" }, \ { HRTIMER_MODE_ABS_PINNED_SOFT, "ABS|PINNED|SOFT" }, \ { HRTIMER_MODE_REL_PINNED_SOFT, "REL|PINNED|SOFT" }, \ { HRTIMER_MODE_ABS_HARD, "ABS|HARD" }, \ { HRTIMER_MODE_REL_HARD, "REL|HARD" }, \ { HRTIMER_MODE_ABS_PINNED_HARD, "ABS|PINNED|HARD" }, \ { HRTIMER_MODE_REL_PINNED_HARD, "REL|PINNED|HARD" }) /** * hrtimer_init - called when the hrtimer is initialized * @hrtimer: pointer to struct hrtimer * @clockid: the hrtimers clock * @mode: the hrtimers mode */ TRACE_EVENT(hrtimer_init, TP_PROTO(struct hrtimer *hrtimer, clockid_t clockid, enum hrtimer_mode mode), TP_ARGS(hrtimer, clockid, mode), TP_STRUCT__entry( __field( void *, hrtimer ) __field( clockid_t, clockid ) __field( enum hrtimer_mode, mode ) ), TP_fast_assign( __entry->hrtimer = hrtimer; __entry->clockid = clockid; __entry->mode = mode; ), TP_printk("hrtimer=%p clockid=%s mode=%s", __entry->hrtimer, decode_clockid(__entry->clockid), decode_hrtimer_mode(__entry->mode)) ); /** * hrtimer_start - called when the hrtimer is started * @hrtimer: pointer to struct hrtimer * @mode: the hrtimers mode */ TRACE_EVENT(hrtimer_start, TP_PROTO(struct hrtimer *hrtimer, enum hrtimer_mode mode), TP_ARGS(hrtimer, mode), TP_STRUCT__entry( __field( void *, hrtimer ) __field( void *, function ) __field( s64, expires ) __field( s64, softexpires ) __field( enum hrtimer_mode, mode ) ), TP_fast_assign( __entry->hrtimer = hrtimer; __entry->function = hrtimer->function; __entry->expires = hrtimer_get_expires(hrtimer); __entry->softexpires = hrtimer_get_softexpires(hrtimer); __entry->mode = mode; ), TP_printk("hrtimer=%p function=%ps expires=%llu softexpires=%llu " "mode=%s", __entry->hrtimer, __entry->function, (unsigned long long) __entry->expires, (unsigned long long) __entry->softexpires, decode_hrtimer_mode(__entry->mode)) ); /** * hrtimer_expire_entry - called immediately before the hrtimer callback * @hrtimer: pointer to struct hrtimer * @now: pointer to variable which contains current time of the * timers base. * * Allows to determine the timer latency. */ TRACE_EVENT(hrtimer_expire_entry, TP_PROTO(struct hrtimer *hrtimer, ktime_t *now), TP_ARGS(hrtimer, now), TP_STRUCT__entry( __field( void *, hrtimer ) __field( s64, now ) __field( void *, function) ), TP_fast_assign( __entry->hrtimer = hrtimer; __entry->now = *now; __entry->function = hrtimer->function; ), TP_printk("hrtimer=%p function=%ps now=%llu", __entry->hrtimer, __entry->function, (unsigned long long) __entry->now) ); DECLARE_EVENT_CLASS(hrtimer_class, TP_PROTO(struct hrtimer *hrtimer), TP_ARGS(hrtimer), TP_STRUCT__entry( __field( void *, hrtimer ) ), TP_fast_assign( __entry->hrtimer = hrtimer; ), TP_printk("hrtimer=%p", __entry->hrtimer) ); /** * hrtimer_expire_exit - called immediately after the hrtimer callback returns * @hrtimer: pointer to struct hrtimer * * When used in combination with the hrtimer_expire_entry tracepoint we can * determine the runtime of the callback function. */ DEFINE_EVENT(hrtimer_class, hrtimer_expire_exit, TP_PROTO(struct hrtimer *hrtimer), TP_ARGS(hrtimer) ); /** * hrtimer_cancel - called when the hrtimer is canceled * @hrtimer: pointer to struct hrtimer */ DEFINE_EVENT(hrtimer_class, hrtimer_cancel, TP_PROTO(struct hrtimer *hrtimer), TP_ARGS(hrtimer) ); /** * itimer_state - called when itimer is started or canceled * @which: name of the interval timer * @value: the itimers value, itimer is canceled if value->it_value is * zero, otherwise it is started * @expires: the itimers expiry time */ TRACE_EVENT(itimer_state, TP_PROTO(int which, const struct itimerspec64 *const value, unsigned long long expires), TP_ARGS(which, value, expires), TP_STRUCT__entry( __field( int, which ) __field( unsigned long long, expires ) __field( long, value_sec ) __field( long, value_nsec ) __field( long, interval_sec ) __field( long, interval_nsec ) ), TP_fast_assign( __entry->which = which; __entry->expires = expires; __entry->value_sec = value->it_value.tv_sec; __entry->value_nsec = value->it_value.tv_nsec; __entry->interval_sec = value->it_interval.tv_sec; __entry->interval_nsec = value->it_interval.tv_nsec; ), TP_printk("which=%d expires=%llu it_value=%ld.%06ld it_interval=%ld.%06ld", __entry->which, __entry->expires, __entry->value_sec, __entry->value_nsec / NSEC_PER_USEC, __entry->interval_sec, __entry->interval_nsec / NSEC_PER_USEC) ); /** * itimer_expire - called when itimer expires * @which: type of the interval timer * @pid: pid of the process which owns the timer * @now: current time, used to calculate the latency of itimer */ TRACE_EVENT(itimer_expire, TP_PROTO(int which, struct pid *pid, unsigned long long now), TP_ARGS(which, pid, now), TP_STRUCT__entry( __field( int , which ) __field( pid_t, pid ) __field( unsigned long long, now ) ), TP_fast_assign( __entry->which = which; __entry->now = now; __entry->pid = pid_nr(pid); ), TP_printk("which=%d pid=%d now=%llu", __entry->which, (int) __entry->pid, __entry->now) ); #ifdef CONFIG_NO_HZ_COMMON #define TICK_DEP_NAMES \ tick_dep_mask_name(NONE) \ tick_dep_name(POSIX_TIMER) \ tick_dep_name(PERF_EVENTS) \ tick_dep_name(SCHED) \ tick_dep_name(CLOCK_UNSTABLE) \ tick_dep_name(RCU) \ tick_dep_name_end(RCU_EXP) #undef tick_dep_name #undef tick_dep_mask_name #undef tick_dep_name_end /* The MASK will convert to their bits and they need to be processed too */ #define tick_dep_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_BIT_##sdep); \ TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep); #define tick_dep_name_end(sdep) TRACE_DEFINE_ENUM(TICK_DEP_BIT_##sdep); \ TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep); /* NONE only has a mask defined for it */ #define tick_dep_mask_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep); TICK_DEP_NAMES #undef tick_dep_name #undef tick_dep_mask_name #undef tick_dep_name_end #define tick_dep_name(sdep) { TICK_DEP_MASK_##sdep, #sdep }, #define tick_dep_mask_name(sdep) { TICK_DEP_MASK_##sdep, #sdep }, #define tick_dep_name_end(sdep) { TICK_DEP_MASK_##sdep, #sdep } #define show_tick_dep_name(val) \ __print_symbolic(val, TICK_DEP_NAMES) TRACE_EVENT(tick_stop, TP_PROTO(int success, int dependency), TP_ARGS(success, dependency), TP_STRUCT__entry( __field( int , success ) __field( int , dependency ) ), TP_fast_assign( __entry->success = success; __entry->dependency = dependency; ), TP_printk("success=%d dependency=%s", __entry->success, \ show_tick_dep_name(__entry->dependency)) ); #endif #endif /* _TRACE_TIMER_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (c) 2005 Andrea Bittau <a.bittau@cs.ucl.ac.uk> */ #ifndef _DCCP_CCID2_H_ #define _DCCP_CCID2_H_ #include <linux/timer.h> #include <linux/types.h> #include "../ccid.h" #include "../dccp.h" /* * CCID-2 timestamping faces the same issues as TCP timestamping. * Hence we reuse/share as much of the code as possible. */ #define ccid2_jiffies32 ((u32)jiffies) /* NUMDUPACK parameter from RFC 4341, p. 6 */ #define NUMDUPACK 3 struct ccid2_seq { u64 ccid2s_seq; u32 ccid2s_sent; int ccid2s_acked; struct ccid2_seq *ccid2s_prev; struct ccid2_seq *ccid2s_next; }; #define CCID2_SEQBUF_LEN 1024 #define CCID2_SEQBUF_MAX 128 /* * Multiple of congestion window to keep the sequence window at * (RFC 4340 7.5.2) */ #define CCID2_WIN_CHANGE_FACTOR 5 /** * struct ccid2_hc_tx_sock - CCID2 TX half connection * @tx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 * @tx_packets_acked: Ack counter for deriving cwnd growth (RFC 3465) * @tx_srtt: smoothed RTT estimate, scaled by 2^3 * @tx_mdev: smoothed RTT variation, scaled by 2^2 * @tx_mdev_max: maximum of @mdev during one flight * @tx_rttvar: moving average/maximum of @mdev_max * @tx_rto: RTO value deriving from SRTT and RTTVAR (RFC 2988) * @tx_rtt_seq: to decay RTTVAR at most once per flight * @tx_cwnd_used: actually used cwnd, W_used of RFC 2861 * @tx_expected_wnd: moving average of @tx_cwnd_used * @tx_cwnd_stamp: to track idle periods in CWV * @tx_lsndtime: last time (in jiffies) a data packet was sent * @tx_rpseq: last consecutive seqno * @tx_rpdupack: dupacks since rpseq * @tx_av_chunks: list of Ack Vectors received on current skb */ struct ccid2_hc_tx_sock { u32 tx_cwnd; u32 tx_ssthresh; u32 tx_pipe; u32 tx_packets_acked; struct ccid2_seq *tx_seqbuf[CCID2_SEQBUF_MAX]; int tx_seqbufc; struct ccid2_seq *tx_seqh; struct ccid2_seq *tx_seqt; /* RTT measurement: variables/principles are the same as in TCP */ u32 tx_srtt, tx_mdev, tx_mdev_max, tx_rttvar, tx_rto; u64 tx_rtt_seq:48; struct timer_list tx_rtotimer; struct sock *sk; /* Congestion Window validation (optional, RFC 2861) */ u32 tx_cwnd_used, tx_expected_wnd, tx_cwnd_stamp, tx_lsndtime; u64 tx_rpseq; int tx_rpdupack; u32 tx_last_cong; u64 tx_high_ack; struct list_head tx_av_chunks; }; static inline bool ccid2_cwnd_network_limited(struct ccid2_hc_tx_sock *hc) { return hc->tx_pipe >= hc->tx_cwnd; } /* * Convert RFC 3390 larger initial window into an equivalent number of packets. * This is based on the numbers specified in RFC 5681, 3.1. */ static inline u32 rfc3390_bytes_to_packets(const u32 smss) { return smss <= 1095 ? 4 : (smss > 2190 ? 2 : 3); } /** * struct ccid2_hc_rx_sock - Receiving end of CCID-2 half-connection * @rx_num_data_pkts: number of data packets received since last feedback */ struct ccid2_hc_rx_sock { u32 rx_num_data_pkts; }; static inline struct ccid2_hc_tx_sock *ccid2_hc_tx_sk(const struct sock *sk) { return ccid_priv(dccp_sk(sk)->dccps_hc_tx_ccid); } static inline struct ccid2_hc_rx_sock *ccid2_hc_rx_sk(const struct sock *sk) { return ccid_priv(dccp_sk(sk)->dccps_hc_rx_ccid); } #endif /* _DCCP_CCID2_H_ */
136 136 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #ifndef _CORE_PRIV_H #define _CORE_PRIV_H #include <linux/list.h> #include <linux/spinlock.h> #include <linux/cgroup_rdma.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <rdma/ib_verbs.h> #include <rdma/opa_addr.h> #include <rdma/ib_mad.h> #include <rdma/restrack.h> #include "mad_priv.h" #include "restrack.h" /* Total number of ports combined across all struct ib_devices's */ #define RDMA_MAX_PORTS 8192 struct pkey_index_qp_list { struct list_head pkey_index_list; u16 pkey_index; /* Lock to hold while iterating the qp_list. */ spinlock_t qp_list_lock; struct list_head qp_list; }; /** * struct rdma_dev_net - rdma net namespace metadata for a net * @nl_sock: Pointer to netlink socket * @net: Pointer to owner net namespace * @id: xarray id to identify the net namespace. */ struct rdma_dev_net { struct sock *nl_sock; possible_net_t net; u32 id; }; extern const struct attribute_group ib_dev_attr_group; extern bool ib_devices_shared_netns; extern unsigned int rdma_dev_net_id; static inline struct rdma_dev_net *rdma_net_to_dev_net(struct net *net) { return net_generic(net, rdma_dev_net_id); } int ib_device_rename(struct ib_device *ibdev, const char *name); int ib_device_set_dim(struct ib_device *ibdev, u8 use_dim); typedef void (*roce_netdev_callback)(struct ib_device *device, u32 port, struct net_device *idev, void *cookie); typedef bool (*roce_netdev_filter)(struct ib_device *device, u32 port, struct net_device *idev, void *cookie); struct net_device *ib_device_get_netdev(struct ib_device *ib_dev, u32 port); void ib_enum_roce_netdev(struct ib_device *ib_dev, roce_netdev_filter filter, void *filter_cookie, roce_netdev_callback cb, void *cookie); void ib_enum_all_roce_netdevs(roce_netdev_filter filter, void *filter_cookie, roce_netdev_callback cb, void *cookie); typedef int (*nldev_callback)(struct ib_device *device, struct sk_buff *skb, struct netlink_callback *cb, unsigned int idx); int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb, struct netlink_callback *cb); struct ib_client_nl_info { struct sk_buff *nl_msg; struct device *cdev; u32 port; u64 abi; }; int ib_get_client_nl_info(struct ib_device *ibdev, const char *client_name, struct ib_client_nl_info *res); enum ib_cache_gid_default_mode { IB_CACHE_GID_DEFAULT_MODE_SET, IB_CACHE_GID_DEFAULT_MODE_DELETE }; int ib_cache_gid_parse_type_str(const char *buf); const char *ib_cache_gid_type_str(enum ib_gid_type gid_type); void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u32 port, struct net_device *ndev, unsigned long gid_type_mask, enum ib_cache_gid_default_mode mode); int ib_cache_gid_add(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr); int ib_cache_gid_del(struct ib_device *ib_dev, u32 port, union ib_gid *gid, struct ib_gid_attr *attr); int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u32 port, struct net_device *ndev); int roce_gid_mgmt_init(void); void roce_gid_mgmt_cleanup(void); unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u32 port); int ib_cache_setup_one(struct ib_device *device); void ib_cache_cleanup_one(struct ib_device *device); void ib_cache_release_one(struct ib_device *device); void ib_dispatch_event_clients(struct ib_event *event); #ifdef CONFIG_CGROUP_RDMA void ib_device_register_rdmacg(struct ib_device *device); void ib_device_unregister_rdmacg(struct ib_device *device); int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj, struct ib_device *device, enum rdmacg_resource_type resource_index); void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj, struct ib_device *device, enum rdmacg_resource_type resource_index); #else static inline void ib_device_register_rdmacg(struct ib_device *device) { } static inline void ib_device_unregister_rdmacg(struct ib_device *device) { } static inline int ib_rdmacg_try_charge(struct ib_rdmacg_object *cg_obj, struct ib_device *device, enum rdmacg_resource_type resource_index) { return 0; } static inline void ib_rdmacg_uncharge(struct ib_rdmacg_object *cg_obj, struct ib_device *device, enum rdmacg_resource_type resource_index) { } #endif static inline bool rdma_is_upper_dev_rcu(struct net_device *dev, struct net_device *upper) { return netdev_has_upper_dev_all_rcu(dev, upper); } int addr_init(void); void addr_cleanup(void); int ib_mad_init(void); void ib_mad_cleanup(void); int ib_sa_init(void); void ib_sa_cleanup(void); void rdma_nl_init(void); void rdma_nl_exit(void); int ib_nl_handle_resolve_resp(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); int ib_nl_handle_set_timeout(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); int ib_nl_handle_ip_res_resp(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); void ib_get_cached_subnet_prefix(struct ib_device *device, u32 port_num, u64 *sn_pfx); #ifdef CONFIG_SECURITY_INFINIBAND void ib_security_release_port_pkey_list(struct ib_device *device); void ib_security_cache_change(struct ib_device *device, u32 port_num, u64 subnet_prefix); int ib_security_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_udata *udata); int ib_create_qp_security(struct ib_qp *qp, struct ib_device *dev); void ib_destroy_qp_security_begin(struct ib_qp_security *sec); void ib_destroy_qp_security_abort(struct ib_qp_security *sec); void ib_destroy_qp_security_end(struct ib_qp_security *sec); int ib_open_shared_qp_security(struct ib_qp *qp, struct ib_device *dev); void ib_close_shared_qp_security(struct ib_qp_security *sec); int ib_mad_agent_security_setup(struct ib_mad_agent *agent, enum ib_qp_type qp_type); void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent); int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index); void ib_mad_agent_security_change(void); #else static inline void ib_security_release_port_pkey_list(struct ib_device *device) { } static inline void ib_security_cache_change(struct ib_device *device, u32 port_num, u64 subnet_prefix) { } static inline int ib_security_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_udata *udata) { return qp->device->ops.modify_qp(qp->real_qp, qp_attr, qp_attr_mask, udata); } static inline int ib_create_qp_security(struct ib_qp *qp, struct ib_device *dev) { return 0; } static inline void ib_destroy_qp_security_begin(struct ib_qp_security *sec) { } static inline void ib_destroy_qp_security_abort(struct ib_qp_security *sec) { } static inline void ib_destroy_qp_security_end(struct ib_qp_security *sec) { } static inline int ib_open_shared_qp_security(struct ib_qp *qp, struct ib_device *dev) { return 0; } static inline void ib_close_shared_qp_security(struct ib_qp_security *sec) { } static inline int ib_mad_agent_security_setup(struct ib_mad_agent *agent, enum ib_qp_type qp_type) { return 0; } static inline void ib_mad_agent_security_cleanup(struct ib_mad_agent *agent) { } static inline int ib_mad_enforce_security(struct ib_mad_agent_private *map, u16 pkey_index) { return 0; } static inline void ib_mad_agent_security_change(void) { } #endif struct ib_device *ib_device_get_by_index(const struct net *net, u32 index); /* RDMA device netlink */ void nldev_init(void); void nldev_exit(void); struct ib_qp *ib_create_qp_user(struct ib_device *dev, struct ib_pd *pd, struct ib_qp_init_attr *attr, struct ib_udata *udata, struct ib_uqp_object *uobj, const char *caller); void ib_qp_usecnt_inc(struct ib_qp *qp); void ib_qp_usecnt_dec(struct ib_qp *qp); struct rdma_dev_addr; int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, u8 *dmac, const struct ib_gid_attr *sgid_attr, int *hoplimit); void rdma_copy_src_l2_addr(struct rdma_dev_addr *dev_addr, const struct net_device *dev); struct sa_path_rec; int roce_resolve_route_from_path(struct sa_path_rec *rec, const struct ib_gid_attr *attr); struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr); void ib_free_port_attrs(struct ib_core_device *coredev); int ib_setup_port_attrs(struct ib_core_device *coredev); struct rdma_hw_stats *ib_get_hw_stats_port(struct ib_device *ibdev, u32 port_num); void ib_device_release_hw_stats(struct hw_stats_device_data *data); int ib_setup_device_attrs(struct ib_device *ibdev); int rdma_compatdev_set(u8 enable); int ib_port_register_client_groups(struct ib_device *ibdev, u32 port_num, const struct attribute_group **groups); void ib_port_unregister_client_groups(struct ib_device *ibdev, u32 port_num, const struct attribute_group **groups); int ib_device_set_netns_put(struct sk_buff *skb, struct ib_device *dev, u32 ns_fd); int rdma_nl_net_init(struct rdma_dev_net *rnet); void rdma_nl_net_exit(struct rdma_dev_net *rnet); struct rdma_umap_priv { struct vm_area_struct *vma; struct list_head list; struct rdma_user_mmap_entry *entry; }; void rdma_umap_priv_init(struct rdma_umap_priv *priv, struct vm_area_struct *vma, struct rdma_user_mmap_entry *entry); void ib_cq_pool_cleanup(struct ib_device *dev); bool rdma_nl_get_privileged_qkey(void); #endif /* _CORE_PRIV_H */
12 12 12 12 12 12 12 12 2 2 2 2 40 8 40 8 40 40 40 8 40 40 8 40 39 12 12 12 12 12 10 12 12 99 97 13 12 12 1 10 88 87 5 5 59 36 39 40 59 59 40 9 9 1 8 8 9 1 8 8 8 3 2 11 11 11 31 31 14 19 12 5 11 11 2 2 2 2 15 15 15 12 3 7 1 1 1 3 3 4 4 2 1 1 3 1 2 32 32 32 12 12 12 11 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 /* BlueZ - Bluetooth protocol stack for Linux Copyright (c) 2000-2001, 2010, Code Aurora Forum. All rights reserved. Copyright 2023-2024 NXP Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ /* Bluetooth HCI connection handling. */ #include <linux/export.h> #include <linux/debugfs.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/l2cap.h> #include <net/bluetooth/iso.h> #include <net/bluetooth/mgmt.h> #include "smp.h" #include "eir.h" struct sco_param { u16 pkt_type; u16 max_latency; u8 retrans_effort; }; struct conn_handle_t { struct hci_conn *conn; __u16 handle; }; static const struct sco_param esco_param_cvsd[] = { { EDR_ESCO_MASK & ~ESCO_2EV3, 0x000a, 0x01 }, /* S3 */ { EDR_ESCO_MASK & ~ESCO_2EV3, 0x0007, 0x01 }, /* S2 */ { EDR_ESCO_MASK | ESCO_EV3, 0x0007, 0x01 }, /* S1 */ { EDR_ESCO_MASK | ESCO_HV3, 0xffff, 0x01 }, /* D1 */ { EDR_ESCO_MASK | ESCO_HV1, 0xffff, 0x01 }, /* D0 */ }; static const struct sco_param sco_param_cvsd[] = { { EDR_ESCO_MASK | ESCO_HV3, 0xffff, 0xff }, /* D1 */ { EDR_ESCO_MASK | ESCO_HV1, 0xffff, 0xff }, /* D0 */ }; static const struct sco_param esco_param_msbc[] = { { EDR_ESCO_MASK & ~ESCO_2EV3, 0x000d, 0x02 }, /* T2 */ { EDR_ESCO_MASK | ESCO_EV3, 0x0008, 0x02 }, /* T1 */ }; /* This function requires the caller holds hdev->lock */ void hci_connect_le_scan_cleanup(struct hci_conn *conn, u8 status) { struct hci_conn_params *params; struct hci_dev *hdev = conn->hdev; struct smp_irk *irk; bdaddr_t *bdaddr; u8 bdaddr_type; bdaddr = &conn->dst; bdaddr_type = conn->dst_type; /* Check if we need to convert to identity address */ irk = hci_get_irk(hdev, bdaddr, bdaddr_type); if (irk) { bdaddr = &irk->bdaddr; bdaddr_type = irk->addr_type; } params = hci_pend_le_action_lookup(&hdev->pend_le_conns, bdaddr, bdaddr_type); if (!params) return; if (params->conn) { hci_conn_drop(params->conn); hci_conn_put(params->conn); params->conn = NULL; } if (!params->explicit_connect) return; /* If the status indicates successful cancellation of * the attempt (i.e. Unknown Connection Id) there's no point of * notifying failure since we'll go back to keep trying to * connect. The only exception is explicit connect requests * where a timeout + cancel does indicate an actual failure. */ if (status && status != HCI_ERROR_UNKNOWN_CONN_ID) mgmt_connect_failed(hdev, conn, status); /* The connection attempt was doing scan for new RPA, and is * in scan phase. If params are not associated with any other * autoconnect action, remove them completely. If they are, just unmark * them as waiting for connection, by clearing explicit_connect field. */ params->explicit_connect = false; hci_pend_le_list_del_init(params); switch (params->auto_connect) { case HCI_AUTO_CONN_EXPLICIT: hci_conn_params_del(hdev, bdaddr, bdaddr_type); /* return instead of break to avoid duplicate scan update */ return; case HCI_AUTO_CONN_DIRECT: case HCI_AUTO_CONN_ALWAYS: hci_pend_le_list_add(params, &hdev->pend_le_conns); break; case HCI_AUTO_CONN_REPORT: hci_pend_le_list_add(params, &hdev->pend_le_reports); break; default: break; } hci_update_passive_scan(hdev); } static void hci_conn_cleanup(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; if (test_bit(HCI_CONN_PARAM_REMOVAL_PEND, &conn->flags)) hci_conn_params_del(conn->hdev, &conn->dst, conn->dst_type); if (test_and_clear_bit(HCI_CONN_FLUSH_KEY, &conn->flags)) hci_remove_link_key(hdev, &conn->dst); hci_chan_list_flush(conn); hci_conn_hash_del(hdev, conn); if (HCI_CONN_HANDLE_UNSET(conn->handle)) ida_free(&hdev->unset_handle_ida, conn->handle); if (conn->cleanup) conn->cleanup(conn); if (conn->type == SCO_LINK || conn->type == ESCO_LINK) { switch (conn->setting & SCO_AIRMODE_MASK) { case SCO_AIRMODE_CVSD: case SCO_AIRMODE_TRANSP: if (hdev->notify) hdev->notify(hdev, HCI_NOTIFY_DISABLE_SCO); break; } } else { if (hdev->notify) hdev->notify(hdev, HCI_NOTIFY_CONN_DEL); } debugfs_remove_recursive(conn->debugfs); hci_conn_del_sysfs(conn); hci_dev_put(hdev); } int hci_disconnect(struct hci_conn *conn, __u8 reason) { BT_DBG("hcon %p", conn); /* When we are central of an established connection and it enters * the disconnect timeout, then go ahead and try to read the * current clock offset. Processing of the result is done * within the event handling and hci_clock_offset_evt function. */ if (conn->type == ACL_LINK && conn->role == HCI_ROLE_MASTER && (conn->state == BT_CONNECTED || conn->state == BT_CONFIG)) { struct hci_dev *hdev = conn->hdev; struct hci_cp_read_clock_offset clkoff_cp; clkoff_cp.handle = cpu_to_le16(conn->handle); hci_send_cmd(hdev, HCI_OP_READ_CLOCK_OFFSET, sizeof(clkoff_cp), &clkoff_cp); } return hci_abort_conn(conn, reason); } static void hci_add_sco(struct hci_conn *conn, __u16 handle) { struct hci_dev *hdev = conn->hdev; struct hci_cp_add_sco cp; BT_DBG("hcon %p", conn); conn->state = BT_CONNECT; conn->out = true; conn->attempt++; cp.handle = cpu_to_le16(handle); cp.pkt_type = cpu_to_le16(conn->pkt_type); hci_send_cmd(hdev, HCI_OP_ADD_SCO, sizeof(cp), &cp); } static bool find_next_esco_param(struct hci_conn *conn, const struct sco_param *esco_param, int size) { if (!conn->parent) return false; for (; conn->attempt <= size; conn->attempt++) { if (lmp_esco_2m_capable(conn->parent) || (esco_param[conn->attempt - 1].pkt_type & ESCO_2EV3)) break; BT_DBG("hcon %p skipped attempt %d, eSCO 2M not supported", conn, conn->attempt); } return conn->attempt <= size; } static int configure_datapath_sync(struct hci_dev *hdev, struct bt_codec *codec) { int err; __u8 vnd_len, *vnd_data = NULL; struct hci_op_configure_data_path *cmd = NULL; /* Do not take below 2 checks as error since the 1st means user do not * want to use HFP offload mode and the 2nd means the vendor controller * do not need to send below HCI command for offload mode. */ if (!codec->data_path || !hdev->get_codec_config_data) return 0; err = hdev->get_codec_config_data(hdev, ESCO_LINK, codec, &vnd_len, &vnd_data); if (err < 0) goto error; cmd = kzalloc(sizeof(*cmd) + vnd_len, GFP_KERNEL); if (!cmd) { err = -ENOMEM; goto error; } err = hdev->get_data_path_id(hdev, &cmd->data_path_id); if (err < 0) goto error; cmd->vnd_len = vnd_len; memcpy(cmd->vnd_data, vnd_data, vnd_len); cmd->direction = 0x00; __hci_cmd_sync_status(hdev, HCI_CONFIGURE_DATA_PATH, sizeof(*cmd) + vnd_len, cmd, HCI_CMD_TIMEOUT); cmd->direction = 0x01; err = __hci_cmd_sync_status(hdev, HCI_CONFIGURE_DATA_PATH, sizeof(*cmd) + vnd_len, cmd, HCI_CMD_TIMEOUT); error: kfree(cmd); kfree(vnd_data); return err; } static int hci_enhanced_setup_sync(struct hci_dev *hdev, void *data) { struct conn_handle_t *conn_handle = data; struct hci_conn *conn = conn_handle->conn; __u16 handle = conn_handle->handle; struct hci_cp_enhanced_setup_sync_conn cp; const struct sco_param *param; kfree(conn_handle); if (!hci_conn_valid(hdev, conn)) return -ECANCELED; bt_dev_dbg(hdev, "hcon %p", conn); configure_datapath_sync(hdev, &conn->codec); conn->state = BT_CONNECT; conn->out = true; conn->attempt++; memset(&cp, 0x00, sizeof(cp)); cp.handle = cpu_to_le16(handle); cp.tx_bandwidth = cpu_to_le32(0x00001f40); cp.rx_bandwidth = cpu_to_le32(0x00001f40); switch (conn->codec.id) { case BT_CODEC_MSBC: if (!find_next_esco_param(conn, esco_param_msbc, ARRAY_SIZE(esco_param_msbc))) return -EINVAL; param = &esco_param_msbc[conn->attempt - 1]; cp.tx_coding_format.id = 0x05; cp.rx_coding_format.id = 0x05; cp.tx_codec_frame_size = __cpu_to_le16(60); cp.rx_codec_frame_size = __cpu_to_le16(60); cp.in_bandwidth = __cpu_to_le32(32000); cp.out_bandwidth = __cpu_to_le32(32000); cp.in_coding_format.id = 0x04; cp.out_coding_format.id = 0x04; cp.in_coded_data_size = __cpu_to_le16(16); cp.out_coded_data_size = __cpu_to_le16(16); cp.in_pcm_data_format = 2; cp.out_pcm_data_format = 2; cp.in_pcm_sample_payload_msb_pos = 0; cp.out_pcm_sample_payload_msb_pos = 0; cp.in_data_path = conn->codec.data_path; cp.out_data_path = conn->codec.data_path; cp.in_transport_unit_size = 1; cp.out_transport_unit_size = 1; break; case BT_CODEC_TRANSPARENT: if (!find_next_esco_param(conn, esco_param_msbc, ARRAY_SIZE(esco_param_msbc))) return false; param = &esco_param_msbc[conn->attempt - 1]; cp.tx_coding_format.id = 0x03; cp.rx_coding_format.id = 0x03; cp.tx_codec_frame_size = __cpu_to_le16(60); cp.rx_codec_frame_size = __cpu_to_le16(60); cp.in_bandwidth = __cpu_to_le32(0x1f40); cp.out_bandwidth = __cpu_to_le32(0x1f40); cp.in_coding_format.id = 0x03; cp.out_coding_format.id = 0x03; cp.in_coded_data_size = __cpu_to_le16(16); cp.out_coded_data_size = __cpu_to_le16(16); cp.in_pcm_data_format = 2; cp.out_pcm_data_format = 2; cp.in_pcm_sample_payload_msb_pos = 0; cp.out_pcm_sample_payload_msb_pos = 0; cp.in_data_path = conn->codec.data_path; cp.out_data_path = conn->codec.data_path; cp.in_transport_unit_size = 1; cp.out_transport_unit_size = 1; break; case BT_CODEC_CVSD: if (conn->parent && lmp_esco_capable(conn->parent)) { if (!find_next_esco_param(conn, esco_param_cvsd, ARRAY_SIZE(esco_param_cvsd))) return -EINVAL; param = &esco_param_cvsd[conn->attempt - 1]; } else { if (conn->attempt > ARRAY_SIZE(sco_param_cvsd)) return -EINVAL; param = &sco_param_cvsd[conn->attempt - 1]; } cp.tx_coding_format.id = 2; cp.rx_coding_format.id = 2; cp.tx_codec_frame_size = __cpu_to_le16(60); cp.rx_codec_frame_size = __cpu_to_le16(60); cp.in_bandwidth = __cpu_to_le32(16000); cp.out_bandwidth = __cpu_to_le32(16000); cp.in_coding_format.id = 4; cp.out_coding_format.id = 4; cp.in_coded_data_size = __cpu_to_le16(16); cp.out_coded_data_size = __cpu_to_le16(16); cp.in_pcm_data_format = 2; cp.out_pcm_data_format = 2; cp.in_pcm_sample_payload_msb_pos = 0; cp.out_pcm_sample_payload_msb_pos = 0; cp.in_data_path = conn->codec.data_path; cp.out_data_path = conn->codec.data_path; cp.in_transport_unit_size = 16; cp.out_transport_unit_size = 16; break; default: return -EINVAL; } cp.retrans_effort = param->retrans_effort; cp.pkt_type = __cpu_to_le16(param->pkt_type); cp.max_latency = __cpu_to_le16(param->max_latency); if (hci_send_cmd(hdev, HCI_OP_ENHANCED_SETUP_SYNC_CONN, sizeof(cp), &cp) < 0) return -EIO; return 0; } static bool hci_setup_sync_conn(struct hci_conn *conn, __u16 handle) { struct hci_dev *hdev = conn->hdev; struct hci_cp_setup_sync_conn cp; const struct sco_param *param; bt_dev_dbg(hdev, "hcon %p", conn); conn->state = BT_CONNECT; conn->out = true; conn->attempt++; cp.handle = cpu_to_le16(handle); cp.tx_bandwidth = cpu_to_le32(0x00001f40); cp.rx_bandwidth = cpu_to_le32(0x00001f40); cp.voice_setting = cpu_to_le16(conn->setting); switch (conn->setting & SCO_AIRMODE_MASK) { case SCO_AIRMODE_TRANSP: if (!find_next_esco_param(conn, esco_param_msbc, ARRAY_SIZE(esco_param_msbc))) return false; param = &esco_param_msbc[conn->attempt - 1]; break; case SCO_AIRMODE_CVSD: if (conn->parent && lmp_esco_capable(conn->parent)) { if (!find_next_esco_param(conn, esco_param_cvsd, ARRAY_SIZE(esco_param_cvsd))) return false; param = &esco_param_cvsd[conn->attempt - 1]; } else { if (conn->attempt > ARRAY_SIZE(sco_param_cvsd)) return false; param = &sco_param_cvsd[conn->attempt - 1]; } break; default: return false; } cp.retrans_effort = param->retrans_effort; cp.pkt_type = __cpu_to_le16(param->pkt_type); cp.max_latency = __cpu_to_le16(param->max_latency); if (hci_send_cmd(hdev, HCI_OP_SETUP_SYNC_CONN, sizeof(cp), &cp) < 0) return false; return true; } bool hci_setup_sync(struct hci_conn *conn, __u16 handle) { int result; struct conn_handle_t *conn_handle; if (enhanced_sync_conn_capable(conn->hdev)) { conn_handle = kzalloc(sizeof(*conn_handle), GFP_KERNEL); if (!conn_handle) return false; conn_handle->conn = conn; conn_handle->handle = handle; result = hci_cmd_sync_queue(conn->hdev, hci_enhanced_setup_sync, conn_handle, NULL); if (result < 0) kfree(conn_handle); return result == 0; } return hci_setup_sync_conn(conn, handle); } u8 hci_le_conn_update(struct hci_conn *conn, u16 min, u16 max, u16 latency, u16 to_multiplier) { struct hci_dev *hdev = conn->hdev; struct hci_conn_params *params; struct hci_cp_le_conn_update cp; hci_dev_lock(hdev); params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type); if (params) { params->conn_min_interval = min; params->conn_max_interval = max; params->conn_latency = latency; params->supervision_timeout = to_multiplier; } hci_dev_unlock(hdev); memset(&cp, 0, sizeof(cp)); cp.handle = cpu_to_le16(conn->handle); cp.conn_interval_min = cpu_to_le16(min); cp.conn_interval_max = cpu_to_le16(max); cp.conn_latency = cpu_to_le16(latency); cp.supervision_timeout = cpu_to_le16(to_multiplier); cp.min_ce_len = cpu_to_le16(0x0000); cp.max_ce_len = cpu_to_le16(0x0000); hci_send_cmd(hdev, HCI_OP_LE_CONN_UPDATE, sizeof(cp), &cp); if (params) return 0x01; return 0x00; } void hci_le_start_enc(struct hci_conn *conn, __le16 ediv, __le64 rand, __u8 ltk[16], __u8 key_size) { struct hci_dev *hdev = conn->hdev; struct hci_cp_le_start_enc cp; BT_DBG("hcon %p", conn); memset(&cp, 0, sizeof(cp)); cp.handle = cpu_to_le16(conn->handle); cp.rand = rand; cp.ediv = ediv; memcpy(cp.ltk, ltk, key_size); hci_send_cmd(hdev, HCI_OP_LE_START_ENC, sizeof(cp), &cp); } /* Device _must_ be locked */ void hci_sco_setup(struct hci_conn *conn, __u8 status) { struct hci_link *link; link = list_first_entry_or_null(&conn->link_list, struct hci_link, list); if (!link || !link->conn) return; BT_DBG("hcon %p", conn); if (!status) { if (lmp_esco_capable(conn->hdev)) hci_setup_sync(link->conn, conn->handle); else hci_add_sco(link->conn, conn->handle); } else { hci_connect_cfm(link->conn, status); hci_conn_del(link->conn); } } static void hci_conn_timeout(struct work_struct *work) { struct hci_conn *conn = container_of(work, struct hci_conn, disc_work.work); int refcnt = atomic_read(&conn->refcnt); BT_DBG("hcon %p state %s", conn, state_to_string(conn->state)); WARN_ON(refcnt < 0); /* FIXME: It was observed that in pairing failed scenario, refcnt * drops below 0. Probably this is because l2cap_conn_del calls * l2cap_chan_del for each channel, and inside l2cap_chan_del conn is * dropped. After that loop hci_chan_del is called which also drops * conn. For now make sure that ACL is alive if refcnt is higher then 0, * otherwise drop it. */ if (refcnt > 0) return; hci_abort_conn(conn, hci_proto_disconn_ind(conn)); } /* Enter sniff mode */ static void hci_conn_idle(struct work_struct *work) { struct hci_conn *conn = container_of(work, struct hci_conn, idle_work.work); struct hci_dev *hdev = conn->hdev; BT_DBG("hcon %p mode %d", conn, conn->mode); if (!lmp_sniff_capable(hdev) || !lmp_sniff_capable(conn)) return; if (conn->mode != HCI_CM_ACTIVE || !(conn->link_policy & HCI_LP_SNIFF)) return; if (lmp_sniffsubr_capable(hdev) && lmp_sniffsubr_capable(conn)) { struct hci_cp_sniff_subrate cp; cp.handle = cpu_to_le16(conn->handle); cp.max_latency = cpu_to_le16(0); cp.min_remote_timeout = cpu_to_le16(0); cp.min_local_timeout = cpu_to_le16(0); hci_send_cmd(hdev, HCI_OP_SNIFF_SUBRATE, sizeof(cp), &cp); } if (!test_and_set_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->flags)) { struct hci_cp_sniff_mode cp; cp.handle = cpu_to_le16(conn->handle); cp.max_interval = cpu_to_le16(hdev->sniff_max_interval); cp.min_interval = cpu_to_le16(hdev->sniff_min_interval); cp.attempt = cpu_to_le16(4); cp.timeout = cpu_to_le16(1); hci_send_cmd(hdev, HCI_OP_SNIFF_MODE, sizeof(cp), &cp); } } static void hci_conn_auto_accept(struct work_struct *work) { struct hci_conn *conn = container_of(work, struct hci_conn, auto_accept_work.work); hci_send_cmd(conn->hdev, HCI_OP_USER_CONFIRM_REPLY, sizeof(conn->dst), &conn->dst); } static void le_disable_advertising(struct hci_dev *hdev) { if (ext_adv_capable(hdev)) { struct hci_cp_le_set_ext_adv_enable cp; cp.enable = 0x00; cp.num_of_sets = 0x00; hci_send_cmd(hdev, HCI_OP_LE_SET_EXT_ADV_ENABLE, sizeof(cp), &cp); } else { u8 enable = 0x00; hci_send_cmd(hdev, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); } } static void le_conn_timeout(struct work_struct *work) { struct hci_conn *conn = container_of(work, struct hci_conn, le_conn_timeout.work); struct hci_dev *hdev = conn->hdev; BT_DBG(""); /* We could end up here due to having done directed advertising, * so clean up the state if necessary. This should however only * happen with broken hardware or if low duty cycle was used * (which doesn't have a timeout of its own). */ if (conn->role == HCI_ROLE_SLAVE) { /* Disable LE Advertising */ le_disable_advertising(hdev); hci_dev_lock(hdev); hci_conn_failed(conn, HCI_ERROR_ADVERTISING_TIMEOUT); hci_dev_unlock(hdev); return; } hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM); } struct iso_list_data { union { u8 cig; u8 big; }; union { u8 cis; u8 bis; u16 sync_handle; }; int count; bool big_term; bool pa_sync_term; bool big_sync_term; }; static void bis_list(struct hci_conn *conn, void *data) { struct iso_list_data *d = data; /* Skip if not broadcast/ANY address */ if (bacmp(&conn->dst, BDADDR_ANY)) return; if (d->big != conn->iso_qos.bcast.big || d->bis == BT_ISO_QOS_BIS_UNSET || d->bis != conn->iso_qos.bcast.bis) return; d->count++; } static int terminate_big_sync(struct hci_dev *hdev, void *data) { struct iso_list_data *d = data; bt_dev_dbg(hdev, "big 0x%2.2x bis 0x%2.2x", d->big, d->bis); hci_disable_per_advertising_sync(hdev, d->bis); hci_remove_ext_adv_instance_sync(hdev, d->bis, NULL); /* Only terminate BIG if it has been created */ if (!d->big_term) return 0; return hci_le_terminate_big_sync(hdev, d->big, HCI_ERROR_LOCAL_HOST_TERM); } static void terminate_big_destroy(struct hci_dev *hdev, void *data, int err) { kfree(data); } static int hci_le_terminate_big(struct hci_dev *hdev, struct hci_conn *conn) { struct iso_list_data *d; int ret; bt_dev_dbg(hdev, "big 0x%2.2x bis 0x%2.2x", conn->iso_qos.bcast.big, conn->iso_qos.bcast.bis); d = kzalloc(sizeof(*d), GFP_KERNEL); if (!d) return -ENOMEM; d->big = conn->iso_qos.bcast.big; d->bis = conn->iso_qos.bcast.bis; d->big_term = test_and_clear_bit(HCI_CONN_BIG_CREATED, &conn->flags); ret = hci_cmd_sync_queue(hdev, terminate_big_sync, d, terminate_big_destroy); if (ret) kfree(d); return ret; } static int big_terminate_sync(struct hci_dev *hdev, void *data) { struct iso_list_data *d = data; bt_dev_dbg(hdev, "big 0x%2.2x sync_handle 0x%4.4x", d->big, d->sync_handle); if (d->big_sync_term) hci_le_big_terminate_sync(hdev, d->big); if (d->pa_sync_term) return hci_le_pa_terminate_sync(hdev, d->sync_handle); return 0; } static void find_bis(struct hci_conn *conn, void *data) { struct iso_list_data *d = data; /* Ignore if BIG doesn't match */ if (d->big != conn->iso_qos.bcast.big) return; d->count++; } static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, struct hci_conn *conn) { struct iso_list_data *d; int ret; bt_dev_dbg(hdev, "big 0x%2.2x sync_handle 0x%4.4x", big, conn->sync_handle); d = kzalloc(sizeof(*d), GFP_KERNEL); if (!d) return -ENOMEM; d->big = big; d->sync_handle = conn->sync_handle; if (test_and_clear_bit(HCI_CONN_PA_SYNC, &conn->flags)) { hci_conn_hash_list_flag(hdev, find_bis, ISO_LINK, HCI_CONN_PA_SYNC, d); if (!d->count) d->pa_sync_term = true; d->count = 0; } if (test_and_clear_bit(HCI_CONN_BIG_SYNC, &conn->flags)) { hci_conn_hash_list_flag(hdev, find_bis, ISO_LINK, HCI_CONN_BIG_SYNC, d); if (!d->count) d->big_sync_term = true; } ret = hci_cmd_sync_queue(hdev, big_terminate_sync, d, terminate_big_destroy); if (ret) kfree(d); return ret; } /* Cleanup BIS connection * * Detects if there any BIS left connected in a BIG * broadcaster: Remove advertising instance and terminate BIG. * broadcaster receiver: Teminate BIG sync and terminate PA sync. */ static void bis_cleanup(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; struct hci_conn *bis; bt_dev_dbg(hdev, "conn %p", conn); if (conn->role == HCI_ROLE_MASTER) { if (!test_and_clear_bit(HCI_CONN_PER_ADV, &conn->flags)) return; /* Check if ISO connection is a BIS and terminate advertising * set and BIG if there are no other connections using it. */ bis = hci_conn_hash_lookup_big(hdev, conn->iso_qos.bcast.big); if (bis) return; hci_le_terminate_big(hdev, conn); } else { hci_le_big_terminate(hdev, conn->iso_qos.bcast.big, conn); } } static int remove_cig_sync(struct hci_dev *hdev, void *data) { u8 handle = PTR_UINT(data); return hci_le_remove_cig_sync(hdev, handle); } static int hci_le_remove_cig(struct hci_dev *hdev, u8 handle) { bt_dev_dbg(hdev, "handle 0x%2.2x", handle); return hci_cmd_sync_queue(hdev, remove_cig_sync, UINT_PTR(handle), NULL); } static void find_cis(struct hci_conn *conn, void *data) { struct iso_list_data *d = data; /* Ignore broadcast or if CIG don't match */ if (!bacmp(&conn->dst, BDADDR_ANY) || d->cig != conn->iso_qos.ucast.cig) return; d->count++; } /* Cleanup CIS connection: * * Detects if there any CIS left connected in a CIG and remove it. */ static void cis_cleanup(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; struct iso_list_data d; if (conn->iso_qos.ucast.cig == BT_ISO_QOS_CIG_UNSET) return; memset(&d, 0, sizeof(d)); d.cig = conn->iso_qos.ucast.cig; /* Check if ISO connection is a CIS and remove CIG if there are * no other connections using it. */ hci_conn_hash_list_state(hdev, find_cis, ISO_LINK, BT_BOUND, &d); hci_conn_hash_list_state(hdev, find_cis, ISO_LINK, BT_CONNECT, &d); hci_conn_hash_list_state(hdev, find_cis, ISO_LINK, BT_CONNECTED, &d); if (d.count) return; hci_le_remove_cig(hdev, conn->iso_qos.ucast.cig); } static int hci_conn_hash_alloc_unset(struct hci_dev *hdev) { return ida_alloc_range(&hdev->unset_handle_ida, HCI_CONN_HANDLE_MAX + 1, U16_MAX, GFP_ATOMIC); } static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, u8 role, u16 handle) { struct hci_conn *conn; switch (type) { case ACL_LINK: if (!hdev->acl_mtu) return ERR_PTR(-ECONNREFUSED); break; case ISO_LINK: if (hdev->iso_mtu) /* Dedicated ISO Buffer exists */ break; fallthrough; case LE_LINK: if (hdev->le_mtu && hdev->le_mtu < HCI_MIN_LE_MTU) return ERR_PTR(-ECONNREFUSED); if (!hdev->le_mtu && hdev->acl_mtu < HCI_MIN_LE_MTU) return ERR_PTR(-ECONNREFUSED); break; case SCO_LINK: case ESCO_LINK: if (!hdev->sco_pkts) /* Controller does not support SCO or eSCO over HCI */ return ERR_PTR(-ECONNREFUSED); break; default: return ERR_PTR(-ECONNREFUSED); } bt_dev_dbg(hdev, "dst %pMR handle 0x%4.4x", dst, handle); conn = kzalloc(sizeof(*conn), GFP_KERNEL); if (!conn) return ERR_PTR(-ENOMEM); bacpy(&conn->dst, dst); bacpy(&conn->src, &hdev->bdaddr); conn->handle = handle; conn->hdev = hdev; conn->type = type; conn->role = role; conn->mode = HCI_CM_ACTIVE; conn->state = BT_OPEN; conn->auth_type = HCI_AT_GENERAL_BONDING; conn->io_capability = hdev->io_capability; conn->remote_auth = 0xff; conn->key_type = 0xff; conn->rssi = HCI_RSSI_INVALID; conn->tx_power = HCI_TX_POWER_INVALID; conn->max_tx_power = HCI_TX_POWER_INVALID; conn->sync_handle = HCI_SYNC_HANDLE_INVALID; conn->sid = HCI_SID_INVALID; set_bit(HCI_CONN_POWER_SAVE, &conn->flags); conn->disc_timeout = HCI_DISCONN_TIMEOUT; /* Set Default Authenticated payload timeout to 30s */ conn->auth_payload_timeout = DEFAULT_AUTH_PAYLOAD_TIMEOUT; if (conn->role == HCI_ROLE_MASTER) conn->out = true; switch (type) { case ACL_LINK: conn->pkt_type = hdev->pkt_type & ACL_PTYPE_MASK; conn->mtu = hdev->acl_mtu; break; case LE_LINK: /* conn->src should reflect the local identity address */ hci_copy_identity_address(hdev, &conn->src, &conn->src_type); conn->mtu = hdev->le_mtu ? hdev->le_mtu : hdev->acl_mtu; break; case ISO_LINK: /* conn->src should reflect the local identity address */ hci_copy_identity_address(hdev, &conn->src, &conn->src_type); /* set proper cleanup function */ if (!bacmp(dst, BDADDR_ANY)) conn->cleanup = bis_cleanup; else if (conn->role == HCI_ROLE_MASTER) conn->cleanup = cis_cleanup; conn->mtu = hdev->iso_mtu ? hdev->iso_mtu : hdev->le_mtu ? hdev->le_mtu : hdev->acl_mtu; break; case SCO_LINK: if (lmp_esco_capable(hdev)) conn->pkt_type = (hdev->esco_type & SCO_ESCO_MASK) | (hdev->esco_type & EDR_ESCO_MASK); else conn->pkt_type = hdev->pkt_type & SCO_PTYPE_MASK; conn->mtu = hdev->sco_mtu; break; case ESCO_LINK: conn->pkt_type = hdev->esco_type & ~EDR_ESCO_MASK; conn->mtu = hdev->sco_mtu; break; } skb_queue_head_init(&conn->data_q); INIT_LIST_HEAD(&conn->chan_list); INIT_LIST_HEAD(&conn->link_list); INIT_DELAYED_WORK(&conn->disc_work, hci_conn_timeout); INIT_DELAYED_WORK(&conn->auto_accept_work, hci_conn_auto_accept); INIT_DELAYED_WORK(&conn->idle_work, hci_conn_idle); INIT_DELAYED_WORK(&conn->le_conn_timeout, le_conn_timeout); atomic_set(&conn->refcnt, 0); hci_dev_hold(hdev); hci_conn_hash_add(hdev, conn); /* The SCO and eSCO connections will only be notified when their * setup has been completed. This is different to ACL links which * can be notified right away. */ if (conn->type != SCO_LINK && conn->type != ESCO_LINK) { if (hdev->notify) hdev->notify(hdev, HCI_NOTIFY_CONN_ADD); } hci_conn_init_sysfs(conn); return conn; } struct hci_conn *hci_conn_add_unset(struct hci_dev *hdev, int type, bdaddr_t *dst, u8 role) { int handle; bt_dev_dbg(hdev, "dst %pMR", dst); handle = hci_conn_hash_alloc_unset(hdev); if (unlikely(handle < 0)) return ERR_PTR(-ECONNREFUSED); return __hci_conn_add(hdev, type, dst, role, handle); } struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, u8 role, u16 handle) { if (handle > HCI_CONN_HANDLE_MAX) return ERR_PTR(-EINVAL); return __hci_conn_add(hdev, type, dst, role, handle); } static void hci_conn_cleanup_child(struct hci_conn *conn, u8 reason) { if (!reason) reason = HCI_ERROR_REMOTE_USER_TERM; /* Due to race, SCO/ISO conn might be not established yet at this point, * and nothing else will clean it up. In other cases it is done via HCI * events. */ switch (conn->type) { case SCO_LINK: case ESCO_LINK: if (HCI_CONN_HANDLE_UNSET(conn->handle)) hci_conn_failed(conn, reason); break; case ISO_LINK: if ((conn->state != BT_CONNECTED && !test_bit(HCI_CONN_CREATE_CIS, &conn->flags)) || test_bit(HCI_CONN_BIG_CREATED, &conn->flags)) hci_conn_failed(conn, reason); break; } } static void hci_conn_unlink(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; bt_dev_dbg(hdev, "hcon %p", conn); if (!conn->parent) { struct hci_link *link, *t; list_for_each_entry_safe(link, t, &conn->link_list, list) { struct hci_conn *child = link->conn; hci_conn_unlink(child); /* If hdev is down it means * hci_dev_close_sync/hci_conn_hash_flush is in progress * and links don't need to be cleanup as all connections * would be cleanup. */ if (!test_bit(HCI_UP, &hdev->flags)) continue; hci_conn_cleanup_child(child, conn->abort_reason); } return; } if (!conn->link) return; list_del_rcu(&conn->link->list); synchronize_rcu(); hci_conn_drop(conn->parent); hci_conn_put(conn->parent); conn->parent = NULL; kfree(conn->link); conn->link = NULL; } void hci_conn_del(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; BT_DBG("%s hcon %p handle %d", hdev->name, conn, conn->handle); hci_conn_unlink(conn); disable_delayed_work_sync(&conn->disc_work); disable_delayed_work_sync(&conn->auto_accept_work); disable_delayed_work_sync(&conn->idle_work); if (conn->type == ACL_LINK) { /* Unacked frames */ hdev->acl_cnt += conn->sent; } else if (conn->type == LE_LINK) { cancel_delayed_work(&conn->le_conn_timeout); if (hdev->le_pkts) hdev->le_cnt += conn->sent; else hdev->acl_cnt += conn->sent; } else { /* Unacked ISO frames */ if (conn->type == ISO_LINK) { if (hdev->iso_pkts) hdev->iso_cnt += conn->sent; else if (hdev->le_pkts) hdev->le_cnt += conn->sent; else hdev->acl_cnt += conn->sent; } } skb_queue_purge(&conn->data_q); /* Remove the connection from the list and cleanup its remaining * state. This is a separate function since for some cases like * BT_CONNECT_SCAN we *only* want the cleanup part without the * rest of hci_conn_del. */ hci_conn_cleanup(conn); /* Dequeue callbacks using connection pointer as data */ hci_cmd_sync_dequeue(hdev, NULL, conn, NULL); } struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src, uint8_t src_type) { int use_src = bacmp(src, BDADDR_ANY); struct hci_dev *hdev = NULL, *d; BT_DBG("%pMR -> %pMR", src, dst); read_lock(&hci_dev_list_lock); list_for_each_entry(d, &hci_dev_list, list) { if (!test_bit(HCI_UP, &d->flags) || hci_dev_test_flag(d, HCI_USER_CHANNEL)) continue; /* Simple routing: * No source address - find interface with bdaddr != dst * Source address - find interface with bdaddr == src */ if (use_src) { bdaddr_t id_addr; u8 id_addr_type; if (src_type == BDADDR_BREDR) { if (!lmp_bredr_capable(d)) continue; bacpy(&id_addr, &d->bdaddr); id_addr_type = BDADDR_BREDR; } else { if (!lmp_le_capable(d)) continue; hci_copy_identity_address(d, &id_addr, &id_addr_type); /* Convert from HCI to three-value type */ if (id_addr_type == ADDR_LE_DEV_PUBLIC) id_addr_type = BDADDR_LE_PUBLIC; else id_addr_type = BDADDR_LE_RANDOM; } if (!bacmp(&id_addr, src) && id_addr_type == src_type) { hdev = d; break; } } else { if (bacmp(&d->bdaddr, dst)) { hdev = d; break; } } } if (hdev) hdev = hci_dev_hold(hdev); read_unlock(&hci_dev_list_lock); return hdev; } EXPORT_SYMBOL(hci_get_route); /* This function requires the caller holds hdev->lock */ static void hci_le_conn_failed(struct hci_conn *conn, u8 status) { struct hci_dev *hdev = conn->hdev; hci_connect_le_scan_cleanup(conn, status); /* Enable advertising in case this was a failed connection * attempt as a peripheral. */ hci_enable_advertising(hdev); } /* This function requires the caller holds hdev->lock */ void hci_conn_failed(struct hci_conn *conn, u8 status) { struct hci_dev *hdev = conn->hdev; bt_dev_dbg(hdev, "status 0x%2.2x", status); switch (conn->type) { case LE_LINK: hci_le_conn_failed(conn, status); break; case ACL_LINK: mgmt_connect_failed(hdev, conn, status); break; } /* In case of BIG/PA sync failed, clear conn flags so that * the conns will be correctly cleaned up by ISO layer */ test_and_clear_bit(HCI_CONN_BIG_SYNC_FAILED, &conn->flags); test_and_clear_bit(HCI_CONN_PA_SYNC_FAILED, &conn->flags); conn->state = BT_CLOSED; hci_connect_cfm(conn, status); hci_conn_del(conn); } /* This function requires the caller holds hdev->lock */ u8 hci_conn_set_handle(struct hci_conn *conn, u16 handle) { struct hci_dev *hdev = conn->hdev; bt_dev_dbg(hdev, "hcon %p handle 0x%4.4x", conn, handle); if (conn->handle == handle) return 0; if (handle > HCI_CONN_HANDLE_MAX) { bt_dev_err(hdev, "Invalid handle: 0x%4.4x > 0x%4.4x", handle, HCI_CONN_HANDLE_MAX); return HCI_ERROR_INVALID_PARAMETERS; } /* If abort_reason has been sent it means the connection is being * aborted and the handle shall not be changed. */ if (conn->abort_reason) return conn->abort_reason; if (HCI_CONN_HANDLE_UNSET(conn->handle)) ida_free(&hdev->unset_handle_ida, conn->handle); conn->handle = handle; return 0; } struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, u8 dst_type, bool dst_resolved, u8 sec_level, u16 conn_timeout, u8 role, u8 phy, u8 sec_phy) { struct hci_conn *conn; struct smp_irk *irk; int err; /* Let's make sure that le is enabled.*/ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) { if (lmp_le_capable(hdev)) return ERR_PTR(-ECONNREFUSED); return ERR_PTR(-EOPNOTSUPP); } /* Since the controller supports only one LE connection attempt at a * time, we return -EBUSY if there is any connection attempt running. */ if (hci_lookup_le_connect(hdev)) return ERR_PTR(-EBUSY); /* If there's already a connection object but it's not in * scanning state it means it must already be established, in * which case we can't do anything else except report a failure * to connect. */ conn = hci_conn_hash_lookup_le(hdev, dst, dst_type); if (conn && !test_bit(HCI_CONN_SCANNING, &conn->flags)) { return ERR_PTR(-EBUSY); } /* Check if the destination address has been resolved by the controller * since if it did then the identity address shall be used. */ if (!dst_resolved) { /* When given an identity address with existing identity * resolving key, the connection needs to be established * to a resolvable random address. * * Storing the resolvable random address is required here * to handle connection failures. The address will later * be resolved back into the original identity address * from the connect request. */ irk = hci_find_irk_by_addr(hdev, dst, dst_type); if (irk && bacmp(&irk->rpa, BDADDR_ANY)) { dst = &irk->rpa; dst_type = ADDR_LE_DEV_RANDOM; } } if (conn) { bacpy(&conn->dst, dst); } else { conn = hci_conn_add_unset(hdev, LE_LINK, dst, role); if (IS_ERR(conn)) return conn; hci_conn_hold(conn); conn->pending_sec_level = sec_level; } conn->dst_type = dst_type; conn->sec_level = BT_SECURITY_LOW; conn->conn_timeout = conn_timeout; conn->le_adv_phy = phy; conn->le_adv_sec_phy = sec_phy; err = hci_connect_le_sync(hdev, conn); if (err) { hci_conn_del(conn); return ERR_PTR(err); } return conn; } static bool is_connected(struct hci_dev *hdev, bdaddr_t *addr, u8 type) { struct hci_conn *conn; conn = hci_conn_hash_lookup_le(hdev, addr, type); if (!conn) return false; if (conn->state != BT_CONNECTED) return false; return true; } /* This function requires the caller holds hdev->lock */ static int hci_explicit_conn_params_set(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) { struct hci_conn_params *params; if (is_connected(hdev, addr, addr_type)) return -EISCONN; params = hci_conn_params_lookup(hdev, addr, addr_type); if (!params) { params = hci_conn_params_add(hdev, addr, addr_type); if (!params) return -ENOMEM; /* If we created new params, mark them to be deleted in * hci_connect_le_scan_cleanup. It's different case than * existing disabled params, those will stay after cleanup. */ params->auto_connect = HCI_AUTO_CONN_EXPLICIT; } /* We're trying to connect, so make sure params are at pend_le_conns */ if (params->auto_connect == HCI_AUTO_CONN_DISABLED || params->auto_connect == HCI_AUTO_CONN_REPORT || params->auto_connect == HCI_AUTO_CONN_EXPLICIT) { hci_pend_le_list_del_init(params); hci_pend_le_list_add(params, &hdev->pend_le_conns); } params->explicit_connect = true; BT_DBG("addr %pMR (type %u) auto_connect %u", addr, addr_type, params->auto_connect); return 0; } static int qos_set_big(struct hci_dev *hdev, struct bt_iso_qos *qos) { struct hci_conn *conn; u8 big; /* Allocate a BIG if not set */ if (qos->bcast.big == BT_ISO_QOS_BIG_UNSET) { for (big = 0x00; big < 0xef; big++) { conn = hci_conn_hash_lookup_big(hdev, big); if (!conn) break; } if (big == 0xef) return -EADDRNOTAVAIL; /* Update BIG */ qos->bcast.big = big; } return 0; } static int qos_set_bis(struct hci_dev *hdev, struct bt_iso_qos *qos) { struct hci_conn *conn; u8 bis; /* Allocate BIS if not set */ if (qos->bcast.bis == BT_ISO_QOS_BIS_UNSET) { if (qos->bcast.big != BT_ISO_QOS_BIG_UNSET) { conn = hci_conn_hash_lookup_big(hdev, qos->bcast.big); if (conn) { /* If the BIG handle is already matched to an advertising * handle, do not allocate a new one. */ qos->bcast.bis = conn->iso_qos.bcast.bis; return 0; } } /* Find an unused adv set to advertise BIS, skip instance 0x00 * since it is reserved as general purpose set. */ for (bis = 0x01; bis < hdev->le_num_of_adv_sets; bis++) { conn = hci_conn_hash_lookup_bis(hdev, BDADDR_ANY, bis); if (!conn) break; } if (bis == hdev->le_num_of_adv_sets) return -EADDRNOTAVAIL; /* Update BIS */ qos->bcast.bis = bis; } return 0; } /* This function requires the caller holds hdev->lock */ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst, struct bt_iso_qos *qos, __u8 base_len, __u8 *base) { struct hci_conn *conn; int err; /* Let's make sure that le is enabled.*/ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) { if (lmp_le_capable(hdev)) return ERR_PTR(-ECONNREFUSED); return ERR_PTR(-EOPNOTSUPP); } err = qos_set_big(hdev, qos); if (err) return ERR_PTR(err); err = qos_set_bis(hdev, qos); if (err) return ERR_PTR(err); /* Check if the LE Create BIG command has already been sent */ conn = hci_conn_hash_lookup_per_adv_bis(hdev, dst, qos->bcast.big, qos->bcast.big); if (conn) return ERR_PTR(-EADDRINUSE); /* Check BIS settings against other bound BISes, since all * BISes in a BIG must have the same value for all parameters */ conn = hci_conn_hash_lookup_big(hdev, qos->bcast.big); if (conn && (memcmp(qos, &conn->iso_qos, sizeof(*qos)) || base_len != conn->le_per_adv_data_len || memcmp(conn->le_per_adv_data, base, base_len))) return ERR_PTR(-EADDRINUSE); conn = hci_conn_add_unset(hdev, ISO_LINK, dst, HCI_ROLE_MASTER); if (IS_ERR(conn)) return conn; conn->state = BT_CONNECT; hci_conn_hold(conn); return conn; } /* This function requires the caller holds hdev->lock */ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst, u8 dst_type, u8 sec_level, u16 conn_timeout, enum conn_reasons conn_reason) { struct hci_conn *conn; /* Let's make sure that le is enabled.*/ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) { if (lmp_le_capable(hdev)) return ERR_PTR(-ECONNREFUSED); return ERR_PTR(-EOPNOTSUPP); } /* Some devices send ATT messages as soon as the physical link is * established. To be able to handle these ATT messages, the user- * space first establishes the connection and then starts the pairing * process. * * So if a hci_conn object already exists for the following connection * attempt, we simply update pending_sec_level and auth_type fields * and return the object found. */ conn = hci_conn_hash_lookup_le(hdev, dst, dst_type); if (conn) { if (conn->pending_sec_level < sec_level) conn->pending_sec_level = sec_level; goto done; } BT_DBG("requesting refresh of dst_addr"); conn = hci_conn_add_unset(hdev, LE_LINK, dst, HCI_ROLE_MASTER); if (IS_ERR(conn)) return conn; if (hci_explicit_conn_params_set(hdev, dst, dst_type) < 0) { hci_conn_del(conn); return ERR_PTR(-EBUSY); } conn->state = BT_CONNECT; set_bit(HCI_CONN_SCANNING, &conn->flags); conn->dst_type = dst_type; conn->sec_level = BT_SECURITY_LOW; conn->pending_sec_level = sec_level; conn->conn_timeout = conn_timeout; conn->conn_reason = conn_reason; hci_update_passive_scan(hdev); done: hci_conn_hold(conn); return conn; } struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst, u8 sec_level, u8 auth_type, enum conn_reasons conn_reason, u16 timeout) { struct hci_conn *acl; if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { if (lmp_bredr_capable(hdev)) return ERR_PTR(-ECONNREFUSED); return ERR_PTR(-EOPNOTSUPP); } /* Reject outgoing connection to device with same BD ADDR against * CVE-2020-26555 */ if (!bacmp(&hdev->bdaddr, dst)) { bt_dev_dbg(hdev, "Reject connection with same BD_ADDR %pMR\n", dst); return ERR_PTR(-ECONNREFUSED); } acl = hci_conn_hash_lookup_ba(hdev, ACL_LINK, dst); if (!acl) { acl = hci_conn_add_unset(hdev, ACL_LINK, dst, HCI_ROLE_MASTER); if (IS_ERR(acl)) return acl; } hci_conn_hold(acl); acl->conn_reason = conn_reason; if (acl->state == BT_OPEN || acl->state == BT_CLOSED) { int err; acl->sec_level = BT_SECURITY_LOW; acl->pending_sec_level = sec_level; acl->auth_type = auth_type; acl->conn_timeout = timeout; err = hci_connect_acl_sync(hdev, acl); if (err) { hci_conn_del(acl); return ERR_PTR(err); } } return acl; } static struct hci_link *hci_conn_link(struct hci_conn *parent, struct hci_conn *conn) { struct hci_dev *hdev = parent->hdev; struct hci_link *link; bt_dev_dbg(hdev, "parent %p hcon %p", parent, conn); if (conn->link) return conn->link; if (conn->parent) return NULL; link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) return NULL; link->conn = hci_conn_hold(conn); conn->link = link; conn->parent = hci_conn_get(parent); /* Use list_add_tail_rcu append to the list */ list_add_tail_rcu(&link->list, &parent->link_list); return link; } struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, __u16 setting, struct bt_codec *codec, u16 timeout) { struct hci_conn *acl; struct hci_conn *sco; struct hci_link *link; acl = hci_connect_acl(hdev, dst, BT_SECURITY_LOW, HCI_AT_NO_BONDING, CONN_REASON_SCO_CONNECT, timeout); if (IS_ERR(acl)) return acl; sco = hci_conn_hash_lookup_ba(hdev, type, dst); if (!sco) { sco = hci_conn_add_unset(hdev, type, dst, HCI_ROLE_MASTER); if (IS_ERR(sco)) { hci_conn_drop(acl); return sco; } } link = hci_conn_link(acl, sco); if (!link) { hci_conn_drop(acl); hci_conn_drop(sco); return ERR_PTR(-ENOLINK); } sco->setting = setting; sco->codec = *codec; if (acl->state == BT_CONNECTED && (sco->state == BT_OPEN || sco->state == BT_CLOSED)) { set_bit(HCI_CONN_POWER_SAVE, &acl->flags); hci_conn_enter_active_mode(acl, BT_POWER_FORCE_ACTIVE_ON); if (test_bit(HCI_CONN_MODE_CHANGE_PEND, &acl->flags)) { /* defer SCO setup until mode change completed */ set_bit(HCI_CONN_SCO_SETUP_PEND, &acl->flags); return sco; } hci_sco_setup(acl, 0x00); } return sco; } static int hci_le_create_big(struct hci_conn *conn, struct bt_iso_qos *qos) { struct hci_dev *hdev = conn->hdev; struct hci_cp_le_create_big cp; struct iso_list_data data; memset(&cp, 0, sizeof(cp)); data.big = qos->bcast.big; data.bis = qos->bcast.bis; data.count = 0; /* Create a BIS for each bound connection */ hci_conn_hash_list_state(hdev, bis_list, ISO_LINK, BT_BOUND, &data); cp.handle = qos->bcast.big; cp.adv_handle = qos->bcast.bis; cp.num_bis = data.count; hci_cpu_to_le24(qos->bcast.out.interval, cp.bis.sdu_interval); cp.bis.sdu = cpu_to_le16(qos->bcast.out.sdu); cp.bis.latency = cpu_to_le16(qos->bcast.out.latency); cp.bis.rtn = qos->bcast.out.rtn; cp.bis.phy = qos->bcast.out.phy; cp.bis.packing = qos->bcast.packing; cp.bis.framing = qos->bcast.framing; cp.bis.encryption = qos->bcast.encryption; memcpy(cp.bis.bcode, qos->bcast.bcode, sizeof(cp.bis.bcode)); return hci_send_cmd(hdev, HCI_OP_LE_CREATE_BIG, sizeof(cp), &cp); } static int set_cig_params_sync(struct hci_dev *hdev, void *data) { DEFINE_FLEX(struct hci_cp_le_set_cig_params, pdu, cis, num_cis, 0x1f); u8 cig_id = PTR_UINT(data); struct hci_conn *conn; struct bt_iso_qos *qos; u8 aux_num_cis = 0; u8 cis_id; conn = hci_conn_hash_lookup_cig(hdev, cig_id); if (!conn) return 0; qos = &conn->iso_qos; pdu->cig_id = cig_id; hci_cpu_to_le24(qos->ucast.out.interval, pdu->c_interval); hci_cpu_to_le24(qos->ucast.in.interval, pdu->p_interval); pdu->sca = qos->ucast.sca; pdu->packing = qos->ucast.packing; pdu->framing = qos->ucast.framing; pdu->c_latency = cpu_to_le16(qos->ucast.out.latency); pdu->p_latency = cpu_to_le16(qos->ucast.in.latency); /* Reprogram all CIS(s) with the same CIG, valid range are: * num_cis: 0x00 to 0x1F * cis_id: 0x00 to 0xEF */ for (cis_id = 0x00; cis_id < 0xf0 && aux_num_cis < pdu->num_cis; cis_id++) { struct hci_cis_params *cis; conn = hci_conn_hash_lookup_cis(hdev, NULL, 0, cig_id, cis_id); if (!conn) continue; qos = &conn->iso_qos; cis = &pdu->cis[aux_num_cis++]; cis->cis_id = cis_id; cis->c_sdu = cpu_to_le16(conn->iso_qos.ucast.out.sdu); cis->p_sdu = cpu_to_le16(conn->iso_qos.ucast.in.sdu); cis->c_phy = qos->ucast.out.phy ? qos->ucast.out.phy : qos->ucast.in.phy; cis->p_phy = qos->ucast.in.phy ? qos->ucast.in.phy : qos->ucast.out.phy; cis->c_rtn = qos->ucast.out.rtn; cis->p_rtn = qos->ucast.in.rtn; } pdu->num_cis = aux_num_cis; if (!pdu->num_cis) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_CIG_PARAMS, struct_size(pdu, cis, pdu->num_cis), pdu, HCI_CMD_TIMEOUT); } static bool hci_le_set_cig_params(struct hci_conn *conn, struct bt_iso_qos *qos) { struct hci_dev *hdev = conn->hdev; struct iso_list_data data; memset(&data, 0, sizeof(data)); /* Allocate first still reconfigurable CIG if not set */ if (qos->ucast.cig == BT_ISO_QOS_CIG_UNSET) { for (data.cig = 0x00; data.cig < 0xf0; data.cig++) { data.count = 0; hci_conn_hash_list_state(hdev, find_cis, ISO_LINK, BT_CONNECT, &data); if (data.count) continue; hci_conn_hash_list_state(hdev, find_cis, ISO_LINK, BT_CONNECTED, &data); if (!data.count) break; } if (data.cig == 0xf0) return false; /* Update CIG */ qos->ucast.cig = data.cig; } if (qos->ucast.cis != BT_ISO_QOS_CIS_UNSET) { if (hci_conn_hash_lookup_cis(hdev, NULL, 0, qos->ucast.cig, qos->ucast.cis)) return false; goto done; } /* Allocate first available CIS if not set */ for (data.cig = qos->ucast.cig, data.cis = 0x00; data.cis < 0xf0; data.cis++) { if (!hci_conn_hash_lookup_cis(hdev, NULL, 0, data.cig, data.cis)) { /* Update CIS */ qos->ucast.cis = data.cis; break; } } if (qos->ucast.cis == BT_ISO_QOS_CIS_UNSET) return false; done: if (hci_cmd_sync_queue(hdev, set_cig_params_sync, UINT_PTR(qos->ucast.cig), NULL) < 0) return false; return true; } struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, struct bt_iso_qos *qos) { struct hci_conn *cis; cis = hci_conn_hash_lookup_cis(hdev, dst, dst_type, qos->ucast.cig, qos->ucast.cis); if (!cis) { cis = hci_conn_add_unset(hdev, ISO_LINK, dst, HCI_ROLE_MASTER); if (IS_ERR(cis)) return cis; cis->cleanup = cis_cleanup; cis->dst_type = dst_type; cis->iso_qos.ucast.cig = BT_ISO_QOS_CIG_UNSET; cis->iso_qos.ucast.cis = BT_ISO_QOS_CIS_UNSET; } if (cis->state == BT_CONNECTED) return cis; /* Check if CIS has been set and the settings matches */ if (cis->state == BT_BOUND && !memcmp(&cis->iso_qos, qos, sizeof(*qos))) return cis; /* Update LINK PHYs according to QoS preference */ cis->le_tx_phy = qos->ucast.out.phy; cis->le_rx_phy = qos->ucast.in.phy; /* If output interval is not set use the input interval as it cannot be * 0x000000. */ if (!qos->ucast.out.interval) qos->ucast.out.interval = qos->ucast.in.interval; /* If input interval is not set use the output interval as it cannot be * 0x000000. */ if (!qos->ucast.in.interval) qos->ucast.in.interval = qos->ucast.out.interval; /* If output latency is not set use the input latency as it cannot be * 0x0000. */ if (!qos->ucast.out.latency) qos->ucast.out.latency = qos->ucast.in.latency; /* If input latency is not set use the output latency as it cannot be * 0x0000. */ if (!qos->ucast.in.latency) qos->ucast.in.latency = qos->ucast.out.latency; if (!hci_le_set_cig_params(cis, qos)) { hci_conn_drop(cis); return ERR_PTR(-EINVAL); } hci_conn_hold(cis); cis->iso_qos = *qos; cis->state = BT_BOUND; return cis; } bool hci_iso_setup_path(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; struct hci_cp_le_setup_iso_path cmd; memset(&cmd, 0, sizeof(cmd)); if (conn->iso_qos.ucast.out.sdu) { cmd.handle = cpu_to_le16(conn->handle); cmd.direction = 0x00; /* Input (Host to Controller) */ cmd.path = 0x00; /* HCI path if enabled */ cmd.codec = 0x03; /* Transparent Data */ if (hci_send_cmd(hdev, HCI_OP_LE_SETUP_ISO_PATH, sizeof(cmd), &cmd) < 0) return false; } if (conn->iso_qos.ucast.in.sdu) { cmd.handle = cpu_to_le16(conn->handle); cmd.direction = 0x01; /* Output (Controller to Host) */ cmd.path = 0x00; /* HCI path if enabled */ cmd.codec = 0x03; /* Transparent Data */ if (hci_send_cmd(hdev, HCI_OP_LE_SETUP_ISO_PATH, sizeof(cmd), &cmd) < 0) return false; } return true; } int hci_conn_check_create_cis(struct hci_conn *conn) { if (conn->type != ISO_LINK || !bacmp(&conn->dst, BDADDR_ANY)) return -EINVAL; if (!conn->parent || conn->parent->state != BT_CONNECTED || conn->state != BT_CONNECT || HCI_CONN_HANDLE_UNSET(conn->handle)) return 1; return 0; } static int hci_create_cis_sync(struct hci_dev *hdev, void *data) { return hci_le_create_cis_sync(hdev); } int hci_le_create_cis_pending(struct hci_dev *hdev) { struct hci_conn *conn; bool pending = false; rcu_read_lock(); list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { if (test_bit(HCI_CONN_CREATE_CIS, &conn->flags)) { rcu_read_unlock(); return -EBUSY; } if (!hci_conn_check_create_cis(conn)) pending = true; } rcu_read_unlock(); if (!pending) return 0; /* Queue Create CIS */ return hci_cmd_sync_queue(hdev, hci_create_cis_sync, NULL, NULL); } static void hci_iso_qos_setup(struct hci_dev *hdev, struct hci_conn *conn, struct bt_iso_io_qos *qos, __u8 phy) { /* Only set MTU if PHY is enabled */ if (!qos->sdu && qos->phy) qos->sdu = conn->mtu; /* Use the same PHY as ACL if set to any */ if (qos->phy == BT_ISO_PHY_ANY) qos->phy = phy; /* Use LE ACL connection interval if not set */ if (!qos->interval) /* ACL interval unit in 1.25 ms to us */ qos->interval = conn->le_conn_interval * 1250; /* Use LE ACL connection latency if not set */ if (!qos->latency) qos->latency = conn->le_conn_latency; } static int create_big_sync(struct hci_dev *hdev, void *data) { struct hci_conn *conn = data; struct bt_iso_qos *qos = &conn->iso_qos; u16 interval, sync_interval = 0; u32 flags = 0; int err; if (qos->bcast.out.phy == 0x02) flags |= MGMT_ADV_FLAG_SEC_2M; /* Align intervals */ interval = (qos->bcast.out.interval / 1250) * qos->bcast.sync_factor; if (qos->bcast.bis) sync_interval = interval * 4; err = hci_start_per_adv_sync(hdev, qos->bcast.bis, conn->le_per_adv_data_len, conn->le_per_adv_data, flags, interval, interval, sync_interval); if (err) return err; return hci_le_create_big(conn, &conn->iso_qos); } static void create_pa_complete(struct hci_dev *hdev, void *data, int err) { bt_dev_dbg(hdev, ""); if (err) bt_dev_err(hdev, "Unable to create PA: %d", err); } static bool hci_conn_check_create_pa_sync(struct hci_conn *conn) { if (conn->type != ISO_LINK || conn->sid == HCI_SID_INVALID) return false; return true; } static int create_pa_sync(struct hci_dev *hdev, void *data) { struct hci_cp_le_pa_create_sync cp = {0}; struct hci_conn *conn; int err = 0; hci_dev_lock(hdev); rcu_read_lock(); /* The spec allows only one pending LE Periodic Advertising Create * Sync command at a time. If the command is pending now, don't do * anything. We check for pending connections after each PA Sync * Established event. * * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E * page 2493: * * If the Host issues this command when another HCI_LE_Periodic_ * Advertising_Create_Sync command is pending, the Controller shall * return the error code Command Disallowed (0x0C). */ list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { if (test_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags)) goto unlock; } list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { if (hci_conn_check_create_pa_sync(conn)) { struct bt_iso_qos *qos = &conn->iso_qos; cp.options = qos->bcast.options; cp.sid = conn->sid; cp.addr_type = conn->dst_type; bacpy(&cp.addr, &conn->dst); cp.skip = cpu_to_le16(qos->bcast.skip); cp.sync_timeout = cpu_to_le16(qos->bcast.sync_timeout); cp.sync_cte_type = qos->bcast.sync_cte_type; break; } } unlock: rcu_read_unlock(); hci_dev_unlock(hdev); if (bacmp(&cp.addr, BDADDR_ANY)) { hci_dev_set_flag(hdev, HCI_PA_SYNC); set_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags); err = __hci_cmd_sync_status(hdev, HCI_OP_LE_PA_CREATE_SYNC, sizeof(cp), &cp, HCI_CMD_TIMEOUT); if (!err) err = hci_update_passive_scan_sync(hdev); if (err) { hci_dev_clear_flag(hdev, HCI_PA_SYNC); clear_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags); } } return err; } int hci_pa_create_sync_pending(struct hci_dev *hdev) { /* Queue start pa_create_sync and scan */ return hci_cmd_sync_queue(hdev, create_pa_sync, NULL, create_pa_complete); } struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, __u8 sid, struct bt_iso_qos *qos) { struct hci_conn *conn; conn = hci_conn_add_unset(hdev, ISO_LINK, dst, HCI_ROLE_SLAVE); if (IS_ERR(conn)) return conn; conn->iso_qos = *qos; conn->dst_type = dst_type; conn->sid = sid; conn->state = BT_LISTEN; hci_conn_hold(conn); hci_pa_create_sync_pending(hdev); return conn; } static bool hci_conn_check_create_big_sync(struct hci_conn *conn) { if (!conn->num_bis) return false; return true; } static void big_create_sync_complete(struct hci_dev *hdev, void *data, int err) { bt_dev_dbg(hdev, ""); if (err) bt_dev_err(hdev, "Unable to create BIG sync: %d", err); } static int big_create_sync(struct hci_dev *hdev, void *data) { DEFINE_FLEX(struct hci_cp_le_big_create_sync, pdu, bis, num_bis, 0x11); struct hci_conn *conn; rcu_read_lock(); pdu->num_bis = 0; /* The spec allows only one pending LE BIG Create Sync command at * a time. If the command is pending now, don't do anything. We * check for pending connections after each BIG Sync Established * event. * * BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E * page 2586: * * If the Host sends this command when the Controller is in the * process of synchronizing to any BIG, i.e. the HCI_LE_BIG_Sync_ * Established event has not been generated, the Controller shall * return the error code Command Disallowed (0x0C). */ list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { if (test_bit(HCI_CONN_CREATE_BIG_SYNC, &conn->flags)) goto unlock; } list_for_each_entry_rcu(conn, &hdev->conn_hash.list, list) { if (hci_conn_check_create_big_sync(conn)) { struct bt_iso_qos *qos = &conn->iso_qos; set_bit(HCI_CONN_CREATE_BIG_SYNC, &conn->flags); pdu->handle = qos->bcast.big; pdu->sync_handle = cpu_to_le16(conn->sync_handle); pdu->encryption = qos->bcast.encryption; memcpy(pdu->bcode, qos->bcast.bcode, sizeof(pdu->bcode)); pdu->mse = qos->bcast.mse; pdu->timeout = cpu_to_le16(qos->bcast.timeout); pdu->num_bis = conn->num_bis; memcpy(pdu->bis, conn->bis, conn->num_bis); break; } } unlock: rcu_read_unlock(); if (!pdu->num_bis) return 0; return hci_send_cmd(hdev, HCI_OP_LE_BIG_CREATE_SYNC, struct_size(pdu, bis, pdu->num_bis), pdu); } int hci_le_big_create_sync_pending(struct hci_dev *hdev) { /* Queue big_create_sync */ return hci_cmd_sync_queue_once(hdev, big_create_sync, NULL, big_create_sync_complete); } int hci_le_big_create_sync(struct hci_dev *hdev, struct hci_conn *hcon, struct bt_iso_qos *qos, __u16 sync_handle, __u8 num_bis, __u8 bis[]) { int err; if (num_bis < 0x01 || num_bis > ISO_MAX_NUM_BIS) return -EINVAL; err = qos_set_big(hdev, qos); if (err) return err; if (hcon) { /* Update hcon QoS */ hcon->iso_qos = *qos; hcon->num_bis = num_bis; memcpy(hcon->bis, bis, num_bis); } return hci_le_big_create_sync_pending(hdev); } static void create_big_complete(struct hci_dev *hdev, void *data, int err) { struct hci_conn *conn = data; bt_dev_dbg(hdev, "conn %p", conn); if (err) { bt_dev_err(hdev, "Unable to create BIG: %d", err); hci_connect_cfm(conn, err); hci_conn_del(conn); } } struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, struct bt_iso_qos *qos, __u8 base_len, __u8 *base) { struct hci_conn *conn; struct hci_conn *parent; __u8 eir[HCI_MAX_PER_AD_LENGTH]; struct hci_link *link; /* Look for any BIS that is open for rebinding */ conn = hci_conn_hash_lookup_big_state(hdev, qos->bcast.big, BT_OPEN); if (conn) { memcpy(qos, &conn->iso_qos, sizeof(*qos)); conn->state = BT_CONNECTED; return conn; } if (base_len && base) base_len = eir_append_service_data(eir, 0, 0x1851, base, base_len); /* We need hci_conn object using the BDADDR_ANY as dst */ conn = hci_add_bis(hdev, dst, qos, base_len, eir); if (IS_ERR(conn)) return conn; /* Update LINK PHYs according to QoS preference */ conn->le_tx_phy = qos->bcast.out.phy; conn->le_tx_phy = qos->bcast.out.phy; /* Add Basic Announcement into Peridic Adv Data if BASE is set */ if (base_len && base) { memcpy(conn->le_per_adv_data, eir, sizeof(eir)); conn->le_per_adv_data_len = base_len; } hci_iso_qos_setup(hdev, conn, &qos->bcast.out, conn->le_tx_phy ? conn->le_tx_phy : hdev->le_tx_def_phys); conn->iso_qos = *qos; conn->state = BT_BOUND; /* Link BISes together */ parent = hci_conn_hash_lookup_big(hdev, conn->iso_qos.bcast.big); if (parent && parent != conn) { link = hci_conn_link(parent, conn); hci_conn_drop(conn); if (!link) return ERR_PTR(-ENOLINK); } return conn; } static void bis_mark_per_adv(struct hci_conn *conn, void *data) { struct iso_list_data *d = data; /* Skip if not broadcast/ANY address */ if (bacmp(&conn->dst, BDADDR_ANY)) return; if (d->big != conn->iso_qos.bcast.big || d->bis == BT_ISO_QOS_BIS_UNSET || d->bis != conn->iso_qos.bcast.bis) return; set_bit(HCI_CONN_PER_ADV, &conn->flags); } struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, struct bt_iso_qos *qos, __u8 base_len, __u8 *base) { struct hci_conn *conn; int err; struct iso_list_data data; conn = hci_bind_bis(hdev, dst, qos, base_len, base); if (IS_ERR(conn)) return conn; if (conn->state == BT_CONNECTED) return conn; data.big = qos->bcast.big; data.bis = qos->bcast.bis; /* Set HCI_CONN_PER_ADV for all bound connections, to mark that * the start periodic advertising and create BIG commands have * been queued */ hci_conn_hash_list_state(hdev, bis_mark_per_adv, ISO_LINK, BT_BOUND, &data); /* Queue start periodic advertising and create BIG */ err = hci_cmd_sync_queue(hdev, create_big_sync, conn, create_big_complete); if (err < 0) { hci_conn_drop(conn); return ERR_PTR(err); } return conn; } struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, struct bt_iso_qos *qos) { struct hci_conn *le; struct hci_conn *cis; struct hci_link *link; if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) le = hci_connect_le(hdev, dst, dst_type, false, BT_SECURITY_LOW, HCI_LE_CONN_TIMEOUT, HCI_ROLE_SLAVE, 0, 0); else le = hci_connect_le_scan(hdev, dst, dst_type, BT_SECURITY_LOW, HCI_LE_CONN_TIMEOUT, CONN_REASON_ISO_CONNECT); if (IS_ERR(le)) return le; hci_iso_qos_setup(hdev, le, &qos->ucast.out, le->le_tx_phy ? le->le_tx_phy : hdev->le_tx_def_phys); hci_iso_qos_setup(hdev, le, &qos->ucast.in, le->le_rx_phy ? le->le_rx_phy : hdev->le_rx_def_phys); cis = hci_bind_cis(hdev, dst, dst_type, qos); if (IS_ERR(cis)) { hci_conn_drop(le); return cis; } link = hci_conn_link(le, cis); hci_conn_drop(cis); if (!link) { hci_conn_drop(le); return ERR_PTR(-ENOLINK); } cis->state = BT_CONNECT; hci_le_create_cis_pending(hdev); return cis; } /* Check link security requirement */ int hci_conn_check_link_mode(struct hci_conn *conn) { BT_DBG("hcon %p", conn); /* In Secure Connections Only mode, it is required that Secure * Connections is used and the link is encrypted with AES-CCM * using a P-256 authenticated combination key. */ if (hci_dev_test_flag(conn->hdev, HCI_SC_ONLY)) { if (!hci_conn_sc_enabled(conn) || !test_bit(HCI_CONN_AES_CCM, &conn->flags) || conn->key_type != HCI_LK_AUTH_COMBINATION_P256) return 0; } /* AES encryption is required for Level 4: * * BLUETOOTH CORE SPECIFICATION Version 5.2 | Vol 3, Part C * page 1319: * * 128-bit equivalent strength for link and encryption keys * required using FIPS approved algorithms (E0 not allowed, * SAFER+ not allowed, and P-192 not allowed; encryption key * not shortened) */ if (conn->sec_level == BT_SECURITY_FIPS && !test_bit(HCI_CONN_AES_CCM, &conn->flags)) { bt_dev_err(conn->hdev, "Invalid security: Missing AES-CCM usage"); return 0; } if (hci_conn_ssp_enabled(conn) && !test_bit(HCI_CONN_ENCRYPT, &conn->flags)) return 0; return 1; } /* Authenticate remote device */ static int hci_conn_auth(struct hci_conn *conn, __u8 sec_level, __u8 auth_type) { BT_DBG("hcon %p", conn); if (conn->pending_sec_level > sec_level) sec_level = conn->pending_sec_level; if (sec_level > conn->sec_level) conn->pending_sec_level = sec_level; else if (test_bit(HCI_CONN_AUTH, &conn->flags)) return 1; /* Make sure we preserve an existing MITM requirement*/ auth_type |= (conn->auth_type & 0x01); conn->auth_type = auth_type; if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->flags)) { struct hci_cp_auth_requested cp; cp.handle = cpu_to_le16(conn->handle); hci_send_cmd(conn->hdev, HCI_OP_AUTH_REQUESTED, sizeof(cp), &cp); /* Set the ENCRYPT_PEND to trigger encryption after * authentication. */ if (!test_bit(HCI_CONN_ENCRYPT, &conn->flags)) set_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags); } return 0; } /* Encrypt the link */ static void hci_conn_encrypt(struct hci_conn *conn) { BT_DBG("hcon %p", conn); if (!test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags)) { struct hci_cp_set_conn_encrypt cp; cp.handle = cpu_to_le16(conn->handle); cp.encrypt = 0x01; hci_send_cmd(conn->hdev, HCI_OP_SET_CONN_ENCRYPT, sizeof(cp), &cp); } } /* Enable security */ int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type, bool initiator) { BT_DBG("hcon %p", conn); if (conn->type == LE_LINK) return smp_conn_security(conn, sec_level); /* For sdp we don't need the link key. */ if (sec_level == BT_SECURITY_SDP) return 1; /* For non 2.1 devices and low security level we don't need the link key. */ if (sec_level == BT_SECURITY_LOW && !hci_conn_ssp_enabled(conn)) return 1; /* For other security levels we need the link key. */ if (!test_bit(HCI_CONN_AUTH, &conn->flags)) goto auth; switch (conn->key_type) { case HCI_LK_AUTH_COMBINATION_P256: /* An authenticated FIPS approved combination key has * sufficient security for security level 4 or lower. */ if (sec_level <= BT_SECURITY_FIPS) goto encrypt; break; case HCI_LK_AUTH_COMBINATION_P192: /* An authenticated combination key has sufficient security for * security level 3 or lower. */ if (sec_level <= BT_SECURITY_HIGH) goto encrypt; break; case HCI_LK_UNAUTH_COMBINATION_P192: case HCI_LK_UNAUTH_COMBINATION_P256: /* An unauthenticated combination key has sufficient security * for security level 2 or lower. */ if (sec_level <= BT_SECURITY_MEDIUM) goto encrypt; break; case HCI_LK_COMBINATION: /* A combination key has always sufficient security for the * security levels 2 or lower. High security level requires the * combination key is generated using maximum PIN code length * (16). For pre 2.1 units. */ if (sec_level <= BT_SECURITY_MEDIUM || conn->pin_length == 16) goto encrypt; break; default: break; } auth: if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags)) return 0; if (initiator) set_bit(HCI_CONN_AUTH_INITIATOR, &conn->flags); if (!hci_conn_auth(conn, sec_level, auth_type)) return 0; encrypt: if (test_bit(HCI_CONN_ENCRYPT, &conn->flags)) { /* Ensure that the encryption key size has been read, * otherwise stall the upper layer responses. */ if (!conn->enc_key_size) return 0; /* Nothing else needed, all requirements are met */ return 1; } hci_conn_encrypt(conn); return 0; } EXPORT_SYMBOL(hci_conn_security); /* Check secure link requirement */ int hci_conn_check_secure(struct hci_conn *conn, __u8 sec_level) { BT_DBG("hcon %p", conn); /* Accept if non-secure or higher security level is required */ if (sec_level != BT_SECURITY_HIGH && sec_level != BT_SECURITY_FIPS) return 1; /* Accept if secure or higher security level is already present */ if (conn->sec_level == BT_SECURITY_HIGH || conn->sec_level == BT_SECURITY_FIPS) return 1; /* Reject not secure link */ return 0; } EXPORT_SYMBOL(hci_conn_check_secure); /* Switch role */ int hci_conn_switch_role(struct hci_conn *conn, __u8 role) { BT_DBG("hcon %p", conn); if (role == conn->role) return 1; if (!test_and_set_bit(HCI_CONN_RSWITCH_PEND, &conn->flags)) { struct hci_cp_switch_role cp; bacpy(&cp.bdaddr, &conn->dst); cp.role = role; hci_send_cmd(conn->hdev, HCI_OP_SWITCH_ROLE, sizeof(cp), &cp); } return 0; } EXPORT_SYMBOL(hci_conn_switch_role); /* Enter active mode */ void hci_conn_enter_active_mode(struct hci_conn *conn, __u8 force_active) { struct hci_dev *hdev = conn->hdev; BT_DBG("hcon %p mode %d", conn, conn->mode); if (conn->mode != HCI_CM_SNIFF) goto timer; if (!test_bit(HCI_CONN_POWER_SAVE, &conn->flags) && !force_active) goto timer; if (!test_and_set_bit(HCI_CONN_MODE_CHANGE_PEND, &conn->flags)) { struct hci_cp_exit_sniff_mode cp; cp.handle = cpu_to_le16(conn->handle); hci_send_cmd(hdev, HCI_OP_EXIT_SNIFF_MODE, sizeof(cp), &cp); } timer: if (hdev->idle_timeout > 0) queue_delayed_work(hdev->workqueue, &conn->idle_work, msecs_to_jiffies(hdev->idle_timeout)); } /* Drop all connection on the device */ void hci_conn_hash_flush(struct hci_dev *hdev) { struct list_head *head = &hdev->conn_hash.list; struct hci_conn *conn; BT_DBG("hdev %s", hdev->name); /* We should not traverse the list here, because hci_conn_del * can remove extra links, which may cause the list traversal * to hit items that have already been released. */ while ((conn = list_first_entry_or_null(head, struct hci_conn, list)) != NULL) { conn->state = BT_CLOSED; hci_disconn_cfm(conn, HCI_ERROR_LOCAL_HOST_TERM); hci_conn_del(conn); } } static u32 get_link_mode(struct hci_conn *conn) { u32 link_mode = 0; if (conn->role == HCI_ROLE_MASTER) link_mode |= HCI_LM_MASTER; if (test_bit(HCI_CONN_ENCRYPT, &conn->flags)) link_mode |= HCI_LM_ENCRYPT; if (test_bit(HCI_CONN_AUTH, &conn->flags)) link_mode |= HCI_LM_AUTH; if (test_bit(HCI_CONN_SECURE, &conn->flags)) link_mode |= HCI_LM_SECURE; if (test_bit(HCI_CONN_FIPS, &conn->flags)) link_mode |= HCI_LM_FIPS; return link_mode; } int hci_get_conn_list(void __user *arg) { struct hci_conn *c; struct hci_conn_list_req req, *cl; struct hci_conn_info *ci; struct hci_dev *hdev; int n = 0, size, err; if (copy_from_user(&req, arg, sizeof(req))) return -EFAULT; if (!req.conn_num || req.conn_num > (PAGE_SIZE * 2) / sizeof(*ci)) return -EINVAL; size = sizeof(req) + req.conn_num * sizeof(*ci); cl = kmalloc(size, GFP_KERNEL); if (!cl) return -ENOMEM; hdev = hci_dev_get(req.dev_id); if (!hdev) { kfree(cl); return -ENODEV; } ci = cl->conn_info; hci_dev_lock(hdev); list_for_each_entry(c, &hdev->conn_hash.list, list) { bacpy(&(ci + n)->bdaddr, &c->dst); (ci + n)->handle = c->handle; (ci + n)->type = c->type; (ci + n)->out = c->out; (ci + n)->state = c->state; (ci + n)->link_mode = get_link_mode(c); if (++n >= req.conn_num) break; } hci_dev_unlock(hdev); cl->dev_id = hdev->id; cl->conn_num = n; size = sizeof(req) + n * sizeof(*ci); hci_dev_put(hdev); err = copy_to_user(arg, cl, size); kfree(cl); return err ? -EFAULT : 0; } int hci_get_conn_info(struct hci_dev *hdev, void __user *arg) { struct hci_conn_info_req req; struct hci_conn_info ci; struct hci_conn *conn; char __user *ptr = arg + sizeof(req); if (copy_from_user(&req, arg, sizeof(req))) return -EFAULT; hci_dev_lock(hdev); conn = hci_conn_hash_lookup_ba(hdev, req.type, &req.bdaddr); if (conn) { bacpy(&ci.bdaddr, &conn->dst); ci.handle = conn->handle; ci.type = conn->type; ci.out = conn->out; ci.state = conn->state; ci.link_mode = get_link_mode(conn); } hci_dev_unlock(hdev); if (!conn) return -ENOENT; return copy_to_user(ptr, &ci, sizeof(ci)) ? -EFAULT : 0; } int hci_get_auth_info(struct hci_dev *hdev, void __user *arg) { struct hci_auth_info_req req; struct hci_conn *conn; if (copy_from_user(&req, arg, sizeof(req))) return -EFAULT; hci_dev_lock(hdev); conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &req.bdaddr); if (conn) req.type = conn->auth_type; hci_dev_unlock(hdev); if (!conn) return -ENOENT; return copy_to_user(arg, &req, sizeof(req)) ? -EFAULT : 0; } struct hci_chan *hci_chan_create(struct hci_conn *conn) { struct hci_dev *hdev = conn->hdev; struct hci_chan *chan; BT_DBG("%s hcon %p", hdev->name, conn); if (test_bit(HCI_CONN_DROP, &conn->flags)) { BT_DBG("Refusing to create new hci_chan"); return NULL; } chan = kzalloc(sizeof(*chan), GFP_KERNEL); if (!chan) return NULL; chan->conn = hci_conn_get(conn); skb_queue_head_init(&chan->data_q); chan->state = BT_CONNECTED; list_add_rcu(&chan->list, &conn->chan_list); return chan; } void hci_chan_del(struct hci_chan *chan) { struct hci_conn *conn = chan->conn; struct hci_dev *hdev = conn->hdev; BT_DBG("%s hcon %p chan %p", hdev->name, conn, chan); list_del_rcu(&chan->list); synchronize_rcu(); /* Prevent new hci_chan's to be created for this hci_conn */ set_bit(HCI_CONN_DROP, &conn->flags); hci_conn_put(conn); skb_queue_purge(&chan->data_q); kfree(chan); } void hci_chan_list_flush(struct hci_conn *conn) { struct hci_chan *chan, *n; BT_DBG("hcon %p", conn); list_for_each_entry_safe(chan, n, &conn->chan_list, list) hci_chan_del(chan); } static struct hci_chan *__hci_chan_lookup_handle(struct hci_conn *hcon, __u16 handle) { struct hci_chan *hchan; list_for_each_entry(hchan, &hcon->chan_list, list) { if (hchan->handle == handle) return hchan; } return NULL; } struct hci_chan *hci_chan_lookup_handle(struct hci_dev *hdev, __u16 handle) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *hcon; struct hci_chan *hchan = NULL; rcu_read_lock(); list_for_each_entry_rcu(hcon, &h->list, list) { hchan = __hci_chan_lookup_handle(hcon, handle); if (hchan) break; } rcu_read_unlock(); return hchan; } u32 hci_conn_get_phy(struct hci_conn *conn) { u32 phys = 0; /* BLUETOOTH CORE SPECIFICATION Version 5.2 | Vol 2, Part B page 471: * Table 6.2: Packets defined for synchronous, asynchronous, and * CPB logical transport types. */ switch (conn->type) { case SCO_LINK: /* SCO logical transport (1 Mb/s): * HV1, HV2, HV3 and DV. */ phys |= BT_PHY_BR_1M_1SLOT; break; case ACL_LINK: /* ACL logical transport (1 Mb/s) ptt=0: * DH1, DM3, DH3, DM5 and DH5. */ phys |= BT_PHY_BR_1M_1SLOT; if (conn->pkt_type & (HCI_DM3 | HCI_DH3)) phys |= BT_PHY_BR_1M_3SLOT; if (conn->pkt_type & (HCI_DM5 | HCI_DH5)) phys |= BT_PHY_BR_1M_5SLOT; /* ACL logical transport (2 Mb/s) ptt=1: * 2-DH1, 2-DH3 and 2-DH5. */ if (!(conn->pkt_type & HCI_2DH1)) phys |= BT_PHY_EDR_2M_1SLOT; if (!(conn->pkt_type & HCI_2DH3)) phys |= BT_PHY_EDR_2M_3SLOT; if (!(conn->pkt_type & HCI_2DH5)) phys |= BT_PHY_EDR_2M_5SLOT; /* ACL logical transport (3 Mb/s) ptt=1: * 3-DH1, 3-DH3 and 3-DH5. */ if (!(conn->pkt_type & HCI_3DH1)) phys |= BT_PHY_EDR_3M_1SLOT; if (!(conn->pkt_type & HCI_3DH3)) phys |= BT_PHY_EDR_3M_3SLOT; if (!(conn->pkt_type & HCI_3DH5)) phys |= BT_PHY_EDR_3M_5SLOT; break; case ESCO_LINK: /* eSCO logical transport (1 Mb/s): EV3, EV4 and EV5 */ phys |= BT_PHY_BR_1M_1SLOT; if (!(conn->pkt_type & (ESCO_EV4 | ESCO_EV5))) phys |= BT_PHY_BR_1M_3SLOT; /* eSCO logical transport (2 Mb/s): 2-EV3, 2-EV5 */ if (!(conn->pkt_type & ESCO_2EV3)) phys |= BT_PHY_EDR_2M_1SLOT; if (!(conn->pkt_type & ESCO_2EV5)) phys |= BT_PHY_EDR_2M_3SLOT; /* eSCO logical transport (3 Mb/s): 3-EV3, 3-EV5 */ if (!(conn->pkt_type & ESCO_3EV3)) phys |= BT_PHY_EDR_3M_1SLOT; if (!(conn->pkt_type & ESCO_3EV5)) phys |= BT_PHY_EDR_3M_3SLOT; break; case LE_LINK: if (conn->le_tx_phy & HCI_LE_SET_PHY_1M) phys |= BT_PHY_LE_1M_TX; if (conn->le_rx_phy & HCI_LE_SET_PHY_1M) phys |= BT_PHY_LE_1M_RX; if (conn->le_tx_phy & HCI_LE_SET_PHY_2M) phys |= BT_PHY_LE_2M_TX; if (conn->le_rx_phy & HCI_LE_SET_PHY_2M) phys |= BT_PHY_LE_2M_RX; if (conn->le_tx_phy & HCI_LE_SET_PHY_CODED) phys |= BT_PHY_LE_CODED_TX; if (conn->le_rx_phy & HCI_LE_SET_PHY_CODED) phys |= BT_PHY_LE_CODED_RX; break; } return phys; } static int abort_conn_sync(struct hci_dev *hdev, void *data) { struct hci_conn *conn = data; if (!hci_conn_valid(hdev, conn)) return -ECANCELED; return hci_abort_conn_sync(hdev, conn, conn->abort_reason); } int hci_abort_conn(struct hci_conn *conn, u8 reason) { struct hci_dev *hdev = conn->hdev; /* If abort_reason has already been set it means the connection is * already being aborted so don't attempt to overwrite it. */ if (conn->abort_reason) return 0; bt_dev_dbg(hdev, "handle 0x%2.2x reason 0x%2.2x", conn->handle, reason); conn->abort_reason = reason; /* If the connection is pending check the command opcode since that * might be blocking on hci_cmd_sync_work while waiting its respective * event so we need to hci_cmd_sync_cancel to cancel it. * * hci_connect_le serializes the connection attempts so only one * connection can be in BT_CONNECT at time. */ if (conn->state == BT_CONNECT && hdev->req_status == HCI_REQ_PEND) { switch (hci_skb_event(hdev->sent_cmd)) { case HCI_EV_CONN_COMPLETE: case HCI_EV_LE_CONN_COMPLETE: case HCI_EV_LE_ENHANCED_CONN_COMPLETE: case HCI_EVT_LE_CIS_ESTABLISHED: hci_cmd_sync_cancel(hdev, ECANCELED); break; } /* Cancel connect attempt if still queued/pending */ } else if (!hci_cancel_connect_sync(hdev, conn)) { return 0; } /* Run immediately if on cmd_sync_work since this may be called * as a result to MGMT_OP_DISCONNECT/MGMT_OP_UNPAIR which does * already queue its callback on cmd_sync_work. */ return hci_cmd_sync_run_once(hdev, abort_conn_sync, conn, NULL); }
1738 5 3 3 3 3 3 1516 1516 1514 1519 547 547 2 2 2 2 2 1737 1735 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 // SPDX-License-Identifier: GPL-2.0 /* * bus.c - bus driver management * * Copyright (c) 2002-3 Patrick Mochel * Copyright (c) 2002-3 Open Source Development Labs * Copyright (c) 2007 Greg Kroah-Hartman <gregkh@suse.de> * Copyright (c) 2007 Novell Inc. * Copyright (c) 2023 Greg Kroah-Hartman <gregkh@linuxfoundation.org> */ #include <linux/async.h> #include <linux/device/bus.h> #include <linux/device.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/string.h> #include <linux/mutex.h> #include <linux/sysfs.h> #include "base.h" #include "power/power.h" /* /sys/devices/system */ static struct kset *system_kset; /* /sys/bus */ static struct kset *bus_kset; #define to_bus_attr(_attr) container_of(_attr, struct bus_attribute, attr) /* * sysfs bindings for drivers */ #define to_drv_attr(_attr) container_of(_attr, struct driver_attribute, attr) #define DRIVER_ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) \ struct driver_attribute driver_attr_##_name = \ __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) static int __must_check bus_rescan_devices_helper(struct device *dev, void *data); /** * bus_to_subsys - Turn a struct bus_type into a struct subsys_private * * @bus: pointer to the struct bus_type to look up * * The driver core internals needs to work on the subsys_private structure, not * the external struct bus_type pointer. This function walks the list of * registered busses in the system and finds the matching one and returns the * internal struct subsys_private that relates to that bus. * * Note, the reference count of the return value is INCREMENTED if it is not * NULL. A call to subsys_put() must be done when finished with the pointer in * order for it to be properly freed. */ static struct subsys_private *bus_to_subsys(const struct bus_type *bus) { struct subsys_private *sp = NULL; struct kobject *kobj; if (!bus || !bus_kset) return NULL; spin_lock(&bus_kset->list_lock); if (list_empty(&bus_kset->list)) goto done; list_for_each_entry(kobj, &bus_kset->list, entry) { struct kset *kset = container_of(kobj, struct kset, kobj); sp = container_of_const(kset, struct subsys_private, subsys); if (sp->bus == bus) goto done; } sp = NULL; done: sp = subsys_get(sp); spin_unlock(&bus_kset->list_lock); return sp; } static const struct bus_type *bus_get(const struct bus_type *bus) { struct subsys_private *sp = bus_to_subsys(bus); if (sp) return bus; return NULL; } static void bus_put(const struct bus_type *bus) { struct subsys_private *sp = bus_to_subsys(bus); /* two puts are required as the call to bus_to_subsys incremented it again */ subsys_put(sp); subsys_put(sp); } static ssize_t drv_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct driver_attribute *drv_attr = to_drv_attr(attr); struct driver_private *drv_priv = to_driver(kobj); ssize_t ret = -EIO; if (drv_attr->show) ret = drv_attr->show(drv_priv->driver, buf); return ret; } static ssize_t drv_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct driver_attribute *drv_attr = to_drv_attr(attr); struct driver_private *drv_priv = to_driver(kobj); ssize_t ret = -EIO; if (drv_attr->store) ret = drv_attr->store(drv_priv->driver, buf, count); return ret; } static const struct sysfs_ops driver_sysfs_ops = { .show = drv_attr_show, .store = drv_attr_store, }; static void driver_release(struct kobject *kobj) { struct driver_private *drv_priv = to_driver(kobj); pr_debug("driver: '%s': %s\n", kobject_name(kobj), __func__); kfree(drv_priv); } static const struct kobj_type driver_ktype = { .sysfs_ops = &driver_sysfs_ops, .release = driver_release, }; /* * sysfs bindings for buses */ static ssize_t bus_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct bus_attribute *bus_attr = to_bus_attr(attr); struct subsys_private *subsys_priv = to_subsys_private(kobj); /* return -EIO for reading a bus attribute without show() */ ssize_t ret = -EIO; if (bus_attr->show) ret = bus_attr->show(subsys_priv->bus, buf); return ret; } static ssize_t bus_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct bus_attribute *bus_attr = to_bus_attr(attr); struct subsys_private *subsys_priv = to_subsys_private(kobj); /* return -EIO for writing a bus attribute without store() */ ssize_t ret = -EIO; if (bus_attr->store) ret = bus_attr->store(subsys_priv->bus, buf, count); return ret; } static const struct sysfs_ops bus_sysfs_ops = { .show = bus_attr_show, .store = bus_attr_store, }; int bus_create_file(const struct bus_type *bus, struct bus_attribute *attr) { struct subsys_private *sp = bus_to_subsys(bus); int error; if (!sp) return -EINVAL; error = sysfs_create_file(&sp->subsys.kobj, &attr->attr); subsys_put(sp); return error; } EXPORT_SYMBOL_GPL(bus_create_file); void bus_remove_file(const struct bus_type *bus, struct bus_attribute *attr) { struct subsys_private *sp = bus_to_subsys(bus); if (!sp) return; sysfs_remove_file(&sp->subsys.kobj, &attr->attr); subsys_put(sp); } EXPORT_SYMBOL_GPL(bus_remove_file); static void bus_release(struct kobject *kobj) { struct subsys_private *priv = to_subsys_private(kobj); lockdep_unregister_key(&priv->lock_key); kfree(priv); } static const struct kobj_type bus_ktype = { .sysfs_ops = &bus_sysfs_ops, .release = bus_release, }; static int bus_uevent_filter(const struct kobject *kobj) { const struct kobj_type *ktype = get_ktype(kobj); if (ktype == &bus_ktype) return 1; return 0; } static const struct kset_uevent_ops bus_uevent_ops = { .filter = bus_uevent_filter, }; /* Manually detach a device from its associated driver. */ static ssize_t unbind_store(struct device_driver *drv, const char *buf, size_t count) { const struct bus_type *bus = bus_get(drv->bus); struct device *dev; int err = -ENODEV; dev = bus_find_device_by_name(bus, NULL, buf); if (dev && dev->driver == drv) { device_driver_detach(dev); err = count; } put_device(dev); bus_put(bus); return err; } static DRIVER_ATTR_IGNORE_LOCKDEP(unbind, 0200, NULL, unbind_store); /* * Manually attach a device to a driver. * Note: the driver must want to bind to the device, * it is not possible to override the driver's id table. */ static ssize_t bind_store(struct device_driver *drv, const char *buf, size_t count) { const struct bus_type *bus = bus_get(drv->bus); struct device *dev; int err = -ENODEV; dev = bus_find_device_by_name(bus, NULL, buf); if (dev && driver_match_device(drv, dev)) { err = device_driver_attach(drv, dev); if (!err) { /* success */ err = count; } } put_device(dev); bus_put(bus); return err; } static DRIVER_ATTR_IGNORE_LOCKDEP(bind, 0200, NULL, bind_store); static ssize_t drivers_autoprobe_show(const struct bus_type *bus, char *buf) { struct subsys_private *sp = bus_to_subsys(bus); int ret; if (!sp) return -EINVAL; ret = sysfs_emit(buf, "%d\n", sp->drivers_autoprobe); subsys_put(sp); return ret; } static ssize_t drivers_autoprobe_store(const struct bus_type *bus, const char *buf, size_t count) { struct subsys_private *sp = bus_to_subsys(bus); if (!sp) return -EINVAL; if (buf[0] == '0') sp->drivers_autoprobe = 0; else sp->drivers_autoprobe = 1; subsys_put(sp); return count; } static ssize_t drivers_probe_store(const struct bus_type *bus, const char *buf, size_t count) { struct device *dev; int err = -EINVAL; dev = bus_find_device_by_name(bus, NULL, buf); if (!dev) return -ENODEV; if (bus_rescan_devices_helper(dev, NULL) == 0) err = count; put_device(dev); return err; } static struct device *next_device(struct klist_iter *i) { struct klist_node *n = klist_next(i); struct device *dev = NULL; struct device_private *dev_prv; if (n) { dev_prv = to_device_private_bus(n); dev = dev_prv->device; } return dev; } /** * bus_for_each_dev - device iterator. * @bus: bus type. * @start: device to start iterating from. * @data: data for the callback. * @fn: function to be called for each device. * * Iterate over @bus's list of devices, and call @fn for each, * passing it @data. If @start is not NULL, we use that device to * begin iterating from. * * We check the return of @fn each time. If it returns anything * other than 0, we break out and return that value. * * NOTE: The device that returns a non-zero value is not retained * in any way, nor is its refcount incremented. If the caller needs * to retain this data, it should do so, and increment the reference * count in the supplied callback. */ int bus_for_each_dev(const struct bus_type *bus, struct device *start, void *data, device_iter_t fn) { struct subsys_private *sp = bus_to_subsys(bus); struct klist_iter i; struct device *dev; int error = 0; if (!sp) return -EINVAL; klist_iter_init_node(&sp->klist_devices, &i, (start ? &start->p->knode_bus : NULL)); while (!error && (dev = next_device(&i))) error = fn(dev, data); klist_iter_exit(&i); subsys_put(sp); return error; } EXPORT_SYMBOL_GPL(bus_for_each_dev); /** * bus_find_device - device iterator for locating a particular device. * @bus: bus type * @start: Device to begin with * @data: Data to pass to match function * @match: Callback function to check device * * This is similar to the bus_for_each_dev() function above, but it * returns a reference to a device that is 'found' for later use, as * determined by the @match callback. * * The callback should return 0 if the device doesn't match and non-zero * if it does. If the callback returns non-zero, this function will * return to the caller and not iterate over any more devices. */ struct device *bus_find_device(const struct bus_type *bus, struct device *start, const void *data, device_match_t match) { struct subsys_private *sp = bus_to_subsys(bus); struct klist_iter i; struct device *dev; if (!sp) return NULL; klist_iter_init_node(&sp->klist_devices, &i, (start ? &start->p->knode_bus : NULL)); while ((dev = next_device(&i))) { if (match(dev, data)) { get_device(dev); break; } } klist_iter_exit(&i); subsys_put(sp); return dev; } EXPORT_SYMBOL_GPL(bus_find_device); static struct device_driver *next_driver(struct klist_iter *i) { struct klist_node *n = klist_next(i); struct driver_private *drv_priv; if (n) { drv_priv = container_of(n, struct driver_private, knode_bus); return drv_priv->driver; } return NULL; } /** * bus_for_each_drv - driver iterator * @bus: bus we're dealing with. * @start: driver to start iterating on. * @data: data to pass to the callback. * @fn: function to call for each driver. * * This is nearly identical to the device iterator above. * We iterate over each driver that belongs to @bus, and call * @fn for each. If @fn returns anything but 0, we break out * and return it. If @start is not NULL, we use it as the head * of the list. * * NOTE: we don't return the driver that returns a non-zero * value, nor do we leave the reference count incremented for that * driver. If the caller needs to know that info, it must set it * in the callback. It must also be sure to increment the refcount * so it doesn't disappear before returning to the caller. */ int bus_for_each_drv(const struct bus_type *bus, struct device_driver *start, void *data, int (*fn)(struct device_driver *, void *)) { struct subsys_private *sp = bus_to_subsys(bus); struct klist_iter i; struct device_driver *drv; int error = 0; if (!sp) return -EINVAL; klist_iter_init_node(&sp->klist_drivers, &i, start ? &start->p->knode_bus : NULL); while ((drv = next_driver(&i)) && !error) error = fn(drv, data); klist_iter_exit(&i); subsys_put(sp); return error; } EXPORT_SYMBOL_GPL(bus_for_each_drv); /** * bus_add_device - add device to bus * @dev: device being added * * - Add device's bus attributes. * - Create links to device's bus. * - Add the device to its bus's list of devices. */ int bus_add_device(struct device *dev) { struct subsys_private *sp = bus_to_subsys(dev->bus); int error; if (!sp) { /* * This is a normal operation for many devices that do not * have a bus assigned to them, just say that all went * well. */ return 0; } /* * Reference in sp is now incremented and will be dropped when * the device is removed from the bus */ pr_debug("bus: '%s': add device %s\n", sp->bus->name, dev_name(dev)); error = device_add_groups(dev, sp->bus->dev_groups); if (error) goto out_put; error = sysfs_create_link(&sp->devices_kset->kobj, &dev->kobj, dev_name(dev)); if (error) goto out_groups; error = sysfs_create_link(&dev->kobj, &sp->subsys.kobj, "subsystem"); if (error) goto out_subsys; klist_add_tail(&dev->p->knode_bus, &sp->klist_devices); return 0; out_subsys: sysfs_remove_link(&sp->devices_kset->kobj, dev_name(dev)); out_groups: device_remove_groups(dev, sp->bus->dev_groups); out_put: subsys_put(sp); return error; } /** * bus_probe_device - probe drivers for a new device * @dev: device to probe * * - Automatically probe for a driver if the bus allows it. */ void bus_probe_device(struct device *dev) { struct subsys_private *sp = bus_to_subsys(dev->bus); struct subsys_interface *sif; if (!sp) return; if (sp->drivers_autoprobe) device_initial_probe(dev); mutex_lock(&sp->mutex); list_for_each_entry(sif, &sp->interfaces, node) if (sif->add_dev) sif->add_dev(dev, sif); mutex_unlock(&sp->mutex); subsys_put(sp); } /** * bus_remove_device - remove device from bus * @dev: device to be removed * * - Remove device from all interfaces. * - Remove symlink from bus' directory. * - Delete device from bus's list. * - Detach from its driver. * - Drop reference taken in bus_add_device(). */ void bus_remove_device(struct device *dev) { struct subsys_private *sp = bus_to_subsys(dev->bus); struct subsys_interface *sif; if (!sp) return; mutex_lock(&sp->mutex); list_for_each_entry(sif, &sp->interfaces, node) if (sif->remove_dev) sif->remove_dev(dev, sif); mutex_unlock(&sp->mutex); sysfs_remove_link(&dev->kobj, "subsystem"); sysfs_remove_link(&sp->devices_kset->kobj, dev_name(dev)); device_remove_groups(dev, dev->bus->dev_groups); if (klist_node_attached(&dev->p->knode_bus)) klist_del(&dev->p->knode_bus); pr_debug("bus: '%s': remove device %s\n", dev->bus->name, dev_name(dev)); device_release_driver(dev); /* * Decrement the reference count twice, once for the bus_to_subsys() * call in the start of this function, and the second one from the * reference increment in bus_add_device() */ subsys_put(sp); subsys_put(sp); } static int __must_check add_bind_files(struct device_driver *drv) { int ret; ret = driver_create_file(drv, &driver_attr_unbind); if (ret == 0) { ret = driver_create_file(drv, &driver_attr_bind); if (ret) driver_remove_file(drv, &driver_attr_unbind); } return ret; } static void remove_bind_files(struct device_driver *drv) { driver_remove_file(drv, &driver_attr_bind); driver_remove_file(drv, &driver_attr_unbind); } static BUS_ATTR_WO(drivers_probe); static BUS_ATTR_RW(drivers_autoprobe); static int add_probe_files(const struct bus_type *bus) { int retval; retval = bus_create_file(bus, &bus_attr_drivers_probe); if (retval) goto out; retval = bus_create_file(bus, &bus_attr_drivers_autoprobe); if (retval) bus_remove_file(bus, &bus_attr_drivers_probe); out: return retval; } static void remove_probe_files(const struct bus_type *bus) { bus_remove_file(bus, &bus_attr_drivers_autoprobe); bus_remove_file(bus, &bus_attr_drivers_probe); } static ssize_t uevent_store(struct device_driver *drv, const char *buf, size_t count) { int rc; rc = kobject_synth_uevent(&drv->p->kobj, buf, count); return rc ? rc : count; } static DRIVER_ATTR_WO(uevent); /** * bus_add_driver - Add a driver to the bus. * @drv: driver. */ int bus_add_driver(struct device_driver *drv) { struct subsys_private *sp = bus_to_subsys(drv->bus); struct driver_private *priv; int error = 0; if (!sp) return -EINVAL; /* * Reference in sp is now incremented and will be dropped when * the driver is removed from the bus */ pr_debug("bus: '%s': add driver %s\n", sp->bus->name, drv->name); priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) { error = -ENOMEM; goto out_put_bus; } klist_init(&priv->klist_devices, NULL, NULL); priv->driver = drv; drv->p = priv; priv->kobj.kset = sp->drivers_kset; error = kobject_init_and_add(&priv->kobj, &driver_ktype, NULL, "%s", drv->name); if (error) goto out_unregister; klist_add_tail(&priv->knode_bus, &sp->klist_drivers); if (sp->drivers_autoprobe) { error = driver_attach(drv); if (error) goto out_del_list; } error = module_add_driver(drv->owner, drv); if (error) { printk(KERN_ERR "%s: failed to create module links for %s\n", __func__, drv->name); goto out_detach; } error = driver_create_file(drv, &driver_attr_uevent); if (error) { printk(KERN_ERR "%s: uevent attr (%s) failed\n", __func__, drv->name); } error = driver_add_groups(drv, sp->bus->drv_groups); if (error) { /* How the hell do we get out of this pickle? Give up */ printk(KERN_ERR "%s: driver_add_groups(%s) failed\n", __func__, drv->name); } if (!drv->suppress_bind_attrs) { error = add_bind_files(drv); if (error) { /* Ditto */ printk(KERN_ERR "%s: add_bind_files(%s) failed\n", __func__, drv->name); } } return 0; out_detach: driver_detach(drv); out_del_list: klist_del(&priv->knode_bus); out_unregister: kobject_put(&priv->kobj); /* drv->p is freed in driver_release() */ drv->p = NULL; out_put_bus: subsys_put(sp); return error; } /** * bus_remove_driver - delete driver from bus's knowledge. * @drv: driver. * * Detach the driver from the devices it controls, and remove * it from its bus's list of drivers. Finally, we drop the reference * to the bus we took in bus_add_driver(). */ void bus_remove_driver(struct device_driver *drv) { struct subsys_private *sp = bus_to_subsys(drv->bus); if (!sp) return; pr_debug("bus: '%s': remove driver %s\n", sp->bus->name, drv->name); if (!drv->suppress_bind_attrs) remove_bind_files(drv); driver_remove_groups(drv, sp->bus->drv_groups); driver_remove_file(drv, &driver_attr_uevent); klist_remove(&drv->p->knode_bus); driver_detach(drv); module_remove_driver(drv); kobject_put(&drv->p->kobj); /* * Decrement the reference count twice, once for the bus_to_subsys() * call in the start of this function, and the second one from the * reference increment in bus_add_driver() */ subsys_put(sp); subsys_put(sp); } /* Helper for bus_rescan_devices's iter */ static int __must_check bus_rescan_devices_helper(struct device *dev, void *data) { int ret = 0; if (!dev->driver) { if (dev->parent && dev->bus->need_parent_lock) device_lock(dev->parent); ret = device_attach(dev); if (dev->parent && dev->bus->need_parent_lock) device_unlock(dev->parent); } return ret < 0 ? ret : 0; } /** * bus_rescan_devices - rescan devices on the bus for possible drivers * @bus: the bus to scan. * * This function will look for devices on the bus with no driver * attached and rescan it against existing drivers to see if it matches * any by calling device_attach() for the unbound devices. */ int bus_rescan_devices(const struct bus_type *bus) { return bus_for_each_dev(bus, NULL, NULL, bus_rescan_devices_helper); } EXPORT_SYMBOL_GPL(bus_rescan_devices); /** * device_reprobe - remove driver for a device and probe for a new driver * @dev: the device to reprobe * * This function detaches the attached driver (if any) for the given * device and restarts the driver probing process. It is intended * to use if probing criteria changed during a devices lifetime and * driver attachment should change accordingly. */ int device_reprobe(struct device *dev) { if (dev->driver) device_driver_detach(dev); return bus_rescan_devices_helper(dev, NULL); } EXPORT_SYMBOL_GPL(device_reprobe); static void klist_devices_get(struct klist_node *n) { struct device_private *dev_prv = to_device_private_bus(n); struct device *dev = dev_prv->device; get_device(dev); } static void klist_devices_put(struct klist_node *n) { struct device_private *dev_prv = to_device_private_bus(n); struct device *dev = dev_prv->device; put_device(dev); } static ssize_t bus_uevent_store(const struct bus_type *bus, const char *buf, size_t count) { struct subsys_private *sp = bus_to_subsys(bus); int ret; if (!sp) return -EINVAL; ret = kobject_synth_uevent(&sp->subsys.kobj, buf, count); subsys_put(sp); if (ret) return ret; return count; } /* * "open code" the old BUS_ATTR() macro here. We want to use BUS_ATTR_WO() * here, but can not use it as earlier in the file we have * DEVICE_ATTR_WO(uevent), which would cause a clash with the with the store * function name. */ static struct bus_attribute bus_attr_uevent = __ATTR(uevent, 0200, NULL, bus_uevent_store); /** * bus_register - register a driver-core subsystem * @bus: bus to register * * Once we have that, we register the bus with the kobject * infrastructure, then register the children subsystems it has: * the devices and drivers that belong to the subsystem. */ int bus_register(const struct bus_type *bus) { int retval; struct subsys_private *priv; struct kobject *bus_kobj; struct lock_class_key *key; priv = kzalloc(sizeof(struct subsys_private), GFP_KERNEL); if (!priv) return -ENOMEM; priv->bus = bus; BLOCKING_INIT_NOTIFIER_HEAD(&priv->bus_notifier); bus_kobj = &priv->subsys.kobj; retval = kobject_set_name(bus_kobj, "%s", bus->name); if (retval) goto out; bus_kobj->kset = bus_kset; bus_kobj->ktype = &bus_ktype; priv->drivers_autoprobe = 1; retval = kset_register(&priv->subsys); if (retval) goto out; retval = bus_create_file(bus, &bus_attr_uevent); if (retval) goto bus_uevent_fail; priv->devices_kset = kset_create_and_add("devices", NULL, bus_kobj); if (!priv->devices_kset) { retval = -ENOMEM; goto bus_devices_fail; } priv->drivers_kset = kset_create_and_add("drivers", NULL, bus_kobj); if (!priv->drivers_kset) { retval = -ENOMEM; goto bus_drivers_fail; } INIT_LIST_HEAD(&priv->interfaces); key = &priv->lock_key; lockdep_register_key(key); __mutex_init(&priv->mutex, "subsys mutex", key); klist_init(&priv->klist_devices, klist_devices_get, klist_devices_put); klist_init(&priv->klist_drivers, NULL, NULL); retval = add_probe_files(bus); if (retval) goto bus_probe_files_fail; retval = sysfs_create_groups(bus_kobj, bus->bus_groups); if (retval) goto bus_groups_fail; pr_debug("bus: '%s': registered\n", bus->name); return 0; bus_groups_fail: remove_probe_files(bus); bus_probe_files_fail: kset_unregister(priv->drivers_kset); bus_drivers_fail: kset_unregister(priv->devices_kset); bus_devices_fail: bus_remove_file(bus, &bus_attr_uevent); bus_uevent_fail: kset_unregister(&priv->subsys); /* Above kset_unregister() will kfree @priv */ priv = NULL; out: kfree(priv); return retval; } EXPORT_SYMBOL_GPL(bus_register); /** * bus_unregister - remove a bus from the system * @bus: bus. * * Unregister the child subsystems and the bus itself. * Finally, we call bus_put() to release the refcount */ void bus_unregister(const struct bus_type *bus) { struct subsys_private *sp = bus_to_subsys(bus); struct kobject *bus_kobj; if (!sp) return; pr_debug("bus: '%s': unregistering\n", bus->name); if (sp->dev_root) device_unregister(sp->dev_root); bus_kobj = &sp->subsys.kobj; sysfs_remove_groups(bus_kobj, bus->bus_groups); remove_probe_files(bus); bus_remove_file(bus, &bus_attr_uevent); kset_unregister(sp->drivers_kset); kset_unregister(sp->devices_kset); kset_unregister(&sp->subsys); subsys_put(sp); } EXPORT_SYMBOL_GPL(bus_unregister); int bus_register_notifier(const struct bus_type *bus, struct notifier_block *nb) { struct subsys_private *sp = bus_to_subsys(bus); int retval; if (!sp) return -EINVAL; retval = blocking_notifier_chain_register(&sp->bus_notifier, nb); subsys_put(sp); return retval; } EXPORT_SYMBOL_GPL(bus_register_notifier); int bus_unregister_notifier(const struct bus_type *bus, struct notifier_block *nb) { struct subsys_private *sp = bus_to_subsys(bus); int retval; if (!sp) return -EINVAL; retval = blocking_notifier_chain_unregister(&sp->bus_notifier, nb); subsys_put(sp); return retval; } EXPORT_SYMBOL_GPL(bus_unregister_notifier); void bus_notify(struct device *dev, enum bus_notifier_event value) { struct subsys_private *sp = bus_to_subsys(dev->bus); if (!sp) return; blocking_notifier_call_chain(&sp->bus_notifier, value, dev); subsys_put(sp); } struct kset *bus_get_kset(const struct bus_type *bus) { struct subsys_private *sp = bus_to_subsys(bus); struct kset *kset; if (!sp) return NULL; kset = &sp->subsys; subsys_put(sp); return kset; } EXPORT_SYMBOL_GPL(bus_get_kset); /* * Yes, this forcibly breaks the klist abstraction temporarily. It * just wants to sort the klist, not change reference counts and * take/drop locks rapidly in the process. It does all this while * holding the lock for the list, so objects can't otherwise be * added/removed while we're swizzling. */ static void device_insertion_sort_klist(struct device *a, struct list_head *list, int (*compare)(const struct device *a, const struct device *b)) { struct klist_node *n; struct device_private *dev_prv; struct device *b; list_for_each_entry(n, list, n_node) { dev_prv = to_device_private_bus(n); b = dev_prv->device; if (compare(a, b) <= 0) { list_move_tail(&a->p->knode_bus.n_node, &b->p->knode_bus.n_node); return; } } list_move_tail(&a->p->knode_bus.n_node, list); } void bus_sort_breadthfirst(const struct bus_type *bus, int (*compare)(const struct device *a, const struct device *b)) { struct subsys_private *sp = bus_to_subsys(bus); LIST_HEAD(sorted_devices); struct klist_node *n, *tmp; struct device_private *dev_prv; struct device *dev; struct klist *device_klist; if (!sp) return; device_klist = &sp->klist_devices; spin_lock(&device_klist->k_lock); list_for_each_entry_safe(n, tmp, &device_klist->k_list, n_node) { dev_prv = to_device_private_bus(n); dev = dev_prv->device; device_insertion_sort_klist(dev, &sorted_devices, compare); } list_splice(&sorted_devices, &device_klist->k_list); spin_unlock(&device_klist->k_lock); subsys_put(sp); } EXPORT_SYMBOL_GPL(bus_sort_breadthfirst); struct subsys_dev_iter { struct klist_iter ki; const struct device_type *type; }; /** * subsys_dev_iter_init - initialize subsys device iterator * @iter: subsys iterator to initialize * @sp: the subsys private (i.e. bus) we wanna iterate over * @start: the device to start iterating from, if any * @type: device_type of the devices to iterate over, NULL for all * * Initialize subsys iterator @iter such that it iterates over devices * of @subsys. If @start is set, the list iteration will start there, * otherwise if it is NULL, the iteration starts at the beginning of * the list. */ static void subsys_dev_iter_init(struct subsys_dev_iter *iter, struct subsys_private *sp, struct device *start, const struct device_type *type) { struct klist_node *start_knode = NULL; if (start) start_knode = &start->p->knode_bus; klist_iter_init_node(&sp->klist_devices, &iter->ki, start_knode); iter->type = type; } /** * subsys_dev_iter_next - iterate to the next device * @iter: subsys iterator to proceed * * Proceed @iter to the next device and return it. Returns NULL if * iteration is complete. * * The returned device is referenced and won't be released till * iterator is proceed to the next device or exited. The caller is * free to do whatever it wants to do with the device including * calling back into subsys code. */ static struct device *subsys_dev_iter_next(struct subsys_dev_iter *iter) { struct klist_node *knode; struct device *dev; for (;;) { knode = klist_next(&iter->ki); if (!knode) return NULL; dev = to_device_private_bus(knode)->device; if (!iter->type || iter->type == dev->type) return dev; } } /** * subsys_dev_iter_exit - finish iteration * @iter: subsys iterator to finish * * Finish an iteration. Always call this function after iteration is * complete whether the iteration ran till the end or not. */ static void subsys_dev_iter_exit(struct subsys_dev_iter *iter) { klist_iter_exit(&iter->ki); } int subsys_interface_register(struct subsys_interface *sif) { struct subsys_private *sp; struct subsys_dev_iter iter; struct device *dev; if (!sif || !sif->subsys) return -ENODEV; sp = bus_to_subsys(sif->subsys); if (!sp) return -EINVAL; /* * Reference in sp is now incremented and will be dropped when * the interface is removed from the bus */ mutex_lock(&sp->mutex); list_add_tail(&sif->node, &sp->interfaces); if (sif->add_dev) { subsys_dev_iter_init(&iter, sp, NULL, NULL); while ((dev = subsys_dev_iter_next(&iter))) sif->add_dev(dev, sif); subsys_dev_iter_exit(&iter); } mutex_unlock(&sp->mutex); return 0; } EXPORT_SYMBOL_GPL(subsys_interface_register); void subsys_interface_unregister(struct subsys_interface *sif) { struct subsys_private *sp; struct subsys_dev_iter iter; struct device *dev; if (!sif || !sif->subsys) return; sp = bus_to_subsys(sif->subsys); if (!sp) return; mutex_lock(&sp->mutex); list_del_init(&sif->node); if (sif->remove_dev) { subsys_dev_iter_init(&iter, sp, NULL, NULL); while ((dev = subsys_dev_iter_next(&iter))) sif->remove_dev(dev, sif); subsys_dev_iter_exit(&iter); } mutex_unlock(&sp->mutex); /* * Decrement the reference count twice, once for the bus_to_subsys() * call in the start of this function, and the second one from the * reference increment in subsys_interface_register() */ subsys_put(sp); subsys_put(sp); } EXPORT_SYMBOL_GPL(subsys_interface_unregister); static void system_root_device_release(struct device *dev) { kfree(dev); } static int subsys_register(const struct bus_type *subsys, const struct attribute_group **groups, struct kobject *parent_of_root) { struct subsys_private *sp; struct device *dev; int err; err = bus_register(subsys); if (err < 0) return err; sp = bus_to_subsys(subsys); if (!sp) { err = -EINVAL; goto err_sp; } dev = kzalloc(sizeof(struct device), GFP_KERNEL); if (!dev) { err = -ENOMEM; goto err_dev; } err = dev_set_name(dev, "%s", subsys->name); if (err < 0) goto err_name; dev->kobj.parent = parent_of_root; dev->groups = groups; dev->release = system_root_device_release; err = device_register(dev); if (err < 0) goto err_dev_reg; sp->dev_root = dev; subsys_put(sp); return 0; err_dev_reg: put_device(dev); dev = NULL; err_name: kfree(dev); err_dev: subsys_put(sp); err_sp: bus_unregister(subsys); return err; } /** * subsys_system_register - register a subsystem at /sys/devices/system/ * @subsys: system subsystem * @groups: default attributes for the root device * * All 'system' subsystems have a /sys/devices/system/<name> root device * with the name of the subsystem. The root device can carry subsystem- * wide attributes. All registered devices are below this single root * device and are named after the subsystem with a simple enumeration * number appended. The registered devices are not explicitly named; * only 'id' in the device needs to be set. * * Do not use this interface for anything new, it exists for compatibility * with bad ideas only. New subsystems should use plain subsystems; and * add the subsystem-wide attributes should be added to the subsystem * directory itself and not some create fake root-device placed in * /sys/devices/system/<name>. */ int subsys_system_register(const struct bus_type *subsys, const struct attribute_group **groups) { return subsys_register(subsys, groups, &system_kset->kobj); } EXPORT_SYMBOL_GPL(subsys_system_register); /** * subsys_virtual_register - register a subsystem at /sys/devices/virtual/ * @subsys: virtual subsystem * @groups: default attributes for the root device * * All 'virtual' subsystems have a /sys/devices/system/<name> root device * with the name of the subystem. The root device can carry subsystem-wide * attributes. All registered devices are below this single root device. * There's no restriction on device naming. This is for kernel software * constructs which need sysfs interface. */ int subsys_virtual_register(const struct bus_type *subsys, const struct attribute_group **groups) { struct kobject *virtual_dir; virtual_dir = virtual_device_parent(); if (!virtual_dir) return -ENOMEM; return subsys_register(subsys, groups, virtual_dir); } EXPORT_SYMBOL_GPL(subsys_virtual_register); /** * driver_find - locate driver on a bus by its name. * @name: name of the driver. * @bus: bus to scan for the driver. * * Call kset_find_obj() to iterate over list of drivers on * a bus to find driver by name. Return driver if found. * * This routine provides no locking to prevent the driver it returns * from being unregistered or unloaded while the caller is using it. * The caller is responsible for preventing this. */ struct device_driver *driver_find(const char *name, const struct bus_type *bus) { struct subsys_private *sp = bus_to_subsys(bus); struct kobject *k; struct driver_private *priv; if (!sp) return NULL; k = kset_find_obj(sp->drivers_kset, name); subsys_put(sp); if (!k) return NULL; priv = to_driver(k); /* Drop reference added by kset_find_obj() */ kobject_put(k); return priv->driver; } EXPORT_SYMBOL_GPL(driver_find); /* * Warning, the value could go to "removed" instantly after calling this function, so be very * careful when calling it... */ bool bus_is_registered(const struct bus_type *bus) { struct subsys_private *sp = bus_to_subsys(bus); bool is_initialized = false; if (sp) { is_initialized = true; subsys_put(sp); } return is_initialized; } /** * bus_get_dev_root - return a pointer to the "device root" of a bus * @bus: bus to return the device root of. * * If a bus has a "device root" structure, return it, WITH THE REFERENCE * COUNT INCREMENTED. * * Note, when finished with the device, a call to put_device() is required. * * If the device root is not present (or bus is not a valid pointer), NULL * will be returned. */ struct device *bus_get_dev_root(const struct bus_type *bus) { struct subsys_private *sp = bus_to_subsys(bus); struct device *dev_root; if (!sp) return NULL; dev_root = get_device(sp->dev_root); subsys_put(sp); return dev_root; } EXPORT_SYMBOL_GPL(bus_get_dev_root); int __init buses_init(void) { bus_kset = kset_create_and_add("bus", &bus_uevent_ops, NULL); if (!bus_kset) return -ENOMEM; system_kset = kset_create_and_add("system", NULL, &devices_kset->kobj); if (!system_kset) { /* Do error handling here as devices_init() do */ kset_unregister(bus_kset); bus_kset = NULL; pr_err("%s: failed to create and add kset 'bus'\n", __func__); return -ENOMEM; } return 0; }
82 44 44 199 74 44 44 232 234 11 82 74 10 234 135 25 76 232 44 223 40 40 39 40 40 29 17 19 1 10 10 1 1 1 40 151 151 149 150 135 42 42 42 42 7 38 89 51 115 114 115 110 21 16 22 16 34 114 14 14 14 14 14 1 14 14 13 3 3 2 3 3 3 3 3 3 3 3 3 3 4 4 4 4 4 4 4 3 4 4 6 8 7 4 4 3 3 3 3 3 3 2 2 2 2 2 2 2 6 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 // SPDX-License-Identifier: GPL-2.0 #include <linux/rcupdate.h> #include <linux/spinlock.h> #include <linux/jiffies.h> #include <linux/module.h> #include <linux/cache.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/tcp.h> #include <linux/hash.h> #include <linux/tcp_metrics.h> #include <linux/vmalloc.h> #include <net/inet_connection_sock.h> #include <net/net_namespace.h> #include <net/request_sock.h> #include <net/inetpeer.h> #include <net/sock.h> #include <net/ipv6.h> #include <net/dst.h> #include <net/tcp.h> #include <net/genetlink.h> static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr, const struct inetpeer_addr *daddr, struct net *net, unsigned int hash); struct tcp_fastopen_metrics { u16 mss; u16 syn_loss:10, /* Recurring Fast Open SYN losses */ try_exp:2; /* Request w/ exp. option (once) */ unsigned long last_syn_loss; /* Last Fast Open SYN loss */ struct tcp_fastopen_cookie cookie; }; /* TCP_METRIC_MAX includes 2 extra fields for userspace compatibility * Kernel only stores RTT and RTTVAR in usec resolution */ #define TCP_METRIC_MAX_KERNEL (TCP_METRIC_MAX - 2) struct tcp_metrics_block { struct tcp_metrics_block __rcu *tcpm_next; struct net *tcpm_net; struct inetpeer_addr tcpm_saddr; struct inetpeer_addr tcpm_daddr; unsigned long tcpm_stamp; u32 tcpm_lock; u32 tcpm_vals[TCP_METRIC_MAX_KERNEL + 1]; struct tcp_fastopen_metrics tcpm_fastopen; struct rcu_head rcu_head; }; static inline struct net *tm_net(const struct tcp_metrics_block *tm) { /* Paired with the WRITE_ONCE() in tcpm_new() */ return READ_ONCE(tm->tcpm_net); } static bool tcp_metric_locked(struct tcp_metrics_block *tm, enum tcp_metric_index idx) { /* Paired with WRITE_ONCE() in tcpm_suck_dst() */ return READ_ONCE(tm->tcpm_lock) & (1 << idx); } static u32 tcp_metric_get(const struct tcp_metrics_block *tm, enum tcp_metric_index idx) { /* Paired with WRITE_ONCE() in tcp_metric_set() */ return READ_ONCE(tm->tcpm_vals[idx]); } static void tcp_metric_set(struct tcp_metrics_block *tm, enum tcp_metric_index idx, u32 val) { /* Paired with READ_ONCE() in tcp_metric_get() */ WRITE_ONCE(tm->tcpm_vals[idx], val); } static bool addr_same(const struct inetpeer_addr *a, const struct inetpeer_addr *b) { return (a->family == b->family) && !inetpeer_addr_cmp(a, b); } struct tcpm_hash_bucket { struct tcp_metrics_block __rcu *chain; }; static struct tcpm_hash_bucket *tcp_metrics_hash __read_mostly; static unsigned int tcp_metrics_hash_log __read_mostly; static DEFINE_SPINLOCK(tcp_metrics_lock); static DEFINE_SEQLOCK(fastopen_seqlock); static void tcpm_suck_dst(struct tcp_metrics_block *tm, const struct dst_entry *dst, bool fastopen_clear) { u32 msval; u32 val; WRITE_ONCE(tm->tcpm_stamp, jiffies); val = 0; if (dst_metric_locked(dst, RTAX_RTT)) val |= 1 << TCP_METRIC_RTT; if (dst_metric_locked(dst, RTAX_RTTVAR)) val |= 1 << TCP_METRIC_RTTVAR; if (dst_metric_locked(dst, RTAX_SSTHRESH)) val |= 1 << TCP_METRIC_SSTHRESH; if (dst_metric_locked(dst, RTAX_CWND)) val |= 1 << TCP_METRIC_CWND; if (dst_metric_locked(dst, RTAX_REORDERING)) val |= 1 << TCP_METRIC_REORDERING; /* Paired with READ_ONCE() in tcp_metric_locked() */ WRITE_ONCE(tm->tcpm_lock, val); msval = dst_metric_raw(dst, RTAX_RTT); tcp_metric_set(tm, TCP_METRIC_RTT, msval * USEC_PER_MSEC); msval = dst_metric_raw(dst, RTAX_RTTVAR); tcp_metric_set(tm, TCP_METRIC_RTTVAR, msval * USEC_PER_MSEC); tcp_metric_set(tm, TCP_METRIC_SSTHRESH, dst_metric_raw(dst, RTAX_SSTHRESH)); tcp_metric_set(tm, TCP_METRIC_CWND, dst_metric_raw(dst, RTAX_CWND)); tcp_metric_set(tm, TCP_METRIC_REORDERING, dst_metric_raw(dst, RTAX_REORDERING)); if (fastopen_clear) { write_seqlock(&fastopen_seqlock); tm->tcpm_fastopen.mss = 0; tm->tcpm_fastopen.syn_loss = 0; tm->tcpm_fastopen.try_exp = 0; tm->tcpm_fastopen.cookie.exp = false; tm->tcpm_fastopen.cookie.len = 0; write_sequnlock(&fastopen_seqlock); } } #define TCP_METRICS_TIMEOUT (60 * 60 * HZ) static void tcpm_check_stamp(struct tcp_metrics_block *tm, const struct dst_entry *dst) { unsigned long limit; if (!tm) return; limit = READ_ONCE(tm->tcpm_stamp) + TCP_METRICS_TIMEOUT; if (unlikely(time_after(jiffies, limit))) tcpm_suck_dst(tm, dst, false); } #define TCP_METRICS_RECLAIM_DEPTH 5 #define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL #define deref_locked(p) \ rcu_dereference_protected(p, lockdep_is_held(&tcp_metrics_lock)) static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, struct inetpeer_addr *saddr, struct inetpeer_addr *daddr, unsigned int hash) { struct tcp_metrics_block *tm; struct net *net; bool reclaim = false; spin_lock_bh(&tcp_metrics_lock); net = dev_net(dst->dev); /* While waiting for the spin-lock the cache might have been populated * with this entry and so we have to check again. */ tm = __tcp_get_metrics(saddr, daddr, net, hash); if (tm == TCP_METRICS_RECLAIM_PTR) { reclaim = true; tm = NULL; } if (tm) { tcpm_check_stamp(tm, dst); goto out_unlock; } if (unlikely(reclaim)) { struct tcp_metrics_block *oldest; oldest = deref_locked(tcp_metrics_hash[hash].chain); for (tm = deref_locked(oldest->tcpm_next); tm; tm = deref_locked(tm->tcpm_next)) { if (time_before(READ_ONCE(tm->tcpm_stamp), READ_ONCE(oldest->tcpm_stamp))) oldest = tm; } tm = oldest; } else { tm = kzalloc(sizeof(*tm), GFP_ATOMIC); if (!tm) goto out_unlock; } /* Paired with the READ_ONCE() in tm_net() */ WRITE_ONCE(tm->tcpm_net, net); tm->tcpm_saddr = *saddr; tm->tcpm_daddr = *daddr; tcpm_suck_dst(tm, dst, reclaim); if (likely(!reclaim)) { tm->tcpm_next = tcp_metrics_hash[hash].chain; rcu_assign_pointer(tcp_metrics_hash[hash].chain, tm); } out_unlock: spin_unlock_bh(&tcp_metrics_lock); return tm; } static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth) { if (tm) return tm; if (depth > TCP_METRICS_RECLAIM_DEPTH) return TCP_METRICS_RECLAIM_PTR; return NULL; } static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *saddr, const struct inetpeer_addr *daddr, struct net *net, unsigned int hash) { struct tcp_metrics_block *tm; int depth = 0; for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { if (addr_same(&tm->tcpm_saddr, saddr) && addr_same(&tm->tcpm_daddr, daddr) && net_eq(tm_net(tm), net)) break; depth++; } return tcp_get_encode(tm, depth); } static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, struct dst_entry *dst) { struct tcp_metrics_block *tm; struct inetpeer_addr saddr, daddr; unsigned int hash; struct net *net; saddr.family = req->rsk_ops->family; daddr.family = req->rsk_ops->family; switch (daddr.family) { case AF_INET: inetpeer_set_addr_v4(&saddr, inet_rsk(req)->ir_loc_addr); inetpeer_set_addr_v4(&daddr, inet_rsk(req)->ir_rmt_addr); hash = ipv4_addr_hash(inet_rsk(req)->ir_rmt_addr); break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: inetpeer_set_addr_v6(&saddr, &inet_rsk(req)->ir_v6_loc_addr); inetpeer_set_addr_v6(&daddr, &inet_rsk(req)->ir_v6_rmt_addr); hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr); break; #endif default: return NULL; } net = dev_net(dst->dev); hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { if (addr_same(&tm->tcpm_saddr, &saddr) && addr_same(&tm->tcpm_daddr, &daddr) && net_eq(tm_net(tm), net)) break; } tcpm_check_stamp(tm, dst); return tm; } static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, struct dst_entry *dst, bool create) { struct tcp_metrics_block *tm; struct inetpeer_addr saddr, daddr; unsigned int hash; struct net *net; if (sk->sk_family == AF_INET) { inetpeer_set_addr_v4(&saddr, inet_sk(sk)->inet_saddr); inetpeer_set_addr_v4(&daddr, inet_sk(sk)->inet_daddr); hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr); } #if IS_ENABLED(CONFIG_IPV6) else if (sk->sk_family == AF_INET6) { if (ipv6_addr_v4mapped(&sk->sk_v6_daddr)) { inetpeer_set_addr_v4(&saddr, inet_sk(sk)->inet_saddr); inetpeer_set_addr_v4(&daddr, inet_sk(sk)->inet_daddr); hash = ipv4_addr_hash(inet_sk(sk)->inet_daddr); } else { inetpeer_set_addr_v6(&saddr, &sk->sk_v6_rcv_saddr); inetpeer_set_addr_v6(&daddr, &sk->sk_v6_daddr); hash = ipv6_addr_hash(&sk->sk_v6_daddr); } } #endif else return NULL; net = dev_net(dst->dev); hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); tm = __tcp_get_metrics(&saddr, &daddr, net, hash); if (tm == TCP_METRICS_RECLAIM_PTR) tm = NULL; if (!tm && create) tm = tcpm_new(dst, &saddr, &daddr, hash); else tcpm_check_stamp(tm, dst); return tm; } /* Save metrics learned by this TCP session. This function is called * only, when TCP finishes successfully i.e. when it enters TIME-WAIT * or goes from LAST-ACK to CLOSE. */ void tcp_update_metrics(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct tcp_metrics_block *tm; unsigned long rtt; u32 val; int m; sk_dst_confirm(sk); if (READ_ONCE(net->ipv4.sysctl_tcp_nometrics_save) || !dst) return; rcu_read_lock(); if (icsk->icsk_backoff || !tp->srtt_us) { /* This session failed to estimate rtt. Why? * Probably, no packets returned in time. Reset our * results. */ tm = tcp_get_metrics(sk, dst, false); if (tm && !tcp_metric_locked(tm, TCP_METRIC_RTT)) tcp_metric_set(tm, TCP_METRIC_RTT, 0); goto out_unlock; } else tm = tcp_get_metrics(sk, dst, true); if (!tm) goto out_unlock; rtt = tcp_metric_get(tm, TCP_METRIC_RTT); m = rtt - tp->srtt_us; /* If newly calculated rtt larger than stored one, store new * one. Otherwise, use EWMA. Remember, rtt overestimation is * always better than underestimation. */ if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) { if (m <= 0) rtt = tp->srtt_us; else rtt -= (m >> 3); tcp_metric_set(tm, TCP_METRIC_RTT, rtt); } if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) { unsigned long var; if (m < 0) m = -m; /* Scale deviation to rttvar fixed point */ m >>= 1; if (m < tp->mdev_us) m = tp->mdev_us; var = tcp_metric_get(tm, TCP_METRIC_RTTVAR); if (m >= var) var = m; else var -= (var - m) >> 2; tcp_metric_set(tm, TCP_METRIC_RTTVAR, var); } if (tcp_in_initial_slowstart(tp)) { /* Slow start still did not finish. */ if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) && !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val && (tcp_snd_cwnd(tp) >> 1) > val) tcp_metric_set(tm, TCP_METRIC_SSTHRESH, tcp_snd_cwnd(tp) >> 1); } if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { val = tcp_metric_get(tm, TCP_METRIC_CWND); if (tcp_snd_cwnd(tp) > val) tcp_metric_set(tm, TCP_METRIC_CWND, tcp_snd_cwnd(tp)); } } else if (!tcp_in_slow_start(tp) && icsk->icsk_ca_state == TCP_CA_Open) { /* Cong. avoidance phase, cwnd is reliable. */ if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) && !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) tcp_metric_set(tm, TCP_METRIC_SSTHRESH, max(tcp_snd_cwnd(tp) >> 1, tp->snd_ssthresh)); if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { val = tcp_metric_get(tm, TCP_METRIC_CWND); tcp_metric_set(tm, TCP_METRIC_CWND, (val + tcp_snd_cwnd(tp)) >> 1); } } else { /* Else slow start did not finish, cwnd is non-sense, * ssthresh may be also invalid. */ if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { val = tcp_metric_get(tm, TCP_METRIC_CWND); tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_ssthresh) >> 1); } if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) && !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val && tp->snd_ssthresh > val) tcp_metric_set(tm, TCP_METRIC_SSTHRESH, tp->snd_ssthresh); } if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) { val = tcp_metric_get(tm, TCP_METRIC_REORDERING); if (val < tp->reordering && tp->reordering != READ_ONCE(net->ipv4.sysctl_tcp_reordering)) tcp_metric_set(tm, TCP_METRIC_REORDERING, tp->reordering); } } WRITE_ONCE(tm->tcpm_stamp, jiffies); out_unlock: rcu_read_unlock(); } /* Initialize metrics on socket. */ void tcp_init_metrics(struct sock *sk) { struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct tcp_metrics_block *tm; u32 val, crtt = 0; /* cached RTT scaled by 8 */ sk_dst_confirm(sk); /* ssthresh may have been reduced unnecessarily during. * 3WHS. Restore it back to its initial default. */ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; if (!dst) goto reset; rcu_read_lock(); tm = tcp_get_metrics(sk, dst, false); if (!tm) { rcu_read_unlock(); goto reset; } if (tcp_metric_locked(tm, TCP_METRIC_CWND)) tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND); val = READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) ? 0 : tcp_metric_get(tm, TCP_METRIC_SSTHRESH); if (val) { tp->snd_ssthresh = val; if (tp->snd_ssthresh > tp->snd_cwnd_clamp) tp->snd_ssthresh = tp->snd_cwnd_clamp; } val = tcp_metric_get(tm, TCP_METRIC_REORDERING); if (val && tp->reordering != val) tp->reordering = val; crtt = tcp_metric_get(tm, TCP_METRIC_RTT); rcu_read_unlock(); reset: /* The initial RTT measurement from the SYN/SYN-ACK is not ideal * to seed the RTO for later data packets because SYN packets are * small. Use the per-dst cached values to seed the RTO but keep * the RTT estimator variables intact (e.g., srtt, mdev, rttvar). * Later the RTO will be updated immediately upon obtaining the first * data RTT sample (tcp_rtt_estimator()). Hence the cached RTT only * influences the first RTO but not later RTT estimation. * * But if RTT is not available from the SYN (due to retransmits or * syn cookies) or the cache, force a conservative 3secs timeout. * * A bit of theory. RTT is time passed after "normal" sized packet * is sent until it is ACKed. In normal circumstances sending small * packets force peer to delay ACKs and calculation is correct too. * The algorithm is adaptive and, provided we follow specs, it * NEVER underestimate RTT. BUT! If peer tries to make some clever * tricks sort of "quick acks" for time long enough to decrease RTT * to low value, and then abruptly stops to do it and starts to delay * ACKs, wait for troubles. */ if (crtt > tp->srtt_us) { /* Set RTO like tcp_rtt_estimator(), but from cached RTT. */ crtt /= 8 * USEC_PER_SEC / HZ; inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk)); } else if (tp->srtt_us == 0) { /* RFC6298: 5.7 We've failed to get a valid RTT sample from * 3WHS. This is most likely due to retransmission, * including spurious one. Reset the RTO back to 3secs * from the more aggressive 1sec to avoid more spurious * retransmission. */ tp->rttvar_us = jiffies_to_usecs(TCP_TIMEOUT_FALLBACK); tp->mdev_us = tp->mdev_max_us = tp->rttvar_us; inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK; } } bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst) { struct tcp_metrics_block *tm; bool ret; if (!dst) return false; rcu_read_lock(); tm = __tcp_get_metrics_req(req, dst); if (tm && tcp_metric_get(tm, TCP_METRIC_RTT)) ret = true; else ret = false; rcu_read_unlock(); return ret; } void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie) { struct tcp_metrics_block *tm; rcu_read_lock(); tm = tcp_get_metrics(sk, __sk_dst_get(sk), false); if (tm) { struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen; unsigned int seq; do { seq = read_seqbegin(&fastopen_seqlock); if (tfom->mss) *mss = tfom->mss; *cookie = tfom->cookie; if (cookie->len <= 0 && tfom->try_exp == 1) cookie->exp = true; } while (read_seqretry(&fastopen_seqlock, seq)); } rcu_read_unlock(); } void tcp_fastopen_cache_set(struct sock *sk, u16 mss, struct tcp_fastopen_cookie *cookie, bool syn_lost, u16 try_exp) { struct dst_entry *dst = __sk_dst_get(sk); struct tcp_metrics_block *tm; if (!dst) return; rcu_read_lock(); tm = tcp_get_metrics(sk, dst, true); if (tm) { struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen; write_seqlock_bh(&fastopen_seqlock); if (mss) tfom->mss = mss; if (cookie && cookie->len > 0) tfom->cookie = *cookie; else if (try_exp > tfom->try_exp && tfom->cookie.len <= 0 && !tfom->cookie.exp) tfom->try_exp = try_exp; if (syn_lost) { ++tfom->syn_loss; tfom->last_syn_loss = jiffies; } else tfom->syn_loss = 0; write_sequnlock_bh(&fastopen_seqlock); } rcu_read_unlock(); } static struct genl_family tcp_metrics_nl_family; static const struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] = { [TCP_METRICS_ATTR_ADDR_IPV4] = { .type = NLA_U32, }, [TCP_METRICS_ATTR_ADDR_IPV6] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), [TCP_METRICS_ATTR_SADDR_IPV4] = { .type = NLA_U32, }, [TCP_METRICS_ATTR_SADDR_IPV6] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), /* Following attributes are not received for GET/DEL, * we keep them for reference */ #if 0 [TCP_METRICS_ATTR_AGE] = { .type = NLA_MSECS, }, [TCP_METRICS_ATTR_TW_TSVAL] = { .type = NLA_U32, }, [TCP_METRICS_ATTR_TW_TS_STAMP] = { .type = NLA_S32, }, [TCP_METRICS_ATTR_VALS] = { .type = NLA_NESTED, }, [TCP_METRICS_ATTR_FOPEN_MSS] = { .type = NLA_U16, }, [TCP_METRICS_ATTR_FOPEN_SYN_DROPS] = { .type = NLA_U16, }, [TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS] = { .type = NLA_MSECS, }, [TCP_METRICS_ATTR_FOPEN_COOKIE] = { .type = NLA_BINARY, .len = TCP_FASTOPEN_COOKIE_MAX, }, #endif }; /* Add attributes, caller cancels its header on failure */ static int tcp_metrics_fill_info(struct sk_buff *msg, struct tcp_metrics_block *tm) { struct nlattr *nest; int i; switch (tm->tcpm_daddr.family) { case AF_INET: if (nla_put_in_addr(msg, TCP_METRICS_ATTR_ADDR_IPV4, inetpeer_get_addr_v4(&tm->tcpm_daddr)) < 0) goto nla_put_failure; if (nla_put_in_addr(msg, TCP_METRICS_ATTR_SADDR_IPV4, inetpeer_get_addr_v4(&tm->tcpm_saddr)) < 0) goto nla_put_failure; break; case AF_INET6: if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_ADDR_IPV6, inetpeer_get_addr_v6(&tm->tcpm_daddr)) < 0) goto nla_put_failure; if (nla_put_in6_addr(msg, TCP_METRICS_ATTR_SADDR_IPV6, inetpeer_get_addr_v6(&tm->tcpm_saddr)) < 0) goto nla_put_failure; break; default: return -EAFNOSUPPORT; } if (nla_put_msecs(msg, TCP_METRICS_ATTR_AGE, jiffies - READ_ONCE(tm->tcpm_stamp), TCP_METRICS_ATTR_PAD) < 0) goto nla_put_failure; { int n = 0; nest = nla_nest_start_noflag(msg, TCP_METRICS_ATTR_VALS); if (!nest) goto nla_put_failure; for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) { u32 val = tcp_metric_get(tm, i); if (!val) continue; if (i == TCP_METRIC_RTT) { if (nla_put_u32(msg, TCP_METRIC_RTT_US + 1, val) < 0) goto nla_put_failure; n++; val = max(val / 1000, 1U); } if (i == TCP_METRIC_RTTVAR) { if (nla_put_u32(msg, TCP_METRIC_RTTVAR_US + 1, val) < 0) goto nla_put_failure; n++; val = max(val / 1000, 1U); } if (nla_put_u32(msg, i + 1, val) < 0) goto nla_put_failure; n++; } if (n) nla_nest_end(msg, nest); else nla_nest_cancel(msg, nest); } { struct tcp_fastopen_metrics tfom_copy[1], *tfom; unsigned int seq; do { seq = read_seqbegin(&fastopen_seqlock); tfom_copy[0] = tm->tcpm_fastopen; } while (read_seqretry(&fastopen_seqlock, seq)); tfom = tfom_copy; if (tfom->mss && nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_MSS, tfom->mss) < 0) goto nla_put_failure; if (tfom->syn_loss && (nla_put_u16(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROPS, tfom->syn_loss) < 0 || nla_put_msecs(msg, TCP_METRICS_ATTR_FOPEN_SYN_DROP_TS, jiffies - tfom->last_syn_loss, TCP_METRICS_ATTR_PAD) < 0)) goto nla_put_failure; if (tfom->cookie.len > 0 && nla_put(msg, TCP_METRICS_ATTR_FOPEN_COOKIE, tfom->cookie.len, tfom->cookie.val) < 0) goto nla_put_failure; } return 0; nla_put_failure: return -EMSGSIZE; } static int tcp_metrics_dump_info(struct sk_buff *skb, struct netlink_callback *cb, struct tcp_metrics_block *tm) { void *hdr; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &tcp_metrics_nl_family, NLM_F_MULTI, TCP_METRICS_CMD_GET); if (!hdr) return -EMSGSIZE; if (tcp_metrics_fill_info(skb, tm) < 0) goto nla_put_failure; genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } static int tcp_metrics_nl_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); unsigned int max_rows = 1U << tcp_metrics_hash_log; unsigned int row, s_row = cb->args[0]; int s_col = cb->args[1], col = s_col; int res = 0; for (row = s_row; row < max_rows; row++, s_col = 0) { struct tcp_metrics_block *tm; struct tcpm_hash_bucket *hb = tcp_metrics_hash + row; rcu_read_lock(); for (col = 0, tm = rcu_dereference(hb->chain); tm; tm = rcu_dereference(tm->tcpm_next), col++) { if (!net_eq(tm_net(tm), net)) continue; if (col < s_col) continue; res = tcp_metrics_dump_info(skb, cb, tm); if (res < 0) { rcu_read_unlock(); goto done; } } rcu_read_unlock(); } done: cb->args[0] = row; cb->args[1] = col; return res; } static int __parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, unsigned int *hash, int optional, int v4, int v6) { struct nlattr *a; a = info->attrs[v4]; if (a) { inetpeer_set_addr_v4(addr, nla_get_in_addr(a)); if (hash) *hash = ipv4_addr_hash(inetpeer_get_addr_v4(addr)); return 0; } a = info->attrs[v6]; if (a) { struct in6_addr in6; in6 = nla_get_in6_addr(a); inetpeer_set_addr_v6(addr, &in6); if (hash) *hash = ipv6_addr_hash(inetpeer_get_addr_v6(addr)); return 0; } return optional ? 1 : -EAFNOSUPPORT; } static int parse_nl_addr(struct genl_info *info, struct inetpeer_addr *addr, unsigned int *hash, int optional) { return __parse_nl_addr(info, addr, hash, optional, TCP_METRICS_ATTR_ADDR_IPV4, TCP_METRICS_ATTR_ADDR_IPV6); } static int parse_nl_saddr(struct genl_info *info, struct inetpeer_addr *addr) { return __parse_nl_addr(info, addr, NULL, 0, TCP_METRICS_ATTR_SADDR_IPV4, TCP_METRICS_ATTR_SADDR_IPV6); } static int tcp_metrics_nl_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct tcp_metrics_block *tm; struct inetpeer_addr saddr, daddr; unsigned int hash; struct sk_buff *msg; struct net *net = genl_info_net(info); void *reply; int ret; bool src = true; ret = parse_nl_addr(info, &daddr, &hash, 0); if (ret < 0) return ret; ret = parse_nl_saddr(info, &saddr); if (ret < 0) src = false; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; reply = genlmsg_put_reply(msg, info, &tcp_metrics_nl_family, 0, info->genlhdr->cmd); if (!reply) goto nla_put_failure; hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); ret = -ESRCH; rcu_read_lock(); for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm; tm = rcu_dereference(tm->tcpm_next)) { if (addr_same(&tm->tcpm_daddr, &daddr) && (!src || addr_same(&tm->tcpm_saddr, &saddr)) && net_eq(tm_net(tm), net)) { ret = tcp_metrics_fill_info(msg, tm); break; } } rcu_read_unlock(); if (ret < 0) goto out_free; genlmsg_end(msg, reply); return genlmsg_reply(msg, info); nla_put_failure: ret = -EMSGSIZE; out_free: nlmsg_free(msg); return ret; } static void tcp_metrics_flush_all(struct net *net) { unsigned int max_rows = 1U << tcp_metrics_hash_log; struct tcpm_hash_bucket *hb = tcp_metrics_hash; struct tcp_metrics_block *tm; unsigned int row; for (row = 0; row < max_rows; row++, hb++) { struct tcp_metrics_block __rcu **pp = &hb->chain; bool match; if (!rcu_access_pointer(*pp)) continue; spin_lock_bh(&tcp_metrics_lock); for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) { match = net ? net_eq(tm_net(tm), net) : !refcount_read(&tm_net(tm)->ns.count); if (match) { rcu_assign_pointer(*pp, tm->tcpm_next); kfree_rcu(tm, rcu_head); } else { pp = &tm->tcpm_next; } } spin_unlock_bh(&tcp_metrics_lock); cond_resched(); } } static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) { struct tcpm_hash_bucket *hb; struct tcp_metrics_block *tm; struct tcp_metrics_block __rcu **pp; struct inetpeer_addr saddr, daddr; unsigned int hash; struct net *net = genl_info_net(info); int ret; bool src = true, found = false; ret = parse_nl_addr(info, &daddr, &hash, 1); if (ret < 0) return ret; if (ret > 0) { tcp_metrics_flush_all(net); return 0; } ret = parse_nl_saddr(info, &saddr); if (ret < 0) src = false; hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); hb = tcp_metrics_hash + hash; pp = &hb->chain; spin_lock_bh(&tcp_metrics_lock); for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) { if (addr_same(&tm->tcpm_daddr, &daddr) && (!src || addr_same(&tm->tcpm_saddr, &saddr)) && net_eq(tm_net(tm), net)) { rcu_assign_pointer(*pp, tm->tcpm_next); kfree_rcu(tm, rcu_head); found = true; } else { pp = &tm->tcpm_next; } } spin_unlock_bh(&tcp_metrics_lock); if (!found) return -ESRCH; return 0; } static const struct genl_small_ops tcp_metrics_nl_ops[] = { { .cmd = TCP_METRICS_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = tcp_metrics_nl_cmd_get, .dumpit = tcp_metrics_nl_dump, }, { .cmd = TCP_METRICS_CMD_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = tcp_metrics_nl_cmd_del, .flags = GENL_ADMIN_PERM, }, }; static struct genl_family tcp_metrics_nl_family __ro_after_init = { .hdrsize = 0, .name = TCP_METRICS_GENL_NAME, .version = TCP_METRICS_GENL_VERSION, .maxattr = TCP_METRICS_ATTR_MAX, .policy = tcp_metrics_nl_policy, .netnsok = true, .parallel_ops = true, .module = THIS_MODULE, .small_ops = tcp_metrics_nl_ops, .n_small_ops = ARRAY_SIZE(tcp_metrics_nl_ops), .resv_start_op = TCP_METRICS_CMD_DEL + 1, }; static unsigned int tcpmhash_entries __initdata; static int __init set_tcpmhash_entries(char *str) { ssize_t ret; if (!str) return 0; ret = kstrtouint(str, 0, &tcpmhash_entries); if (ret) return 0; return 1; } __setup("tcpmhash_entries=", set_tcpmhash_entries); static void __init tcp_metrics_hash_alloc(void) { unsigned int slots = tcpmhash_entries; size_t size; if (!slots) { if (totalram_pages() >= 128 * 1024) slots = 16 * 1024; else slots = 8 * 1024; } tcp_metrics_hash_log = order_base_2(slots); size = sizeof(struct tcpm_hash_bucket) << tcp_metrics_hash_log; tcp_metrics_hash = kvzalloc(size, GFP_KERNEL); if (!tcp_metrics_hash) panic("Could not allocate the tcp_metrics hash table\n"); } static void __net_exit tcp_net_metrics_exit_batch(struct list_head *net_exit_list) { tcp_metrics_flush_all(NULL); } static __net_initdata struct pernet_operations tcp_net_metrics_ops = { .exit_batch = tcp_net_metrics_exit_batch, }; void __init tcp_metrics_init(void) { int ret; tcp_metrics_hash_alloc(); ret = register_pernet_subsys(&tcp_net_metrics_ops); if (ret < 0) panic("Could not register tcp_net_metrics_ops\n"); ret = genl_register_family(&tcp_metrics_nl_family); if (ret < 0) panic("Could not register tcp_metrics generic netlink\n"); }
41 53 48 43 9 48 11 51 51 13 32 32 57 1 56 56 11 2 10 11 10 3 16 16 1 11 4 24 24 24 55 51 12 11 1 11 5 5 5 2 2 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 // SPDX-License-Identifier: GPL-2.0-or-later /* * Spanning tree protocol; interface code * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <net/switchdev.h> #include "br_private.h" #include "br_private_stp.h" /* Port id is composed of priority and port number. * NB: some bits of priority are dropped to * make room for more ports. */ static inline port_id br_make_port_id(__u8 priority, __u16 port_no) { return ((u16)priority << BR_PORT_BITS) | (port_no & ((1<<BR_PORT_BITS)-1)); } #define BR_MAX_PORT_PRIORITY ((u16)~0 >> BR_PORT_BITS) /* called under bridge lock */ void br_init_port(struct net_bridge_port *p) { int err; p->port_id = br_make_port_id(p->priority, p->port_no); br_become_designated_port(p); br_set_state(p, BR_STATE_BLOCKING); p->topology_change_ack = 0; p->config_pending = 0; err = __set_ageing_time(p->dev, p->br->ageing_time); if (err) netdev_err(p->dev, "failed to offload ageing time\n"); } /* NO locks held */ void br_stp_enable_bridge(struct net_bridge *br) { struct net_bridge_port *p; spin_lock_bh(&br->lock); if (br->stp_enabled == BR_KERNEL_STP) mod_timer(&br->hello_timer, jiffies + br->hello_time); mod_delayed_work(system_long_wq, &br->gc_work, HZ / 10); br_config_bpdu_generation(br); list_for_each_entry(p, &br->port_list, list) { if (netif_running(p->dev) && netif_oper_up(p->dev)) br_stp_enable_port(p); } spin_unlock_bh(&br->lock); } /* NO locks held */ void br_stp_disable_bridge(struct net_bridge *br) { struct net_bridge_port *p; spin_lock_bh(&br->lock); list_for_each_entry(p, &br->port_list, list) { if (p->state != BR_STATE_DISABLED) br_stp_disable_port(p); } __br_set_topology_change(br, 0); br->topology_change_detected = 0; spin_unlock_bh(&br->lock); del_timer_sync(&br->hello_timer); del_timer_sync(&br->topology_change_timer); del_timer_sync(&br->tcn_timer); cancel_delayed_work_sync(&br->gc_work); } /* called under bridge lock */ void br_stp_enable_port(struct net_bridge_port *p) { br_init_port(p); br_port_state_selection(p->br); br_ifinfo_notify(RTM_NEWLINK, NULL, p); } /* called under bridge lock */ void br_stp_disable_port(struct net_bridge_port *p) { struct net_bridge *br = p->br; int wasroot; wasroot = br_is_root_bridge(br); br_become_designated_port(p); br_set_state(p, BR_STATE_DISABLED); p->topology_change_ack = 0; p->config_pending = 0; br_ifinfo_notify(RTM_NEWLINK, NULL, p); del_timer(&p->message_age_timer); del_timer(&p->forward_delay_timer); del_timer(&p->hold_timer); if (!rcu_access_pointer(p->backup_port)) br_fdb_delete_by_port(br, p, 0, 0); br_multicast_disable_port(p); br_configuration_update(br); br_port_state_selection(br); if (br_is_root_bridge(br) && !wasroot) br_become_root_bridge(br); } static int br_stp_call_user(struct net_bridge *br, char *arg) { char *argv[] = { BR_STP_PROG, br->dev->name, arg, NULL }; char *envp[] = { NULL }; int rc; /* call userspace STP and report program errors */ rc = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC); if (rc > 0) { if (rc & 0xff) br_debug(br, BR_STP_PROG " received signal %d\n", rc & 0x7f); else br_debug(br, BR_STP_PROG " exited with code %d\n", (rc >> 8) & 0xff); } return rc; } static void br_stp_start(struct net_bridge *br) { int err = -ENOENT; if (net_eq(dev_net(br->dev), &init_net)) err = br_stp_call_user(br, "start"); if (err && err != -ENOENT) br_err(br, "failed to start userspace STP (%d)\n", err); spin_lock_bh(&br->lock); if (br->bridge_forward_delay < BR_MIN_FORWARD_DELAY) __br_set_forward_delay(br, BR_MIN_FORWARD_DELAY); else if (br->bridge_forward_delay > BR_MAX_FORWARD_DELAY) __br_set_forward_delay(br, BR_MAX_FORWARD_DELAY); if (!err) { br->stp_enabled = BR_USER_STP; br_debug(br, "userspace STP started\n"); } else { br->stp_enabled = BR_KERNEL_STP; br_debug(br, "using kernel STP\n"); /* To start timers on any ports left in blocking */ if (br->dev->flags & IFF_UP) mod_timer(&br->hello_timer, jiffies + br->hello_time); br_port_state_selection(br); } spin_unlock_bh(&br->lock); } static void br_stp_stop(struct net_bridge *br) { int err; if (br->stp_enabled == BR_USER_STP) { err = br_stp_call_user(br, "stop"); if (err) br_err(br, "failed to stop userspace STP (%d)\n", err); /* To start timers on any ports left in blocking */ spin_lock_bh(&br->lock); br_port_state_selection(br); spin_unlock_bh(&br->lock); } br->stp_enabled = BR_NO_STP; } int br_stp_set_enabled(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { ASSERT_RTNL(); if (br_mrp_enabled(br)) { NL_SET_ERR_MSG_MOD(extack, "STP can't be enabled if MRP is already enabled"); return -EINVAL; } if (val) { if (br->stp_enabled == BR_NO_STP) br_stp_start(br); } else { if (br->stp_enabled != BR_NO_STP) br_stp_stop(br); } return 0; } /* called under bridge lock */ void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *addr) { /* should be aligned on 2 bytes for ether_addr_equal() */ unsigned short oldaddr_aligned[ETH_ALEN >> 1]; unsigned char *oldaddr = (unsigned char *)oldaddr_aligned; struct net_bridge_port *p; int wasroot; wasroot = br_is_root_bridge(br); br_fdb_change_mac_address(br, addr); memcpy(oldaddr, br->bridge_id.addr, ETH_ALEN); memcpy(br->bridge_id.addr, addr, ETH_ALEN); eth_hw_addr_set(br->dev, addr); list_for_each_entry(p, &br->port_list, list) { if (ether_addr_equal(p->designated_bridge.addr, oldaddr)) memcpy(p->designated_bridge.addr, addr, ETH_ALEN); if (ether_addr_equal(p->designated_root.addr, oldaddr)) memcpy(p->designated_root.addr, addr, ETH_ALEN); } br_configuration_update(br); br_port_state_selection(br); if (br_is_root_bridge(br) && !wasroot) br_become_root_bridge(br); } /* should be aligned on 2 bytes for ether_addr_equal() */ static const unsigned short br_mac_zero_aligned[ETH_ALEN >> 1]; /* called under bridge lock */ bool br_stp_recalculate_bridge_id(struct net_bridge *br) { const unsigned char *br_mac_zero = (const unsigned char *)br_mac_zero_aligned; const unsigned char *addr = br_mac_zero; struct net_bridge_port *p; /* user has chosen a value so keep it */ if (br->dev->addr_assign_type == NET_ADDR_SET) return false; list_for_each_entry(p, &br->port_list, list) { if (addr == br_mac_zero || memcmp(p->dev->dev_addr, addr, ETH_ALEN) < 0) addr = p->dev->dev_addr; } if (ether_addr_equal(br->bridge_id.addr, addr)) return false; /* no change */ br_stp_change_bridge_id(br, addr); return true; } /* Acquires and releases bridge lock */ void br_stp_set_bridge_priority(struct net_bridge *br, u16 newprio) { struct net_bridge_port *p; int wasroot; spin_lock_bh(&br->lock); wasroot = br_is_root_bridge(br); list_for_each_entry(p, &br->port_list, list) { if (p->state != BR_STATE_DISABLED && br_is_designated_port(p)) { p->designated_bridge.prio[0] = (newprio >> 8) & 0xFF; p->designated_bridge.prio[1] = newprio & 0xFF; } } br->bridge_id.prio[0] = (newprio >> 8) & 0xFF; br->bridge_id.prio[1] = newprio & 0xFF; br_configuration_update(br); br_port_state_selection(br); if (br_is_root_bridge(br) && !wasroot) br_become_root_bridge(br); spin_unlock_bh(&br->lock); } /* called under bridge lock */ int br_stp_set_port_priority(struct net_bridge_port *p, unsigned long newprio) { port_id new_port_id; if (newprio > BR_MAX_PORT_PRIORITY) return -ERANGE; new_port_id = br_make_port_id(newprio, p->port_no); if (br_is_designated_port(p)) p->designated_port = new_port_id; p->port_id = new_port_id; p->priority = newprio; if (!memcmp(&p->br->bridge_id, &p->designated_bridge, 8) && p->port_id < p->designated_port) { br_become_designated_port(p); br_port_state_selection(p->br); } return 0; } /* called under bridge lock */ int br_stp_set_path_cost(struct net_bridge_port *p, unsigned long path_cost) { if (path_cost < BR_MIN_PATH_COST || path_cost > BR_MAX_PATH_COST) return -ERANGE; p->flags |= BR_ADMIN_COST; p->path_cost = path_cost; br_configuration_update(p->br); br_port_state_selection(p->br); return 0; } ssize_t br_show_bridge_id(char *buf, const struct bridge_id *id) { return sprintf(buf, "%.2x%.2x.%.2x%.2x%.2x%.2x%.2x%.2x\n", id->prio[0], id->prio[1], id->addr[0], id->addr[1], id->addr[2], id->addr[3], id->addr[4], id->addr[5]); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_BOOTMEM_INFO_H #define __LINUX_BOOTMEM_INFO_H #include <linux/mm.h> #include <linux/kmemleak.h> /* * Types for free bootmem stored in the low bits of page->private. */ enum bootmem_type { MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 1, SECTION_INFO = MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE, MIX_SECTION_INFO, NODE_INFO, MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO, }; #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE void __init register_page_bootmem_info_node(struct pglist_data *pgdat); void get_page_bootmem(unsigned long info, struct page *page, enum bootmem_type type); void put_page_bootmem(struct page *page); static inline enum bootmem_type bootmem_type(const struct page *page) { return (unsigned long)page->private & 0xf; } static inline unsigned long bootmem_info(const struct page *page) { return (unsigned long)page->private >> 4; } /* * Any memory allocated via the memblock allocator and not via the * buddy will be marked reserved already in the memmap. For those * pages, we can call this function to free it to buddy allocator. */ static inline void free_bootmem_page(struct page *page) { enum bootmem_type type = bootmem_type(page); /* * The reserve_bootmem_region sets the reserved flag on bootmem * pages. */ VM_BUG_ON_PAGE(page_ref_count(page) != 2, page); if (type == SECTION_INFO || type == MIX_SECTION_INFO) put_page_bootmem(page); else VM_BUG_ON_PAGE(1, page); } #else static inline void register_page_bootmem_info_node(struct pglist_data *pgdat) { } static inline void put_page_bootmem(struct page *page) { } static inline enum bootmem_type bootmem_type(const struct page *page) { return SECTION_INFO; } static inline unsigned long bootmem_info(const struct page *page) { return 0; } static inline void get_page_bootmem(unsigned long info, struct page *page, enum bootmem_type type) { } static inline void free_bootmem_page(struct page *page) { kmemleak_free_part_phys(PFN_PHYS(page_to_pfn(page)), PAGE_SIZE); free_reserved_page(page); } #endif #endif /* __LINUX_BOOTMEM_INFO_H */
38 2243 2245 715 1719 2247 2245 2248 3 3 3 3 3 3 3 32 17 15 32 72 1580 3579 3226 1634 3 3 226 231 1 25 25 73 73 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 // SPDX-License-Identifier: GPL-2.0-only /* * net/core/dst.c Protocol independent destination cache. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * */ #include <linux/bitops.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/workqueue.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/string.h> #include <linux/types.h> #include <net/net_namespace.h> #include <linux/sched.h> #include <linux/prefetch.h> #include <net/lwtunnel.h> #include <net/xfrm.h> #include <net/dst.h> #include <net/dst_metadata.h> int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); return 0; } EXPORT_SYMBOL(dst_discard_out); const struct dst_metrics dst_default_metrics = { /* This initializer is needed to force linker to place this variable * into const section. Otherwise it might end into bss section. * We really want to avoid false sharing on this variable, and catch * any writes on it. */ .refcnt = REFCOUNT_INIT(1), }; EXPORT_SYMBOL(dst_default_metrics); void dst_init(struct dst_entry *dst, struct dst_ops *ops, struct net_device *dev, int initial_obsolete, unsigned short flags) { dst->dev = dev; netdev_hold(dev, &dst->dev_tracker, GFP_ATOMIC); dst->ops = ops; dst_init_metrics(dst, dst_default_metrics.metrics, true); dst->expires = 0UL; #ifdef CONFIG_XFRM dst->xfrm = NULL; #endif dst->input = dst_discard; dst->output = dst_discard_out; dst->error = 0; dst->obsolete = initial_obsolete; dst->header_len = 0; dst->trailer_len = 0; #ifdef CONFIG_IP_ROUTE_CLASSID dst->tclassid = 0; #endif dst->lwtstate = NULL; rcuref_init(&dst->__rcuref, 1); INIT_LIST_HEAD(&dst->rt_uncached); dst->__use = 0; dst->lastuse = jiffies; dst->flags = flags; if (!(flags & DST_NOCOUNT)) dst_entries_add(ops, 1); } EXPORT_SYMBOL(dst_init); void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_obsolete, unsigned short flags) { struct dst_entry *dst; if (ops->gc && !(flags & DST_NOCOUNT) && dst_entries_get_fast(ops) > ops->gc_thresh) ops->gc(ops); dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); if (!dst) return NULL; dst_init(dst, ops, dev, initial_obsolete, flags); return dst; } EXPORT_SYMBOL(dst_alloc); static void dst_destroy(struct dst_entry *dst) { struct dst_entry *child = NULL; smp_rmb(); #ifdef CONFIG_XFRM if (dst->xfrm) { struct xfrm_dst *xdst = (struct xfrm_dst *) dst; child = xdst->child; } #endif if (dst->ops->destroy) dst->ops->destroy(dst); netdev_put(dst->dev, &dst->dev_tracker); lwtstate_put(dst->lwtstate); if (dst->flags & DST_METADATA) metadata_dst_free((struct metadata_dst *)dst); else kmem_cache_free(dst->ops->kmem_cachep, dst); dst = child; if (dst) dst_release_immediate(dst); } static void dst_destroy_rcu(struct rcu_head *head) { struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); dst_destroy(dst); } /* Operations to mark dst as DEAD and clean up the net device referenced * by dst: * 1. put the dst under blackhole interface and discard all tx/rx packets * on this route. * 2. release the net_device * This function should be called when removing routes from the fib tree * in preparation for a NETDEV_DOWN/NETDEV_UNREGISTER event and also to * make the next dst_ops->check() fail. */ void dst_dev_put(struct dst_entry *dst) { struct net_device *dev = dst->dev; dst->obsolete = DST_OBSOLETE_DEAD; if (dst->ops->ifdown) dst->ops->ifdown(dst, dev); dst->input = dst_discard; dst->output = dst_discard_out; dst->dev = blackhole_netdev; netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker, GFP_ATOMIC); } EXPORT_SYMBOL(dst_dev_put); static void dst_count_dec(struct dst_entry *dst) { if (!(dst->flags & DST_NOCOUNT)) dst_entries_add(dst->ops, -1); } void dst_release(struct dst_entry *dst) { if (dst && rcuref_put(&dst->__rcuref)) { dst_count_dec(dst); call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu); } } EXPORT_SYMBOL(dst_release); void dst_release_immediate(struct dst_entry *dst) { if (dst && rcuref_put(&dst->__rcuref)) { dst_count_dec(dst); dst_destroy(dst); } } EXPORT_SYMBOL(dst_release_immediate); u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old) { struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC); if (p) { struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old); unsigned long prev, new; refcount_set(&p->refcnt, 1); memcpy(p->metrics, old_p->metrics, sizeof(p->metrics)); new = (unsigned long) p; prev = cmpxchg(&dst->_metrics, old, new); if (prev != old) { kfree(p); p = (struct dst_metrics *)__DST_METRICS_PTR(prev); if (prev & DST_METRICS_READ_ONLY) p = NULL; } else if (prev & DST_METRICS_REFCOUNTED) { if (refcount_dec_and_test(&old_p->refcnt)) kfree(old_p); } } BUILD_BUG_ON(offsetof(struct dst_metrics, metrics) != 0); return (u32 *)p; } EXPORT_SYMBOL(dst_cow_metrics_generic); /* Caller asserts that dst_metrics_read_only(dst) is false. */ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) { unsigned long prev, new; new = ((unsigned long) &dst_default_metrics) | DST_METRICS_READ_ONLY; prev = cmpxchg(&dst->_metrics, old, new); if (prev == old) kfree(__DST_METRICS_PTR(old)); } EXPORT_SYMBOL(__dst_destroy_metrics_generic); struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie) { return NULL; } u32 *dst_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old) { return NULL; } struct neighbour *dst_blackhole_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { return NULL; } void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh) { } EXPORT_SYMBOL_GPL(dst_blackhole_update_pmtu); void dst_blackhole_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { } EXPORT_SYMBOL_GPL(dst_blackhole_redirect); unsigned int dst_blackhole_mtu(const struct dst_entry *dst) { unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); return mtu ? : dst->dev->mtu; } EXPORT_SYMBOL_GPL(dst_blackhole_mtu); static struct dst_ops dst_blackhole_ops = { .family = AF_UNSPEC, .neigh_lookup = dst_blackhole_neigh_lookup, .check = dst_blackhole_check, .cow_metrics = dst_blackhole_cow_metrics, .update_pmtu = dst_blackhole_update_pmtu, .redirect = dst_blackhole_redirect, .mtu = dst_blackhole_mtu, }; static void __metadata_dst_init(struct metadata_dst *md_dst, enum metadata_type type, u8 optslen) { struct dst_entry *dst; dst = &md_dst->dst; dst_init(dst, &dst_blackhole_ops, NULL, DST_OBSOLETE_NONE, DST_METADATA | DST_NOCOUNT); memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst)); md_dst->type = type; } struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type, gfp_t flags) { struct metadata_dst *md_dst; md_dst = kmalloc(sizeof(*md_dst) + optslen, flags); if (!md_dst) return NULL; __metadata_dst_init(md_dst, type, optslen); return md_dst; } EXPORT_SYMBOL_GPL(metadata_dst_alloc); void metadata_dst_free(struct metadata_dst *md_dst) { #ifdef CONFIG_DST_CACHE if (md_dst->type == METADATA_IP_TUNNEL) dst_cache_destroy(&md_dst->u.tun_info.dst_cache); #endif if (md_dst->type == METADATA_XFRM) dst_release(md_dst->u.xfrm_info.dst_orig); kfree(md_dst); } EXPORT_SYMBOL_GPL(metadata_dst_free); struct metadata_dst __percpu * metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags) { int cpu; struct metadata_dst __percpu *md_dst; md_dst = __alloc_percpu_gfp(sizeof(struct metadata_dst) + optslen, __alignof__(struct metadata_dst), flags); if (!md_dst) return NULL; for_each_possible_cpu(cpu) __metadata_dst_init(per_cpu_ptr(md_dst, cpu), type, optslen); return md_dst; } EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu); void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst) { int cpu; for_each_possible_cpu(cpu) { struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu); #ifdef CONFIG_DST_CACHE if (one_md_dst->type == METADATA_IP_TUNNEL) dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache); #endif if (one_md_dst->type == METADATA_XFRM) dst_release(one_md_dst->u.xfrm_info.dst_orig); } free_percpu(md_dst); } EXPORT_SYMBOL_GPL(metadata_dst_free_percpu);
6 5 1 9 4 8 7 1 5 1 1 8 7 1 6 6 5 5 5 5 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" struct pause_req_info { struct ethnl_req_info base; enum ethtool_mac_stats_src src; }; #define PAUSE_REQINFO(__req_base) \ container_of(__req_base, struct pause_req_info, base) struct pause_reply_data { struct ethnl_reply_data base; struct ethtool_pauseparam pauseparam; struct ethtool_pause_stats pausestat; }; #define PAUSE_REPDATA(__reply_base) \ container_of(__reply_base, struct pause_reply_data, base) const struct nla_policy ethnl_pause_get_policy[] = { [ETHTOOL_A_PAUSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_stats), [ETHTOOL_A_PAUSE_STATS_SRC] = NLA_POLICY_MAX(NLA_U32, ETHTOOL_MAC_STATS_SRC_PMAC), }; static int pause_parse_request(struct ethnl_req_info *req_base, struct nlattr **tb, struct netlink_ext_ack *extack) { enum ethtool_mac_stats_src src = ETHTOOL_MAC_STATS_SRC_AGGREGATE; struct pause_req_info *req_info = PAUSE_REQINFO(req_base); if (tb[ETHTOOL_A_PAUSE_STATS_SRC]) { if (!(req_base->flags & ETHTOOL_FLAG_STATS)) { NL_SET_ERR_MSG_MOD(extack, "ETHTOOL_FLAG_STATS must be set when requesting a source of stats"); return -EINVAL; } src = nla_get_u32(tb[ETHTOOL_A_PAUSE_STATS_SRC]); } req_info->src = src; return 0; } static int pause_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { const struct pause_req_info *req_info = PAUSE_REQINFO(req_base); struct pause_reply_data *data = PAUSE_REPDATA(reply_base); enum ethtool_mac_stats_src src = req_info->src; struct net_device *dev = reply_base->dev; int ret; if (!dev->ethtool_ops->get_pauseparam) return -EOPNOTSUPP; ethtool_stats_init((u64 *)&data->pausestat, sizeof(data->pausestat) / 8); data->pausestat.src = src; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; if ((src == ETHTOOL_MAC_STATS_SRC_EMAC || src == ETHTOOL_MAC_STATS_SRC_PMAC) && !__ethtool_dev_mm_supported(dev)) { NL_SET_ERR_MSG_MOD(info->extack, "Device does not support MAC merge layer"); ethnl_ops_complete(dev); return -EOPNOTSUPP; } dev->ethtool_ops->get_pauseparam(dev, &data->pauseparam); if (req_base->flags & ETHTOOL_FLAG_STATS && dev->ethtool_ops->get_pause_stats) dev->ethtool_ops->get_pause_stats(dev, &data->pausestat); ethnl_ops_complete(dev); return 0; } static int pause_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { int n = nla_total_size(sizeof(u8)) + /* _PAUSE_AUTONEG */ nla_total_size(sizeof(u8)) + /* _PAUSE_RX */ nla_total_size(sizeof(u8)); /* _PAUSE_TX */ if (req_base->flags & ETHTOOL_FLAG_STATS) n += nla_total_size(0) + /* _PAUSE_STATS */ nla_total_size(sizeof(u32)) + /* _PAUSE_STATS_SRC */ nla_total_size_64bit(sizeof(u64)) * ETHTOOL_PAUSE_STAT_CNT; return n; } static int ethtool_put_stat(struct sk_buff *skb, u64 val, u16 attrtype, u16 padtype) { if (val == ETHTOOL_STAT_NOT_SET) return 0; if (nla_put_u64_64bit(skb, attrtype, val, padtype)) return -EMSGSIZE; return 0; } static int pause_put_stats(struct sk_buff *skb, const struct ethtool_pause_stats *pause_stats) { const u16 pad = ETHTOOL_A_PAUSE_STAT_PAD; struct nlattr *nest; if (nla_put_u32(skb, ETHTOOL_A_PAUSE_STATS_SRC, pause_stats->src)) return -EMSGSIZE; nest = nla_nest_start(skb, ETHTOOL_A_PAUSE_STATS); if (!nest) return -EMSGSIZE; if (ethtool_put_stat(skb, pause_stats->tx_pause_frames, ETHTOOL_A_PAUSE_STAT_TX_FRAMES, pad) || ethtool_put_stat(skb, pause_stats->rx_pause_frames, ETHTOOL_A_PAUSE_STAT_RX_FRAMES, pad)) goto err_cancel; nla_nest_end(skb, nest); return 0; err_cancel: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int pause_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct pause_reply_data *data = PAUSE_REPDATA(reply_base); const struct ethtool_pauseparam *pauseparam = &data->pauseparam; if (nla_put_u8(skb, ETHTOOL_A_PAUSE_AUTONEG, !!pauseparam->autoneg) || nla_put_u8(skb, ETHTOOL_A_PAUSE_RX, !!pauseparam->rx_pause) || nla_put_u8(skb, ETHTOOL_A_PAUSE_TX, !!pauseparam->tx_pause)) return -EMSGSIZE; if (req_base->flags & ETHTOOL_FLAG_STATS && pause_put_stats(skb, &data->pausestat)) return -EMSGSIZE; return 0; } /* PAUSE_SET */ const struct nla_policy ethnl_pause_set_policy[] = { [ETHTOOL_A_PAUSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_PAUSE_AUTONEG] = { .type = NLA_U8 }, [ETHTOOL_A_PAUSE_RX] = { .type = NLA_U8 }, [ETHTOOL_A_PAUSE_TX] = { .type = NLA_U8 }, }; static int ethnl_set_pause_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; return ops->get_pauseparam && ops->set_pauseparam ? 1 : -EOPNOTSUPP; } static int ethnl_set_pause(struct ethnl_req_info *req_info, struct genl_info *info) { struct net_device *dev = req_info->dev; struct ethtool_pauseparam params = {}; struct nlattr **tb = info->attrs; bool mod = false; int ret; dev->ethtool_ops->get_pauseparam(dev, &params); ethnl_update_bool32(&params.autoneg, tb[ETHTOOL_A_PAUSE_AUTONEG], &mod); ethnl_update_bool32(&params.rx_pause, tb[ETHTOOL_A_PAUSE_RX], &mod); ethnl_update_bool32(&params.tx_pause, tb[ETHTOOL_A_PAUSE_TX], &mod); if (!mod) return 0; ret = dev->ethtool_ops->set_pauseparam(dev, &params); return ret < 0 ? ret : 1; } const struct ethnl_request_ops ethnl_pause_request_ops = { .request_cmd = ETHTOOL_MSG_PAUSE_GET, .reply_cmd = ETHTOOL_MSG_PAUSE_GET_REPLY, .hdr_attr = ETHTOOL_A_PAUSE_HEADER, .req_info_size = sizeof(struct pause_req_info), .reply_data_size = sizeof(struct pause_reply_data), .parse_request = pause_parse_request, .prepare_data = pause_prepare_data, .reply_size = pause_reply_size, .fill_reply = pause_fill_reply, .set_validate = ethnl_set_pause_validate, .set = ethnl_set_pause, .set_ntf_cmd = ETHTOOL_MSG_PAUSE_NTF, };
6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 /* SPDX-License-Identifier: GPL-2.0 */ /* * win_minmax.h: windowed min/max tracker by Kathleen Nichols. * */ #ifndef MINMAX_H #define MINMAX_H #include <linux/types.h> /* A single data point for our parameterized min-max tracker */ struct minmax_sample { u32 t; /* time measurement was taken */ u32 v; /* value measured */ }; /* State for the parameterized min-max tracker */ struct minmax { struct minmax_sample s[3]; }; static inline u32 minmax_get(const struct minmax *m) { return m->s[0].v; } static inline u32 minmax_reset(struct minmax *m, u32 t, u32 meas) { struct minmax_sample val = { .t = t, .v = meas }; m->s[2] = m->s[1] = m->s[0] = val; return m->s[0].v; } u32 minmax_running_max(struct minmax *m, u32 win, u32 t, u32 meas); u32 minmax_running_min(struct minmax *m, u32 win, u32 t, u32 meas); #endif
662 646 13 32 12 21 28 572 88 641 29 7 660 29 29 244 322 58 109 405 405 611 573 88 9 88 256 401 401 574 88 611 572 88 611 7 7 21 604 258 401 401 573 88 608 21 14 7 14 7 21 21 21 579 88 606 670 16 660 671 37 641 22 667 622 150 666 83 82 82 82 605 606 604 590 26 283 564 568 88 587 36 607 607 606 282 565 281 566 21 19 1 1 682 681 682 682 683 681 28 28 27 27 28 28 14 5 5 5 5 5 5 5 5 5 5 670 669 671 670 654 4 21 653 28 28 28 28 3 21 21 4 28 28 671 657 670 15 660 662 669 669 5 13 667 664 667 667 666 667 659 5 5 670 443 284 283 670 208 564 672 670 665 39 186 616 12 8 668 670 669 671 670 672 671 671 3 5 5 5 5 5 5 5 5 5 5 657 109 109 109 109 659 627 109 15 109 659 606 109 64 29 43 114 505 9 256 61 192 257 9 255 5 254 255 209 30 7 24 24 608 452 196 195 421 3 2 82 665 4 666 15 3 674 51 12 671 417 83 351 350 417 114 165 594 594 489 160 48 113 129 510 609 677 669 670 655 607 610 81 2 85 676 677 515 203 677 510 204 5 5 5 1 1 8 8 8 7 7 2 3 2 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 // SPDX-License-Identifier: GPL-2.0-or-later /* * VMA-specific functions. */ #include "vma_internal.h" #include "vma.h" struct mmap_state { struct mm_struct *mm; struct vma_iterator *vmi; unsigned long addr; unsigned long end; pgoff_t pgoff; unsigned long pglen; unsigned long flags; struct file *file; unsigned long charged; bool retry_merge; struct vm_area_struct *prev; struct vm_area_struct *next; /* Unmapping state. */ struct vma_munmap_struct vms; struct ma_state mas_detach; struct maple_tree mt_detach; }; #define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, flags_, file_) \ struct mmap_state name = { \ .mm = mm_, \ .vmi = vmi_, \ .addr = addr_, \ .end = (addr_) + (len_), \ .pgoff = pgoff_, \ .pglen = PHYS_PFN(len_), \ .flags = flags_, \ .file = file_, \ } #define VMG_MMAP_STATE(name, map_, vma_) \ struct vma_merge_struct name = { \ .mm = (map_)->mm, \ .vmi = (map_)->vmi, \ .start = (map_)->addr, \ .end = (map_)->end, \ .flags = (map_)->flags, \ .pgoff = (map_)->pgoff, \ .file = (map_)->file, \ .prev = (map_)->prev, \ .vma = vma_, \ .next = (vma_) ? NULL : (map_)->next, \ .state = VMA_MERGE_START, \ .merge_flags = VMG_FLAG_DEFAULT, \ } static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next) { struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev; if (!mpol_equal(vmg->policy, vma_policy(vma))) return false; /* * VM_SOFTDIRTY should not prevent from VMA merging, if we * match the flags but dirty bit -- the caller should mark * merged VMA as dirty. If dirty bit won't be excluded from * comparison, we increase pressure on the memory system forcing * the kernel to generate new VMAs when old one could be * extended instead. */ if ((vma->vm_flags ^ vmg->flags) & ~VM_SOFTDIRTY) return false; if (vma->vm_file != vmg->file) return false; if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx)) return false; if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name)) return false; return true; } static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1, struct anon_vma *anon_vma2, struct vm_area_struct *vma) { /* * The list_is_singular() test is to avoid merging VMA cloned from * parents. This can improve scalability caused by anon_vma lock. */ if ((!anon_vma1 || !anon_vma2) && (!vma || list_is_singular(&vma->anon_vma_chain))) return true; return anon_vma1 == anon_vma2; } /* Are the anon_vma's belonging to each VMA compatible with one another? */ static inline bool are_anon_vmas_compatible(struct vm_area_struct *vma1, struct vm_area_struct *vma2) { return is_mergeable_anon_vma(vma1->anon_vma, vma2->anon_vma, NULL); } /* * init_multi_vma_prep() - Initializer for struct vma_prepare * @vp: The vma_prepare struct * @vma: The vma that will be altered once locked * @next: The next vma if it is to be adjusted * @remove: The first vma to be removed * @remove2: The second vma to be removed */ static void init_multi_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma, struct vm_area_struct *next, struct vm_area_struct *remove, struct vm_area_struct *remove2) { memset(vp, 0, sizeof(struct vma_prepare)); vp->vma = vma; vp->anon_vma = vma->anon_vma; vp->remove = remove; vp->remove2 = remove2; vp->adj_next = next; if (!vp->anon_vma && next) vp->anon_vma = next->anon_vma; vp->file = vma->vm_file; if (vp->file) vp->mapping = vma->vm_file->f_mapping; } /* * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) * in front of (at a lower virtual address and file offset than) the vma. * * We cannot merge two vmas if they have differently assigned (non-NULL) * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. * * We don't check here for the merged mmap wrapping around the end of pagecache * indices (16TB on ia32) because do_mmap() does not permit mmap's which * wrap, nor mmaps which cover the final page at index -1UL. * * We assume the vma may be removed as part of the merge. */ static bool can_vma_merge_before(struct vma_merge_struct *vmg) { pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); if (is_mergeable_vma(vmg, /* merge_next = */ true) && is_mergeable_anon_vma(vmg->anon_vma, vmg->next->anon_vma, vmg->next)) { if (vmg->next->vm_pgoff == vmg->pgoff + pglen) return true; } return false; } /* * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff) * beyond (at a higher virtual address and file offset than) the vma. * * We cannot merge two vmas if they have differently assigned (non-NULL) * anon_vmas, nor if same anon_vma is assigned but offsets incompatible. * * We assume that vma is not removed as part of the merge. */ static bool can_vma_merge_after(struct vma_merge_struct *vmg) { if (is_mergeable_vma(vmg, /* merge_next = */ false) && is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) { if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff) return true; } return false; } static void __vma_link_file(struct vm_area_struct *vma, struct address_space *mapping) { if (vma_is_shared_maywrite(vma)) mapping_allow_writable(mapping); flush_dcache_mmap_lock(mapping); vma_interval_tree_insert(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } /* * Requires inode->i_mapping->i_mmap_rwsem */ static void __remove_shared_vm_struct(struct vm_area_struct *vma, struct address_space *mapping) { if (vma_is_shared_maywrite(vma)) mapping_unmap_writable(mapping); flush_dcache_mmap_lock(mapping); vma_interval_tree_remove(vma, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); } /* * vma has some anon_vma assigned, and is already inserted on that * anon_vma's interval trees. * * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the * vma must be removed from the anon_vma's interval trees using * anon_vma_interval_tree_pre_update_vma(). * * After the update, the vma will be reinserted using * anon_vma_interval_tree_post_update_vma(). * * The entire update must be protected by exclusive mmap_lock and by * the root anon_vma's mutex. */ static void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma) { struct anon_vma_chain *avc; list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root); } static void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) { struct anon_vma_chain *avc; list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); } /* * vma_prepare() - Helper function for handling locking VMAs prior to altering * @vp: The initialized vma_prepare struct */ static void vma_prepare(struct vma_prepare *vp) { if (vp->file) { uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); if (vp->adj_next) uprobe_munmap(vp->adj_next, vp->adj_next->vm_start, vp->adj_next->vm_end); i_mmap_lock_write(vp->mapping); if (vp->insert && vp->insert->vm_file) { /* * Put into interval tree now, so instantiated pages * are visible to arm/parisc __flush_dcache_page * throughout; but we cannot insert into address * space until vma start or end is updated. */ __vma_link_file(vp->insert, vp->insert->vm_file->f_mapping); } } if (vp->anon_vma) { anon_vma_lock_write(vp->anon_vma); anon_vma_interval_tree_pre_update_vma(vp->vma); if (vp->adj_next) anon_vma_interval_tree_pre_update_vma(vp->adj_next); } if (vp->file) { flush_dcache_mmap_lock(vp->mapping); vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap); if (vp->adj_next) vma_interval_tree_remove(vp->adj_next, &vp->mapping->i_mmap); } } /* * vma_complete- Helper function for handling the unlocking after altering VMAs, * or for inserting a VMA. * * @vp: The vma_prepare struct * @vmi: The vma iterator * @mm: The mm_struct */ static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi, struct mm_struct *mm) { if (vp->file) { if (vp->adj_next) vma_interval_tree_insert(vp->adj_next, &vp->mapping->i_mmap); vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap); flush_dcache_mmap_unlock(vp->mapping); } if (vp->remove && vp->file) { __remove_shared_vm_struct(vp->remove, vp->mapping); if (vp->remove2) __remove_shared_vm_struct(vp->remove2, vp->mapping); } else if (vp->insert) { /* * split_vma has split insert from vma, and needs * us to insert it before dropping the locks * (it may either follow vma or precede it). */ vma_iter_store(vmi, vp->insert); mm->map_count++; } if (vp->anon_vma) { anon_vma_interval_tree_post_update_vma(vp->vma); if (vp->adj_next) anon_vma_interval_tree_post_update_vma(vp->adj_next); anon_vma_unlock_write(vp->anon_vma); } if (vp->file) { i_mmap_unlock_write(vp->mapping); uprobe_mmap(vp->vma); if (vp->adj_next) uprobe_mmap(vp->adj_next); } if (vp->remove) { again: vma_mark_detached(vp->remove, true); if (vp->file) { uprobe_munmap(vp->remove, vp->remove->vm_start, vp->remove->vm_end); fput(vp->file); } if (vp->remove->anon_vma) anon_vma_merge(vp->vma, vp->remove); mm->map_count--; mpol_put(vma_policy(vp->remove)); if (!vp->remove2) WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end); vm_area_free(vp->remove); /* * In mprotect's case 6 (see comments on vma_merge), * we are removing both mid and next vmas */ if (vp->remove2) { vp->remove = vp->remove2; vp->remove2 = NULL; goto again; } } if (vp->insert && vp->file) uprobe_mmap(vp->insert); } /* * init_vma_prep() - Initializer wrapper for vma_prepare struct * @vp: The vma_prepare struct * @vma: The vma that will be altered once locked */ static void init_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma) { init_multi_vma_prep(vp, vma, NULL, NULL, NULL); } /* * Can the proposed VMA be merged with the left (previous) VMA taking into * account the start position of the proposed range. */ static bool can_vma_merge_left(struct vma_merge_struct *vmg) { return vmg->prev && vmg->prev->vm_end == vmg->start && can_vma_merge_after(vmg); } /* * Can the proposed VMA be merged with the right (next) VMA taking into * account the end position of the proposed range. * * In addition, if we can merge with the left VMA, ensure that left and right * anon_vma's are also compatible. */ static bool can_vma_merge_right(struct vma_merge_struct *vmg, bool can_merge_left) { if (!vmg->next || vmg->end != vmg->next->vm_start || !can_vma_merge_before(vmg)) return false; if (!can_merge_left) return true; /* * If we can merge with prev (left) and next (right), indicating that * each VMA's anon_vma is compatible with the proposed anon_vma, this * does not mean prev and next are compatible with EACH OTHER. * * We therefore check this in addition to mergeability to either side. */ return are_anon_vmas_compatible(vmg->prev, vmg->next); } /* * Close a vm structure and free it. */ void remove_vma(struct vm_area_struct *vma, bool unreachable) { might_sleep(); vma_close(vma); if (vma->vm_file) fput(vma->vm_file); mpol_put(vma_policy(vma)); if (unreachable) __vm_area_free(vma); else vm_area_free(vma); } /* * Get rid of page table information in the indicated region. * * Called with the mm semaphore held. */ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next) { struct mm_struct *mm = vma->vm_mm; struct mmu_gather tlb; tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end, /* mm_wr_locked = */ true); mas_set(mas, vma->vm_end); free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, next ? next->vm_start : USER_PGTABLES_CEILING, /* mm_wr_locked = */ true); tlb_finish_mmu(&tlb); } /* * __split_vma() bypasses sysctl_max_map_count checking. We use this where it * has already been checked or doesn't make sense to fail. * VMA Iterator will point to the original VMA. */ static __must_check int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { struct vma_prepare vp; struct vm_area_struct *new; int err; WARN_ON(vma->vm_start >= addr); WARN_ON(vma->vm_end <= addr); if (vma->vm_ops && vma->vm_ops->may_split) { err = vma->vm_ops->may_split(vma, addr); if (err) return err; } new = vm_area_dup(vma); if (!new) return -ENOMEM; if (new_below) { new->vm_end = addr; } else { new->vm_start = addr; new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); } err = -ENOMEM; vma_iter_config(vmi, new->vm_start, new->vm_end); if (vma_iter_prealloc(vmi, new)) goto out_free_vma; err = vma_dup_policy(vma, new); if (err) goto out_free_vmi; err = anon_vma_clone(new, vma); if (err) goto out_free_mpol; if (new->vm_file) get_file(new->vm_file); if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); vma_start_write(vma); vma_start_write(new); init_vma_prep(&vp, vma); vp.insert = new; vma_prepare(&vp); vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); if (new_below) { vma->vm_start = addr; vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT; } else { vma->vm_end = addr; } /* vma_complete stores the new vma */ vma_complete(&vp, vmi, vma->vm_mm); validate_mm(vma->vm_mm); /* Success. */ if (new_below) vma_next(vmi); else vma_prev(vmi); return 0; out_free_mpol: mpol_put(vma_policy(new)); out_free_vmi: vma_iter_free(vmi); out_free_vma: vm_area_free(new); return err; } /* * Split a vma into two pieces at address 'addr', a new vma is allocated * either for the first part or the tail. */ static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, int new_below) { if (vma->vm_mm->map_count >= sysctl_max_map_count) return -ENOMEM; return __split_vma(vmi, vma, addr, new_below); } /* * dup_anon_vma() - Helper function to duplicate anon_vma * @dst: The destination VMA * @src: The source VMA * @dup: Pointer to the destination VMA when successful. * * Returns: 0 on success. */ static int dup_anon_vma(struct vm_area_struct *dst, struct vm_area_struct *src, struct vm_area_struct **dup) { /* * Easily overlooked: when mprotect shifts the boundary, make sure the * expanding vma has anon_vma set if the shrinking vma had, to cover any * anon pages imported. */ if (src->anon_vma && !dst->anon_vma) { int ret; vma_assert_write_locked(dst); dst->anon_vma = src->anon_vma; ret = anon_vma_clone(dst, src); if (ret) return ret; *dup = dst; } return 0; } #ifdef CONFIG_DEBUG_VM_MAPLE_TREE void validate_mm(struct mm_struct *mm) { int bug = 0; int i = 0; struct vm_area_struct *vma; VMA_ITERATOR(vmi, mm, 0); mt_validate(&mm->mm_mt); for_each_vma(vmi, vma) { #ifdef CONFIG_DEBUG_VM_RB struct anon_vma *anon_vma = vma->anon_vma; struct anon_vma_chain *avc; #endif unsigned long vmi_start, vmi_end; bool warn = 0; vmi_start = vma_iter_addr(&vmi); vmi_end = vma_iter_end(&vmi); if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm)) warn = 1; if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm)) warn = 1; if (warn) { pr_emerg("issue in %s\n", current->comm); dump_stack(); dump_vma(vma); pr_emerg("tree range: %px start %lx end %lx\n", vma, vmi_start, vmi_end - 1); vma_iter_dump_tree(&vmi); } #ifdef CONFIG_DEBUG_VM_RB if (anon_vma) { anon_vma_lock_read(anon_vma); list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) anon_vma_interval_tree_verify(avc); anon_vma_unlock_read(anon_vma); } #endif /* Check for a infinite loop */ if (++i > mm->map_count + 10) { i = -1; break; } } if (i != mm->map_count) { pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i); bug = 1; } VM_BUG_ON_MM(bug, mm); } #endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ /* Actually perform the VMA merge operation. */ static int commit_merge(struct vma_merge_struct *vmg, struct vm_area_struct *adjust, struct vm_area_struct *remove, struct vm_area_struct *remove2, long adj_start, bool expanded) { struct vma_prepare vp; init_multi_vma_prep(&vp, vmg->vma, adjust, remove, remove2); VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma && vp.anon_vma != adjust->anon_vma); if (expanded) { /* Note: vma iterator must be pointing to 'start'. */ vma_iter_config(vmg->vmi, vmg->start, vmg->end); } else { vma_iter_config(vmg->vmi, adjust->vm_start + adj_start, adjust->vm_end); } if (vma_iter_prealloc(vmg->vmi, vmg->vma)) return -ENOMEM; vma_prepare(&vp); vma_adjust_trans_huge(vmg->vma, vmg->start, vmg->end, adj_start); vma_set_range(vmg->vma, vmg->start, vmg->end, vmg->pgoff); if (expanded) vma_iter_store(vmg->vmi, vmg->vma); if (adj_start) { adjust->vm_start += adj_start; adjust->vm_pgoff += PHYS_PFN(adj_start); if (adj_start < 0) { WARN_ON(expanded); vma_iter_store(vmg->vmi, adjust); } } vma_complete(&vp, vmg->vmi, vmg->vma->vm_mm); return 0; } /* We can only remove VMAs when merging if they do not have a close hook. */ static bool can_merge_remove_vma(struct vm_area_struct *vma) { return !vma->vm_ops || !vma->vm_ops->close; } /* * vma_merge_existing_range - Attempt to merge VMAs based on a VMA having its * attributes modified. * * @vmg: Describes the modifications being made to a VMA and associated * metadata. * * When the attributes of a range within a VMA change, then it might be possible * for immediately adjacent VMAs to be merged into that VMA due to having * identical properties. * * This function checks for the existence of any such mergeable VMAs and updates * the maple tree describing the @vmg->vma->vm_mm address space to account for * this, as well as any VMAs shrunk/expanded/deleted as a result of this merge. * * As part of this operation, if a merge occurs, the @vmg object will have its * vma, start, end, and pgoff fields modified to execute the merge. Subsequent * calls to this function should reset these fields. * * Returns: The merged VMA if merge succeeds, or NULL otherwise. * * ASSUMPTIONS: * - The caller must assign the VMA to be modifed to @vmg->vma. * - The caller must have set @vmg->prev to the previous VMA, if there is one. * - The caller must not set @vmg->next, as we determine this. * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. * - vmi must be positioned within [@vmg->vma->vm_start, @vmg->vma->vm_end). */ static __must_check struct vm_area_struct *vma_merge_existing_range( struct vma_merge_struct *vmg) { struct vm_area_struct *vma = vmg->vma; struct vm_area_struct *prev = vmg->prev; struct vm_area_struct *next, *res; struct vm_area_struct *anon_dup = NULL; struct vm_area_struct *adjust = NULL; unsigned long start = vmg->start; unsigned long end = vmg->end; bool left_side = vma && start == vma->vm_start; bool right_side = vma && end == vma->vm_end; int err = 0; long adj_start = 0; bool merge_will_delete_vma, merge_will_delete_next; bool merge_left, merge_right, merge_both; bool expanded; mmap_assert_write_locked(vmg->mm); VM_WARN_ON_VMG(!vma, vmg); /* We are modifying a VMA, so caller must specify. */ VM_WARN_ON_VMG(vmg->next, vmg); /* We set this. */ VM_WARN_ON_VMG(prev && start <= prev->vm_start, vmg); VM_WARN_ON_VMG(start >= end, vmg); /* * If vma == prev, then we are offset into a VMA. Otherwise, if we are * not, we must span a portion of the VMA. */ VM_WARN_ON_VMG(vma && ((vma != prev && vmg->start != vma->vm_start) || vmg->end > vma->vm_end), vmg); /* The vmi must be positioned within vmg->vma. */ VM_WARN_ON_VMG(vma && !(vma_iter_addr(vmg->vmi) >= vma->vm_start && vma_iter_addr(vmg->vmi) < vma->vm_end), vmg); vmg->state = VMA_MERGE_NOMERGE; /* * If a special mapping or if the range being modified is neither at the * furthermost left or right side of the VMA, then we have no chance of * merging and should abort. */ if (vmg->flags & VM_SPECIAL || (!left_side && !right_side)) return NULL; if (left_side) merge_left = can_vma_merge_left(vmg); else merge_left = false; if (right_side) { next = vmg->next = vma_iter_next_range(vmg->vmi); vma_iter_prev_range(vmg->vmi); merge_right = can_vma_merge_right(vmg, merge_left); } else { merge_right = false; next = NULL; } if (merge_left) /* If merging prev, position iterator there. */ vma_prev(vmg->vmi); else if (!merge_right) /* If we have nothing to merge, abort. */ return NULL; merge_both = merge_left && merge_right; /* If we span the entire VMA, a merge implies it will be deleted. */ merge_will_delete_vma = left_side && right_side; /* * If we need to remove vma in its entirety but are unable to do so, * we have no sensible recourse but to abort the merge. */ if (merge_will_delete_vma && !can_merge_remove_vma(vma)) return NULL; /* * If we merge both VMAs, then next is also deleted. This implies * merge_will_delete_vma also. */ merge_will_delete_next = merge_both; /* * If we cannot delete next, then we can reduce the operation to merging * prev and vma (thereby deleting vma). */ if (merge_will_delete_next && !can_merge_remove_vma(next)) { merge_will_delete_next = false; merge_right = false; merge_both = false; } /* No matter what happens, we will be adjusting vma. */ vma_start_write(vma); if (merge_left) vma_start_write(prev); if (merge_right) vma_start_write(next); if (merge_both) { /* * |<----->| * |-------*********-------| * prev vma next * extend delete delete */ vmg->vma = prev; vmg->start = prev->vm_start; vmg->end = next->vm_end; vmg->pgoff = prev->vm_pgoff; /* * We already ensured anon_vma compatibility above, so now it's * simply a case of, if prev has no anon_vma object, which of * next or vma contains the anon_vma we must duplicate. */ err = dup_anon_vma(prev, next->anon_vma ? next : vma, &anon_dup); } else if (merge_left) { /* * |<----->| OR * |<--------->| * |-------************* * prev vma * extend shrink/delete */ vmg->vma = prev; vmg->start = prev->vm_start; vmg->pgoff = prev->vm_pgoff; if (!merge_will_delete_vma) { adjust = vma; adj_start = vmg->end - vma->vm_start; } err = dup_anon_vma(prev, vma, &anon_dup); } else { /* merge_right */ /* * |<----->| OR * |<--------->| * *************-------| * vma next * shrink/delete extend */ pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); VM_WARN_ON_VMG(!merge_right, vmg); /* If we are offset into a VMA, then prev must be vma. */ VM_WARN_ON_VMG(vmg->start > vma->vm_start && prev && vma != prev, vmg); if (merge_will_delete_vma) { vmg->vma = next; vmg->end = next->vm_end; vmg->pgoff = next->vm_pgoff - pglen; } else { /* * We shrink vma and expand next. * * IMPORTANT: This is the ONLY case where the final * merged VMA is NOT vmg->vma, but rather vmg->next. */ vmg->start = vma->vm_start; vmg->end = start; vmg->pgoff = vma->vm_pgoff; adjust = next; adj_start = -(vma->vm_end - start); } err = dup_anon_vma(next, vma, &anon_dup); } if (err) goto abort; /* * In nearly all cases, we expand vmg->vma. There is one exception - * merge_right where we partially span the VMA. In this case we shrink * the end of vmg->vma and adjust the start of vmg->next accordingly. */ expanded = !merge_right || merge_will_delete_vma; if (commit_merge(vmg, adjust, merge_will_delete_vma ? vma : NULL, merge_will_delete_next ? next : NULL, adj_start, expanded)) { if (anon_dup) unlink_anon_vmas(anon_dup); vmg->state = VMA_MERGE_ERROR_NOMEM; return NULL; } res = merge_left ? prev : next; khugepaged_enter_vma(res, vmg->flags); vmg->state = VMA_MERGE_SUCCESS; return res; abort: vma_iter_set(vmg->vmi, start); vma_iter_load(vmg->vmi); vmg->state = VMA_MERGE_ERROR_NOMEM; return NULL; } /* * vma_merge_new_range - Attempt to merge a new VMA into address space * * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end * (exclusive), which we try to merge with any adjacent VMAs if possible. * * We are about to add a VMA to the address space starting at @vmg->start and * ending at @vmg->end. There are three different possible scenarios: * * 1. There is a VMA with identical properties immediately adjacent to the * proposed new VMA [@vmg->start, @vmg->end) either before or after it - * EXPAND that VMA: * * Proposed: |-----| or |-----| * Existing: |----| |----| * * 2. There are VMAs with identical properties immediately adjacent to the * proposed new VMA [@vmg->start, @vmg->end) both before AND after it - * EXPAND the former and REMOVE the latter: * * Proposed: |-----| * Existing: |----| |----| * * 3. There are no VMAs immediately adjacent to the proposed new VMA or those * VMAs do not have identical attributes - NO MERGE POSSIBLE. * * In instances where we can merge, this function returns the expanded VMA which * will have its range adjusted accordingly and the underlying maple tree also * adjusted. * * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer * to the VMA we expanded. * * This function adjusts @vmg to provide @vmg->next if not already specified, * and adjusts [@vmg->start, @vmg->end) to span the expanded range. * * ASSUMPTIONS: * - The caller must hold a WRITE lock on the mm_struct->mmap_lock. * - The caller must have determined that [@vmg->start, @vmg->end) is empty, other than VMAs that will be unmapped should the operation succeed. * - The caller must have specified the previous vma in @vmg->prev. * - The caller must have specified the next vma in @vmg->next. * - The caller must have positioned the vmi at or before the gap. */ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) { struct vm_area_struct *prev = vmg->prev; struct vm_area_struct *next = vmg->next; unsigned long end = vmg->end; bool can_merge_left, can_merge_right; bool just_expand = vmg->merge_flags & VMG_FLAG_JUST_EXPAND; mmap_assert_write_locked(vmg->mm); VM_WARN_ON_VMG(vmg->vma, vmg); /* vmi must point at or before the gap. */ VM_WARN_ON_VMG(vma_iter_addr(vmg->vmi) > end, vmg); vmg->state = VMA_MERGE_NOMERGE; /* Special VMAs are unmergeable, also if no prev/next. */ if ((vmg->flags & VM_SPECIAL) || (!prev && !next)) return NULL; can_merge_left = can_vma_merge_left(vmg); can_merge_right = !just_expand && can_vma_merge_right(vmg, can_merge_left); /* If we can merge with the next VMA, adjust vmg accordingly. */ if (can_merge_right) { vmg->end = next->vm_end; vmg->vma = next; } /* If we can merge with the previous VMA, adjust vmg accordingly. */ if (can_merge_left) { vmg->start = prev->vm_start; vmg->vma = prev; vmg->pgoff = prev->vm_pgoff; /* * If this merge would result in removal of the next VMA but we * are not permitted to do so, reduce the operation to merging * prev and vma. */ if (can_merge_right && !can_merge_remove_vma(next)) vmg->end = end; /* In expand-only case we are already positioned at prev. */ if (!just_expand) { /* Equivalent to going to the previous range. */ vma_prev(vmg->vmi); } } /* * Now try to expand adjacent VMA(s). This takes care of removing the * following VMA if we have VMAs on both sides. */ if (vmg->vma && !vma_expand(vmg)) { khugepaged_enter_vma(vmg->vma, vmg->flags); vmg->state = VMA_MERGE_SUCCESS; return vmg->vma; } return NULL; } /* * vma_expand - Expand an existing VMA * * @vmg: Describes a VMA expansion operation. * * Expand @vma to vmg->start and vmg->end. Can expand off the start and end. * Will expand over vmg->next if it's different from vmg->vma and vmg->end == * vmg->next->vm_end. Checking if the vmg->vma can expand and merge with * vmg->next needs to be handled by the caller. * * Returns: 0 on success. * * ASSUMPTIONS: * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock. * - The caller must have set @vmg->vma and @vmg->next. */ int vma_expand(struct vma_merge_struct *vmg) { struct vm_area_struct *anon_dup = NULL; bool remove_next = false; struct vm_area_struct *vma = vmg->vma; struct vm_area_struct *next = vmg->next; mmap_assert_write_locked(vmg->mm); vma_start_write(vma); if (next && (vma != next) && (vmg->end == next->vm_end)) { int ret; remove_next = true; /* This should already have been checked by this point. */ VM_WARN_ON_VMG(!can_merge_remove_vma(next), vmg); vma_start_write(next); ret = dup_anon_vma(vma, next, &anon_dup); if (ret) return ret; } /* Not merging but overwriting any part of next is not handled. */ VM_WARN_ON_VMG(next && !remove_next && next != vma && vmg->end > next->vm_start, vmg); /* Only handles expanding */ VM_WARN_ON_VMG(vma->vm_start < vmg->start || vma->vm_end > vmg->end, vmg); if (commit_merge(vmg, NULL, remove_next ? next : NULL, NULL, 0, true)) goto nomem; return 0; nomem: vmg->state = VMA_MERGE_ERROR_NOMEM; if (anon_dup) unlink_anon_vmas(anon_dup); return -ENOMEM; } /* * vma_shrink() - Reduce an existing VMAs memory area * @vmi: The vma iterator * @vma: The VMA to modify * @start: The new start * @end: The new end * * Returns: 0 on success, -ENOMEM otherwise */ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff) { struct vma_prepare vp; WARN_ON((vma->vm_start != start) && (vma->vm_end != end)); if (vma->vm_start < start) vma_iter_config(vmi, vma->vm_start, start); else vma_iter_config(vmi, end, vma->vm_end); if (vma_iter_prealloc(vmi, NULL)) return -ENOMEM; vma_start_write(vma); init_vma_prep(&vp, vma); vma_prepare(&vp); vma_adjust_trans_huge(vma, start, end, 0); vma_iter_clear(vmi); vma_set_range(vma, start, end, pgoff); vma_complete(&vp, vmi, vma->vm_mm); validate_mm(vma->vm_mm); return 0; } static inline void vms_clear_ptes(struct vma_munmap_struct *vms, struct ma_state *mas_detach, bool mm_wr_locked) { struct mmu_gather tlb; if (!vms->clear_ptes) /* Nothing to do */ return; /* * We can free page tables without write-locking mmap_lock because VMAs * were isolated before we downgraded mmap_lock. */ mas_set(mas_detach, 1); tlb_gather_mmu(&tlb, vms->vma->vm_mm); update_hiwater_rss(vms->vma->vm_mm); unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end, vms->vma_count, mm_wr_locked); mas_set(mas_detach, 1); /* start and end may be different if there is no prev or next vma. */ free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, vms->unmap_end, mm_wr_locked); tlb_finish_mmu(&tlb); vms->clear_ptes = false; } static void vms_clean_up_area(struct vma_munmap_struct *vms, struct ma_state *mas_detach) { struct vm_area_struct *vma; if (!vms->nr_pages) return; vms_clear_ptes(vms, mas_detach, true); mas_set(mas_detach, 0); mas_for_each(mas_detach, vma, ULONG_MAX) vma_close(vma); } /* * vms_complete_munmap_vmas() - Finish the munmap() operation * @vms: The vma munmap struct * @mas_detach: The maple state of the detached vmas * * This updates the mm_struct, unmaps the region, frees the resources * used for the munmap() and may downgrade the lock - if requested. Everything * needed to be done once the vma maple tree is updated. */ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, struct ma_state *mas_detach) { struct vm_area_struct *vma; struct mm_struct *mm; mm = current->mm; mm->map_count -= vms->vma_count; mm->locked_vm -= vms->locked_vm; if (vms->unlock) mmap_write_downgrade(mm); if (!vms->nr_pages) return; vms_clear_ptes(vms, mas_detach, !vms->unlock); /* Update high watermark before we lower total_vm */ update_hiwater_vm(mm); /* Stat accounting */ WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm) - vms->nr_pages); /* Paranoid bookkeeping */ VM_WARN_ON(vms->exec_vm > mm->exec_vm); VM_WARN_ON(vms->stack_vm > mm->stack_vm); VM_WARN_ON(vms->data_vm > mm->data_vm); mm->exec_vm -= vms->exec_vm; mm->stack_vm -= vms->stack_vm; mm->data_vm -= vms->data_vm; /* Remove and clean up vmas */ mas_set(mas_detach, 0); mas_for_each(mas_detach, vma, ULONG_MAX) remove_vma(vma, /* unreachable = */ false); vm_unacct_memory(vms->nr_accounted); validate_mm(mm); if (vms->unlock) mmap_read_unlock(mm); __mt_destroy(mas_detach->tree); } /* * reattach_vmas() - Undo any munmap work and free resources * @mas_detach: The maple state with the detached maple tree * * Reattach any detached vmas and free up the maple tree used to track the vmas. */ static void reattach_vmas(struct ma_state *mas_detach) { struct vm_area_struct *vma; mas_set(mas_detach, 0); mas_for_each(mas_detach, vma, ULONG_MAX) vma_mark_detached(vma, false); __mt_destroy(mas_detach->tree); } /* * vms_gather_munmap_vmas() - Put all VMAs within a range into a maple tree * for removal at a later date. Handles splitting first and last if necessary * and marking the vmas as isolated. * * @vms: The vma munmap struct * @mas_detach: The maple state tracking the detached tree * * Return: 0 on success, error otherwise */ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, struct ma_state *mas_detach) { struct vm_area_struct *next = NULL; int error; /* * If we need to split any vma, do it now to save pain later. * Does it split the first one? */ if (vms->start > vms->vma->vm_start) { /* * Make sure that map_count on return from munmap() will * not exceed its limit; but let map_count go just above * its limit temporarily, to help free resources as expected. */ if (vms->end < vms->vma->vm_end && vms->vma->vm_mm->map_count >= sysctl_max_map_count) { error = -ENOMEM; goto map_count_exceeded; } /* Don't bother splitting the VMA if we can't unmap it anyway */ if (!can_modify_vma(vms->vma)) { error = -EPERM; goto start_split_failed; } error = __split_vma(vms->vmi, vms->vma, vms->start, 1); if (error) goto start_split_failed; } vms->prev = vma_prev(vms->vmi); if (vms->prev) vms->unmap_start = vms->prev->vm_end; /* * Detach a range of VMAs from the mm. Using next as a temp variable as * it is always overwritten. */ for_each_vma_range(*(vms->vmi), next, vms->end) { long nrpages; if (!can_modify_vma(next)) { error = -EPERM; goto modify_vma_failed; } /* Does it split the end? */ if (next->vm_end > vms->end) { error = __split_vma(vms->vmi, next, vms->end, 0); if (error) goto end_split_failed; } vma_start_write(next); mas_set(mas_detach, vms->vma_count++); error = mas_store_gfp(mas_detach, next, GFP_KERNEL); if (error) goto munmap_gather_failed; vma_mark_detached(next, true); nrpages = vma_pages(next); vms->nr_pages += nrpages; if (next->vm_flags & VM_LOCKED) vms->locked_vm += nrpages; if (next->vm_flags & VM_ACCOUNT) vms->nr_accounted += nrpages; if (is_exec_mapping(next->vm_flags)) vms->exec_vm += nrpages; else if (is_stack_mapping(next->vm_flags)) vms->stack_vm += nrpages; else if (is_data_mapping(next->vm_flags)) vms->data_vm += nrpages; if (vms->uf) { /* * If userfaultfd_unmap_prep returns an error the vmas * will remain split, but userland will get a * highly unexpected error anyway. This is no * different than the case where the first of the two * __split_vma fails, but we don't undo the first * split, despite we could. This is unlikely enough * failure that it's not worth optimizing it for. */ error = userfaultfd_unmap_prep(next, vms->start, vms->end, vms->uf); if (error) goto userfaultfd_error; } #ifdef CONFIG_DEBUG_VM_MAPLE_TREE BUG_ON(next->vm_start < vms->start); BUG_ON(next->vm_start > vms->end); #endif } vms->next = vma_next(vms->vmi); if (vms->next) vms->unmap_end = vms->next->vm_start; #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) /* Make sure no VMAs are about to be lost. */ { MA_STATE(test, mas_detach->tree, 0, 0); struct vm_area_struct *vma_mas, *vma_test; int test_count = 0; vma_iter_set(vms->vmi, vms->start); rcu_read_lock(); vma_test = mas_find(&test, vms->vma_count - 1); for_each_vma_range(*(vms->vmi), vma_mas, vms->end) { BUG_ON(vma_mas != vma_test); test_count++; vma_test = mas_next(&test, vms->vma_count - 1); } rcu_read_unlock(); BUG_ON(vms->vma_count != test_count); } #endif while (vma_iter_addr(vms->vmi) > vms->start) vma_iter_prev_range(vms->vmi); vms->clear_ptes = true; return 0; userfaultfd_error: munmap_gather_failed: end_split_failed: modify_vma_failed: reattach_vmas(mas_detach); start_split_failed: map_count_exceeded: return error; } /* * init_vma_munmap() - Initializer wrapper for vma_munmap_struct * @vms: The vma munmap struct * @vmi: The vma iterator * @vma: The first vm_area_struct to munmap * @start: The aligned start address to munmap * @end: The aligned end address to munmap * @uf: The userfaultfd list_head * @unlock: Unlock after the operation. Only unlocked on success */ static void init_vma_munmap(struct vma_munmap_struct *vms, struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct list_head *uf, bool unlock) { vms->vmi = vmi; vms->vma = vma; if (vma) { vms->start = start; vms->end = end; } else { vms->start = vms->end = 0; } vms->unlock = unlock; vms->uf = uf; vms->vma_count = 0; vms->nr_pages = vms->locked_vm = vms->nr_accounted = 0; vms->exec_vm = vms->stack_vm = vms->data_vm = 0; vms->unmap_start = FIRST_USER_ADDRESS; vms->unmap_end = USER_PGTABLES_CEILING; vms->clear_ptes = false; } /* * do_vmi_align_munmap() - munmap the aligned region from @start to @end. * @vmi: The vma iterator * @vma: The starting vm_area_struct * @mm: The mm_struct * @start: The aligned start address to munmap. * @end: The aligned end address to munmap. * @uf: The userfaultfd list_head * @unlock: Set to true to drop the mmap_lock. unlocking only happens on * success. * * Return: 0 on success and drops the lock if so directed, error and leaves the * lock held otherwise. */ int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end, struct list_head *uf, bool unlock) { struct maple_tree mt_detach; MA_STATE(mas_detach, &mt_detach, 0, 0); mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); mt_on_stack(mt_detach); struct vma_munmap_struct vms; int error; init_vma_munmap(&vms, vmi, vma, start, end, uf, unlock); error = vms_gather_munmap_vmas(&vms, &mas_detach); if (error) goto gather_failed; error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL); if (error) goto clear_tree_failed; /* Point of no return */ vms_complete_munmap_vmas(&vms, &mas_detach); return 0; clear_tree_failed: reattach_vmas(&mas_detach); gather_failed: validate_mm(mm); return error; } /* * do_vmi_munmap() - munmap a given range. * @vmi: The vma iterator * @mm: The mm_struct * @start: The start address to munmap * @len: The length of the range to munmap * @uf: The userfaultfd list_head * @unlock: set to true if the user wants to drop the mmap_lock on success * * This function takes a @mas that is either pointing to the previous VMA or set * to MA_START and sets it up to remove the mapping(s). The @len will be * aligned. * * Return: 0 on success and drops the lock if so directed, error and leaves the * lock held otherwise. */ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool unlock) { unsigned long end; struct vm_area_struct *vma; if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) return -EINVAL; end = start + PAGE_ALIGN(len); if (end == start) return -EINVAL; /* Find the first overlapping VMA */ vma = vma_find(vmi, end); if (!vma) { if (unlock) mmap_write_unlock(mm); return 0; } return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock); } /* * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd * context and anonymous VMA name within the range [start, end). * * As a result, we might be able to merge the newly modified VMA range with an * adjacent VMA with identical properties. * * If no merge is possible and the range does not span the entirety of the VMA, * we then need to split the VMA to accommodate the change. * * The function returns either the merged VMA, the original VMA if a split was * required instead, or an error if the split failed. */ static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg) { struct vm_area_struct *vma = vmg->vma; struct vm_area_struct *merged; /* First, try to merge. */ merged = vma_merge_existing_range(vmg); if (merged) return merged; /* Split any preceding portion of the VMA. */ if (vma->vm_start < vmg->start) { int err = split_vma(vmg->vmi, vma, vmg->start, 1); if (err) return ERR_PTR(err); } /* Split any trailing portion of the VMA. */ if (vma->vm_end > vmg->end) { int err = split_vma(vmg->vmi, vma, vmg->end, 0); if (err) return ERR_PTR(err); } return vma; } struct vm_area_struct *vma_modify_flags( struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); vmg.flags = new_flags; return vma_modify(&vmg); } struct vm_area_struct *vma_modify_flags_name(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags, struct anon_vma_name *new_name) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); vmg.flags = new_flags; vmg.anon_name = new_name; return vma_modify(&vmg); } struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct mempolicy *new_pol) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); vmg.policy = new_pol; return vma_modify(&vmg); } struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long new_flags, struct vm_userfaultfd_ctx new_ctx) { VMG_VMA_STATE(vmg, vmi, prev, vma, start, end); vmg.flags = new_flags; vmg.uffd_ctx = new_ctx; return vma_modify(&vmg); } /* * Expand vma by delta bytes, potentially merging with an immediately adjacent * VMA with identical properties. */ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long delta) { VMG_VMA_STATE(vmg, vmi, vma, vma, vma->vm_end, vma->vm_end + delta); vmg.next = vma_iter_next_rewind(vmi, NULL); vmg.vma = NULL; /* We use the VMA to populate VMG fields only. */ return vma_merge_new_range(&vmg); } void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb) { vb->count = 0; } static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb) { struct address_space *mapping; int i; mapping = vb->vmas[0]->vm_file->f_mapping; i_mmap_lock_write(mapping); for (i = 0; i < vb->count; i++) { VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping); __remove_shared_vm_struct(vb->vmas[i], mapping); } i_mmap_unlock_write(mapping); unlink_file_vma_batch_init(vb); } void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, struct vm_area_struct *vma) { if (vma->vm_file == NULL) return; if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) || vb->count == ARRAY_SIZE(vb->vmas)) unlink_file_vma_batch_process(vb); vb->vmas[vb->count] = vma; vb->count++; } void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb) { if (vb->count > 0) unlink_file_vma_batch_process(vb); } /* * Unlink a file-based vm structure from its interval tree, to hide * vma from rmap and vmtruncate before freeing its page tables. */ void unlink_file_vma(struct vm_area_struct *vma) { struct file *file = vma->vm_file; if (file) { struct address_space *mapping = file->f_mapping; i_mmap_lock_write(mapping); __remove_shared_vm_struct(vma, mapping); i_mmap_unlock_write(mapping); } } void vma_link_file(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct address_space *mapping; if (file) { mapping = file->f_mapping; i_mmap_lock_write(mapping); __vma_link_file(vma, mapping); i_mmap_unlock_write(mapping); } } int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) { VMA_ITERATOR(vmi, mm, 0); vma_iter_config(&vmi, vma->vm_start, vma->vm_end); if (vma_iter_prealloc(&vmi, vma)) return -ENOMEM; vma_start_write(vma); vma_iter_store(&vmi, vma); vma_link_file(vma); mm->map_count++; validate_mm(mm); return 0; } /* * Copy the vma structure to a new location in the same mm, * prior to moving page table entries, to effect an mremap move. */ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, unsigned long addr, unsigned long len, pgoff_t pgoff, bool *need_rmap_locks) { struct vm_area_struct *vma = *vmap; unsigned long vma_start = vma->vm_start; struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma; bool faulted_in_anon_vma = true; VMA_ITERATOR(vmi, mm, addr); VMG_VMA_STATE(vmg, &vmi, NULL, vma, addr, addr + len); /* * If anonymous vma has not yet been faulted, update new pgoff * to match new location, to increase its chance of merging. */ if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) { pgoff = addr >> PAGE_SHIFT; faulted_in_anon_vma = false; } new_vma = find_vma_prev(mm, addr, &vmg.prev); if (new_vma && new_vma->vm_start < addr + len) return NULL; /* should never get here */ vmg.vma = NULL; /* New VMA range. */ vmg.pgoff = pgoff; vmg.next = vma_iter_next_rewind(&vmi, NULL); new_vma = vma_merge_new_range(&vmg); if (new_vma) { /* * Source vma may have been merged into new_vma */ if (unlikely(vma_start >= new_vma->vm_start && vma_start < new_vma->vm_end)) { /* * The only way we can get a vma_merge with * self during an mremap is if the vma hasn't * been faulted in yet and we were allowed to * reset the dst vma->vm_pgoff to the * destination address of the mremap to allow * the merge to happen. mremap must change the * vm_pgoff linearity between src and dst vmas * (in turn preventing a vma_merge) to be * safe. It is only safe to keep the vm_pgoff * linear if there are no pages mapped yet. */ VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma); *vmap = vma = new_vma; } *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff); } else { new_vma = vm_area_dup(vma); if (!new_vma) goto out; vma_set_range(new_vma, addr, addr + len, pgoff); if (vma_dup_policy(vma, new_vma)) goto out_free_vma; if (anon_vma_clone(new_vma, vma)) goto out_free_mempol; if (new_vma->vm_file) get_file(new_vma->vm_file); if (new_vma->vm_ops && new_vma->vm_ops->open) new_vma->vm_ops->open(new_vma); if (vma_link(mm, new_vma)) goto out_vma_link; *need_rmap_locks = false; } return new_vma; out_vma_link: vma_close(new_vma); if (new_vma->vm_file) fput(new_vma->vm_file); unlink_anon_vmas(new_vma); out_free_mempol: mpol_put(vma_policy(new_vma)); out_free_vma: vm_area_free(new_vma); out: return NULL; } /* * Rough compatibility check to quickly see if it's even worth looking * at sharing an anon_vma. * * They need to have the same vm_file, and the flags can only differ * in things that mprotect may change. * * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that * we can merge the two vma's. For example, we refuse to merge a vma if * there is a vm_ops->close() function, because that indicates that the * driver is doing some kind of reference counting. But that doesn't * really matter for the anon_vma sharing case. */ static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) { return a->vm_end == b->vm_start && mpol_equal(vma_policy(a), vma_policy(b)) && a->vm_file == b->vm_file && !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) && b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); } /* * Do some basic sanity checking to see if we can re-use the anon_vma * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be * the same as 'old', the other will be the new one that is trying * to share the anon_vma. * * NOTE! This runs with mmap_lock held for reading, so it is possible that * the anon_vma of 'old' is concurrently in the process of being set up * by another page fault trying to merge _that_. But that's ok: if it * is being set up, that automatically means that it will be a singleton * acceptable for merging, so we can do all of this optimistically. But * we do that READ_ONCE() to make sure that we never re-load the pointer. * * IOW: that the "list_is_singular()" test on the anon_vma_chain only * matters for the 'stable anon_vma' case (ie the thing we want to avoid * is to return an anon_vma that is "complex" due to having gone through * a fork). * * We also make sure that the two vma's are compatible (adjacent, * and with the same memory policies). That's all stable, even with just * a read lock on the mmap_lock. */ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) { if (anon_vma_compatible(a, b)) { struct anon_vma *anon_vma = READ_ONCE(old->anon_vma); if (anon_vma && list_is_singular(&old->anon_vma_chain)) return anon_vma; } return NULL; } /* * find_mergeable_anon_vma is used by anon_vma_prepare, to check * neighbouring vmas for a suitable anon_vma, before it goes off * to allocate a new anon_vma. It checks because a repetitive * sequence of mprotects and faults may otherwise lead to distinct * anon_vmas being allocated, preventing vma merge in subsequent * mprotect. */ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) { struct anon_vma *anon_vma = NULL; struct vm_area_struct *prev, *next; VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end); /* Try next first. */ next = vma_iter_load(&vmi); if (next) { anon_vma = reusable_anon_vma(next, vma, next); if (anon_vma) return anon_vma; } prev = vma_prev(&vmi); VM_BUG_ON_VMA(prev != vma, vma); prev = vma_prev(&vmi); /* Try prev next. */ if (prev) anon_vma = reusable_anon_vma(prev, prev, vma); /* * We might reach here with anon_vma == NULL if we can't find * any reusable anon_vma. * There's no absolute need to look only at touching neighbours: * we could search further afield for "compatible" anon_vmas. * But it would probably just be a waste of time searching, * or lead to too many vmas hanging off the same anon_vma. * We're trying to allow mprotect remerging later on, * not trying to minimize memory used for anon_vmas. */ return anon_vma; } static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops) { return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite); } static bool vma_is_shared_writable(struct vm_area_struct *vma) { return (vma->vm_flags & (VM_WRITE | VM_SHARED)) == (VM_WRITE | VM_SHARED); } static bool vma_fs_can_writeback(struct vm_area_struct *vma) { /* No managed pages to writeback. */ if (vma->vm_flags & VM_PFNMAP) return false; return vma->vm_file && vma->vm_file->f_mapping && mapping_can_writeback(vma->vm_file->f_mapping); } /* * Does this VMA require the underlying folios to have their dirty state * tracked? */ bool vma_needs_dirty_tracking(struct vm_area_struct *vma) { /* Only shared, writable VMAs require dirty tracking. */ if (!vma_is_shared_writable(vma)) return false; /* Does the filesystem need to be notified? */ if (vm_ops_needs_writenotify(vma->vm_ops)) return true; /* * Even if the filesystem doesn't indicate a need for writenotify, if it * can writeback, dirty tracking is still required. */ return vma_fs_can_writeback(vma); } /* * Some shared mappings will want the pages marked read-only * to track write events. If so, we'll downgrade vm_page_prot * to the private version (using protection_map[] without the * VM_SHARED bit). */ bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) { /* If it was private or non-writable, the write bit is already clear */ if (!vma_is_shared_writable(vma)) return false; /* The backer wishes to know when pages are first written to? */ if (vm_ops_needs_writenotify(vma->vm_ops)) return true; /* The open routine did something to the protections that pgprot_modify * won't preserve? */ if (pgprot_val(vm_page_prot) != pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags))) return false; /* * Do we need to track softdirty? hugetlb does not support softdirty * tracking yet. */ if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma)) return true; /* Do we need write faults for uffd-wp tracking? */ if (userfaultfd_wp(vma)) return true; /* Can the mapping track the dirty pages? */ return vma_fs_can_writeback(vma); } static DEFINE_MUTEX(mm_all_locks_mutex); static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) { if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { /* * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. */ down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock); /* * We can safely modify head.next after taking the * anon_vma->root->rwsem. If some other vma in this mm shares * the same anon_vma we won't take it again. * * No need of atomic instructions here, head.next * can't change from under us thanks to the * anon_vma->root->rwsem. */ if (__test_and_set_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) BUG(); } } static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) { if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { /* * AS_MM_ALL_LOCKS can't change from under us because * we hold the mm_all_locks_mutex. * * Operations on ->flags have to be atomic because * even if AS_MM_ALL_LOCKS is stable thanks to the * mm_all_locks_mutex, there may be other cpus * changing other bitflags in parallel to us. */ if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) BUG(); down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock); } } /* * This operation locks against the VM for all pte/vma/mm related * operations that could ever happen on a certain mm. This includes * vmtruncate, try_to_unmap, and all page faults. * * The caller must take the mmap_lock in write mode before calling * mm_take_all_locks(). The caller isn't allowed to release the * mmap_lock until mm_drop_all_locks() returns. * * mmap_lock in write mode is required in order to block all operations * that could modify pagetables and free pages without need of * altering the vma layout. It's also needed in write mode to avoid new * anon_vmas to be associated with existing vmas. * * A single task can't take more than one mm_take_all_locks() in a row * or it would deadlock. * * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in * mapping->flags avoid to take the same lock twice, if more than one * vma in this mm is backed by the same anon_vma or address_space. * * We take locks in following order, accordingly to comment at beginning * of mm/rmap.c: * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for * hugetlb mapping); * - all vmas marked locked * - all i_mmap_rwsem locks; * - all anon_vma->rwseml * * We can take all locks within these types randomly because the VM code * doesn't nest them and we protected from parallel mm_take_all_locks() by * mm_all_locks_mutex. * * mm_take_all_locks() and mm_drop_all_locks are expensive operations * that may have to take thousand of locks. * * mm_take_all_locks() can fail if it's interrupted by signals. */ int mm_take_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; struct anon_vma_chain *avc; VMA_ITERATOR(vmi, mm, 0); mmap_assert_write_locked(mm); mutex_lock(&mm_all_locks_mutex); /* * vma_start_write() does not have a complement in mm_drop_all_locks() * because vma_start_write() is always asymmetrical; it marks a VMA as * being written to until mmap_write_unlock() or mmap_write_downgrade() * is reached. */ for_each_vma(vmi, vma) { if (signal_pending(current)) goto out_unlock; vma_start_write(vma); } vma_iter_init(&vmi, mm, 0); for_each_vma(vmi, vma) { if (signal_pending(current)) goto out_unlock; if (vma->vm_file && vma->vm_file->f_mapping && is_vm_hugetlb_page(vma)) vm_lock_mapping(mm, vma->vm_file->f_mapping); } vma_iter_init(&vmi, mm, 0); for_each_vma(vmi, vma) { if (signal_pending(current)) goto out_unlock; if (vma->vm_file && vma->vm_file->f_mapping && !is_vm_hugetlb_page(vma)) vm_lock_mapping(mm, vma->vm_file->f_mapping); } vma_iter_init(&vmi, mm, 0); for_each_vma(vmi, vma) { if (signal_pending(current)) goto out_unlock; if (vma->anon_vma) list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) vm_lock_anon_vma(mm, avc->anon_vma); } return 0; out_unlock: mm_drop_all_locks(mm); return -EINTR; } static void vm_unlock_anon_vma(struct anon_vma *anon_vma) { if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) { /* * The LSB of head.next can't change to 0 from under * us because we hold the mm_all_locks_mutex. * * We must however clear the bitflag before unlocking * the vma so the users using the anon_vma->rb_root will * never see our bitflag. * * No need of atomic instructions here, head.next * can't change from under us until we release the * anon_vma->root->rwsem. */ if (!__test_and_clear_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) BUG(); anon_vma_unlock_write(anon_vma); } } static void vm_unlock_mapping(struct address_space *mapping) { if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { /* * AS_MM_ALL_LOCKS can't change to 0 from under us * because we hold the mm_all_locks_mutex. */ i_mmap_unlock_write(mapping); if (!test_and_clear_bit(AS_MM_ALL_LOCKS, &mapping->flags)) BUG(); } } /* * The mmap_lock cannot be released by the caller until * mm_drop_all_locks() returns. */ void mm_drop_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; struct anon_vma_chain *avc; VMA_ITERATOR(vmi, mm, 0); mmap_assert_write_locked(mm); BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); for_each_vma(vmi, vma) { if (vma->anon_vma) list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) vm_unlock_anon_vma(avc->anon_vma); if (vma->vm_file && vma->vm_file->f_mapping) vm_unlock_mapping(vma->vm_file->f_mapping); } mutex_unlock(&mm_all_locks_mutex); } /* * We account for memory if it's a private writeable mapping, * not hugepages and VM_NORESERVE wasn't set. */ static bool accountable_mapping(struct file *file, vm_flags_t vm_flags) { /* * hugetlb has its own accounting separate from the core VM * VM_HUGETLB may not be set yet so we cannot check for that flag. */ if (file && is_file_hugepages(file)) return false; return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; } /* * vms_abort_munmap_vmas() - Undo as much as possible from an aborted munmap() * operation. * @vms: The vma unmap structure * @mas_detach: The maple state with the detached maple tree * * Reattach any detached vmas, free up the maple tree used to track the vmas. * If that's not possible because the ptes are cleared (and vm_ops->closed() may * have been called), then a NULL is written over the vmas and the vmas are * removed (munmap() completed). */ static void vms_abort_munmap_vmas(struct vma_munmap_struct *vms, struct ma_state *mas_detach) { struct ma_state *mas = &vms->vmi->mas; if (!vms->nr_pages) return; if (vms->clear_ptes) return reattach_vmas(mas_detach); /* * Aborting cannot just call the vm_ops open() because they are often * not symmetrical and state data has been lost. Resort to the old * failure method of leaving a gap where the MAP_FIXED mapping failed. */ mas_set_range(mas, vms->start, vms->end - 1); mas_store_gfp(mas, NULL, GFP_KERNEL|__GFP_NOFAIL); /* Clean up the insertion of the unfortunate gap */ vms_complete_munmap_vmas(vms, mas_detach); } /* * __mmap_prepare() - Prepare to gather any overlapping VMAs that need to be * unmapped once the map operation is completed, check limits, account mapping * and clean up any pre-existing VMAs. * * @map: Mapping state. * @uf: Userfaultfd context list. * * Returns: 0 on success, error code otherwise. */ static int __mmap_prepare(struct mmap_state *map, struct list_head *uf) { int error; struct vma_iterator *vmi = map->vmi; struct vma_munmap_struct *vms = &map->vms; /* Find the first overlapping VMA and initialise unmap state. */ vms->vma = vma_find(vmi, map->end); init_vma_munmap(vms, vmi, vms->vma, map->addr, map->end, uf, /* unlock = */ false); /* OK, we have overlapping VMAs - prepare to unmap them. */ if (vms->vma) { mt_init_flags(&map->mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); mt_on_stack(map->mt_detach); mas_init(&map->mas_detach, &map->mt_detach, /* addr = */ 0); /* Prepare to unmap any existing mapping in the area */ error = vms_gather_munmap_vmas(vms, &map->mas_detach); if (error) { /* On error VMAs will already have been reattached. */ vms->nr_pages = 0; return error; } map->next = vms->next; map->prev = vms->prev; } else { map->next = vma_iter_next_rewind(vmi, &map->prev); } /* Check against address space limit. */ if (!may_expand_vm(map->mm, map->flags, map->pglen - vms->nr_pages)) return -ENOMEM; /* Private writable mapping: check memory availability. */ if (accountable_mapping(map->file, map->flags)) { map->charged = map->pglen; map->charged -= vms->nr_accounted; if (map->charged) { error = security_vm_enough_memory_mm(map->mm, map->charged); if (error) return error; } vms->nr_accounted = 0; map->flags |= VM_ACCOUNT; } /* * Clear PTEs while the vma is still in the tree so that rmap * cannot race with the freeing later in the truncate scenario. * This is also needed for mmap_file(), which is why vm_ops * close function is called. */ vms_clean_up_area(vms, &map->mas_detach); return 0; } static int __mmap_new_file_vma(struct mmap_state *map, struct vm_area_struct *vma) { struct vma_iterator *vmi = map->vmi; int error; vma->vm_file = get_file(map->file); error = mmap_file(vma->vm_file, vma); if (error) { fput(vma->vm_file); vma->vm_file = NULL; vma_iter_set(vmi, vma->vm_end); /* Undo any partial mapping done by a device driver. */ unmap_region(&vmi->mas, vma, map->prev, map->next); return error; } /* Drivers cannot alter the address of the VMA. */ WARN_ON_ONCE(map->addr != vma->vm_start); /* * Drivers should not permit writability when previously it was * disallowed. */ VM_WARN_ON_ONCE(map->flags != vma->vm_flags && !(map->flags & VM_MAYWRITE) && (vma->vm_flags & VM_MAYWRITE)); /* If the flags change (and are mergeable), let's retry later. */ map->retry_merge = vma->vm_flags != map->flags && !(vma->vm_flags & VM_SPECIAL); map->flags = vma->vm_flags; return 0; } /* * __mmap_new_vma() - Allocate a new VMA for the region, as merging was not * possible. * * @map: Mapping state. * @vmap: Output pointer for the new VMA. * * Returns: Zero on success, or an error. */ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) { struct vma_iterator *vmi = map->vmi; int error = 0; struct vm_area_struct *vma; /* * Determine the object being mapped and call the appropriate * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ vma = vm_area_alloc(map->mm); if (!vma) return -ENOMEM; vma_iter_config(vmi, map->addr, map->end); vma_set_range(vma, map->addr, map->end, map->pgoff); vm_flags_init(vma, map->flags); vma->vm_page_prot = vm_get_page_prot(map->flags); if (vma_iter_prealloc(vmi, vma)) { error = -ENOMEM; goto free_vma; } if (map->file) error = __mmap_new_file_vma(map, vma); else if (map->flags & VM_SHARED) error = shmem_zero_setup(vma); else vma_set_anonymous(vma); if (error) goto free_iter_vma; #ifdef CONFIG_SPARC64 /* TODO: Fix SPARC ADI! */ WARN_ON_ONCE(!arch_validate_flags(map->flags)); #endif /* Lock the VMA since it is modified after insertion into VMA tree */ vma_start_write(vma); vma_iter_store(vmi, vma); map->mm->map_count++; vma_link_file(vma); /* * vma_merge_new_range() calls khugepaged_enter_vma() too, the below * call covers the non-merge case. */ khugepaged_enter_vma(vma, map->flags); ksm_add_vma(vma); *vmap = vma; return 0; free_iter_vma: vma_iter_free(vmi); free_vma: vm_area_free(vma); return error; } /* * __mmap_complete() - Unmap any VMAs we overlap, account memory mapping * statistics, handle locking and finalise the VMA. * * @map: Mapping state. * @vma: Merged or newly allocated VMA for the mmap()'d region. */ static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) { struct mm_struct *mm = map->mm; unsigned long vm_flags = vma->vm_flags; perf_event_mmap(vma); /* Unmap any existing mapping in the area. */ vms_complete_munmap_vmas(&map->vms, &map->mas_detach); vm_stat_account(mm, vma->vm_flags, map->pglen); if (vm_flags & VM_LOCKED) { if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(mm)) vm_flags_clear(vma, VM_LOCKED_MASK); else mm->locked_vm += map->pglen; } if (vma->vm_file) uprobe_mmap(vma); /* * New (or expanded) vma always get soft dirty status. * Otherwise user-space soft-dirty page tracker won't * be able to distinguish situation when vma area unmapped, * then new mapped in-place (which must be aimed as * a completely new data area). */ vm_flags_set(vma, VM_SOFTDIRTY); vma_set_page_prot(vma); } static unsigned long __mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; int error; VMA_ITERATOR(vmi, mm, addr); MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file); error = __mmap_prepare(&map, uf); if (error) goto abort_munmap; /* Attempt to merge with adjacent VMAs... */ if (map.prev || map.next) { VMG_MMAP_STATE(vmg, &map, /* vma = */ NULL); vma = vma_merge_new_range(&vmg); } /* ...but if we can't, allocate a new VMA. */ if (!vma) { error = __mmap_new_vma(&map, &vma); if (error) goto unacct_error; } /* If flags changed, we might be able to merge, so try again. */ if (map.retry_merge) { struct vm_area_struct *merged; VMG_MMAP_STATE(vmg, &map, vma); vma_iter_config(map.vmi, map.addr, map.end); merged = vma_merge_existing_range(&vmg); if (merged) vma = merged; } __mmap_complete(&map, vma); return addr; /* Accounting was done by __mmap_prepare(). */ unacct_error: if (map.charged) vm_unacct_memory(map.charged); abort_munmap: vms_abort_munmap_vmas(&map.vms, &map.mas_detach); return error; } /** * mmap_region() - Actually perform the userland mapping of a VMA into * current->mm with known, aligned and overflow-checked @addr and @len, and * correctly determined VMA flags @vm_flags and page offset @pgoff. * * This is an internal memory management function, and should not be used * directly. * * The caller must write-lock current->mm->mmap_lock. * * @file: If a file-backed mapping, a pointer to the struct file describing the * file to be mapped, otherwise NULL. * @addr: The page-aligned address at which to perform the mapping. * @len: The page-aligned, non-zero, length of the mapping. * @vm_flags: The VMA flags which should be applied to the mapping. * @pgoff: If @file is specified, the page offset into the file, if not then * the virtual page offset in memory of the anonymous mapping. * @uf: Optionally, a pointer to a list head used for tracking userfaultfd unmap * events. * * Returns: Either an error, or the address at which the requested mapping has * been performed. */ unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) { unsigned long ret; bool writable_file_mapping = false; mmap_assert_write_locked(current->mm); /* Check to see if MDWE is applicable. */ if (map_deny_write_exec(vm_flags, vm_flags)) return -EACCES; /* Allow architectures to sanity-check the vm_flags. */ if (!arch_validate_flags(vm_flags)) return -EINVAL; /* Map writable and ensure this isn't a sealed memfd. */ if (file && is_shared_maywrite(vm_flags)) { int error = mapping_map_writable(file->f_mapping); if (error) return error; writable_file_mapping = true; } ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf); /* Clear our write mapping regardless of error. */ if (writable_file_mapping) mapping_unmap_writable(file->f_mapping); validate_mm(current->mm); return ret; } /* * do_brk_flags() - Increase the brk vma if the flags match. * @vmi: The vma iterator * @addr: The start address * @len: The length of the increase * @vma: The vma, * @flags: The VMA Flags * * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags * do not match then create a new anonymous VMA. Eventually we may be able to * do some brk-specific accounting here. */ int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long addr, unsigned long len, unsigned long flags) { struct mm_struct *mm = current->mm; /* * Check against address space limits by the changed size * Note: This happens *after* clearing old mappings in some code paths. */ flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) return -ENOMEM; if (mm->map_count > sysctl_max_map_count) return -ENOMEM; if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) return -ENOMEM; /* * Expand the existing vma if possible; Note that singular lists do not * occur after forking, so the expand will only happen on new VMAs. */ if (vma && vma->vm_end == addr) { VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr)); vmg.prev = vma; /* vmi is positioned at prev, which this mode expects. */ vmg.merge_flags = VMG_FLAG_JUST_EXPAND; if (vma_merge_new_range(&vmg)) goto out; else if (vmg_nomem(&vmg)) goto unacct_fail; } if (vma) vma_iter_next_range(vmi); /* create a vma struct for an anonymous mapping */ vma = vm_area_alloc(mm); if (!vma) goto unacct_fail; vma_set_anonymous(vma); vma_set_range(vma, addr, addr + len, addr >> PAGE_SHIFT); vm_flags_init(vma, flags); vma->vm_page_prot = vm_get_page_prot(flags); vma_start_write(vma); if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) goto mas_store_fail; mm->map_count++; validate_mm(mm); ksm_add_vma(vma); out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; mm->data_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); vm_flags_set(vma, VM_SOFTDIRTY); return 0; mas_store_fail: vm_area_free(vma); unacct_fail: vm_unacct_memory(len >> PAGE_SHIFT); return -ENOMEM; } /** * unmapped_area() - Find an area between the low_limit and the high_limit with * the correct alignment and offset, all from @info. Note: current->mm is used * for the search. * * @info: The unmapped area information including the range [low_limit - * high_limit), the alignment offset and mask. * * Return: A memory address or -ENOMEM. */ unsigned long unmapped_area(struct vm_unmapped_area_info *info) { unsigned long length, gap; unsigned long low_limit, high_limit; struct vm_area_struct *tmp; VMA_ITERATOR(vmi, current->mm, 0); /* Adjust search length to account for worst case alignment overhead */ length = info->length + info->align_mask + info->start_gap; if (length < info->length) return -ENOMEM; low_limit = info->low_limit; if (low_limit < mmap_min_addr) low_limit = mmap_min_addr; high_limit = info->high_limit; retry: if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length)) return -ENOMEM; /* * Adjust for the gap first so it doesn't interfere with the * later alignment. The first step is the minimum needed to * fulill the start gap, the next steps is the minimum to align * that. It is the minimum needed to fulill both. */ gap = vma_iter_addr(&vmi) + info->start_gap; gap += (info->align_offset - gap) & info->align_mask; tmp = vma_next(&vmi); if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ if (vm_start_gap(tmp) < gap + length - 1) { low_limit = tmp->vm_end; vma_iter_reset(&vmi); goto retry; } } else { tmp = vma_prev(&vmi); if (tmp && vm_end_gap(tmp) > gap) { low_limit = vm_end_gap(tmp); vma_iter_reset(&vmi); goto retry; } } return gap; } /** * unmapped_area_topdown() - Find an area between the low_limit and the * high_limit with the correct alignment and offset at the highest available * address, all from @info. Note: current->mm is used for the search. * * @info: The unmapped area information including the range [low_limit - * high_limit), the alignment offset and mask. * * Return: A memory address or -ENOMEM. */ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) { unsigned long length, gap, gap_end; unsigned long low_limit, high_limit; struct vm_area_struct *tmp; VMA_ITERATOR(vmi, current->mm, 0); /* Adjust search length to account for worst case alignment overhead */ length = info->length + info->align_mask + info->start_gap; if (length < info->length) return -ENOMEM; low_limit = info->low_limit; if (low_limit < mmap_min_addr) low_limit = mmap_min_addr; high_limit = info->high_limit; retry: if (vma_iter_area_highest(&vmi, low_limit, high_limit, length)) return -ENOMEM; gap = vma_iter_end(&vmi) - info->length; gap -= (gap - info->align_offset) & info->align_mask; gap_end = vma_iter_end(&vmi); tmp = vma_next(&vmi); if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */ if (vm_start_gap(tmp) < gap_end) { high_limit = vm_start_gap(tmp); vma_iter_reset(&vmi); goto retry; } } else { tmp = vma_prev(&vmi); if (tmp && vm_end_gap(tmp) > gap) { high_limit = tmp->vm_start; vma_iter_reset(&vmi); goto retry; } } return gap; } /* * Verify that the stack growth is acceptable and * update accounting. This is shared with both the * grow-up and grow-down cases. */ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow) { struct mm_struct *mm = vma->vm_mm; unsigned long new_start; /* address space limit tests */ if (!may_expand_vm(mm, vma->vm_flags, grow)) return -ENOMEM; /* Stack limit test */ if (size > rlimit(RLIMIT_STACK)) return -ENOMEM; /* mlock limit tests */ if (!mlock_future_ok(mm, vma->vm_flags, grow << PAGE_SHIFT)) return -ENOMEM; /* Check to ensure the stack will not grow into a hugetlb-only region */ new_start = (vma->vm_flags & VM_GROWSUP) ? vma->vm_start : vma->vm_end - size; if (is_hugepage_only_range(vma->vm_mm, new_start, size)) return -EFAULT; /* * Overcommit.. This must be the final test, as it will * update security statistics. */ if (security_vm_enough_memory_mm(mm, grow)) return -ENOMEM; return 0; } #if defined(CONFIG_STACK_GROWSUP) /* * PA-RISC uses this for its stack. * vma is the last one with address > vma->vm_end. Have to extend vma. */ int expand_upwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *next; unsigned long gap_addr; int error = 0; VMA_ITERATOR(vmi, mm, vma->vm_start); if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT; mmap_assert_write_locked(mm); /* Guard against exceeding limits of the address space. */ address &= PAGE_MASK; if (address >= (TASK_SIZE & PAGE_MASK)) return -ENOMEM; address += PAGE_SIZE; /* Enforce stack_guard_gap */ gap_addr = address + stack_guard_gap; /* Guard against overflow */ if (gap_addr < address || gap_addr > TASK_SIZE) gap_addr = TASK_SIZE; next = find_vma_intersection(mm, vma->vm_end, gap_addr); if (next && vma_is_accessible(next)) { if (!(next->vm_flags & VM_GROWSUP)) return -ENOMEM; /* Check that both stack segments have the same anon_vma? */ } if (next) vma_iter_prev_range_limit(&vmi, address); vma_iter_config(&vmi, vma->vm_start, address); if (vma_iter_prealloc(&vmi, vma)) return -ENOMEM; /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) { vma_iter_free(&vmi); return -ENOMEM; } /* Lock the VMA before expanding to prevent concurrent page faults */ vma_start_write(vma); /* We update the anon VMA tree. */ anon_vma_lock_write(vma->anon_vma); /* Somebody else might have raced and expanded it already */ if (address > vma->vm_end) { unsigned long size, grow; size = address - vma->vm_start; grow = (address - vma->vm_end) >> PAGE_SHIFT; error = -ENOMEM; if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { if (vma->vm_flags & VM_LOCKED) mm->locked_vm += grow; vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; /* Overwrite old entry in mtree. */ vma_iter_store(&vmi, vma); anon_vma_interval_tree_post_update_vma(vma); perf_event_mmap(vma); } } } anon_vma_unlock_write(vma->anon_vma); vma_iter_free(&vmi); validate_mm(mm); return error; } #endif /* CONFIG_STACK_GROWSUP */ /* * vma is the first one with address < vma->vm_start. Have to extend vma. * mmap_lock held for writing. */ int expand_downwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *prev; int error = 0; VMA_ITERATOR(vmi, mm, vma->vm_start); if (!(vma->vm_flags & VM_GROWSDOWN)) return -EFAULT; mmap_assert_write_locked(mm); address &= PAGE_MASK; if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) return -EPERM; /* Enforce stack_guard_gap */ prev = vma_prev(&vmi); /* Check that both stack segments have the same anon_vma? */ if (prev) { if (!(prev->vm_flags & VM_GROWSDOWN) && vma_is_accessible(prev) && (address - prev->vm_end < stack_guard_gap)) return -ENOMEM; } if (prev) vma_iter_next_range_limit(&vmi, vma->vm_start); vma_iter_config(&vmi, address, vma->vm_end); if (vma_iter_prealloc(&vmi, vma)) return -ENOMEM; /* We must make sure the anon_vma is allocated. */ if (unlikely(anon_vma_prepare(vma))) { vma_iter_free(&vmi); return -ENOMEM; } /* Lock the VMA before expanding to prevent concurrent page faults */ vma_start_write(vma); /* We update the anon VMA tree. */ anon_vma_lock_write(vma->anon_vma); /* Somebody else might have raced and expanded it already */ if (address < vma->vm_start) { unsigned long size, grow; size = vma->vm_end - address; grow = (vma->vm_start - address) >> PAGE_SHIFT; error = -ENOMEM; if (grow <= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { if (vma->vm_flags & VM_LOCKED) mm->locked_vm += grow; vm_stat_account(mm, vma->vm_flags, grow); anon_vma_interval_tree_pre_update_vma(vma); vma->vm_start = address; vma->vm_pgoff -= grow; /* Overwrite old entry in mtree. */ vma_iter_store(&vmi, vma); anon_vma_interval_tree_post_update_vma(vma); perf_event_mmap(vma); } } } anon_vma_unlock_write(vma->anon_vma); vma_iter_free(&vmi); validate_mm(mm); return error; } int __vm_munmap(unsigned long start, size_t len, bool unlock) { int ret; struct mm_struct *mm = current->mm; LIST_HEAD(uf); VMA_ITERATOR(vmi, mm, start); if (mmap_write_lock_killable(mm)) return -EINTR; ret = do_vmi_munmap(&vmi, mm, start, len, &uf, unlock); if (ret || !unlock) mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); return ret; }
108 107 108 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 /* * Copyright (c) 2006,2007 The Regents of the University of Michigan. * All rights reserved. * * Andy Adamson <andros@citi.umich.edu> * Fred Isaman <iisaman@umich.edu> * * permission is granted to use, copy, create derivative works and * redistribute this software and such derivative works for any purpose, * so long as the name of the university of michigan is not used in * any advertising or publicity pertaining to the use or distribution * of this software without specific, written prior authorization. if * the above copyright notice or any other identification of the * university of michigan is included in any copy of any portion of * this software, then the disclaimer below must also be included. * * this software is provided as is, without representation from the * university of michigan as to its fitness for any purpose, and without * warranty by the university of michigan of any kind, either express * or implied, including without limitation the implied warranties of * merchantability and fitness for a particular purpose. the regents * of the university of michigan shall not be liable for any damages, * including special, indirect, incidental, or consequential damages, * with respect to any claim arising out or in connection with the use * of the software, even if it has been or is hereafter advised of the * possibility of such damages. */ #include <linux/module.h> #include <linux/blkdev.h> #include "blocklayout.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD static void nfs4_encode_simple(__be32 *p, struct pnfs_block_volume *b) { int i; *p++ = cpu_to_be32(1); *p++ = cpu_to_be32(b->type); *p++ = cpu_to_be32(b->simple.nr_sigs); for (i = 0; i < b->simple.nr_sigs; i++) { p = xdr_encode_hyper(p, b->simple.sigs[i].offset); p = xdr_encode_opaque(p, b->simple.sigs[i].sig, b->simple.sigs[i].sig_len); } } dev_t bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b, gfp_t gfp_mask) { struct net *net = server->nfs_client->cl_net; struct nfs_net *nn = net_generic(net, nfs_net_id); struct bl_dev_msg *reply = &nn->bl_mount_reply; struct bl_pipe_msg bl_pipe_msg; struct rpc_pipe_msg *msg = &bl_pipe_msg.msg; struct bl_msg_hdr *bl_msg; DECLARE_WAITQUEUE(wq, current); dev_t dev = 0; int rc; dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); mutex_lock(&nn->bl_mutex); bl_pipe_msg.bl_wq = &nn->bl_wq; b->simple.len += 4; /* single volume */ if (b->simple.len > PAGE_SIZE) goto out_unlock; memset(msg, 0, sizeof(*msg)); msg->len = sizeof(*bl_msg) + b->simple.len; msg->data = kzalloc(msg->len, gfp_mask); if (!msg->data) goto out_unlock; bl_msg = msg->data; bl_msg->type = BL_DEVICE_MOUNT; bl_msg->totallen = b->simple.len; nfs4_encode_simple(msg->data + sizeof(*bl_msg), b); dprintk("%s CALLING USERSPACE DAEMON\n", __func__); add_wait_queue(&nn->bl_wq, &wq); rc = rpc_queue_upcall(nn->bl_device_pipe, msg); if (rc < 0) { remove_wait_queue(&nn->bl_wq, &wq); goto out_free_data; } set_current_state(TASK_UNINTERRUPTIBLE); schedule(); remove_wait_queue(&nn->bl_wq, &wq); if (reply->status != BL_DEVICE_REQUEST_PROC) { printk(KERN_WARNING "%s failed to decode device: %d\n", __func__, reply->status); goto out_free_data; } dev = MKDEV(reply->major, reply->minor); out_free_data: kfree(msg->data); out_unlock: mutex_unlock(&nn->bl_mutex); return dev; } static ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { struct nfs_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info, nfs_net_id); if (mlen != sizeof (struct bl_dev_msg)) return -EINVAL; if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0) return -EFAULT; wake_up(&nn->bl_wq); return mlen; } static void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) { struct bl_pipe_msg *bl_pipe_msg = container_of(msg, struct bl_pipe_msg, msg); if (msg->errno >= 0) return; wake_up(bl_pipe_msg->bl_wq); } static const struct rpc_pipe_ops bl_upcall_ops = { .upcall = rpc_pipe_generic_upcall, .downcall = bl_pipe_downcall, .destroy_msg = bl_pipe_destroy_msg, }; static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb, struct rpc_pipe *pipe) { struct dentry *dir, *dentry; dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME); if (dir == NULL) return ERR_PTR(-ENOENT); dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe); dput(dir); return dentry; } static void nfs4blocklayout_unregister_sb(struct super_block *sb, struct rpc_pipe *pipe) { if (pipe->dentry) rpc_unlink(pipe->dentry); } static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct super_block *sb = ptr; struct net *net = sb->s_fs_info; struct nfs_net *nn = net_generic(net, nfs_net_id); struct dentry *dentry; int ret = 0; if (!try_module_get(THIS_MODULE)) return 0; if (nn->bl_device_pipe == NULL) { module_put(THIS_MODULE); return 0; } switch (event) { case RPC_PIPEFS_MOUNT: dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); break; } nn->bl_device_pipe->dentry = dentry; break; case RPC_PIPEFS_UMOUNT: if (nn->bl_device_pipe->dentry) nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe); break; default: ret = -ENOTSUPP; break; } module_put(THIS_MODULE); return ret; } static struct notifier_block nfs4blocklayout_block = { .notifier_call = rpc_pipefs_event, }; static struct dentry *nfs4blocklayout_register_net(struct net *net, struct rpc_pipe *pipe) { struct super_block *pipefs_sb; struct dentry *dentry; pipefs_sb = rpc_get_sb_net(net); if (!pipefs_sb) return NULL; dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe); rpc_put_sb_net(net); return dentry; } static void nfs4blocklayout_unregister_net(struct net *net, struct rpc_pipe *pipe) { struct super_block *pipefs_sb; pipefs_sb = rpc_get_sb_net(net); if (pipefs_sb) { nfs4blocklayout_unregister_sb(pipefs_sb, pipe); rpc_put_sb_net(net); } } static int nfs4blocklayout_net_init(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); struct dentry *dentry; mutex_init(&nn->bl_mutex); init_waitqueue_head(&nn->bl_wq); nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0); if (IS_ERR(nn->bl_device_pipe)) return PTR_ERR(nn->bl_device_pipe); dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe); if (IS_ERR(dentry)) { rpc_destroy_pipe_data(nn->bl_device_pipe); return PTR_ERR(dentry); } nn->bl_device_pipe->dentry = dentry; return 0; } static void nfs4blocklayout_net_exit(struct net *net) { struct nfs_net *nn = net_generic(net, nfs_net_id); nfs4blocklayout_unregister_net(net, nn->bl_device_pipe); rpc_destroy_pipe_data(nn->bl_device_pipe); nn->bl_device_pipe = NULL; } static struct pernet_operations nfs4blocklayout_net_ops = { .init = nfs4blocklayout_net_init, .exit = nfs4blocklayout_net_exit, }; int __init bl_init_pipefs(void) { int ret; ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block); if (ret) goto out; ret = register_pernet_subsys(&nfs4blocklayout_net_ops); if (ret) goto out_unregister_notifier; return 0; out_unregister_notifier: rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); out: return ret; } void bl_cleanup_pipefs(void) { rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); unregister_pernet_subsys(&nfs4blocklayout_net_ops); }
8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 // SPDX-License-Identifier: GPL-2.0-only /* * * Authors: * Alexander Aring <aar@pengutronix.de> * * Based on: net/wireless/sysfs.c */ #include <linux/device.h> #include <linux/rtnetlink.h> #include <net/cfg802154.h> #include "core.h" #include "sysfs.h" #include "rdev-ops.h" static inline struct cfg802154_registered_device * dev_to_rdev(struct device *dev) { return container_of(dev, struct cfg802154_registered_device, wpan_phy.dev); } #define SHOW_FMT(name, fmt, member) \ static ssize_t name ## _show(struct device *dev, \ struct device_attribute *attr, \ char *buf) \ { \ return sprintf(buf, fmt "\n", dev_to_rdev(dev)->member); \ } \ static DEVICE_ATTR_RO(name) SHOW_FMT(index, "%d", wpan_phy_idx); static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wpan_phy *wpan_phy = &dev_to_rdev(dev)->wpan_phy; return sprintf(buf, "%s\n", dev_name(&wpan_phy->dev)); } static DEVICE_ATTR_RO(name); static void wpan_phy_release(struct device *dev) { struct cfg802154_registered_device *rdev = dev_to_rdev(dev); cfg802154_dev_free(rdev); } static struct attribute *pmib_attrs[] = { &dev_attr_index.attr, &dev_attr_name.attr, NULL, }; ATTRIBUTE_GROUPS(pmib); #ifdef CONFIG_PM_SLEEP static int wpan_phy_suspend(struct device *dev) { struct cfg802154_registered_device *rdev = dev_to_rdev(dev); int ret = 0; if (rdev->ops->suspend) { rtnl_lock(); ret = rdev_suspend(rdev); rtnl_unlock(); } return ret; } static int wpan_phy_resume(struct device *dev) { struct cfg802154_registered_device *rdev = dev_to_rdev(dev); int ret = 0; if (rdev->ops->resume) { rtnl_lock(); ret = rdev_resume(rdev); rtnl_unlock(); } return ret; } static SIMPLE_DEV_PM_OPS(wpan_phy_pm_ops, wpan_phy_suspend, wpan_phy_resume); #define WPAN_PHY_PM_OPS (&wpan_phy_pm_ops) #else #define WPAN_PHY_PM_OPS NULL #endif const struct class wpan_phy_class = { .name = "ieee802154", .dev_release = wpan_phy_release, .dev_groups = pmib_groups, .pm = WPAN_PHY_PM_OPS, }; int wpan_phy_sysfs_init(void) { return class_register(&wpan_phy_class); } void wpan_phy_sysfs_exit(void) { class_unregister(&wpan_phy_class); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SPECIAL_INSNS_H #define _ASM_X86_SPECIAL_INSNS_H #ifdef __KERNEL__ #include <asm/nops.h> #include <asm/processor-flags.h> #include <linux/errno.h> #include <linux/irqflags.h> #include <linux/jump_label.h> /* * The compiler should not reorder volatile asm statements with respect to each * other: they should execute in program order. However GCC 4.9.x and 5.x have * a bug (which was fixed in 8.1, 7.3 and 6.5) where they might reorder * volatile asm. The write functions are not affected since they have memory * clobbers preventing reordering. To prevent reads from being reordered with * respect to writes, use a dummy memory operand. */ #define __FORCE_ORDER "m"(*(unsigned int *)0x1000UL) void native_write_cr0(unsigned long val); static inline unsigned long native_read_cr0(void) { unsigned long val; asm volatile("mov %%cr0,%0\n\t" : "=r" (val) : __FORCE_ORDER); return val; } static __always_inline unsigned long native_read_cr2(void) { unsigned long val; asm volatile("mov %%cr2,%0\n\t" : "=r" (val) : __FORCE_ORDER); return val; } static __always_inline void native_write_cr2(unsigned long val) { asm volatile("mov %0,%%cr2": : "r" (val) : "memory"); } static inline unsigned long __native_read_cr3(void) { unsigned long val; asm volatile("mov %%cr3,%0\n\t" : "=r" (val) : __FORCE_ORDER); return val; } static inline void native_write_cr3(unsigned long val) { asm volatile("mov %0,%%cr3": : "r" (val) : "memory"); } static inline unsigned long native_read_cr4(void) { unsigned long val; #ifdef CONFIG_X86_32 /* * This could fault if CR4 does not exist. Non-existent CR4 * is functionally equivalent to CR4 == 0. Keep it simple and pretend * that CR4 == 0 on CPUs that don't have CR4. */ asm volatile("1: mov %%cr4, %0\n" "2:\n" _ASM_EXTABLE(1b, 2b) : "=r" (val) : "0" (0), __FORCE_ORDER); #else /* CR4 always exists on x86_64. */ asm volatile("mov %%cr4,%0\n\t" : "=r" (val) : __FORCE_ORDER); #endif return val; } void native_write_cr4(unsigned long val); #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS static inline u32 rdpkru(void) { u32 ecx = 0; u32 edx, pkru; /* * "rdpkru" instruction. Places PKRU contents in to EAX, * clears EDX and requires that ecx=0. */ asm volatile(".byte 0x0f,0x01,0xee\n\t" : "=a" (pkru), "=d" (edx) : "c" (ecx)); return pkru; } static inline void wrpkru(u32 pkru) { u32 ecx = 0, edx = 0; /* * "wrpkru" instruction. Loads contents in EAX to PKRU, * requires that ecx = edx = 0. */ asm volatile(".byte 0x0f,0x01,0xef\n\t" : : "a" (pkru), "c"(ecx), "d"(edx)); } #else static inline u32 rdpkru(void) { return 0; } static inline void wrpkru(u32 pkru) { } #endif static __always_inline void wbinvd(void) { asm volatile("wbinvd": : :"memory"); } static inline unsigned long __read_cr4(void) { return native_read_cr4(); } #ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else static inline unsigned long read_cr0(void) { return native_read_cr0(); } static inline void write_cr0(unsigned long x) { native_write_cr0(x); } static __always_inline unsigned long read_cr2(void) { return native_read_cr2(); } static __always_inline void write_cr2(unsigned long x) { native_write_cr2(x); } /* * Careful! CR3 contains more than just an address. You probably want * read_cr3_pa() instead. */ static inline unsigned long __read_cr3(void) { return __native_read_cr3(); } static inline void write_cr3(unsigned long x) { native_write_cr3(x); } static inline void __write_cr4(unsigned long x) { native_write_cr4(x); } #endif /* CONFIG_PARAVIRT_XXL */ static __always_inline void clflush(volatile void *__p) { asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p)); } static inline void clflushopt(volatile void *__p) { alternative_io(".byte 0x3e; clflush %0", ".byte 0x66; clflush %0", X86_FEATURE_CLFLUSHOPT, "+m" (*(volatile char __force *)__p)); } static inline void clwb(volatile void *__p) { volatile struct { char x[64]; } *p = __p; asm volatile(ALTERNATIVE_2( ".byte 0x3e; clflush (%[pax])", ".byte 0x66; clflush (%[pax])", /* clflushopt (%%rax) */ X86_FEATURE_CLFLUSHOPT, ".byte 0x66, 0x0f, 0xae, 0x30", /* clwb (%%rax) */ X86_FEATURE_CLWB) : [p] "+m" (*p) : [pax] "a" (p)); } #ifdef CONFIG_X86_USER_SHADOW_STACK static inline int write_user_shstk_64(u64 __user *addr, u64 val) { asm goto("1: wrussq %[val], %[addr]\n" _ASM_EXTABLE(1b, %l[fail]) :: [addr] "m" (*addr), [val] "r" (val) :: fail); return 0; fail: return -EFAULT; } #endif /* CONFIG_X86_USER_SHADOW_STACK */ #define nop() asm volatile ("nop") static __always_inline void serialize(void) { /* Instruction opcode for SERIALIZE; supported in binutils >= 2.35. */ asm volatile(".byte 0xf, 0x1, 0xe8" ::: "memory"); } /* The dst parameter must be 64-bytes aligned */ static inline void movdir64b(void *dst, const void *src) { const struct { char _[64]; } *__src = src; struct { char _[64]; } *__dst = dst; /* * MOVDIR64B %(rdx), rax. * * Both __src and __dst must be memory constraints in order to tell the * compiler that no other memory accesses should be reordered around * this one. * * Also, both must be supplied as lvalues because this tells * the compiler what the object is (its size) the instruction accesses. * I.e., not the pointers but what they point to, thus the deref'ing '*'. */ asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02" : "+m" (*__dst) : "m" (*__src), "a" (__dst), "d" (__src)); } static inline void movdir64b_io(void __iomem *dst, const void *src) { movdir64b((void __force *)dst, src); } /** * enqcmds - Enqueue a command in supervisor (CPL0) mode * @dst: destination, in MMIO space (must be 512-bit aligned) * @src: 512 bits memory operand * * The ENQCMDS instruction allows software to write a 512-bit command to * a 512-bit-aligned special MMIO region that supports the instruction. * A return status is loaded into the ZF flag in the RFLAGS register. * ZF = 0 equates to success, and ZF = 1 indicates retry or error. * * This function issues the ENQCMDS instruction to submit data from * kernel space to MMIO space, in a unit of 512 bits. Order of data access * is not guaranteed, nor is a memory barrier performed afterwards. It * returns 0 on success and -EAGAIN on failure. * * Warning: Do not use this helper unless your driver has checked that the * ENQCMDS instruction is supported on the platform and the device accepts * ENQCMDS. */ static inline int enqcmds(void __iomem *dst, const void *src) { const struct { char _[64]; } *__src = src; struct { char _[64]; } __iomem *__dst = dst; bool zf; /* * ENQCMDS %(rdx), rax * * See movdir64b()'s comment on operand specification. */ asm volatile(".byte 0xf3, 0x0f, 0x38, 0xf8, 0x02, 0x66, 0x90" CC_SET(z) : CC_OUT(z) (zf), "+m" (*__dst) : "m" (*__src), "a" (__dst), "d" (__src)); /* Submission failure is indicated via EFLAGS.ZF=1 */ if (zf) return -EAGAIN; return 0; } static __always_inline void tile_release(void) { /* * Instruction opcode for TILERELEASE; supported in binutils * version >= 2.36. */ asm volatile(".byte 0xc4, 0xe2, 0x78, 0x49, 0xc0"); } #endif /* __KERNEL__ */ #endif /* _ASM_X86_SPECIAL_INSNS_H */
219 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the ICMP protocol. * * Version: @(#)icmp.h 1.0.3 04/28/93 * * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_ICMP_H #define _LINUX_ICMP_H #include <linux/skbuff.h> #include <uapi/linux/icmp.h> #include <uapi/linux/errqueue.h> static inline struct icmphdr *icmp_hdr(const struct sk_buff *skb) { return (struct icmphdr *)skb_transport_header(skb); } static inline bool icmp_is_err(int type) { switch (type) { case ICMP_DEST_UNREACH: case ICMP_SOURCE_QUENCH: case ICMP_REDIRECT: case ICMP_TIME_EXCEEDED: case ICMP_PARAMETERPROB: return true; } return false; } void ip_icmp_error_rfc4884(const struct sk_buff *skb, struct sock_ee_data_rfc4884 *out, int thlen, int off); #endif /* _LINUX_ICMP_H */
69 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 // SPDX-License-Identifier: GPL-2.0-only /* * Software WEP encryption implementation * Copyright 2002, Jouni Malinen <jkmaline@cc.hut.fi> * Copyright 2003, Instant802 Networks, Inc. * Copyright (C) 2023 Intel Corporation */ #include <linux/netdevice.h> #include <linux/types.h> #include <linux/random.h> #include <linux/compiler.h> #include <linux/crc32.h> #include <linux/crypto.h> #include <linux/err.h> #include <linux/mm.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <linux/unaligned.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "wep.h" void ieee80211_wep_init(struct ieee80211_local *local) { /* start WEP IV from a random value */ get_random_bytes(&local->wep_iv, IEEE80211_WEP_IV_LEN); } static inline bool ieee80211_wep_weak_iv(u32 iv, int keylen) { /* * Fluhrer, Mantin, and Shamir have reported weaknesses in the * key scheduling algorithm of RC4. At least IVs (KeyByte + 3, * 0xff, N) can be used to speedup attacks, so avoid using them. */ if ((iv & 0xff00) == 0xff00) { u8 B = (iv >> 16) & 0xff; if (B >= 3 && B < 3 + keylen) return true; } return false; } static void ieee80211_wep_get_iv(struct ieee80211_local *local, int keylen, int keyidx, u8 *iv) { local->wep_iv++; if (ieee80211_wep_weak_iv(local->wep_iv, keylen)) local->wep_iv += 0x0100; if (!iv) return; *iv++ = (local->wep_iv >> 16) & 0xff; *iv++ = (local->wep_iv >> 8) & 0xff; *iv++ = local->wep_iv & 0xff; *iv++ = keyidx << 6; } static u8 *ieee80211_wep_add_iv(struct ieee80211_local *local, struct sk_buff *skb, int keylen, int keyidx) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); unsigned int hdrlen; u8 *newhdr; hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); if (WARN_ON(skb_headroom(skb) < IEEE80211_WEP_IV_LEN)) return NULL; hdrlen = ieee80211_hdrlen(hdr->frame_control); newhdr = skb_push(skb, IEEE80211_WEP_IV_LEN); memmove(newhdr, newhdr + IEEE80211_WEP_IV_LEN, hdrlen); /* the HW only needs room for the IV, but not the actual IV */ if (info->control.hw_key && (info->control.hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE)) return newhdr + hdrlen; ieee80211_wep_get_iv(local, keylen, keyidx, newhdr + hdrlen); return newhdr + hdrlen; } static void ieee80211_wep_remove_iv(struct ieee80211_local *local, struct sk_buff *skb, struct ieee80211_key *key) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; unsigned int hdrlen; hdrlen = ieee80211_hdrlen(hdr->frame_control); memmove(skb->data + IEEE80211_WEP_IV_LEN, skb->data, hdrlen); skb_pull(skb, IEEE80211_WEP_IV_LEN); } /* Perform WEP encryption using given key. data buffer must have tailroom * for 4-byte ICV. data_len must not include this ICV. Note: this function * does _not_ add IV. data = RC4(data | CRC32(data)) */ int ieee80211_wep_encrypt_data(struct arc4_ctx *ctx, u8 *rc4key, size_t klen, u8 *data, size_t data_len) { __le32 icv; icv = cpu_to_le32(~crc32_le(~0, data, data_len)); put_unaligned(icv, (__le32 *)(data + data_len)); arc4_setkey(ctx, rc4key, klen); arc4_crypt(ctx, data, data, data_len + IEEE80211_WEP_ICV_LEN); memzero_explicit(ctx, sizeof(*ctx)); return 0; } /* Perform WEP encryption on given skb. 4 bytes of extra space (IV) in the * beginning of the buffer 4 bytes of extra space (ICV) in the end of the * buffer will be added. Both IV and ICV will be transmitted, so the * payload length increases with 8 bytes. * * WEP frame payload: IV + TX key idx, RC4(data), ICV = RC4(CRC32(data)) */ int ieee80211_wep_encrypt(struct ieee80211_local *local, struct sk_buff *skb, const u8 *key, int keylen, int keyidx) { u8 *iv; size_t len; u8 rc4key[3 + WLAN_KEY_LEN_WEP104]; if (WARN_ON(skb_tailroom(skb) < IEEE80211_WEP_ICV_LEN)) return -1; iv = ieee80211_wep_add_iv(local, skb, keylen, keyidx); if (!iv) return -1; len = skb->len - (iv + IEEE80211_WEP_IV_LEN - skb->data); /* Prepend 24-bit IV to RC4 key */ memcpy(rc4key, iv, 3); /* Copy rest of the WEP key (the secret part) */ memcpy(rc4key + 3, key, keylen); /* Add room for ICV */ skb_put(skb, IEEE80211_WEP_ICV_LEN); return ieee80211_wep_encrypt_data(&local->wep_tx_ctx, rc4key, keylen + 3, iv + IEEE80211_WEP_IV_LEN, len); } /* Perform WEP decryption using given key. data buffer includes encrypted * payload, including 4-byte ICV, but _not_ IV. data_len must not include ICV. * Return 0 on success and -1 on ICV mismatch. */ int ieee80211_wep_decrypt_data(struct arc4_ctx *ctx, u8 *rc4key, size_t klen, u8 *data, size_t data_len) { __le32 crc; arc4_setkey(ctx, rc4key, klen); arc4_crypt(ctx, data, data, data_len + IEEE80211_WEP_ICV_LEN); memzero_explicit(ctx, sizeof(*ctx)); crc = cpu_to_le32(~crc32_le(~0, data, data_len)); if (memcmp(&crc, data + data_len, IEEE80211_WEP_ICV_LEN) != 0) /* ICV mismatch */ return -1; return 0; } /* Perform WEP decryption on given skb. Buffer includes whole WEP part of * the frame: IV (4 bytes), encrypted payload (including SNAP header), * ICV (4 bytes). skb->len includes both IV and ICV. * * Returns 0 if frame was decrypted successfully and ICV was correct and -1 on * failure. If frame is OK, IV and ICV will be removed, i.e., decrypted payload * is moved to the beginning of the skb and skb length will be reduced. */ static int ieee80211_wep_decrypt(struct ieee80211_local *local, struct sk_buff *skb, struct ieee80211_key *key) { u32 klen; u8 rc4key[3 + WLAN_KEY_LEN_WEP104]; u8 keyidx; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; unsigned int hdrlen; size_t len; int ret = 0; if (!ieee80211_has_protected(hdr->frame_control)) return -1; hdrlen = ieee80211_hdrlen(hdr->frame_control); if (skb->len < hdrlen + IEEE80211_WEP_IV_LEN + IEEE80211_WEP_ICV_LEN) return -1; len = skb->len - hdrlen - IEEE80211_WEP_IV_LEN - IEEE80211_WEP_ICV_LEN; keyidx = skb->data[hdrlen + 3] >> 6; if (!key || keyidx != key->conf.keyidx) return -1; klen = 3 + key->conf.keylen; /* Prepend 24-bit IV to RC4 key */ memcpy(rc4key, skb->data + hdrlen, 3); /* Copy rest of the WEP key (the secret part) */ memcpy(rc4key + 3, key->conf.key, key->conf.keylen); if (ieee80211_wep_decrypt_data(&local->wep_rx_ctx, rc4key, klen, skb->data + hdrlen + IEEE80211_WEP_IV_LEN, len)) ret = -1; /* Trim ICV */ skb_trim(skb, skb->len - IEEE80211_WEP_ICV_LEN); /* Remove IV */ memmove(skb->data + IEEE80211_WEP_IV_LEN, skb->data, hdrlen); skb_pull(skb, IEEE80211_WEP_IV_LEN); return ret; } ieee80211_rx_result ieee80211_crypto_wep_decrypt(struct ieee80211_rx_data *rx) { struct sk_buff *skb = rx->skb; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; __le16 fc = hdr->frame_control; if (!ieee80211_is_data(fc) && !ieee80211_is_auth(fc)) return RX_CONTINUE; if (!(status->flag & RX_FLAG_DECRYPTED)) { if (skb_linearize(rx->skb)) return RX_DROP_U_OOM; if (ieee80211_wep_decrypt(rx->local, rx->skb, rx->key)) return RX_DROP_U_WEP_DEC_FAIL; } else if (!(status->flag & RX_FLAG_IV_STRIPPED)) { if (!pskb_may_pull(rx->skb, ieee80211_hdrlen(fc) + IEEE80211_WEP_IV_LEN)) return RX_DROP_U_NO_IV; ieee80211_wep_remove_iv(rx->local, rx->skb, rx->key); /* remove ICV */ if (!(status->flag & RX_FLAG_ICV_STRIPPED) && pskb_trim(rx->skb, rx->skb->len - IEEE80211_WEP_ICV_LEN)) return RX_DROP_U_NO_ICV; } return RX_CONTINUE; } static int wep_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_key_conf *hw_key = info->control.hw_key; if (!hw_key) { if (ieee80211_wep_encrypt(tx->local, skb, tx->key->conf.key, tx->key->conf.keylen, tx->key->conf.keyidx)) return -1; } else if ((hw_key->flags & IEEE80211_KEY_FLAG_GENERATE_IV) || (hw_key->flags & IEEE80211_KEY_FLAG_PUT_IV_SPACE)) { if (!ieee80211_wep_add_iv(tx->local, skb, tx->key->conf.keylen, tx->key->conf.keyidx)) return -1; } return 0; } ieee80211_tx_result ieee80211_crypto_wep_encrypt(struct ieee80211_tx_data *tx) { struct sk_buff *skb; ieee80211_tx_set_protected(tx); skb_queue_walk(&tx->skbs, skb) { if (wep_encrypt_skb(tx, skb) < 0) { I802_DEBUG_INC(tx->local->tx_handlers_drop_wep); return TX_DROP; } } return TX_CONTINUE; }
119 84 81 60 6 15 47 48 47 21 47 47 47 5 83 84 19 82 60 47 40 1 8 39 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 /* * INETPEER - A storage for permanent information about peers * * This source is covered by the GNU GPL, the same as all kernel sources. * * Authors: Andrey V. Savochkin <saw@msu.ru> */ #include <linux/cache.h> #include <linux/module.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/interrupt.h> #include <linux/spinlock.h> #include <linux/random.h> #include <linux/timer.h> #include <linux/time.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/net.h> #include <linux/workqueue.h> #include <net/ip.h> #include <net/inetpeer.h> #include <net/secure_seq.h> /* * Theory of operations. * We keep one entry for each peer IP address. The nodes contains long-living * information about the peer which doesn't depend on routes. * * Nodes are removed only when reference counter goes to 0. * When it's happened the node may be removed when a sufficient amount of * time has been passed since its last use. The less-recently-used entry can * also be removed if the pool is overloaded i.e. if the total amount of * entries is greater-or-equal than the threshold. * * Node pool is organised as an RB tree. * Such an implementation has been chosen not just for fun. It's a way to * prevent easy and efficient DoS attacks by creating hash collisions. A huge * amount of long living nodes in a single hash slot would significantly delay * lookups performed with disabled BHs. * * Serialisation issues. * 1. Nodes may appear in the tree only with the pool lock held. * 2. Nodes may disappear from the tree only with the pool lock held * AND reference count being 0. * 3. Global variable peer_total is modified under the pool lock. * 4. struct inet_peer fields modification: * rb_node: pool lock * refcnt: atomically against modifications on other CPU; * usually under some other lock to prevent node disappearing * daddr: unchangeable */ static struct kmem_cache *peer_cachep __ro_after_init; void inet_peer_base_init(struct inet_peer_base *bp) { bp->rb_root = RB_ROOT; seqlock_init(&bp->lock); bp->total = 0; } EXPORT_IPV6_MOD_GPL(inet_peer_base_init); #define PEER_MAX_GC 32 /* Exported for sysctl_net_ipv4. */ int inet_peer_threshold __read_mostly; /* start to throw entries more * aggressively at this stage */ int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */ /* Called from ip_output.c:ip_init */ void __init inet_initpeers(void) { u64 nr_entries; /* 1% of physical memory */ nr_entries = div64_ul((u64)totalram_pages() << PAGE_SHIFT, 100 * L1_CACHE_ALIGN(sizeof(struct inet_peer))); inet_peer_threshold = clamp_val(nr_entries, 4096, 65536 + 128); peer_cachep = KMEM_CACHE(inet_peer, SLAB_HWCACHE_ALIGN | SLAB_PANIC); } /* Called with rcu_read_lock() or base->lock held */ static struct inet_peer *lookup(const struct inetpeer_addr *daddr, struct inet_peer_base *base, unsigned int seq, struct inet_peer *gc_stack[], unsigned int *gc_cnt, struct rb_node **parent_p, struct rb_node ***pp_p) { struct rb_node **pp, *parent, *next; struct inet_peer *p; u32 now; pp = &base->rb_root.rb_node; parent = NULL; while (1) { int cmp; next = rcu_dereference_raw(*pp); if (!next) break; parent = next; p = rb_entry(parent, struct inet_peer, rb_node); cmp = inetpeer_addr_cmp(daddr, &p->daddr); if (cmp == 0) { now = jiffies; if (READ_ONCE(p->dtime) != now) WRITE_ONCE(p->dtime, now); return p; } if (gc_stack) { if (*gc_cnt < PEER_MAX_GC) gc_stack[(*gc_cnt)++] = p; } else if (unlikely(read_seqretry(&base->lock, seq))) { break; } if (cmp == -1) pp = &next->rb_left; else pp = &next->rb_right; } *parent_p = parent; *pp_p = pp; return NULL; } /* perform garbage collect on all items stacked during a lookup */ static void inet_peer_gc(struct inet_peer_base *base, struct inet_peer *gc_stack[], unsigned int gc_cnt) { int peer_threshold, peer_maxttl, peer_minttl; struct inet_peer *p; __u32 delta, ttl; int i; peer_threshold = READ_ONCE(inet_peer_threshold); peer_maxttl = READ_ONCE(inet_peer_maxttl); peer_minttl = READ_ONCE(inet_peer_minttl); if (base->total >= peer_threshold) ttl = 0; /* be aggressive */ else ttl = peer_maxttl - (peer_maxttl - peer_minttl) / HZ * base->total / peer_threshold * HZ; for (i = 0; i < gc_cnt; i++) { p = gc_stack[i]; delta = (__u32)jiffies - READ_ONCE(p->dtime); if (delta < ttl || !refcount_dec_if_one(&p->refcnt)) gc_stack[i] = NULL; } for (i = 0; i < gc_cnt; i++) { p = gc_stack[i]; if (p) { rb_erase(&p->rb_node, &base->rb_root); base->total--; kfree_rcu(p, rcu); } } } /* Must be called under RCU : No refcount change is done here. */ struct inet_peer *inet_getpeer(struct inet_peer_base *base, const struct inetpeer_addr *daddr) { struct inet_peer *p, *gc_stack[PEER_MAX_GC]; struct rb_node **pp, *parent; unsigned int gc_cnt, seq; /* Attempt a lockless lookup first. * Because of a concurrent writer, we might not find an existing entry. */ seq = read_seqbegin(&base->lock); p = lookup(daddr, base, seq, NULL, &gc_cnt, &parent, &pp); if (p) return p; /* retry an exact lookup, taking the lock before. * At least, nodes should be hot in our cache. */ parent = NULL; write_seqlock_bh(&base->lock); gc_cnt = 0; p = lookup(daddr, base, seq, gc_stack, &gc_cnt, &parent, &pp); if (!p) { p = kmem_cache_alloc(peer_cachep, GFP_ATOMIC); if (p) { p->daddr = *daddr; p->dtime = (__u32)jiffies; refcount_set(&p->refcnt, 1); atomic_set(&p->rid, 0); p->metrics[RTAX_LOCK-1] = INETPEER_METRICS_NEW; p->rate_tokens = 0; p->n_redirects = 0; /* 60*HZ is arbitrary, but chosen enough high so that the first * calculation of tokens is at its maximum. */ p->rate_last = jiffies - 60*HZ; rb_link_node(&p->rb_node, parent, pp); rb_insert_color(&p->rb_node, &base->rb_root); base->total++; } } if (gc_cnt) inet_peer_gc(base, gc_stack, gc_cnt); write_sequnlock_bh(&base->lock); return p; } EXPORT_IPV6_MOD_GPL(inet_getpeer); void inet_putpeer(struct inet_peer *p) { if (refcount_dec_and_test(&p->refcnt)) kfree_rcu(p, rcu); } /* * Check transmit rate limitation for given message. * The rate information is held in the inet_peer entries now. * This function is generic and could be used for other purposes * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov. * * Note that the same inet_peer fields are modified by functions in * route.c too, but these work for packet destinations while xrlim_allow * works for icmp destinations. This means the rate limiting information * for one "ip object" is shared - and these ICMPs are twice limited: * by source and by destination. * * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate * SHOULD allow setting of rate limits * * Shared between ICMPv4 and ICMPv6. */ #define XRLIM_BURST_FACTOR 6 bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout) { unsigned long now, token, otoken, delta; bool rc = false; if (!peer) return true; token = otoken = READ_ONCE(peer->rate_tokens); now = jiffies; delta = now - READ_ONCE(peer->rate_last); if (delta) { WRITE_ONCE(peer->rate_last, now); token += delta; if (token > XRLIM_BURST_FACTOR * timeout) token = XRLIM_BURST_FACTOR * timeout; } if (token >= timeout) { token -= timeout; rc = true; } if (token != otoken) WRITE_ONCE(peer->rate_tokens, token); return rc; } EXPORT_IPV6_MOD(inet_peer_xrlim_allow); void inetpeer_invalidate_tree(struct inet_peer_base *base) { struct rb_node *p = rb_first(&base->rb_root); while (p) { struct inet_peer *peer = rb_entry(p, struct inet_peer, rb_node); p = rb_next(p); rb_erase(&peer->rb_node, &base->rb_root); inet_putpeer(peer); cond_resched(); } base->total = 0; } EXPORT_IPV6_MOD(inetpeer_invalidate_tree);
2 101 97 85 85 85 85 85 85 85 85 85 85 85 85 85 85 2 3 43 2 23 31 8 25 9 9 9 9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 /* * net/tipc/link.c: TIPC link code * * Copyright (c) 1996-2007, 2012-2016, Ericsson AB * Copyright (c) 2004-2007, 2010-2013, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "subscr.h" #include "link.h" #include "bcast.h" #include "socket.h" #include "name_distr.h" #include "discover.h" #include "netlink.h" #include "monitor.h" #include "trace.h" #include "crypto.h" #include <linux/pkt_sched.h> struct tipc_stats { u32 sent_pkts; u32 recv_pkts; u32 sent_states; u32 recv_states; u32 sent_probes; u32 recv_probes; u32 sent_nacks; u32 recv_nacks; u32 sent_acks; u32 sent_bundled; u32 sent_bundles; u32 recv_bundled; u32 recv_bundles; u32 retransmitted; u32 sent_fragmented; u32 sent_fragments; u32 recv_fragmented; u32 recv_fragments; u32 link_congs; /* # port sends blocked by congestion */ u32 deferred_recv; u32 duplicates; u32 max_queue_sz; /* send queue size high water mark */ u32 accu_queue_sz; /* used for send queue size profiling */ u32 queue_sz_counts; /* used for send queue size profiling */ u32 msg_length_counts; /* used for message length profiling */ u32 msg_lengths_total; /* used for message length profiling */ u32 msg_length_profile[7]; /* used for msg. length profiling */ }; /** * struct tipc_link - TIPC link data structure * @addr: network address of link's peer node * @name: link name character string * @net: pointer to namespace struct * @peer_session: link session # being used by peer end of link * @peer_bearer_id: bearer id used by link's peer endpoint * @bearer_id: local bearer id used by link * @tolerance: minimum link continuity loss needed to reset link [in ms] * @abort_limit: # of unacknowledged continuity probes needed to reset link * @state: current state of link FSM * @peer_caps: bitmap describing capabilities of peer node * @silent_intv_cnt: # of timer intervals without any reception from peer * @priority: current link priority * @net_plane: current link network plane ('A' through 'H') * @mon_state: cookie with information needed by link monitor * @mtu: current maximum packet size for this link * @advertised_mtu: advertised own mtu when link is being established * @backlogq: queue for messages waiting to be sent * @ackers: # of peers that needs to ack each packet before it can be released * @acked: # last packet acked by a certain peer. Used for broadcast. * @rcv_nxt: next sequence number to expect for inbound messages * @inputq: buffer queue for messages to be delivered upwards * @namedq: buffer queue for name table messages to be delivered upwards * @wakeupq: linked list of wakeup msgs waiting for link congestion to abate * @reasm_buf: head of partially reassembled inbound message fragments * @stats: collects statistics regarding link activity * @session: session to be used by link * @snd_nxt_state: next send seq number * @rcv_nxt_state: next rcv seq number * @in_session: have received ACTIVATE_MSG from peer * @active: link is active * @if_name: associated interface name * @rst_cnt: link reset counter * @drop_point: seq number for failover handling (FIXME) * @failover_reasm_skb: saved failover msg ptr (FIXME) * @failover_deferdq: deferred message queue for failover processing (FIXME) * @transmq: the link's transmit queue * @backlog: link's backlog by priority (importance) * @snd_nxt: next sequence number to be used * @rcv_unacked: # messages read by user, but not yet acked back to peer * @deferdq: deferred receive queue * @window: sliding window size for congestion handling * @min_win: minimal send window to be used by link * @ssthresh: slow start threshold for congestion handling * @max_win: maximal send window to be used by link * @cong_acks: congestion acks for congestion avoidance (FIXME) * @checkpoint: seq number for congestion window size handling * @reasm_tnlmsg: fragmentation/reassembly area for tunnel protocol message * @last_gap: last gap ack blocks for bcast (FIXME) * @last_ga: ptr to gap ack blocks * @bc_rcvlink: the peer specific link used for broadcast reception * @bc_sndlink: the namespace global link used for broadcast sending * @nack_state: bcast nack state * @bc_peer_is_up: peer has acked the bcast init msg */ struct tipc_link { u32 addr; char name[TIPC_MAX_LINK_NAME]; struct net *net; /* Management and link supervision data */ u16 peer_session; u16 session; u16 snd_nxt_state; u16 rcv_nxt_state; u32 peer_bearer_id; u32 bearer_id; u32 tolerance; u32 abort_limit; u32 state; u16 peer_caps; bool in_session; bool active; u32 silent_intv_cnt; char if_name[TIPC_MAX_IF_NAME]; u32 priority; char net_plane; struct tipc_mon_state mon_state; u16 rst_cnt; /* Failover/synch */ u16 drop_point; struct sk_buff *failover_reasm_skb; struct sk_buff_head failover_deferdq; /* Max packet negotiation */ u16 mtu; u16 advertised_mtu; /* Sending */ struct sk_buff_head transmq; struct sk_buff_head backlogq; struct { u16 len; u16 limit; struct sk_buff *target_bskb; } backlog[5]; u16 snd_nxt; /* Reception */ u16 rcv_nxt; u32 rcv_unacked; struct sk_buff_head deferdq; struct sk_buff_head *inputq; struct sk_buff_head *namedq; /* Congestion handling */ struct sk_buff_head wakeupq; u16 window; u16 min_win; u16 ssthresh; u16 max_win; u16 cong_acks; u16 checkpoint; /* Fragmentation/reassembly */ struct sk_buff *reasm_buf; struct sk_buff *reasm_tnlmsg; /* Broadcast */ u16 ackers; u16 acked; u16 last_gap; struct tipc_gap_ack_blks *last_ga; struct tipc_link *bc_rcvlink; struct tipc_link *bc_sndlink; u8 nack_state; bool bc_peer_is_up; /* Statistics */ struct tipc_stats stats; }; /* * Error message prefixes */ static const char *link_co_err = "Link tunneling error, "; static const char *link_rst_msg = "Resetting link "; /* Send states for broadcast NACKs */ enum { BC_NACK_SND_CONDITIONAL, BC_NACK_SND_UNCONDITIONAL, BC_NACK_SND_SUPPRESS, }; #define TIPC_BC_RETR_LIM (jiffies + msecs_to_jiffies(10)) #define TIPC_UC_RETR_TIME (jiffies + msecs_to_jiffies(1)) /* Link FSM states: */ enum { LINK_ESTABLISHED = 0xe, LINK_ESTABLISHING = 0xe << 4, LINK_RESET = 0x1 << 8, LINK_RESETTING = 0x2 << 12, LINK_PEER_RESET = 0xd << 16, LINK_FAILINGOVER = 0xf << 20, LINK_SYNCHING = 0xc << 24 }; static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, struct sk_buff_head *xmitq); static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, bool probe_reply, u16 rcvgap, int tolerance, int priority, struct sk_buff_head *xmitq); static void link_print(struct tipc_link *l, const char *str); static int tipc_link_build_nack_msg(struct tipc_link *l, struct sk_buff_head *xmitq); static void tipc_link_build_bc_init_msg(struct tipc_link *l, struct sk_buff_head *xmitq); static u8 __tipc_build_gap_ack_blks(struct tipc_gap_ack_blks *ga, struct tipc_link *l, u8 start_index); static u16 tipc_build_gap_ack_blks(struct tipc_link *l, struct tipc_msg *hdr); static int tipc_link_advance_transmq(struct tipc_link *l, struct tipc_link *r, u16 acked, u16 gap, struct tipc_gap_ack_blks *ga, struct sk_buff_head *xmitq, bool *retransmitted, int *rc); static void tipc_link_update_cwin(struct tipc_link *l, int released, bool retransmitted); /* * Simple non-static link routines (i.e. referenced outside this file) */ bool tipc_link_is_up(struct tipc_link *l) { return l->state & (LINK_ESTABLISHED | LINK_SYNCHING); } bool tipc_link_peer_is_down(struct tipc_link *l) { return l->state == LINK_PEER_RESET; } bool tipc_link_is_reset(struct tipc_link *l) { return l->state & (LINK_RESET | LINK_FAILINGOVER | LINK_ESTABLISHING); } bool tipc_link_is_establishing(struct tipc_link *l) { return l->state == LINK_ESTABLISHING; } bool tipc_link_is_synching(struct tipc_link *l) { return l->state == LINK_SYNCHING; } bool tipc_link_is_failingover(struct tipc_link *l) { return l->state == LINK_FAILINGOVER; } bool tipc_link_is_blocked(struct tipc_link *l) { return l->state & (LINK_RESETTING | LINK_PEER_RESET | LINK_FAILINGOVER); } static bool link_is_bc_sndlink(struct tipc_link *l) { return !l->bc_sndlink; } static bool link_is_bc_rcvlink(struct tipc_link *l) { return ((l->bc_rcvlink == l) && !link_is_bc_sndlink(l)); } void tipc_link_set_active(struct tipc_link *l, bool active) { l->active = active; } u32 tipc_link_id(struct tipc_link *l) { return l->peer_bearer_id << 16 | l->bearer_id; } int tipc_link_min_win(struct tipc_link *l) { return l->min_win; } int tipc_link_max_win(struct tipc_link *l) { return l->max_win; } int tipc_link_prio(struct tipc_link *l) { return l->priority; } unsigned long tipc_link_tolerance(struct tipc_link *l) { return l->tolerance; } struct sk_buff_head *tipc_link_inputq(struct tipc_link *l) { return l->inputq; } char tipc_link_plane(struct tipc_link *l) { return l->net_plane; } struct net *tipc_link_net(struct tipc_link *l) { return l->net; } void tipc_link_update_caps(struct tipc_link *l, u16 capabilities) { l->peer_caps = capabilities; } void tipc_link_add_bc_peer(struct tipc_link *snd_l, struct tipc_link *uc_l, struct sk_buff_head *xmitq) { struct tipc_link *rcv_l = uc_l->bc_rcvlink; snd_l->ackers++; rcv_l->acked = snd_l->snd_nxt - 1; snd_l->state = LINK_ESTABLISHED; tipc_link_build_bc_init_msg(uc_l, xmitq); } void tipc_link_remove_bc_peer(struct tipc_link *snd_l, struct tipc_link *rcv_l, struct sk_buff_head *xmitq) { u16 ack = snd_l->snd_nxt - 1; snd_l->ackers--; rcv_l->bc_peer_is_up = true; rcv_l->state = LINK_ESTABLISHED; tipc_link_bc_ack_rcv(rcv_l, ack, 0, NULL, xmitq, NULL); trace_tipc_link_reset(rcv_l, TIPC_DUMP_ALL, "bclink removed!"); tipc_link_reset(rcv_l); rcv_l->state = LINK_RESET; if (!snd_l->ackers) { trace_tipc_link_reset(snd_l, TIPC_DUMP_ALL, "zero ackers!"); tipc_link_reset(snd_l); snd_l->state = LINK_RESET; __skb_queue_purge(xmitq); } } int tipc_link_bc_peers(struct tipc_link *l) { return l->ackers; } static u16 link_bc_rcv_gap(struct tipc_link *l) { struct sk_buff *skb = skb_peek(&l->deferdq); u16 gap = 0; if (more(l->snd_nxt, l->rcv_nxt)) gap = l->snd_nxt - l->rcv_nxt; if (skb) gap = buf_seqno(skb) - l->rcv_nxt; return gap; } void tipc_link_set_mtu(struct tipc_link *l, int mtu) { l->mtu = mtu; } int tipc_link_mtu(struct tipc_link *l) { return l->mtu; } int tipc_link_mss(struct tipc_link *l) { #ifdef CONFIG_TIPC_CRYPTO return l->mtu - INT_H_SIZE - EMSG_OVERHEAD; #else return l->mtu - INT_H_SIZE; #endif } u16 tipc_link_rcv_nxt(struct tipc_link *l) { return l->rcv_nxt; } u16 tipc_link_acked(struct tipc_link *l) { return l->acked; } char *tipc_link_name(struct tipc_link *l) { return l->name; } u32 tipc_link_state(struct tipc_link *l) { return l->state; } /** * tipc_link_create - create a new link * @net: pointer to associated network namespace * @if_name: associated interface name * @bearer_id: id (index) of associated bearer * @tolerance: link tolerance to be used by link * @net_plane: network plane (A,B,c..) this link belongs to * @mtu: mtu to be advertised by link * @priority: priority to be used by link * @min_win: minimal send window to be used by link * @max_win: maximal send window to be used by link * @session: session to be used by link * @peer: node id of peer node * @peer_caps: bitmap describing peer node capabilities * @bc_sndlink: the namespace global link used for broadcast sending * @bc_rcvlink: the peer specific link used for broadcast reception * @inputq: queue to put messages ready for delivery * @namedq: queue to put binding table update messages ready for delivery * @link: return value, pointer to put the created link * @self: local unicast link id * @peer_id: 128-bit ID of peer * * Return: true if link was created, otherwise false */ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, int tolerance, char net_plane, u32 mtu, int priority, u32 min_win, u32 max_win, u32 session, u32 self, u32 peer, u8 *peer_id, u16 peer_caps, struct tipc_link *bc_sndlink, struct tipc_link *bc_rcvlink, struct sk_buff_head *inputq, struct sk_buff_head *namedq, struct tipc_link **link) { char peer_str[NODE_ID_STR_LEN] = {0,}; char self_str[NODE_ID_STR_LEN] = {0,}; struct tipc_link *l; l = kzalloc(sizeof(*l), GFP_ATOMIC); if (!l) return false; *link = l; l->session = session; /* Set link name for unicast links only */ if (peer_id) { tipc_nodeid2string(self_str, tipc_own_id(net)); if (strlen(self_str) > 16) sprintf(self_str, "%x", self); tipc_nodeid2string(peer_str, peer_id); if (strlen(peer_str) > 16) sprintf(peer_str, "%x", peer); } /* Peer i/f name will be completed by reset/activate message */ snprintf(l->name, sizeof(l->name), "%s:%s-%s:unknown", self_str, if_name, peer_str); strcpy(l->if_name, if_name); l->addr = peer; l->peer_caps = peer_caps; l->net = net; l->in_session = false; l->bearer_id = bearer_id; l->tolerance = tolerance; if (bc_rcvlink) bc_rcvlink->tolerance = tolerance; l->net_plane = net_plane; l->advertised_mtu = mtu; l->mtu = mtu; l->priority = priority; tipc_link_set_queue_limits(l, min_win, max_win); l->ackers = 1; l->bc_sndlink = bc_sndlink; l->bc_rcvlink = bc_rcvlink; l->inputq = inputq; l->namedq = namedq; l->state = LINK_RESETTING; __skb_queue_head_init(&l->transmq); __skb_queue_head_init(&l->backlogq); __skb_queue_head_init(&l->deferdq); __skb_queue_head_init(&l->failover_deferdq); skb_queue_head_init(&l->wakeupq); skb_queue_head_init(l->inputq); return true; } /** * tipc_link_bc_create - create new link to be used for broadcast * @net: pointer to associated network namespace * @mtu: mtu to be used initially if no peers * @min_win: minimal send window to be used by link * @max_win: maximal send window to be used by link * @inputq: queue to put messages ready for delivery * @namedq: queue to put binding table update messages ready for delivery * @link: return value, pointer to put the created link * @ownnode: identity of own node * @peer: node id of peer node * @peer_id: 128-bit ID of peer * @peer_caps: bitmap describing peer node capabilities * @bc_sndlink: the namespace global link used for broadcast sending * * Return: true if link was created, otherwise false */ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, u8 *peer_id, int mtu, u32 min_win, u32 max_win, u16 peer_caps, struct sk_buff_head *inputq, struct sk_buff_head *namedq, struct tipc_link *bc_sndlink, struct tipc_link **link) { struct tipc_link *l; if (!tipc_link_create(net, "", MAX_BEARERS, 0, 'Z', mtu, 0, min_win, max_win, 0, ownnode, peer, NULL, peer_caps, bc_sndlink, NULL, inputq, namedq, link)) return false; l = *link; if (peer_id) { char peer_str[NODE_ID_STR_LEN] = {0,}; tipc_nodeid2string(peer_str, peer_id); if (strlen(peer_str) > 16) sprintf(peer_str, "%x", peer); /* Broadcast receiver link name: "broadcast-link:<peer>" */ snprintf(l->name, sizeof(l->name), "%s:%s", tipc_bclink_name, peer_str); } else { strcpy(l->name, tipc_bclink_name); } trace_tipc_link_reset(l, TIPC_DUMP_ALL, "bclink created!"); tipc_link_reset(l); l->state = LINK_RESET; l->ackers = 0; l->bc_rcvlink = l; /* Broadcast send link is always up */ if (link_is_bc_sndlink(l)) l->state = LINK_ESTABLISHED; /* Disable replicast if even a single peer doesn't support it */ if (link_is_bc_rcvlink(l) && !(peer_caps & TIPC_BCAST_RCAST)) tipc_bcast_toggle_rcast(net, false); return true; } /** * tipc_link_fsm_evt - link finite state machine * @l: pointer to link * @evt: state machine event to be processed */ int tipc_link_fsm_evt(struct tipc_link *l, int evt) { int rc = 0; int old_state = l->state; switch (l->state) { case LINK_RESETTING: switch (evt) { case LINK_PEER_RESET_EVT: l->state = LINK_PEER_RESET; break; case LINK_RESET_EVT: l->state = LINK_RESET; break; case LINK_FAILURE_EVT: case LINK_FAILOVER_BEGIN_EVT: case LINK_ESTABLISH_EVT: case LINK_FAILOVER_END_EVT: case LINK_SYNCH_BEGIN_EVT: case LINK_SYNCH_END_EVT: default: goto illegal_evt; } break; case LINK_RESET: switch (evt) { case LINK_PEER_RESET_EVT: l->state = LINK_ESTABLISHING; break; case LINK_FAILOVER_BEGIN_EVT: l->state = LINK_FAILINGOVER; break; case LINK_FAILURE_EVT: case LINK_RESET_EVT: case LINK_ESTABLISH_EVT: case LINK_FAILOVER_END_EVT: break; case LINK_SYNCH_BEGIN_EVT: case LINK_SYNCH_END_EVT: default: goto illegal_evt; } break; case LINK_PEER_RESET: switch (evt) { case LINK_RESET_EVT: l->state = LINK_ESTABLISHING; break; case LINK_PEER_RESET_EVT: case LINK_ESTABLISH_EVT: case LINK_FAILURE_EVT: break; case LINK_SYNCH_BEGIN_EVT: case LINK_SYNCH_END_EVT: case LINK_FAILOVER_BEGIN_EVT: case LINK_FAILOVER_END_EVT: default: goto illegal_evt; } break; case LINK_FAILINGOVER: switch (evt) { case LINK_FAILOVER_END_EVT: l->state = LINK_RESET; break; case LINK_PEER_RESET_EVT: case LINK_RESET_EVT: case LINK_ESTABLISH_EVT: case LINK_FAILURE_EVT: break; case LINK_FAILOVER_BEGIN_EVT: case LINK_SYNCH_BEGIN_EVT: case LINK_SYNCH_END_EVT: default: goto illegal_evt; } break; case LINK_ESTABLISHING: switch (evt) { case LINK_ESTABLISH_EVT: l->state = LINK_ESTABLISHED; break; case LINK_FAILOVER_BEGIN_EVT: l->state = LINK_FAILINGOVER; break; case LINK_RESET_EVT: l->state = LINK_RESET; break; case LINK_FAILURE_EVT: case LINK_PEER_RESET_EVT: case LINK_SYNCH_BEGIN_EVT: case LINK_FAILOVER_END_EVT: break; case LINK_SYNCH_END_EVT: default: goto illegal_evt; } break; case LINK_ESTABLISHED: switch (evt) { case LINK_PEER_RESET_EVT: l->state = LINK_PEER_RESET; rc |= TIPC_LINK_DOWN_EVT; break; case LINK_FAILURE_EVT: l->state = LINK_RESETTING; rc |= TIPC_LINK_DOWN_EVT; break; case LINK_RESET_EVT: l->state = LINK_RESET; break; case LINK_ESTABLISH_EVT: case LINK_SYNCH_END_EVT: break; case LINK_SYNCH_BEGIN_EVT: l->state = LINK_SYNCHING; break; case LINK_FAILOVER_BEGIN_EVT: case LINK_FAILOVER_END_EVT: default: goto illegal_evt; } break; case LINK_SYNCHING: switch (evt) { case LINK_PEER_RESET_EVT: l->state = LINK_PEER_RESET; rc |= TIPC_LINK_DOWN_EVT; break; case LINK_FAILURE_EVT: l->state = LINK_RESETTING; rc |= TIPC_LINK_DOWN_EVT; break; case LINK_RESET_EVT: l->state = LINK_RESET; break; case LINK_ESTABLISH_EVT: case LINK_SYNCH_BEGIN_EVT: break; case LINK_SYNCH_END_EVT: l->state = LINK_ESTABLISHED; break; case LINK_FAILOVER_BEGIN_EVT: case LINK_FAILOVER_END_EVT: default: goto illegal_evt; } break; default: pr_err("Unknown FSM state %x in %s\n", l->state, l->name); } trace_tipc_link_fsm(l->name, old_state, l->state, evt); return rc; illegal_evt: pr_err("Illegal FSM event %x in state %x on link %s\n", evt, l->state, l->name); trace_tipc_link_fsm(l->name, old_state, l->state, evt); return rc; } /* link_profile_stats - update statistical profiling of traffic */ static void link_profile_stats(struct tipc_link *l) { struct sk_buff *skb; struct tipc_msg *msg; int length; /* Update counters used in statistical profiling of send traffic */ l->stats.accu_queue_sz += skb_queue_len(&l->transmq); l->stats.queue_sz_counts++; skb = skb_peek(&l->transmq); if (!skb) return; msg = buf_msg(skb); length = msg_size(msg); if (msg_user(msg) == MSG_FRAGMENTER) { if (msg_type(msg) != FIRST_FRAGMENT) return; length = msg_size(msg_inner_hdr(msg)); } l->stats.msg_lengths_total += length; l->stats.msg_length_counts++; if (length <= 64) l->stats.msg_length_profile[0]++; else if (length <= 256) l->stats.msg_length_profile[1]++; else if (length <= 1024) l->stats.msg_length_profile[2]++; else if (length <= 4096) l->stats.msg_length_profile[3]++; else if (length <= 16384) l->stats.msg_length_profile[4]++; else if (length <= 32768) l->stats.msg_length_profile[5]++; else l->stats.msg_length_profile[6]++; } /** * tipc_link_too_silent - check if link is "too silent" * @l: tipc link to be checked * * Return: true if the link 'silent_intv_cnt' is about to reach the * 'abort_limit' value, otherwise false */ bool tipc_link_too_silent(struct tipc_link *l) { return (l->silent_intv_cnt + 2 > l->abort_limit); } /* tipc_link_timeout - perform periodic task as instructed from node timeout */ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq) { int mtyp = 0; int rc = 0; bool state = false; bool probe = false; bool setup = false; u16 bc_snt = l->bc_sndlink->snd_nxt - 1; u16 bc_acked = l->bc_rcvlink->acked; struct tipc_mon_state *mstate = &l->mon_state; trace_tipc_link_timeout(l, TIPC_DUMP_NONE, " "); trace_tipc_link_too_silent(l, TIPC_DUMP_ALL, " "); switch (l->state) { case LINK_ESTABLISHED: case LINK_SYNCHING: mtyp = STATE_MSG; link_profile_stats(l); tipc_mon_get_state(l->net, l->addr, mstate, l->bearer_id); if (mstate->reset || (l->silent_intv_cnt > l->abort_limit)) return tipc_link_fsm_evt(l, LINK_FAILURE_EVT); state = bc_acked != bc_snt; state |= l->bc_rcvlink->rcv_unacked; state |= l->rcv_unacked; state |= !skb_queue_empty(&l->transmq); probe = mstate->probing; probe |= l->silent_intv_cnt; if (probe || mstate->monitoring) l->silent_intv_cnt++; probe |= !skb_queue_empty(&l->deferdq); if (l->snd_nxt == l->checkpoint) { tipc_link_update_cwin(l, 0, 0); probe = true; } l->checkpoint = l->snd_nxt; break; case LINK_RESET: setup = l->rst_cnt++ <= 4; setup |= !(l->rst_cnt % 16); mtyp = RESET_MSG; break; case LINK_ESTABLISHING: setup = true; mtyp = ACTIVATE_MSG; break; case LINK_PEER_RESET: case LINK_RESETTING: case LINK_FAILINGOVER: break; default: break; } if (state || probe || setup) tipc_link_build_proto_msg(l, mtyp, probe, 0, 0, 0, 0, xmitq); return rc; } /** * link_schedule_user - schedule a message sender for wakeup after congestion * @l: congested link * @hdr: header of message that is being sent * Create pseudo msg to send back to user when congestion abates */ static int link_schedule_user(struct tipc_link *l, struct tipc_msg *hdr) { u32 dnode = tipc_own_addr(l->net); u32 dport = msg_origport(hdr); struct sk_buff *skb; /* Create and schedule wakeup pseudo message */ skb = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0, dnode, l->addr, dport, 0, 0); if (!skb) return -ENOBUFS; msg_set_dest_droppable(buf_msg(skb), true); TIPC_SKB_CB(skb)->chain_imp = msg_importance(hdr); skb_queue_tail(&l->wakeupq, skb); l->stats.link_congs++; trace_tipc_link_conges(l, TIPC_DUMP_ALL, "wakeup scheduled!"); return -ELINKCONG; } /** * link_prepare_wakeup - prepare users for wakeup after congestion * @l: congested link * Wake up a number of waiting users, as permitted by available space * in the send queue */ static void link_prepare_wakeup(struct tipc_link *l) { struct sk_buff_head *wakeupq = &l->wakeupq; struct sk_buff_head *inputq = l->inputq; struct sk_buff *skb, *tmp; struct sk_buff_head tmpq; int avail[5] = {0,}; int imp = 0; __skb_queue_head_init(&tmpq); for (; imp <= TIPC_SYSTEM_IMPORTANCE; imp++) avail[imp] = l->backlog[imp].limit - l->backlog[imp].len; skb_queue_walk_safe(wakeupq, skb, tmp) { imp = TIPC_SKB_CB(skb)->chain_imp; if (avail[imp] <= 0) continue; avail[imp]--; __skb_unlink(skb, wakeupq); __skb_queue_tail(&tmpq, skb); } spin_lock_bh(&inputq->lock); skb_queue_splice_tail(&tmpq, inputq); spin_unlock_bh(&inputq->lock); } /** * tipc_link_set_skb_retransmit_time - set the time at which retransmission of * the given skb should be next attempted * @skb: skb to set a future retransmission time for * @l: link the skb will be transmitted on */ static void tipc_link_set_skb_retransmit_time(struct sk_buff *skb, struct tipc_link *l) { if (link_is_bc_sndlink(l)) TIPC_SKB_CB(skb)->nxt_retr = TIPC_BC_RETR_LIM; else TIPC_SKB_CB(skb)->nxt_retr = TIPC_UC_RETR_TIME; } void tipc_link_reset(struct tipc_link *l) { struct sk_buff_head list; u32 imp; __skb_queue_head_init(&list); l->in_session = false; /* Force re-synch of peer session number before establishing */ l->peer_session--; l->session++; l->mtu = l->advertised_mtu; spin_lock_bh(&l->wakeupq.lock); skb_queue_splice_init(&l->wakeupq, &list); spin_unlock_bh(&l->wakeupq.lock); spin_lock_bh(&l->inputq->lock); skb_queue_splice_init(&list, l->inputq); spin_unlock_bh(&l->inputq->lock); __skb_queue_purge(&l->transmq); __skb_queue_purge(&l->deferdq); __skb_queue_purge(&l->backlogq); __skb_queue_purge(&l->failover_deferdq); for (imp = 0; imp <= TIPC_SYSTEM_IMPORTANCE; imp++) { l->backlog[imp].len = 0; l->backlog[imp].target_bskb = NULL; } kfree_skb(l->reasm_buf); kfree_skb(l->reasm_tnlmsg); kfree_skb(l->failover_reasm_skb); l->reasm_buf = NULL; l->reasm_tnlmsg = NULL; l->failover_reasm_skb = NULL; l->rcv_unacked = 0; l->snd_nxt = 1; l->rcv_nxt = 1; l->snd_nxt_state = 1; l->rcv_nxt_state = 1; l->acked = 0; l->last_gap = 0; kfree(l->last_ga); l->last_ga = NULL; l->silent_intv_cnt = 0; l->rst_cnt = 0; l->bc_peer_is_up = false; memset(&l->mon_state, 0, sizeof(l->mon_state)); tipc_link_reset_stats(l); } /** * tipc_link_xmit(): enqueue buffer list according to queue situation * @l: link to use * @list: chain of buffers containing message * @xmitq: returned list of packets to be sent by caller * * Consumes the buffer chain. * Messages at TIPC_SYSTEM_IMPORTANCE are always accepted * Return: 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS */ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, struct sk_buff_head *xmitq) { struct sk_buff_head *backlogq = &l->backlogq; struct sk_buff_head *transmq = &l->transmq; struct sk_buff *skb, *_skb; u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; u16 ack = l->rcv_nxt - 1; u16 seqno = l->snd_nxt; int pkt_cnt = skb_queue_len(list); unsigned int mss = tipc_link_mss(l); unsigned int cwin = l->window; unsigned int mtu = l->mtu; struct tipc_msg *hdr; bool new_bundle; int rc = 0; int imp; if (pkt_cnt <= 0) return 0; hdr = buf_msg(skb_peek(list)); if (unlikely(msg_size(hdr) > mtu)) { pr_warn("Too large msg, purging xmit list %d %d %d %d %d!\n", skb_queue_len(list), msg_user(hdr), msg_type(hdr), msg_size(hdr), mtu); __skb_queue_purge(list); return -EMSGSIZE; } imp = msg_importance(hdr); /* Allow oversubscription of one data msg per source at congestion */ if (unlikely(l->backlog[imp].len >= l->backlog[imp].limit)) { if (imp == TIPC_SYSTEM_IMPORTANCE) { pr_warn("%s<%s>, link overflow", link_rst_msg, l->name); return -ENOBUFS; } rc = link_schedule_user(l, hdr); } if (pkt_cnt > 1) { l->stats.sent_fragmented++; l->stats.sent_fragments += pkt_cnt; } /* Prepare each packet for sending, and add to relevant queue: */ while ((skb = __skb_dequeue(list))) { if (likely(skb_queue_len(transmq) < cwin)) { hdr = buf_msg(skb); msg_set_seqno(hdr, seqno); msg_set_ack(hdr, ack); msg_set_bcast_ack(hdr, bc_ack); _skb = skb_clone(skb, GFP_ATOMIC); if (!_skb) { kfree_skb(skb); __skb_queue_purge(list); return -ENOBUFS; } __skb_queue_tail(transmq, skb); tipc_link_set_skb_retransmit_time(skb, l); __skb_queue_tail(xmitq, _skb); TIPC_SKB_CB(skb)->ackers = l->ackers; l->rcv_unacked = 0; l->stats.sent_pkts++; seqno++; continue; } if (tipc_msg_try_bundle(l->backlog[imp].target_bskb, &skb, mss, l->addr, &new_bundle)) { if (skb) { /* Keep a ref. to the skb for next try */ l->backlog[imp].target_bskb = skb; l->backlog[imp].len++; __skb_queue_tail(backlogq, skb); } else { if (new_bundle) { l->stats.sent_bundles++; l->stats.sent_bundled++; } l->stats.sent_bundled++; } continue; } l->backlog[imp].target_bskb = NULL; l->backlog[imp].len += (1 + skb_queue_len(list)); __skb_queue_tail(backlogq, skb); skb_queue_splice_tail_init(list, backlogq); } l->snd_nxt = seqno; return rc; } static void tipc_link_update_cwin(struct tipc_link *l, int released, bool retransmitted) { int bklog_len = skb_queue_len(&l->backlogq); struct sk_buff_head *txq = &l->transmq; int txq_len = skb_queue_len(txq); u16 cwin = l->window; /* Enter fast recovery */ if (unlikely(retransmitted)) { l->ssthresh = max_t(u16, l->window / 2, 300); l->window = min_t(u16, l->ssthresh, l->window); return; } /* Enter slow start */ if (unlikely(!released)) { l->ssthresh = max_t(u16, l->window / 2, 300); l->window = l->min_win; return; } /* Don't increase window if no pressure on the transmit queue */ if (txq_len + bklog_len < cwin) return; /* Don't increase window if there are holes the transmit queue */ if (txq_len && l->snd_nxt - buf_seqno(skb_peek(txq)) != txq_len) return; l->cong_acks += released; /* Slow start */ if (cwin <= l->ssthresh) { l->window = min_t(u16, cwin + released, l->max_win); return; } /* Congestion avoidance */ if (l->cong_acks < cwin) return; l->window = min_t(u16, ++cwin, l->max_win); l->cong_acks = 0; } static void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq) { u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; struct sk_buff_head *txq = &l->transmq; struct sk_buff *skb, *_skb; u16 ack = l->rcv_nxt - 1; u16 seqno = l->snd_nxt; struct tipc_msg *hdr; u16 cwin = l->window; u32 imp; while (skb_queue_len(txq) < cwin) { skb = skb_peek(&l->backlogq); if (!skb) break; _skb = skb_clone(skb, GFP_ATOMIC); if (!_skb) break; __skb_dequeue(&l->backlogq); hdr = buf_msg(skb); imp = msg_importance(hdr); l->backlog[imp].len--; if (unlikely(skb == l->backlog[imp].target_bskb)) l->backlog[imp].target_bskb = NULL; __skb_queue_tail(&l->transmq, skb); tipc_link_set_skb_retransmit_time(skb, l); __skb_queue_tail(xmitq, _skb); TIPC_SKB_CB(skb)->ackers = l->ackers; msg_set_seqno(hdr, seqno); msg_set_ack(hdr, ack); msg_set_bcast_ack(hdr, bc_ack); l->rcv_unacked = 0; l->stats.sent_pkts++; seqno++; } l->snd_nxt = seqno; } /** * link_retransmit_failure() - Detect repeated retransmit failures * @l: tipc link sender * @r: tipc link receiver (= l in case of unicast) * @rc: returned code * * Return: true if the repeated retransmit failures happens, otherwise * false */ static bool link_retransmit_failure(struct tipc_link *l, struct tipc_link *r, int *rc) { struct sk_buff *skb = skb_peek(&l->transmq); struct tipc_msg *hdr; if (!skb) return false; if (!TIPC_SKB_CB(skb)->retr_cnt) return false; if (!time_after(jiffies, TIPC_SKB_CB(skb)->retr_stamp + msecs_to_jiffies(r->tolerance * 10))) return false; hdr = buf_msg(skb); if (link_is_bc_sndlink(l) && !less(r->acked, msg_seqno(hdr))) return false; pr_warn("Retransmission failure on link <%s>\n", l->name); link_print(l, "State of link "); pr_info("Failed msg: usr %u, typ %u, len %u, err %u\n", msg_user(hdr), msg_type(hdr), msg_size(hdr), msg_errcode(hdr)); pr_info("sqno %u, prev: %x, dest: %x\n", msg_seqno(hdr), msg_prevnode(hdr), msg_destnode(hdr)); pr_info("retr_stamp %d, retr_cnt %d\n", jiffies_to_msecs(TIPC_SKB_CB(skb)->retr_stamp), TIPC_SKB_CB(skb)->retr_cnt); trace_tipc_list_dump(&l->transmq, true, "retrans failure!"); trace_tipc_link_dump(l, TIPC_DUMP_NONE, "retrans failure!"); trace_tipc_link_dump(r, TIPC_DUMP_NONE, "retrans failure!"); if (link_is_bc_sndlink(l)) { r->state = LINK_RESET; *rc |= TIPC_LINK_DOWN_EVT; } else { *rc |= tipc_link_fsm_evt(l, LINK_FAILURE_EVT); } return true; } /* tipc_data_input - deliver data and name distr msgs to upper layer * * Consumes buffer if message is of right type * Node lock must be held */ static bool tipc_data_input(struct tipc_link *l, struct sk_buff *skb, struct sk_buff_head *inputq) { struct sk_buff_head *mc_inputq = l->bc_rcvlink->inputq; struct tipc_msg *hdr = buf_msg(skb); switch (msg_user(hdr)) { case TIPC_LOW_IMPORTANCE: case TIPC_MEDIUM_IMPORTANCE: case TIPC_HIGH_IMPORTANCE: case TIPC_CRITICAL_IMPORTANCE: if (unlikely(msg_in_group(hdr) || msg_mcast(hdr))) { skb_queue_tail(mc_inputq, skb); return true; } fallthrough; case CONN_MANAGER: skb_queue_tail(inputq, skb); return true; case GROUP_PROTOCOL: skb_queue_tail(mc_inputq, skb); return true; case NAME_DISTRIBUTOR: l->bc_rcvlink->state = LINK_ESTABLISHED; skb_queue_tail(l->namedq, skb); return true; case MSG_BUNDLER: case TUNNEL_PROTOCOL: case MSG_FRAGMENTER: case BCAST_PROTOCOL: return false; #ifdef CONFIG_TIPC_CRYPTO case MSG_CRYPTO: if (sysctl_tipc_key_exchange_enabled && TIPC_SKB_CB(skb)->decrypted) { tipc_crypto_msg_rcv(l->net, skb); return true; } fallthrough; #endif default: pr_warn("Dropping received illegal msg type\n"); kfree_skb(skb); return true; } } /* tipc_link_input - process packet that has passed link protocol check * * Consumes buffer */ static int tipc_link_input(struct tipc_link *l, struct sk_buff *skb, struct sk_buff_head *inputq, struct sk_buff **reasm_skb) { struct tipc_msg *hdr = buf_msg(skb); struct sk_buff *iskb; struct sk_buff_head tmpq; int usr = msg_user(hdr); int pos = 0; if (usr == MSG_BUNDLER) { skb_queue_head_init(&tmpq); l->stats.recv_bundles++; l->stats.recv_bundled += msg_msgcnt(hdr); while (tipc_msg_extract(skb, &iskb, &pos)) tipc_data_input(l, iskb, &tmpq); tipc_skb_queue_splice_tail(&tmpq, inputq); return 0; } else if (usr == MSG_FRAGMENTER) { l->stats.recv_fragments++; if (tipc_buf_append(reasm_skb, &skb)) { l->stats.recv_fragmented++; tipc_data_input(l, skb, inputq); } else if (!*reasm_skb && !link_is_bc_rcvlink(l)) { pr_warn_ratelimited("Unable to build fragment list\n"); return tipc_link_fsm_evt(l, LINK_FAILURE_EVT); } return 0; } else if (usr == BCAST_PROTOCOL) { tipc_bcast_lock(l->net); tipc_link_bc_init_rcv(l->bc_rcvlink, hdr); tipc_bcast_unlock(l->net); } kfree_skb(skb); return 0; } /* tipc_link_tnl_rcv() - receive TUNNEL_PROTOCOL message, drop or process the * inner message along with the ones in the old link's * deferdq * @l: tunnel link * @skb: TUNNEL_PROTOCOL message * @inputq: queue to put messages ready for delivery */ static int tipc_link_tnl_rcv(struct tipc_link *l, struct sk_buff *skb, struct sk_buff_head *inputq) { struct sk_buff **reasm_skb = &l->failover_reasm_skb; struct sk_buff **reasm_tnlmsg = &l->reasm_tnlmsg; struct sk_buff_head *fdefq = &l->failover_deferdq; struct tipc_msg *hdr = buf_msg(skb); struct sk_buff *iskb; int ipos = 0; int rc = 0; u16 seqno; if (msg_type(hdr) == SYNCH_MSG) { kfree_skb(skb); return 0; } /* Not a fragment? */ if (likely(!msg_nof_fragms(hdr))) { if (unlikely(!tipc_msg_extract(skb, &iskb, &ipos))) { pr_warn_ratelimited("Unable to extract msg, defq: %d\n", skb_queue_len(fdefq)); return 0; } kfree_skb(skb); } else { /* Set fragment type for buf_append */ if (msg_fragm_no(hdr) == 1) msg_set_type(hdr, FIRST_FRAGMENT); else if (msg_fragm_no(hdr) < msg_nof_fragms(hdr)) msg_set_type(hdr, FRAGMENT); else msg_set_type(hdr, LAST_FRAGMENT); if (!tipc_buf_append(reasm_tnlmsg, &skb)) { /* Successful but non-complete reassembly? */ if (*reasm_tnlmsg || link_is_bc_rcvlink(l)) return 0; pr_warn_ratelimited("Unable to reassemble tunnel msg\n"); return tipc_link_fsm_evt(l, LINK_FAILURE_EVT); } iskb = skb; } do { seqno = buf_seqno(iskb); if (unlikely(less(seqno, l->drop_point))) { kfree_skb(iskb); continue; } if (unlikely(seqno != l->drop_point)) { __tipc_skb_queue_sorted(fdefq, seqno, iskb); continue; } l->drop_point++; if (!tipc_data_input(l, iskb, inputq)) rc |= tipc_link_input(l, iskb, inputq, reasm_skb); if (unlikely(rc)) break; } while ((iskb = __tipc_skb_dequeue(fdefq, l->drop_point))); return rc; } /** * tipc_get_gap_ack_blks - get Gap ACK blocks from PROTOCOL/STATE_MSG * @ga: returned pointer to the Gap ACK blocks if any * @l: the tipc link * @hdr: the PROTOCOL/STATE_MSG header * @uc: desired Gap ACK blocks type, i.e. unicast (= 1) or broadcast (= 0) * * Return: the total Gap ACK blocks size */ u16 tipc_get_gap_ack_blks(struct tipc_gap_ack_blks **ga, struct tipc_link *l, struct tipc_msg *hdr, bool uc) { struct tipc_gap_ack_blks *p; u16 sz = 0; /* Does peer support the Gap ACK blocks feature? */ if (l->peer_caps & TIPC_GAP_ACK_BLOCK) { p = (struct tipc_gap_ack_blks *)msg_data(hdr); sz = ntohs(p->len); /* Sanity check */ if (sz == struct_size(p, gacks, size_add(p->ugack_cnt, p->bgack_cnt))) { /* Good, check if the desired type exists */ if ((uc && p->ugack_cnt) || (!uc && p->bgack_cnt)) goto ok; /* Backward compatible: peer might not support bc, but uc? */ } else if (uc && sz == struct_size(p, gacks, p->ugack_cnt)) { if (p->ugack_cnt) { p->bgack_cnt = 0; goto ok; } } } /* Other cases: ignore! */ p = NULL; ok: *ga = p; return sz; } static u8 __tipc_build_gap_ack_blks(struct tipc_gap_ack_blks *ga, struct tipc_link *l, u8 start_index) { struct tipc_gap_ack *gacks = &ga->gacks[start_index]; struct sk_buff *skb = skb_peek(&l->deferdq); u16 expect, seqno = 0; u8 n = 0; if (!skb) return 0; expect = buf_seqno(skb); skb_queue_walk(&l->deferdq, skb) { seqno = buf_seqno(skb); if (unlikely(more(seqno, expect))) { gacks[n].ack = htons(expect - 1); gacks[n].gap = htons(seqno - expect); if (++n >= MAX_GAP_ACK_BLKS / 2) { pr_info_ratelimited("Gacks on %s: %d, ql: %d!\n", l->name, n, skb_queue_len(&l->deferdq)); return n; } } else if (unlikely(less(seqno, expect))) { pr_warn("Unexpected skb in deferdq!\n"); continue; } expect = seqno + 1; } /* last block */ gacks[n].ack = htons(seqno); gacks[n].gap = 0; n++; return n; } /* tipc_build_gap_ack_blks - build Gap ACK blocks * @l: tipc unicast link * @hdr: the tipc message buffer to store the Gap ACK blocks after built * * The function builds Gap ACK blocks for both the unicast & broadcast receiver * links of a certain peer, the buffer after built has the network data format * as found at the struct tipc_gap_ack_blks definition. * * returns the actual allocated memory size */ static u16 tipc_build_gap_ack_blks(struct tipc_link *l, struct tipc_msg *hdr) { struct tipc_link *bcl = l->bc_rcvlink; struct tipc_gap_ack_blks *ga; u16 len; ga = (struct tipc_gap_ack_blks *)msg_data(hdr); /* Start with broadcast link first */ tipc_bcast_lock(bcl->net); msg_set_bcast_ack(hdr, bcl->rcv_nxt - 1); msg_set_bc_gap(hdr, link_bc_rcv_gap(bcl)); ga->bgack_cnt = __tipc_build_gap_ack_blks(ga, bcl, 0); tipc_bcast_unlock(bcl->net); /* Now for unicast link, but an explicit NACK only (???) */ ga->ugack_cnt = (msg_seq_gap(hdr)) ? __tipc_build_gap_ack_blks(ga, l, ga->bgack_cnt) : 0; /* Total len */ len = struct_size(ga, gacks, size_add(ga->bgack_cnt, ga->ugack_cnt)); ga->len = htons(len); return len; } /* tipc_link_advance_transmq - advance TIPC link transmq queue by releasing * acked packets, also doing retransmissions if * gaps found * @l: tipc link with transmq queue to be advanced * @r: tipc link "receiver" i.e. in case of broadcast (= "l" if unicast) * @acked: seqno of last packet acked by peer without any gaps before * @gap: # of gap packets * @ga: buffer pointer to Gap ACK blocks from peer * @xmitq: queue for accumulating the retransmitted packets if any * @retransmitted: returned boolean value if a retransmission is really issued * @rc: returned code e.g. TIPC_LINK_DOWN_EVT if a repeated retransmit failures * happens (- unlikely case) * * Return: the number of packets released from the link transmq */ static int tipc_link_advance_transmq(struct tipc_link *l, struct tipc_link *r, u16 acked, u16 gap, struct tipc_gap_ack_blks *ga, struct sk_buff_head *xmitq, bool *retransmitted, int *rc) { struct tipc_gap_ack_blks *last_ga = r->last_ga, *this_ga = NULL; struct tipc_gap_ack *gacks = NULL; struct sk_buff *skb, *_skb, *tmp; struct tipc_msg *hdr; u32 qlen = skb_queue_len(&l->transmq); u16 nacked = acked, ngap = gap, gack_cnt = 0; u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; u16 ack = l->rcv_nxt - 1; u16 seqno, n = 0; u16 end = r->acked, start = end, offset = r->last_gap; u16 si = (last_ga) ? last_ga->start_index : 0; bool is_uc = !link_is_bc_sndlink(l); bool bc_has_acked = false; trace_tipc_link_retrans(r, acked + 1, acked + gap, &l->transmq); /* Determine Gap ACK blocks if any for the particular link */ if (ga && is_uc) { /* Get the Gap ACKs, uc part */ gack_cnt = ga->ugack_cnt; gacks = &ga->gacks[ga->bgack_cnt]; } else if (ga) { /* Copy the Gap ACKs, bc part, for later renewal if needed */ this_ga = kmemdup(ga, struct_size(ga, gacks, ga->bgack_cnt), GFP_ATOMIC); if (likely(this_ga)) { this_ga->start_index = 0; /* Start with the bc Gap ACKs */ gack_cnt = this_ga->bgack_cnt; gacks = &this_ga->gacks[0]; } else { /* Hmm, we can get in trouble..., simply ignore it */ pr_warn_ratelimited("Ignoring bc Gap ACKs, no memory\n"); } } /* Advance the link transmq */ skb_queue_walk_safe(&l->transmq, skb, tmp) { seqno = buf_seqno(skb); next_gap_ack: if (less_eq(seqno, nacked)) { if (is_uc) goto release; /* Skip packets peer has already acked */ if (!more(seqno, r->acked)) continue; /* Get the next of last Gap ACK blocks */ while (more(seqno, end)) { if (!last_ga || si >= last_ga->bgack_cnt) break; start = end + offset + 1; end = ntohs(last_ga->gacks[si].ack); offset = ntohs(last_ga->gacks[si].gap); si++; WARN_ONCE(more(start, end) || (!offset && si < last_ga->bgack_cnt) || si > MAX_GAP_ACK_BLKS, "Corrupted Gap ACK: %d %d %d %d %d\n", start, end, offset, si, last_ga->bgack_cnt); } /* Check against the last Gap ACK block */ if (tipc_in_range(seqno, start, end)) continue; /* Update/release the packet peer is acking */ bc_has_acked = true; if (--TIPC_SKB_CB(skb)->ackers) continue; release: /* release skb */ __skb_unlink(skb, &l->transmq); kfree_skb(skb); } else if (less_eq(seqno, nacked + ngap)) { /* First gap: check if repeated retrans failures? */ if (unlikely(seqno == acked + 1 && link_retransmit_failure(l, r, rc))) { /* Ignore this bc Gap ACKs if any */ kfree(this_ga); this_ga = NULL; break; } /* retransmit skb if unrestricted*/ if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr)) continue; tipc_link_set_skb_retransmit_time(skb, l); _skb = pskb_copy(skb, GFP_ATOMIC); if (!_skb) continue; hdr = buf_msg(_skb); msg_set_ack(hdr, ack); msg_set_bcast_ack(hdr, bc_ack); _skb->priority = TC_PRIO_CONTROL; __skb_queue_tail(xmitq, _skb); l->stats.retransmitted++; if (!is_uc) r->stats.retransmitted++; *retransmitted = true; /* Increase actual retrans counter & mark first time */ if (!TIPC_SKB_CB(skb)->retr_cnt++) TIPC_SKB_CB(skb)->retr_stamp = jiffies; } else { /* retry with Gap ACK blocks if any */ if (n >= gack_cnt) break; nacked = ntohs(gacks[n].ack); ngap = ntohs(gacks[n].gap); n++; goto next_gap_ack; } } /* Renew last Gap ACK blocks for bc if needed */ if (bc_has_acked) { if (this_ga) { kfree(last_ga); r->last_ga = this_ga; r->last_gap = gap; } else if (last_ga) { if (less(acked, start)) { si--; offset = start - acked - 1; } else if (less(acked, end)) { acked = end; } if (si < last_ga->bgack_cnt) { last_ga->start_index = si; r->last_gap = offset; } else { kfree(last_ga); r->last_ga = NULL; r->last_gap = 0; } } else { r->last_gap = 0; } r->acked = acked; } else { kfree(this_ga); } return qlen - skb_queue_len(&l->transmq); } /* tipc_link_build_state_msg: prepare link state message for transmission * * Note that sending of broadcast ack is coordinated among nodes, to reduce * risk of ack storms towards the sender */ int tipc_link_build_state_msg(struct tipc_link *l, struct sk_buff_head *xmitq) { if (!l) return 0; /* Broadcast ACK must be sent via a unicast link => defer to caller */ if (link_is_bc_rcvlink(l)) { if (((l->rcv_nxt ^ tipc_own_addr(l->net)) & 0xf) != 0xf) return 0; l->rcv_unacked = 0; /* Use snd_nxt to store peer's snd_nxt in broadcast rcv link */ l->snd_nxt = l->rcv_nxt; return TIPC_LINK_SND_STATE; } /* Unicast ACK */ l->rcv_unacked = 0; l->stats.sent_acks++; tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, 0, xmitq); return 0; } /* tipc_link_build_reset_msg: prepare link RESET or ACTIVATE message */ void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq) { int mtyp = RESET_MSG; struct sk_buff *skb; if (l->state == LINK_ESTABLISHING) mtyp = ACTIVATE_MSG; tipc_link_build_proto_msg(l, mtyp, 0, 0, 0, 0, 0, xmitq); /* Inform peer that this endpoint is going down if applicable */ skb = skb_peek_tail(xmitq); if (skb && (l->state == LINK_RESET)) msg_set_peer_stopping(buf_msg(skb), 1); } /* tipc_link_build_nack_msg: prepare link nack message for transmission * Note that sending of broadcast NACK is coordinated among nodes, to * reduce the risk of NACK storms towards the sender */ static int tipc_link_build_nack_msg(struct tipc_link *l, struct sk_buff_head *xmitq) { u32 def_cnt = ++l->stats.deferred_recv; struct sk_buff_head *dfq = &l->deferdq; u32 defq_len = skb_queue_len(dfq); int match1, match2; if (link_is_bc_rcvlink(l)) { match1 = def_cnt & 0xf; match2 = tipc_own_addr(l->net) & 0xf; if (match1 == match2) return TIPC_LINK_SND_STATE; return 0; } if (defq_len >= 3 && !((defq_len - 3) % 16)) { u16 rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt; tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, rcvgap, 0, 0, xmitq); } return 0; } /* tipc_link_rcv - process TIPC packets/messages arriving from off-node * @l: the link that should handle the message * @skb: TIPC packet * @xmitq: queue to place packets to be sent after this call */ int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb, struct sk_buff_head *xmitq) { struct sk_buff_head *defq = &l->deferdq; struct tipc_msg *hdr = buf_msg(skb); u16 seqno, rcv_nxt, win_lim; int released = 0; int rc = 0; /* Verify and update link state */ if (unlikely(msg_user(hdr) == LINK_PROTOCOL)) return tipc_link_proto_rcv(l, skb, xmitq); /* Don't send probe at next timeout expiration */ l->silent_intv_cnt = 0; do { hdr = buf_msg(skb); seqno = msg_seqno(hdr); rcv_nxt = l->rcv_nxt; win_lim = rcv_nxt + TIPC_MAX_LINK_WIN; if (unlikely(!tipc_link_is_up(l))) { if (l->state == LINK_ESTABLISHING) rc = TIPC_LINK_UP_EVT; kfree_skb(skb); break; } /* Drop if outside receive window */ if (unlikely(less(seqno, rcv_nxt) || more(seqno, win_lim))) { l->stats.duplicates++; kfree_skb(skb); break; } released += tipc_link_advance_transmq(l, l, msg_ack(hdr), 0, NULL, NULL, NULL, NULL); /* Defer delivery if sequence gap */ if (unlikely(seqno != rcv_nxt)) { if (!__tipc_skb_queue_sorted(defq, seqno, skb)) l->stats.duplicates++; rc |= tipc_link_build_nack_msg(l, xmitq); break; } /* Deliver packet */ l->rcv_nxt++; l->stats.recv_pkts++; if (unlikely(msg_user(hdr) == TUNNEL_PROTOCOL)) rc |= tipc_link_tnl_rcv(l, skb, l->inputq); else if (!tipc_data_input(l, skb, l->inputq)) rc |= tipc_link_input(l, skb, l->inputq, &l->reasm_buf); if (unlikely(++l->rcv_unacked >= TIPC_MIN_LINK_WIN)) rc |= tipc_link_build_state_msg(l, xmitq); if (unlikely(rc & ~TIPC_LINK_SND_STATE)) break; } while ((skb = __tipc_skb_dequeue(defq, l->rcv_nxt))); /* Forward queues and wake up waiting users */ if (released) { tipc_link_update_cwin(l, released, 0); tipc_link_advance_backlog(l, xmitq); if (unlikely(!skb_queue_empty(&l->wakeupq))) link_prepare_wakeup(l); } return rc; } static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe, bool probe_reply, u16 rcvgap, int tolerance, int priority, struct sk_buff_head *xmitq) { struct tipc_mon_state *mstate = &l->mon_state; struct sk_buff_head *dfq = &l->deferdq; struct tipc_link *bcl = l->bc_rcvlink; struct tipc_msg *hdr; struct sk_buff *skb; bool node_up = tipc_link_is_up(bcl); u16 glen = 0, bc_rcvgap = 0; int dlen = 0; void *data; /* Don't send protocol message during reset or link failover */ if (tipc_link_is_blocked(l)) return; if (!tipc_link_is_up(l) && (mtyp == STATE_MSG)) return; if ((probe || probe_reply) && !skb_queue_empty(dfq)) rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt; skb = tipc_msg_create(LINK_PROTOCOL, mtyp, INT_H_SIZE, tipc_max_domain_size + MAX_GAP_ACK_BLKS_SZ, l->addr, tipc_own_addr(l->net), 0, 0, 0); if (!skb) return; hdr = buf_msg(skb); data = msg_data(hdr); msg_set_session(hdr, l->session); msg_set_bearer_id(hdr, l->bearer_id); msg_set_net_plane(hdr, l->net_plane); msg_set_next_sent(hdr, l->snd_nxt); msg_set_ack(hdr, l->rcv_nxt - 1); msg_set_bcast_ack(hdr, bcl->rcv_nxt - 1); msg_set_bc_ack_invalid(hdr, !node_up); msg_set_last_bcast(hdr, l->bc_sndlink->snd_nxt - 1); msg_set_link_tolerance(hdr, tolerance); msg_set_linkprio(hdr, priority); msg_set_redundant_link(hdr, node_up); msg_set_seq_gap(hdr, 0); msg_set_seqno(hdr, l->snd_nxt + U16_MAX / 2); if (mtyp == STATE_MSG) { if (l->peer_caps & TIPC_LINK_PROTO_SEQNO) msg_set_seqno(hdr, l->snd_nxt_state++); msg_set_seq_gap(hdr, rcvgap); bc_rcvgap = link_bc_rcv_gap(bcl); msg_set_bc_gap(hdr, bc_rcvgap); msg_set_probe(hdr, probe); msg_set_is_keepalive(hdr, probe || probe_reply); if (l->peer_caps & TIPC_GAP_ACK_BLOCK) glen = tipc_build_gap_ack_blks(l, hdr); tipc_mon_prep(l->net, data + glen, &dlen, mstate, l->bearer_id); msg_set_size(hdr, INT_H_SIZE + glen + dlen); skb_trim(skb, INT_H_SIZE + glen + dlen); l->stats.sent_states++; l->rcv_unacked = 0; } else { /* RESET_MSG or ACTIVATE_MSG */ if (mtyp == ACTIVATE_MSG) { msg_set_dest_session_valid(hdr, 1); msg_set_dest_session(hdr, l->peer_session); } msg_set_max_pkt(hdr, l->advertised_mtu); strcpy(data, l->if_name); msg_set_size(hdr, INT_H_SIZE + TIPC_MAX_IF_NAME); skb_trim(skb, INT_H_SIZE + TIPC_MAX_IF_NAME); } if (probe) l->stats.sent_probes++; if (rcvgap) l->stats.sent_nacks++; if (bc_rcvgap) bcl->stats.sent_nacks++; skb->priority = TC_PRIO_CONTROL; __skb_queue_tail(xmitq, skb); trace_tipc_proto_build(skb, false, l->name); } void tipc_link_create_dummy_tnl_msg(struct tipc_link *l, struct sk_buff_head *xmitq) { u32 onode = tipc_own_addr(l->net); struct tipc_msg *hdr, *ihdr; struct sk_buff_head tnlq; struct sk_buff *skb; u32 dnode = l->addr; __skb_queue_head_init(&tnlq); skb = tipc_msg_create(TUNNEL_PROTOCOL, FAILOVER_MSG, INT_H_SIZE, BASIC_H_SIZE, dnode, onode, 0, 0, 0); if (!skb) { pr_warn("%sunable to create tunnel packet\n", link_co_err); return; } hdr = buf_msg(skb); msg_set_msgcnt(hdr, 1); msg_set_bearer_id(hdr, l->peer_bearer_id); ihdr = (struct tipc_msg *)msg_data(hdr); tipc_msg_init(onode, ihdr, TIPC_LOW_IMPORTANCE, TIPC_DIRECT_MSG, BASIC_H_SIZE, dnode); msg_set_errcode(ihdr, TIPC_ERR_NO_PORT); __skb_queue_tail(&tnlq, skb); tipc_link_xmit(l, &tnlq, xmitq); } /* tipc_link_tnl_prepare(): prepare and return a list of tunnel packets * with contents of the link's transmit and backlog queues. */ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl, int mtyp, struct sk_buff_head *xmitq) { struct sk_buff_head *fdefq = &tnl->failover_deferdq; struct sk_buff *skb, *tnlskb; struct tipc_msg *hdr, tnlhdr; struct sk_buff_head *queue = &l->transmq; struct sk_buff_head tmpxq, tnlq, frags; u16 pktlen, pktcnt, seqno = l->snd_nxt; bool pktcnt_need_update = false; u16 syncpt; int rc; if (!tnl) return; __skb_queue_head_init(&tnlq); /* Link Synching: * From now on, send only one single ("dummy") SYNCH message * to peer. The SYNCH message does not contain any data, just * a header conveying the synch point to the peer. */ if (mtyp == SYNCH_MSG && (tnl->peer_caps & TIPC_TUNNEL_ENHANCED)) { tnlskb = tipc_msg_create(TUNNEL_PROTOCOL, SYNCH_MSG, INT_H_SIZE, 0, l->addr, tipc_own_addr(l->net), 0, 0, 0); if (!tnlskb) { pr_warn("%sunable to create dummy SYNCH_MSG\n", link_co_err); return; } hdr = buf_msg(tnlskb); syncpt = l->snd_nxt + skb_queue_len(&l->backlogq) - 1; msg_set_syncpt(hdr, syncpt); msg_set_bearer_id(hdr, l->peer_bearer_id); __skb_queue_tail(&tnlq, tnlskb); tipc_link_xmit(tnl, &tnlq, xmitq); return; } __skb_queue_head_init(&tmpxq); __skb_queue_head_init(&frags); /* At least one packet required for safe algorithm => add dummy */ skb = tipc_msg_create(TIPC_LOW_IMPORTANCE, TIPC_DIRECT_MSG, BASIC_H_SIZE, 0, l->addr, tipc_own_addr(l->net), 0, 0, TIPC_ERR_NO_PORT); if (!skb) { pr_warn("%sunable to create tunnel packet\n", link_co_err); return; } __skb_queue_tail(&tnlq, skb); tipc_link_xmit(l, &tnlq, &tmpxq); __skb_queue_purge(&tmpxq); /* Initialize reusable tunnel packet header */ tipc_msg_init(tipc_own_addr(l->net), &tnlhdr, TUNNEL_PROTOCOL, mtyp, INT_H_SIZE, l->addr); if (mtyp == SYNCH_MSG) pktcnt = l->snd_nxt - buf_seqno(skb_peek(&l->transmq)); else pktcnt = skb_queue_len(&l->transmq); pktcnt += skb_queue_len(&l->backlogq); msg_set_msgcnt(&tnlhdr, pktcnt); msg_set_bearer_id(&tnlhdr, l->peer_bearer_id); tnl: /* Wrap each packet into a tunnel packet */ skb_queue_walk(queue, skb) { hdr = buf_msg(skb); if (queue == &l->backlogq) msg_set_seqno(hdr, seqno++); pktlen = msg_size(hdr); /* Tunnel link MTU is not large enough? This could be * due to: * 1) Link MTU has just changed or set differently; * 2) Or FAILOVER on the top of a SYNCH message * * The 2nd case should not happen if peer supports * TIPC_TUNNEL_ENHANCED */ if (pktlen > tnl->mtu - INT_H_SIZE) { if (mtyp == FAILOVER_MSG && (tnl->peer_caps & TIPC_TUNNEL_ENHANCED)) { rc = tipc_msg_fragment(skb, &tnlhdr, tnl->mtu, &frags); if (rc) { pr_warn("%sunable to frag msg: rc %d\n", link_co_err, rc); return; } pktcnt += skb_queue_len(&frags) - 1; pktcnt_need_update = true; skb_queue_splice_tail_init(&frags, &tnlq); continue; } /* Unluckily, peer doesn't have TIPC_TUNNEL_ENHANCED * => Just warn it and return! */ pr_warn_ratelimited("%stoo large msg <%d, %d>: %d!\n", link_co_err, msg_user(hdr), msg_type(hdr), msg_size(hdr)); return; } msg_set_size(&tnlhdr, pktlen + INT_H_SIZE); tnlskb = tipc_buf_acquire(pktlen + INT_H_SIZE, GFP_ATOMIC); if (!tnlskb) { pr_warn("%sunable to send packet\n", link_co_err); return; } skb_copy_to_linear_data(tnlskb, &tnlhdr, INT_H_SIZE); skb_copy_to_linear_data_offset(tnlskb, INT_H_SIZE, hdr, pktlen); __skb_queue_tail(&tnlq, tnlskb); } if (queue != &l->backlogq) { queue = &l->backlogq; goto tnl; } if (pktcnt_need_update) skb_queue_walk(&tnlq, skb) { hdr = buf_msg(skb); msg_set_msgcnt(hdr, pktcnt); } tipc_link_xmit(tnl, &tnlq, xmitq); if (mtyp == FAILOVER_MSG) { tnl->drop_point = l->rcv_nxt; tnl->failover_reasm_skb = l->reasm_buf; l->reasm_buf = NULL; /* Failover the link's deferdq */ if (unlikely(!skb_queue_empty(fdefq))) { pr_warn("Link failover deferdq not empty: %d!\n", skb_queue_len(fdefq)); __skb_queue_purge(fdefq); } skb_queue_splice_init(&l->deferdq, fdefq); } } /** * tipc_link_failover_prepare() - prepare tnl for link failover * * This is a special version of the precursor - tipc_link_tnl_prepare(), * see the tipc_node_link_failover() for details * * @l: failover link * @tnl: tunnel link * @xmitq: queue for messages to be xmited */ void tipc_link_failover_prepare(struct tipc_link *l, struct tipc_link *tnl, struct sk_buff_head *xmitq) { struct sk_buff_head *fdefq = &tnl->failover_deferdq; tipc_link_create_dummy_tnl_msg(tnl, xmitq); /* This failover link endpoint was never established before, * so it has not received anything from peer. * Otherwise, it must be a normal failover situation or the * node has entered SELF_DOWN_PEER_LEAVING and both peer nodes * would have to start over from scratch instead. */ tnl->drop_point = 1; tnl->failover_reasm_skb = NULL; /* Initiate the link's failover deferdq */ if (unlikely(!skb_queue_empty(fdefq))) { pr_warn("Link failover deferdq not empty: %d!\n", skb_queue_len(fdefq)); __skb_queue_purge(fdefq); } } /* tipc_link_validate_msg(): validate message against current link state * Returns true if message should be accepted, otherwise false */ bool tipc_link_validate_msg(struct tipc_link *l, struct tipc_msg *hdr) { u16 curr_session = l->peer_session; u16 session = msg_session(hdr); int mtyp = msg_type(hdr); if (msg_user(hdr) != LINK_PROTOCOL) return true; switch (mtyp) { case RESET_MSG: if (!l->in_session) return true; /* Accept only RESET with new session number */ return more(session, curr_session); case ACTIVATE_MSG: if (!l->in_session) return true; /* Accept only ACTIVATE with new or current session number */ return !less(session, curr_session); case STATE_MSG: /* Accept only STATE with current session number */ if (!l->in_session) return false; if (session != curr_session) return false; /* Extra sanity check */ if (!tipc_link_is_up(l) && msg_ack(hdr)) return false; if (!(l->peer_caps & TIPC_LINK_PROTO_SEQNO)) return true; /* Accept only STATE with new sequence number */ return !less(msg_seqno(hdr), l->rcv_nxt_state); default: return false; } } /* tipc_link_proto_rcv(): receive link level protocol message : * Note that network plane id propagates through the network, and may * change at any time. The node with lowest numerical id determines * network plane */ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, struct sk_buff_head *xmitq) { struct tipc_msg *hdr = buf_msg(skb); struct tipc_gap_ack_blks *ga = NULL; bool reply = msg_probe(hdr), retransmitted = false; u32 dlen = msg_data_sz(hdr), glen = 0, msg_max; u16 peers_snd_nxt = msg_next_sent(hdr); u16 peers_tol = msg_link_tolerance(hdr); u16 peers_prio = msg_linkprio(hdr); u16 gap = msg_seq_gap(hdr); u16 ack = msg_ack(hdr); u16 rcv_nxt = l->rcv_nxt; u16 rcvgap = 0; int mtyp = msg_type(hdr); int rc = 0, released; char *if_name; void *data; trace_tipc_proto_rcv(skb, false, l->name); if (dlen > U16_MAX) goto exit; if (tipc_link_is_blocked(l) || !xmitq) goto exit; if (tipc_own_addr(l->net) > msg_prevnode(hdr)) l->net_plane = msg_net_plane(hdr); if (skb_linearize(skb)) goto exit; hdr = buf_msg(skb); data = msg_data(hdr); if (!tipc_link_validate_msg(l, hdr)) { trace_tipc_skb_dump(skb, false, "PROTO invalid (1)!"); trace_tipc_link_dump(l, TIPC_DUMP_NONE, "PROTO invalid (1)!"); goto exit; } switch (mtyp) { case RESET_MSG: case ACTIVATE_MSG: msg_max = msg_max_pkt(hdr); if (msg_max < tipc_bearer_min_mtu(l->net, l->bearer_id)) break; /* Complete own link name with peer's interface name */ if_name = strrchr(l->name, ':') + 1; if (sizeof(l->name) - (if_name - l->name) <= TIPC_MAX_IF_NAME) break; if (msg_data_sz(hdr) < TIPC_MAX_IF_NAME) break; strncpy(if_name, data, TIPC_MAX_IF_NAME); /* Update own tolerance if peer indicates a non-zero value */ if (tipc_in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) { l->tolerance = peers_tol; l->bc_rcvlink->tolerance = peers_tol; } /* Update own priority if peer's priority is higher */ if (tipc_in_range(peers_prio, l->priority + 1, TIPC_MAX_LINK_PRI)) l->priority = peers_prio; /* If peer is going down we want full re-establish cycle */ if (msg_peer_stopping(hdr)) { rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT); break; } /* If this endpoint was re-created while peer was ESTABLISHING * it doesn't know current session number. Force re-synch. */ if (mtyp == ACTIVATE_MSG && msg_dest_session_valid(hdr) && l->session != msg_dest_session(hdr)) { if (less(l->session, msg_dest_session(hdr))) l->session = msg_dest_session(hdr) + 1; break; } /* ACTIVATE_MSG serves as PEER_RESET if link is already down */ if (mtyp == RESET_MSG || !tipc_link_is_up(l)) rc = tipc_link_fsm_evt(l, LINK_PEER_RESET_EVT); /* ACTIVATE_MSG takes up link if it was already locally reset */ if (mtyp == ACTIVATE_MSG && l->state == LINK_ESTABLISHING) rc = TIPC_LINK_UP_EVT; l->peer_session = msg_session(hdr); l->in_session = true; l->peer_bearer_id = msg_bearer_id(hdr); if (l->mtu > msg_max) l->mtu = msg_max; break; case STATE_MSG: /* Validate Gap ACK blocks, drop if invalid */ glen = tipc_get_gap_ack_blks(&ga, l, hdr, true); if (glen > dlen) break; l->rcv_nxt_state = msg_seqno(hdr) + 1; /* Update own tolerance if peer indicates a non-zero value */ if (tipc_in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) { l->tolerance = peers_tol; l->bc_rcvlink->tolerance = peers_tol; } /* Update own prio if peer indicates a different value */ if ((peers_prio != l->priority) && tipc_in_range(peers_prio, 1, TIPC_MAX_LINK_PRI)) { l->priority = peers_prio; rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT); } l->silent_intv_cnt = 0; l->stats.recv_states++; if (msg_probe(hdr)) l->stats.recv_probes++; if (!tipc_link_is_up(l)) { if (l->state == LINK_ESTABLISHING) rc = TIPC_LINK_UP_EVT; break; } tipc_mon_rcv(l->net, data + glen, dlen - glen, l->addr, &l->mon_state, l->bearer_id); /* Send NACK if peer has sent pkts we haven't received yet */ if ((reply || msg_is_keepalive(hdr)) && more(peers_snd_nxt, rcv_nxt) && !tipc_link_is_synching(l) && skb_queue_empty(&l->deferdq)) rcvgap = peers_snd_nxt - l->rcv_nxt; if (rcvgap || reply) tipc_link_build_proto_msg(l, STATE_MSG, 0, reply, rcvgap, 0, 0, xmitq); released = tipc_link_advance_transmq(l, l, ack, gap, ga, xmitq, &retransmitted, &rc); if (gap) l->stats.recv_nacks++; if (released || retransmitted) tipc_link_update_cwin(l, released, retransmitted); if (released) tipc_link_advance_backlog(l, xmitq); if (unlikely(!skb_queue_empty(&l->wakeupq))) link_prepare_wakeup(l); } exit: kfree_skb(skb); return rc; } /* tipc_link_build_bc_proto_msg() - create broadcast protocol message */ static bool tipc_link_build_bc_proto_msg(struct tipc_link *l, bool bcast, u16 peers_snd_nxt, struct sk_buff_head *xmitq) { struct sk_buff *skb; struct tipc_msg *hdr; struct sk_buff *dfrd_skb = skb_peek(&l->deferdq); u16 ack = l->rcv_nxt - 1; u16 gap_to = peers_snd_nxt - 1; skb = tipc_msg_create(BCAST_PROTOCOL, STATE_MSG, INT_H_SIZE, 0, l->addr, tipc_own_addr(l->net), 0, 0, 0); if (!skb) return false; hdr = buf_msg(skb); msg_set_last_bcast(hdr, l->bc_sndlink->snd_nxt - 1); msg_set_bcast_ack(hdr, ack); msg_set_bcgap_after(hdr, ack); if (dfrd_skb) gap_to = buf_seqno(dfrd_skb) - 1; msg_set_bcgap_to(hdr, gap_to); msg_set_non_seq(hdr, bcast); __skb_queue_tail(xmitq, skb); return true; } /* tipc_link_build_bc_init_msg() - synchronize broadcast link endpoints. * * Give a newly added peer node the sequence number where it should * start receiving and acking broadcast packets. */ static void tipc_link_build_bc_init_msg(struct tipc_link *l, struct sk_buff_head *xmitq) { struct sk_buff_head list; __skb_queue_head_init(&list); if (!tipc_link_build_bc_proto_msg(l->bc_rcvlink, false, 0, &list)) return; msg_set_bc_ack_invalid(buf_msg(skb_peek(&list)), true); tipc_link_xmit(l, &list, xmitq); } /* tipc_link_bc_init_rcv - receive initial broadcast synch data from peer */ void tipc_link_bc_init_rcv(struct tipc_link *l, struct tipc_msg *hdr) { int mtyp = msg_type(hdr); u16 peers_snd_nxt = msg_bc_snd_nxt(hdr); if (tipc_link_is_up(l)) return; if (msg_user(hdr) == BCAST_PROTOCOL) { l->rcv_nxt = peers_snd_nxt; l->state = LINK_ESTABLISHED; return; } if (l->peer_caps & TIPC_BCAST_SYNCH) return; if (msg_peer_node_is_up(hdr)) return; /* Compatibility: accept older, less safe initial synch data */ if ((mtyp == RESET_MSG) || (mtyp == ACTIVATE_MSG)) l->rcv_nxt = peers_snd_nxt; } /* tipc_link_bc_sync_rcv - update rcv link according to peer's send state */ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr, struct sk_buff_head *xmitq) { u16 peers_snd_nxt = msg_bc_snd_nxt(hdr); int rc = 0; if (!tipc_link_is_up(l)) return rc; if (!msg_peer_node_is_up(hdr)) return rc; /* Open when peer acknowledges our bcast init msg (pkt #1) */ if (msg_ack(hdr)) l->bc_peer_is_up = true; if (!l->bc_peer_is_up) return rc; /* Ignore if peers_snd_nxt goes beyond receive window */ if (more(peers_snd_nxt, l->rcv_nxt + l->window)) return rc; l->snd_nxt = peers_snd_nxt; if (link_bc_rcv_gap(l)) rc |= TIPC_LINK_SND_STATE; /* Return now if sender supports nack via STATE messages */ if (l->peer_caps & TIPC_BCAST_STATE_NACK) return rc; /* Otherwise, be backwards compatible */ if (!more(peers_snd_nxt, l->rcv_nxt)) { l->nack_state = BC_NACK_SND_CONDITIONAL; return 0; } /* Don't NACK if one was recently sent or peeked */ if (l->nack_state == BC_NACK_SND_SUPPRESS) { l->nack_state = BC_NACK_SND_UNCONDITIONAL; return 0; } /* Conditionally delay NACK sending until next synch rcv */ if (l->nack_state == BC_NACK_SND_CONDITIONAL) { l->nack_state = BC_NACK_SND_UNCONDITIONAL; if ((peers_snd_nxt - l->rcv_nxt) < TIPC_MIN_LINK_WIN) return 0; } /* Send NACK now but suppress next one */ tipc_link_build_bc_proto_msg(l, true, peers_snd_nxt, xmitq); l->nack_state = BC_NACK_SND_SUPPRESS; return 0; } int tipc_link_bc_ack_rcv(struct tipc_link *r, u16 acked, u16 gap, struct tipc_gap_ack_blks *ga, struct sk_buff_head *xmitq, struct sk_buff_head *retrq) { struct tipc_link *l = r->bc_sndlink; bool unused = false; int rc = 0; if (!tipc_link_is_up(r) || !r->bc_peer_is_up) return 0; if (gap) { l->stats.recv_nacks++; r->stats.recv_nacks++; } if (less(acked, r->acked) || (acked == r->acked && !gap && !ga)) return 0; trace_tipc_link_bc_ack(r, acked, gap, &l->transmq); tipc_link_advance_transmq(l, r, acked, gap, ga, retrq, &unused, &rc); tipc_link_advance_backlog(l, xmitq); if (unlikely(!skb_queue_empty(&l->wakeupq))) link_prepare_wakeup(l); return rc; } /* tipc_link_bc_nack_rcv(): receive broadcast nack message * This function is here for backwards compatibility, since * no BCAST_PROTOCOL/STATE messages occur from TIPC v2.5. */ int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb, struct sk_buff_head *xmitq) { struct tipc_msg *hdr = buf_msg(skb); u32 dnode = msg_destnode(hdr); int mtyp = msg_type(hdr); u16 acked = msg_bcast_ack(hdr); u16 from = acked + 1; u16 to = msg_bcgap_to(hdr); u16 peers_snd_nxt = to + 1; int rc = 0; kfree_skb(skb); if (!tipc_link_is_up(l) || !l->bc_peer_is_up) return 0; if (mtyp != STATE_MSG) return 0; if (dnode == tipc_own_addr(l->net)) { rc = tipc_link_bc_ack_rcv(l, acked, to - acked, NULL, xmitq, xmitq); l->stats.recv_nacks++; return rc; } /* Msg for other node => suppress own NACK at next sync if applicable */ if (more(peers_snd_nxt, l->rcv_nxt) && !less(l->rcv_nxt, from)) l->nack_state = BC_NACK_SND_SUPPRESS; return 0; } void tipc_link_set_queue_limits(struct tipc_link *l, u32 min_win, u32 max_win) { int max_bulk = TIPC_MAX_PUBL / (l->mtu / ITEM_SIZE); l->min_win = min_win; l->ssthresh = max_win; l->max_win = max_win; l->window = min_win; l->backlog[TIPC_LOW_IMPORTANCE].limit = min_win * 2; l->backlog[TIPC_MEDIUM_IMPORTANCE].limit = min_win * 4; l->backlog[TIPC_HIGH_IMPORTANCE].limit = min_win * 6; l->backlog[TIPC_CRITICAL_IMPORTANCE].limit = min_win * 8; l->backlog[TIPC_SYSTEM_IMPORTANCE].limit = max_bulk; } /** * tipc_link_reset_stats - reset link statistics * @l: pointer to link */ void tipc_link_reset_stats(struct tipc_link *l) { memset(&l->stats, 0, sizeof(l->stats)); } static void link_print(struct tipc_link *l, const char *str) { struct sk_buff *hskb = skb_peek(&l->transmq); u16 head = hskb ? msg_seqno(buf_msg(hskb)) : l->snd_nxt - 1; u16 tail = l->snd_nxt - 1; pr_info("%s Link <%s> state %x\n", str, l->name, l->state); pr_info("XMTQ: %u [%u-%u], BKLGQ: %u, SNDNX: %u, RCVNX: %u\n", skb_queue_len(&l->transmq), head, tail, skb_queue_len(&l->backlogq), l->snd_nxt, l->rcv_nxt); } /* Parse and validate nested (link) properties valid for media, bearer and link */ int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]) { int err; err = nla_parse_nested_deprecated(props, TIPC_NLA_PROP_MAX, prop, tipc_nl_prop_policy, NULL); if (err) return err; if (props[TIPC_NLA_PROP_PRIO]) { u32 prio; prio = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); if (prio > TIPC_MAX_LINK_PRI) return -EINVAL; } if (props[TIPC_NLA_PROP_TOL]) { u32 tol; tol = nla_get_u32(props[TIPC_NLA_PROP_TOL]); if ((tol < TIPC_MIN_LINK_TOL) || (tol > TIPC_MAX_LINK_TOL)) return -EINVAL; } if (props[TIPC_NLA_PROP_WIN]) { u32 max_win; max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); if (max_win < TIPC_DEF_LINK_WIN || max_win > TIPC_MAX_LINK_WIN) return -EINVAL; } return 0; } static int __tipc_nl_add_stats(struct sk_buff *skb, struct tipc_stats *s) { int i; struct nlattr *stats; struct nla_map { u32 key; u32 val; }; struct nla_map map[] = { {TIPC_NLA_STATS_RX_INFO, 0}, {TIPC_NLA_STATS_RX_FRAGMENTS, s->recv_fragments}, {TIPC_NLA_STATS_RX_FRAGMENTED, s->recv_fragmented}, {TIPC_NLA_STATS_RX_BUNDLES, s->recv_bundles}, {TIPC_NLA_STATS_RX_BUNDLED, s->recv_bundled}, {TIPC_NLA_STATS_TX_INFO, 0}, {TIPC_NLA_STATS_TX_FRAGMENTS, s->sent_fragments}, {TIPC_NLA_STATS_TX_FRAGMENTED, s->sent_fragmented}, {TIPC_NLA_STATS_TX_BUNDLES, s->sent_bundles}, {TIPC_NLA_STATS_TX_BUNDLED, s->sent_bundled}, {TIPC_NLA_STATS_MSG_PROF_TOT, (s->msg_length_counts) ? s->msg_length_counts : 1}, {TIPC_NLA_STATS_MSG_LEN_CNT, s->msg_length_counts}, {TIPC_NLA_STATS_MSG_LEN_TOT, s->msg_lengths_total}, {TIPC_NLA_STATS_MSG_LEN_P0, s->msg_length_profile[0]}, {TIPC_NLA_STATS_MSG_LEN_P1, s->msg_length_profile[1]}, {TIPC_NLA_STATS_MSG_LEN_P2, s->msg_length_profile[2]}, {TIPC_NLA_STATS_MSG_LEN_P3, s->msg_length_profile[3]}, {TIPC_NLA_STATS_MSG_LEN_P4, s->msg_length_profile[4]}, {TIPC_NLA_STATS_MSG_LEN_P5, s->msg_length_profile[5]}, {TIPC_NLA_STATS_MSG_LEN_P6, s->msg_length_profile[6]}, {TIPC_NLA_STATS_RX_STATES, s->recv_states}, {TIPC_NLA_STATS_RX_PROBES, s->recv_probes}, {TIPC_NLA_STATS_RX_NACKS, s->recv_nacks}, {TIPC_NLA_STATS_RX_DEFERRED, s->deferred_recv}, {TIPC_NLA_STATS_TX_STATES, s->sent_states}, {TIPC_NLA_STATS_TX_PROBES, s->sent_probes}, {TIPC_NLA_STATS_TX_NACKS, s->sent_nacks}, {TIPC_NLA_STATS_TX_ACKS, s->sent_acks}, {TIPC_NLA_STATS_RETRANSMITTED, s->retransmitted}, {TIPC_NLA_STATS_DUPLICATES, s->duplicates}, {TIPC_NLA_STATS_LINK_CONGS, s->link_congs}, {TIPC_NLA_STATS_MAX_QUEUE, s->max_queue_sz}, {TIPC_NLA_STATS_AVG_QUEUE, s->queue_sz_counts ? (s->accu_queue_sz / s->queue_sz_counts) : 0} }; stats = nla_nest_start_noflag(skb, TIPC_NLA_LINK_STATS); if (!stats) return -EMSGSIZE; for (i = 0; i < ARRAY_SIZE(map); i++) if (nla_put_u32(skb, map[i].key, map[i].val)) goto msg_full; nla_nest_end(skb, stats); return 0; msg_full: nla_nest_cancel(skb, stats); return -EMSGSIZE; } /* Caller should hold appropriate locks to protect the link */ int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg, struct tipc_link *link, int nlflags) { u32 self = tipc_own_addr(net); struct nlattr *attrs; struct nlattr *prop; void *hdr; int err; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, nlflags, TIPC_NL_LINK_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_LINK); if (!attrs) goto msg_full; if (nla_put_string(msg->skb, TIPC_NLA_LINK_NAME, link->name)) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_LINK_DEST, tipc_cluster_mask(self))) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_LINK_MTU, link->mtu)) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, link->stats.recv_pkts)) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, link->stats.sent_pkts)) goto attr_msg_full; if (tipc_link_is_up(link)) if (nla_put_flag(msg->skb, TIPC_NLA_LINK_UP)) goto attr_msg_full; if (link->active) if (nla_put_flag(msg->skb, TIPC_NLA_LINK_ACTIVE)) goto attr_msg_full; prop = nla_nest_start_noflag(msg->skb, TIPC_NLA_LINK_PROP); if (!prop) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_PRIO, link->priority)) goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_TOL, link->tolerance)) goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, link->window)) goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_PRIO, link->priority)) goto prop_msg_full; nla_nest_end(msg->skb, prop); err = __tipc_nl_add_stats(msg->skb, &link->stats); if (err) goto attr_msg_full; nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; prop_msg_full: nla_nest_cancel(msg->skb, prop); attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } static int __tipc_nl_add_bc_link_stat(struct sk_buff *skb, struct tipc_stats *stats) { int i; struct nlattr *nest; struct nla_map { __u32 key; __u32 val; }; struct nla_map map[] = { {TIPC_NLA_STATS_RX_INFO, stats->recv_pkts}, {TIPC_NLA_STATS_RX_FRAGMENTS, stats->recv_fragments}, {TIPC_NLA_STATS_RX_FRAGMENTED, stats->recv_fragmented}, {TIPC_NLA_STATS_RX_BUNDLES, stats->recv_bundles}, {TIPC_NLA_STATS_RX_BUNDLED, stats->recv_bundled}, {TIPC_NLA_STATS_TX_INFO, stats->sent_pkts}, {TIPC_NLA_STATS_TX_FRAGMENTS, stats->sent_fragments}, {TIPC_NLA_STATS_TX_FRAGMENTED, stats->sent_fragmented}, {TIPC_NLA_STATS_TX_BUNDLES, stats->sent_bundles}, {TIPC_NLA_STATS_TX_BUNDLED, stats->sent_bundled}, {TIPC_NLA_STATS_RX_NACKS, stats->recv_nacks}, {TIPC_NLA_STATS_RX_DEFERRED, stats->deferred_recv}, {TIPC_NLA_STATS_TX_NACKS, stats->sent_nacks}, {TIPC_NLA_STATS_TX_ACKS, stats->sent_acks}, {TIPC_NLA_STATS_RETRANSMITTED, stats->retransmitted}, {TIPC_NLA_STATS_DUPLICATES, stats->duplicates}, {TIPC_NLA_STATS_LINK_CONGS, stats->link_congs}, {TIPC_NLA_STATS_MAX_QUEUE, stats->max_queue_sz}, {TIPC_NLA_STATS_AVG_QUEUE, stats->queue_sz_counts ? (stats->accu_queue_sz / stats->queue_sz_counts) : 0} }; nest = nla_nest_start_noflag(skb, TIPC_NLA_LINK_STATS); if (!nest) return -EMSGSIZE; for (i = 0; i < ARRAY_SIZE(map); i++) if (nla_put_u32(skb, map[i].key, map[i].val)) goto msg_full; nla_nest_end(skb, nest); return 0; msg_full: nla_nest_cancel(skb, nest); return -EMSGSIZE; } int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg, struct tipc_link *bcl) { int err; void *hdr; struct nlattr *attrs; struct nlattr *prop; u32 bc_mode = tipc_bcast_get_mode(net); u32 bc_ratio = tipc_bcast_get_broadcast_ratio(net); if (!bcl) return 0; tipc_bcast_lock(net); hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_LINK_GET); if (!hdr) { tipc_bcast_unlock(net); return -EMSGSIZE; } attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_LINK); if (!attrs) goto msg_full; /* The broadcast link is always up */ if (nla_put_flag(msg->skb, TIPC_NLA_LINK_UP)) goto attr_msg_full; if (nla_put_flag(msg->skb, TIPC_NLA_LINK_BROADCAST)) goto attr_msg_full; if (nla_put_string(msg->skb, TIPC_NLA_LINK_NAME, bcl->name)) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, 0)) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, 0)) goto attr_msg_full; prop = nla_nest_start_noflag(msg->skb, TIPC_NLA_LINK_PROP); if (!prop) goto attr_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bcl->max_win)) goto prop_msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_PROP_BROADCAST, bc_mode)) goto prop_msg_full; if (bc_mode & BCLINK_MODE_SEL) if (nla_put_u32(msg->skb, TIPC_NLA_PROP_BROADCAST_RATIO, bc_ratio)) goto prop_msg_full; nla_nest_end(msg->skb, prop); err = __tipc_nl_add_bc_link_stat(msg->skb, &bcl->stats); if (err) goto attr_msg_full; tipc_bcast_unlock(net); nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; prop_msg_full: nla_nest_cancel(msg->skb, prop); attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: tipc_bcast_unlock(net); genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } void tipc_link_set_tolerance(struct tipc_link *l, u32 tol, struct sk_buff_head *xmitq) { l->tolerance = tol; if (l->bc_rcvlink) l->bc_rcvlink->tolerance = tol; if (tipc_link_is_up(l)) tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, tol, 0, xmitq); } void tipc_link_set_prio(struct tipc_link *l, u32 prio, struct sk_buff_head *xmitq) { l->priority = prio; tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, 0, prio, xmitq); } void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit) { l->abort_limit = limit; } /** * tipc_link_dump - dump TIPC link data * @l: tipc link to be dumped * @dqueues: bitmask to decide if any link queue to be dumped? * - TIPC_DUMP_NONE: don't dump link queues * - TIPC_DUMP_TRANSMQ: dump link transmq queue * - TIPC_DUMP_BACKLOGQ: dump link backlog queue * - TIPC_DUMP_DEFERDQ: dump link deferd queue * - TIPC_DUMP_INPUTQ: dump link input queue * - TIPC_DUMP_WAKEUP: dump link wakeup queue * - TIPC_DUMP_ALL: dump all the link queues above * @buf: returned buffer of dump data in format */ int tipc_link_dump(struct tipc_link *l, u16 dqueues, char *buf) { int i = 0; size_t sz = (dqueues) ? LINK_LMAX : LINK_LMIN; struct sk_buff_head *list; struct sk_buff *hskb, *tskb; u32 len; if (!l) { i += scnprintf(buf, sz, "link data: (null)\n"); return i; } i += scnprintf(buf, sz, "link data: %x", l->addr); i += scnprintf(buf + i, sz - i, " %x", l->state); i += scnprintf(buf + i, sz - i, " %u", l->in_session); i += scnprintf(buf + i, sz - i, " %u", l->session); i += scnprintf(buf + i, sz - i, " %u", l->peer_session); i += scnprintf(buf + i, sz - i, " %u", l->snd_nxt); i += scnprintf(buf + i, sz - i, " %u", l->rcv_nxt); i += scnprintf(buf + i, sz - i, " %u", l->snd_nxt_state); i += scnprintf(buf + i, sz - i, " %u", l->rcv_nxt_state); i += scnprintf(buf + i, sz - i, " %x", l->peer_caps); i += scnprintf(buf + i, sz - i, " %u", l->silent_intv_cnt); i += scnprintf(buf + i, sz - i, " %u", l->rst_cnt); i += scnprintf(buf + i, sz - i, " %u", 0); i += scnprintf(buf + i, sz - i, " %u", 0); i += scnprintf(buf + i, sz - i, " %u", l->acked); list = &l->transmq; len = skb_queue_len(list); hskb = skb_peek(list); tskb = skb_peek_tail(list); i += scnprintf(buf + i, sz - i, " | %u %u %u", len, (hskb) ? msg_seqno(buf_msg(hskb)) : 0, (tskb) ? msg_seqno(buf_msg(tskb)) : 0); list = &l->deferdq; len = skb_queue_len(list); hskb = skb_peek(list); tskb = skb_peek_tail(list); i += scnprintf(buf + i, sz - i, " | %u %u %u", len, (hskb) ? msg_seqno(buf_msg(hskb)) : 0, (tskb) ? msg_seqno(buf_msg(tskb)) : 0); list = &l->backlogq; len = skb_queue_len(list); hskb = skb_peek(list); tskb = skb_peek_tail(list); i += scnprintf(buf + i, sz - i, " | %u %u %u", len, (hskb) ? msg_seqno(buf_msg(hskb)) : 0, (tskb) ? msg_seqno(buf_msg(tskb)) : 0); list = l->inputq; len = skb_queue_len(list); hskb = skb_peek(list); tskb = skb_peek_tail(list); i += scnprintf(buf + i, sz - i, " | %u %u %u\n", len, (hskb) ? msg_seqno(buf_msg(hskb)) : 0, (tskb) ? msg_seqno(buf_msg(tskb)) : 0); if (dqueues & TIPC_DUMP_TRANSMQ) { i += scnprintf(buf + i, sz - i, "transmq: "); i += tipc_list_dump(&l->transmq, false, buf + i); } if (dqueues & TIPC_DUMP_BACKLOGQ) { i += scnprintf(buf + i, sz - i, "backlogq: <%u %u %u %u %u>, ", l->backlog[TIPC_LOW_IMPORTANCE].len, l->backlog[TIPC_MEDIUM_IMPORTANCE].len, l->backlog[TIPC_HIGH_IMPORTANCE].len, l->backlog[TIPC_CRITICAL_IMPORTANCE].len, l->backlog[TIPC_SYSTEM_IMPORTANCE].len); i += tipc_list_dump(&l->backlogq, false, buf + i); } if (dqueues & TIPC_DUMP_DEFERDQ) { i += scnprintf(buf + i, sz - i, "deferdq: "); i += tipc_list_dump(&l->deferdq, false, buf + i); } if (dqueues & TIPC_DUMP_INPUTQ) { i += scnprintf(buf + i, sz - i, "inputq: "); i += tipc_list_dump(l->inputq, false, buf + i); } if (dqueues & TIPC_DUMP_WAKEUP) { i += scnprintf(buf + i, sz - i, "wakeup: "); i += tipc_list_dump(&l->wakeupq, false, buf + i); } return i; }
1 1 3 8 8 8 8 8 8 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 // SPDX-License-Identifier: GPL-2.0-or-later /* * (C) 2012 by Pablo Neira Ayuso <pablo@netfilter.org> * (C) 2012 by Vyatta Inc. <http://www.vyatta.com> */ #include <linux/types.h> #include <linux/netfilter.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/stddef.h> #include <linux/err.h> #include <linux/percpu.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/export.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_timeout.h> const struct nf_ct_timeout_hooks __rcu *nf_ct_timeout_hook __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_timeout_hook); static int untimeout(struct nf_conn *ct, void *timeout) { struct nf_conn_timeout *timeout_ext = nf_ct_timeout_find(ct); if (timeout_ext) { const struct nf_ct_timeout *t; t = rcu_access_pointer(timeout_ext->timeout); if (!timeout || t == timeout) RCU_INIT_POINTER(timeout_ext->timeout, NULL); } /* We are not intended to delete this conntrack. */ return 0; } void nf_ct_untimeout(struct net *net, struct nf_ct_timeout *timeout) { struct nf_ct_iter_data iter_data = { .net = net, .data = timeout, }; nf_ct_iterate_cleanup_net(untimeout, &iter_data); } EXPORT_SYMBOL_GPL(nf_ct_untimeout); static void __nf_ct_timeout_put(struct nf_ct_timeout *timeout) { const struct nf_ct_timeout_hooks *h = rcu_dereference(nf_ct_timeout_hook); if (h) h->timeout_put(timeout); } int nf_ct_set_timeout(struct net *net, struct nf_conn *ct, u8 l3num, u8 l4num, const char *timeout_name) { const struct nf_ct_timeout_hooks *h; struct nf_ct_timeout *timeout; struct nf_conn_timeout *timeout_ext; const char *errmsg = NULL; int ret = 0; rcu_read_lock(); h = rcu_dereference(nf_ct_timeout_hook); if (!h) { ret = -ENOENT; errmsg = "Timeout policy base is empty"; goto out; } timeout = h->timeout_find_get(net, timeout_name); if (!timeout) { ret = -ENOENT; pr_info_ratelimited("No such timeout policy \"%s\"\n", timeout_name); goto out; } if (timeout->l3num != l3num) { ret = -EINVAL; pr_info_ratelimited("Timeout policy `%s' can only be used by " "L%d protocol number %d\n", timeout_name, 3, timeout->l3num); goto err_put_timeout; } /* Make sure the timeout policy matches any existing protocol tracker, * otherwise default to generic. */ if (timeout->l4proto->l4proto != l4num) { ret = -EINVAL; pr_info_ratelimited("Timeout policy `%s' can only be used by " "L%d protocol number %d\n", timeout_name, 4, timeout->l4proto->l4proto); goto err_put_timeout; } timeout_ext = nf_ct_timeout_ext_add(ct, timeout, GFP_ATOMIC); if (!timeout_ext) { ret = -ENOMEM; goto err_put_timeout; } rcu_read_unlock(); return ret; err_put_timeout: __nf_ct_timeout_put(timeout); out: rcu_read_unlock(); if (errmsg) pr_info_ratelimited("%s\n", errmsg); return ret; } EXPORT_SYMBOL_GPL(nf_ct_set_timeout); void nf_ct_destroy_timeout(struct nf_conn *ct) { struct nf_conn_timeout *timeout_ext; const struct nf_ct_timeout_hooks *h; rcu_read_lock(); h = rcu_dereference(nf_ct_timeout_hook); if (h) { timeout_ext = nf_ct_timeout_find(ct); if (timeout_ext) { struct nf_ct_timeout *t; t = rcu_dereference(timeout_ext->timeout); if (t) h->timeout_put(t); RCU_INIT_POINTER(timeout_ext->timeout, NULL); } } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(nf_ct_destroy_timeout);
18 18 5 5 227 215 18 5 235 235 5 230 207 203 3 1 2 12 219 216 3 206 3 10 5 2 9 234 8 2 3 8 5 243 3 204 13 3 3 243 5 10 5 238 5 242 243 243 242 18 18 219 243 185 11 231 209 220 220 221 220 221 220 208 209 209 29 30 48 220 39 38 5 9 8 6 14 1 11 8 5 11 7 9 1 19 15 15 19 35 34 35 10 2 10 10 2 2 2 10 10 2 10 10 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 // SPDX-License-Identifier: GPL-2.0-only /* * (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2011 Patrick McHardy <kaber@trash.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/types.h> #include <linux/timer.h> #include <linux/skbuff.h> #include <linux/gfp.h> #include <net/xfrm.h> #include <linux/siphash.h> #include <linux/rtnetlink.h> #include <net/netfilter/nf_conntrack_bpf.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_helper.h> #include <uapi/linux/netfilter/nf_nat.h> #include "nf_internals.h" #define NF_NAT_MAX_ATTEMPTS 128 #define NF_NAT_HARDER_THRESH (NF_NAT_MAX_ATTEMPTS / 4) static spinlock_t nf_nat_locks[CONNTRACK_LOCKS]; static DEFINE_MUTEX(nf_nat_proto_mutex); static unsigned int nat_net_id __read_mostly; static struct hlist_head *nf_nat_bysource __read_mostly; static unsigned int nf_nat_htable_size __read_mostly; static siphash_aligned_key_t nf_nat_hash_rnd; struct nf_nat_lookup_hook_priv { struct nf_hook_entries __rcu *entries; struct rcu_head rcu_head; }; struct nf_nat_hooks_net { struct nf_hook_ops *nat_hook_ops; unsigned int users; }; struct nat_net { struct nf_nat_hooks_net nat_proto_net[NFPROTO_NUMPROTO]; }; #ifdef CONFIG_XFRM static void nf_nat_ipv4_decode_session(struct sk_buff *skb, const struct nf_conn *ct, enum ip_conntrack_dir dir, unsigned long statusbit, struct flowi *fl) { const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; struct flowi4 *fl4 = &fl->u.ip4; if (ct->status & statusbit) { fl4->daddr = t->dst.u3.ip; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl4->fl4_dport = t->dst.u.all; } statusbit ^= IPS_NAT_MASK; if (ct->status & statusbit) { fl4->saddr = t->src.u3.ip; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl4->fl4_sport = t->src.u.all; } } static void nf_nat_ipv6_decode_session(struct sk_buff *skb, const struct nf_conn *ct, enum ip_conntrack_dir dir, unsigned long statusbit, struct flowi *fl) { #if IS_ENABLED(CONFIG_IPV6) const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; struct flowi6 *fl6 = &fl->u.ip6; if (ct->status & statusbit) { fl6->daddr = t->dst.u3.in6; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl6->fl6_dport = t->dst.u.all; } statusbit ^= IPS_NAT_MASK; if (ct->status & statusbit) { fl6->saddr = t->src.u3.in6; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl6->fl6_sport = t->src.u.all; } #endif } static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl) { const struct nf_conn *ct; enum ip_conntrack_info ctinfo; enum ip_conntrack_dir dir; unsigned long statusbit; u8 family; ct = nf_ct_get(skb, &ctinfo); if (ct == NULL) return; family = nf_ct_l3num(ct); dir = CTINFO2DIR(ctinfo); if (dir == IP_CT_DIR_ORIGINAL) statusbit = IPS_DST_NAT; else statusbit = IPS_SRC_NAT; switch (family) { case NFPROTO_IPV4: nf_nat_ipv4_decode_session(skb, ct, dir, statusbit, fl); return; case NFPROTO_IPV6: nf_nat_ipv6_decode_session(skb, ct, dir, statusbit, fl); return; } } #endif /* CONFIG_XFRM */ /* We keep an extra hash for each conntrack, for fast searching. */ static unsigned int hash_by_src(const struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { unsigned int hash; struct { struct nf_conntrack_man src; u32 net_mix; u32 protonum; u32 zone; } __aligned(SIPHASH_ALIGNMENT) combined; get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); memset(&combined, 0, sizeof(combined)); /* Original src, to ensure we map it consistently if poss. */ combined.src = tuple->src; combined.net_mix = net_hash_mix(net); combined.protonum = tuple->dst.protonum; /* Zone ID can be used provided its valid for both directions */ if (zone->dir == NF_CT_DEFAULT_ZONE_DIR) combined.zone = zone->id; hash = siphash(&combined, sizeof(combined), &nf_nat_hash_rnd); return reciprocal_scale(hash, nf_nat_htable_size); } /** * nf_nat_used_tuple - check if proposed nat tuple clashes with existing entry * @tuple: proposed NAT binding * @ignored_conntrack: our (unconfirmed) conntrack entry * * A conntrack entry can be inserted to the connection tracking table * if there is no existing entry with an identical tuple in either direction. * * Example: * INITIATOR -> NAT/PAT -> RESPONDER * * INITIATOR passes through NAT/PAT ("us") and SNAT is done (saddr rewrite). * Then, later, NAT/PAT itself also connects to RESPONDER. * * This will not work if the SNAT done earlier has same IP:PORT source pair. * * Conntrack table has: * ORIGINAL: $IP_INITIATOR:$SPORT -> $IP_RESPONDER:$DPORT * REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT * * and new locally originating connection wants: * ORIGINAL: $IP_NAT:$SPORT -> $IP_RESPONDER:$DPORT * REPLY: $IP_RESPONDER:$DPORT -> $IP_NAT:$SPORT * * ... which would mean incoming packets cannot be distinguished between * the existing and the newly added entry (identical IP_CT_DIR_REPLY tuple). * * @return: true if the proposed NAT mapping collides with an existing entry. */ static int nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack) { /* Conntrack tracking doesn't keep track of outgoing tuples; only * incoming ones. NAT means they don't have a fixed mapping, * so we invert the tuple and look for the incoming reply. * * We could keep a separate hash if this proves too slow. */ struct nf_conntrack_tuple reply; nf_ct_invert_tuple(&reply, tuple); return nf_conntrack_tuple_taken(&reply, ignored_conntrack); } static bool nf_nat_allow_clash(const struct nf_conn *ct) { return nf_ct_l4proto_find(nf_ct_protonum(ct))->allow_clash; } /** * nf_nat_used_tuple_new - check if to-be-inserted conntrack collides with existing entry * @tuple: proposed NAT binding * @ignored_ct: our (unconfirmed) conntrack entry * * Same as nf_nat_used_tuple, but also check for rare clash in reverse * direction. Should be called only when @tuple has not been altered, i.e. * @ignored_conntrack will not be subject to NAT. * * @return: true if the proposed NAT mapping collides with existing entry. */ static noinline bool nf_nat_used_tuple_new(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_ct) { static const unsigned long uses_nat = IPS_NAT_MASK | IPS_SEQ_ADJUST_BIT; const struct nf_conntrack_tuple_hash *thash; const struct nf_conntrack_zone *zone; struct nf_conn *ct; bool taken = true; struct net *net; if (!nf_nat_used_tuple(tuple, ignored_ct)) return false; if (!nf_nat_allow_clash(ignored_ct)) return true; /* Initial choice clashes with existing conntrack. * Check for (rare) reverse collision. * * This can happen when new packets are received in both directions * at the exact same time on different CPUs. * * Without SMP, first packet creates new conntrack entry and second * packet is resolved as established reply packet. * * With parallel processing, both packets could be picked up as * new and both get their own ct entry allocated. * * If ignored_conntrack and colliding ct are not subject to NAT then * pretend the tuple is available and let later clash resolution * handle this at insertion time. * * Without it, the 'reply' packet has its source port rewritten * by nat engine. */ if (READ_ONCE(ignored_ct->status) & uses_nat) return true; net = nf_ct_net(ignored_ct); zone = nf_ct_zone(ignored_ct); thash = nf_conntrack_find_get(net, zone, tuple); if (unlikely(!thash)) /* clashing entry went away */ return false; ct = nf_ct_tuplehash_to_ctrack(thash); /* NB: IP_CT_DIR_ORIGINAL should be impossible because * nf_nat_used_tuple() handles origin collisions. * * Handle remote chance other CPU confirmed its ct right after. */ if (thash->tuple.dst.dir != IP_CT_DIR_REPLY) goto out; /* clashing connection subject to NAT? Retry with new tuple. */ if (READ_ONCE(ct->status) & uses_nat) goto out; if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, &ignored_ct->tuplehash[IP_CT_DIR_REPLY].tuple) && nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, &ignored_ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) { taken = false; goto out; } out: nf_ct_put(ct); return taken; } static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags) { static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT | IPS_DYING; static const unsigned long flags_needed = IPS_SRC_NAT; enum tcp_conntrack old_state; old_state = READ_ONCE(ct->proto.tcp.state); if (old_state < TCP_CONNTRACK_TIME_WAIT) return false; if (flags & flags_refuse) return false; return (flags & flags_needed) == flags_needed; } /* reverse direction will send packets to new source, so * make sure such packets are invalid. */ static bool nf_seq_has_advanced(const struct nf_conn *old, const struct nf_conn *new) { return (__s32)(new->proto.tcp.seen[0].td_end - old->proto.tcp.seen[0].td_end) > 0; } static int nf_nat_used_tuple_harder(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack, unsigned int attempts_left) { static const unsigned long flags_offload = IPS_OFFLOAD | IPS_HW_OFFLOAD; struct nf_conntrack_tuple_hash *thash; const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple reply; unsigned long flags; struct nf_conn *ct; bool taken = true; struct net *net; nf_ct_invert_tuple(&reply, tuple); if (attempts_left > NF_NAT_HARDER_THRESH || tuple->dst.protonum != IPPROTO_TCP || ignored_conntrack->proto.tcp.state != TCP_CONNTRACK_SYN_SENT) return nf_conntrack_tuple_taken(&reply, ignored_conntrack); /* :ast few attempts to find a free tcp port. Destructive * action: evict colliding if its in timewait state and the * tcp sequence number has advanced past the one used by the * old entry. */ net = nf_ct_net(ignored_conntrack); zone = nf_ct_zone(ignored_conntrack); thash = nf_conntrack_find_get(net, zone, &reply); if (!thash) return false; ct = nf_ct_tuplehash_to_ctrack(thash); if (thash->tuple.dst.dir == IP_CT_DIR_ORIGINAL) goto out; if (WARN_ON_ONCE(ct == ignored_conntrack)) goto out; flags = READ_ONCE(ct->status); if (!nf_nat_may_kill(ct, flags)) goto out; if (!nf_seq_has_advanced(ct, ignored_conntrack)) goto out; /* Even if we can evict do not reuse if entry is offloaded. */ if (nf_ct_kill(ct)) taken = flags & flags_offload; out: nf_ct_put(ct); return taken; } static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t, const struct nf_nat_range2 *range) { if (t->src.l3num == NFPROTO_IPV4) return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) && ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip); return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 && ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0; } /* Is the manipable part of the tuple between min and max incl? */ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype, const union nf_conntrack_man_proto *min, const union nf_conntrack_man_proto *max) { __be16 port; switch (tuple->dst.protonum) { case IPPROTO_ICMP: case IPPROTO_ICMPV6: return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); case IPPROTO_GRE: /* all fall though */ case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_UDPLITE: case IPPROTO_DCCP: case IPPROTO_SCTP: if (maniptype == NF_NAT_MANIP_SRC) port = tuple->src.u.all; else port = tuple->dst.u.all; return ntohs(port) >= ntohs(min->all) && ntohs(port) <= ntohs(max->all); default: return true; } } /* If we source map this tuple so reply looks like reply_tuple, will * that meet the constraints of range. */ static int nf_in_range(const struct nf_conntrack_tuple *tuple, const struct nf_nat_range2 *range) { /* If we are supposed to map IPs, then we must be in the * range specified, otherwise let this drag us onto a new src IP. */ if (range->flags & NF_NAT_RANGE_MAP_IPS && !nf_nat_inet_in_range(tuple, range)) return 0; if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) return 1; return l4proto_in_range(tuple, NF_NAT_MANIP_SRC, &range->min_proto, &range->max_proto); } static inline int same_src(const struct nf_conn *ct, const struct nf_conntrack_tuple *tuple) { const struct nf_conntrack_tuple *t; t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; return (t->dst.protonum == tuple->dst.protonum && nf_inet_addr_cmp(&t->src.u3, &tuple->src.u3) && t->src.u.all == tuple->src.u.all); } /* Only called for SRC manip */ static int find_appropriate_src(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *result, const struct nf_nat_range2 *range) { unsigned int h = hash_by_src(net, zone, tuple); const struct nf_conn *ct; hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { if (same_src(ct, tuple) && net_eq(net, nf_ct_net(ct)) && nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { /* Copy source part from reply tuple. */ nf_ct_invert_tuple(result, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); result->dst = tuple->dst; if (nf_in_range(result, range)) return 1; } } return 0; } /* For [FUTURE] fragmentation handling, we want the least-used * src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports * 1-65535, we don't do pro-rata allocation based on ports; we choose * the ip with the lowest src-ip/dst-ip/proto usage. */ static void find_best_ips_proto(const struct nf_conntrack_zone *zone, struct nf_conntrack_tuple *tuple, const struct nf_nat_range2 *range, const struct nf_conn *ct, enum nf_nat_manip_type maniptype) { union nf_inet_addr *var_ipp; unsigned int i, max; /* Host order */ u32 minip, maxip, j, dist; bool full_range; /* No IP mapping? Do nothing. */ if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) return; if (maniptype == NF_NAT_MANIP_SRC) var_ipp = &tuple->src.u3; else var_ipp = &tuple->dst.u3; /* Fast path: only one choice. */ if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) { *var_ipp = range->min_addr; return; } if (nf_ct_l3num(ct) == NFPROTO_IPV4) max = sizeof(var_ipp->ip) / sizeof(u32) - 1; else max = sizeof(var_ipp->ip6) / sizeof(u32) - 1; /* Hashing source and destination IPs gives a fairly even * spread in practice (if there are a small number of IPs * involved, there usually aren't that many connections * anyway). The consistency means that servers see the same * client coming from the same IP (some Internet Banking sites * like this), even across reboots. */ j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32), range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id); full_range = false; for (i = 0; i <= max; i++) { /* If first bytes of the address are at the maximum, use the * distance. Otherwise use the full range. */ if (!full_range) { minip = ntohl((__force __be32)range->min_addr.all[i]); maxip = ntohl((__force __be32)range->max_addr.all[i]); dist = maxip - minip + 1; } else { minip = 0; dist = ~0; } var_ipp->all[i] = (__force __u32) htonl(minip + reciprocal_scale(j, dist)); if (var_ipp->all[i] != range->max_addr.all[i]) full_range = true; if (!(range->flags & NF_NAT_RANGE_PERSISTENT)) j ^= (__force u32)tuple->dst.u3.all[i]; } } /* Alter the per-proto part of the tuple (depending on maniptype), to * give a unique tuple in the given range if possible. * * Per-protocol part of tuple is initialized to the incoming packet. */ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { unsigned int range_size, min, max, i, attempts; __be16 *keyptr; u16 off; switch (tuple->dst.protonum) { case IPPROTO_ICMP: case IPPROTO_ICMPV6: /* id is same for either direction... */ keyptr = &tuple->src.u.icmp.id; if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { min = 0; range_size = 65536; } else { min = ntohs(range->min_proto.icmp.id); range_size = ntohs(range->max_proto.icmp.id) - ntohs(range->min_proto.icmp.id) + 1; } goto find_free_id; #if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE) case IPPROTO_GRE: /* If there is no master conntrack we are not PPTP, do not change tuples */ if (!ct->master) return; if (maniptype == NF_NAT_MANIP_SRC) keyptr = &tuple->src.u.gre.key; else keyptr = &tuple->dst.u.gre.key; if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { min = 1; range_size = 65535; } else { min = ntohs(range->min_proto.gre.key); range_size = ntohs(range->max_proto.gre.key) - min + 1; } goto find_free_id; #endif case IPPROTO_UDP: case IPPROTO_UDPLITE: case IPPROTO_TCP: case IPPROTO_SCTP: case IPPROTO_DCCP: if (maniptype == NF_NAT_MANIP_SRC) keyptr = &tuple->src.u.all; else keyptr = &tuple->dst.u.all; break; default: return; } /* If no range specified... */ if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { /* If it's dst rewrite, can't change port */ if (maniptype == NF_NAT_MANIP_DST) return; if (ntohs(*keyptr) < 1024) { /* Loose convention: >> 512 is credential passing */ if (ntohs(*keyptr) < 512) { min = 1; range_size = 511 - min + 1; } else { min = 600; range_size = 1023 - min + 1; } } else { min = 1024; range_size = 65535 - 1024 + 1; } } else { min = ntohs(range->min_proto.all); max = ntohs(range->max_proto.all); if (unlikely(max < min)) swap(max, min); range_size = max - min + 1; } find_free_id: if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) off = (ntohs(*keyptr) - ntohs(range->base_proto.all)); else if ((range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL) || maniptype != NF_NAT_MANIP_DST) off = get_random_u16(); else off = 0; attempts = range_size; if (attempts > NF_NAT_MAX_ATTEMPTS) attempts = NF_NAT_MAX_ATTEMPTS; /* We are in softirq; doing a search of the entire range risks * soft lockup when all tuples are already used. * * If we can't find any free port from first offset, pick a new * one and try again, with ever smaller search window. */ another_round: for (i = 0; i < attempts; i++, off++) { *keyptr = htons(min + off % range_size); if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i)) return; } if (attempts >= range_size || attempts < 16) return; attempts /= 2; off = get_random_u16(); goto another_round; } /* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING, * we change the source to map into the range. For NF_INET_PRE_ROUTING * and NF_INET_LOCAL_OUT, we change the destination to map into the * range. It might not be possible to get a unique tuple, but we try. * At worst (or if we race), we will end up with a final duplicate in * __nf_conntrack_confirm and drop the packet. */ static void get_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig_tuple, const struct nf_nat_range2 *range, struct nf_conn *ct, enum nf_nat_manip_type maniptype) { const struct nf_conntrack_zone *zone; struct net *net = nf_ct_net(ct); zone = nf_ct_zone(ct); /* 1) If this srcip/proto/src-proto-part is currently mapped, * and that same mapping gives a unique tuple within the given * range, use that. * * This is only required for source (ie. NAT/masq) mappings. * So far, we don't do local source mappings, so multiple * manips not an issue. */ if (maniptype == NF_NAT_MANIP_SRC && !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { /* try the original tuple first */ if (nf_in_range(orig_tuple, range)) { if (!nf_nat_used_tuple_new(orig_tuple, ct)) { *tuple = *orig_tuple; return; } } else if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) { pr_debug("get_unique_tuple: Found current src map\n"); if (!nf_nat_used_tuple(tuple, ct)) return; } } /* 2) Select the least-used IP/proto combination in the given range */ *tuple = *orig_tuple; find_best_ips_proto(zone, tuple, range, ct, maniptype); /* 3) The per-protocol part of the manip is made to map into * the range to make a unique tuple. */ /* Only bother mapping if it's not already in range and unique */ if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) && l4proto_in_range(tuple, maniptype, &range->min_proto, &range->max_proto) && (range->min_proto.all == range->max_proto.all || !nf_nat_used_tuple(tuple, ct))) return; } else if (!nf_nat_used_tuple(tuple, ct)) { return; } } /* Last chance: get protocol to try to obtain unique tuple. */ nf_nat_l4proto_unique_tuple(tuple, range, maniptype, ct); } struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct) { struct nf_conn_nat *nat = nfct_nat(ct); if (nat) return nat; if (!nf_ct_is_confirmed(ct)) nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); return nat; } EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add); unsigned int nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype) { struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple curr_tuple, new_tuple; /* Can't setup nat info for confirmed ct. */ if (nf_ct_is_confirmed(ct)) return NF_ACCEPT; WARN_ON(maniptype != NF_NAT_MANIP_SRC && maniptype != NF_NAT_MANIP_DST); if (WARN_ON(nf_nat_initialized(ct, maniptype))) return NF_DROP; /* What we've got will look like inverse of reply. Normally * this is what is in the conntrack, except for prior * manipulations (future optimization: if num_manips == 0, * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */ nf_ct_invert_tuple(&curr_tuple, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype); if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) { struct nf_conntrack_tuple reply; /* Alter conntrack table so will recognize replies. */ nf_ct_invert_tuple(&reply, &new_tuple); nf_conntrack_alter_reply(ct, &reply); /* Non-atomic: we own this at the moment. */ if (maniptype == NF_NAT_MANIP_SRC) ct->status |= IPS_SRC_NAT; else ct->status |= IPS_DST_NAT; if (nfct_help(ct) && !nfct_seqadj(ct)) if (!nfct_seqadj_ext_add(ct)) return NF_DROP; } if (maniptype == NF_NAT_MANIP_SRC) { unsigned int srchash; spinlock_t *lock; srchash = hash_by_src(net, nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; spin_lock_bh(lock); hlist_add_head_rcu(&ct->nat_bysource, &nf_nat_bysource[srchash]); spin_unlock_bh(lock); } /* It's done. */ if (maniptype == NF_NAT_MANIP_DST) ct->status |= IPS_DST_NAT_DONE; else ct->status |= IPS_SRC_NAT_DONE; return NF_ACCEPT; } EXPORT_SYMBOL(nf_nat_setup_info); static unsigned int __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip) { /* Force range to this IP; let proto decide mapping for * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). * Use reply in case it's already been mangled (eg local packet). */ union nf_inet_addr ip = (manip == NF_NAT_MANIP_SRC ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3); struct nf_nat_range2 range = { .flags = NF_NAT_RANGE_MAP_IPS, .min_addr = ip, .max_addr = ip, }; return nf_nat_setup_info(ct, &range, manip); } unsigned int nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) { return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum)); } EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding); /* Do packet manipulations according to nf_nat_setup_info. */ unsigned int nf_nat_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum, struct sk_buff *skb) { enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); unsigned int verdict = NF_ACCEPT; unsigned long statusbit; if (mtype == NF_NAT_MANIP_SRC) statusbit = IPS_SRC_NAT; else statusbit = IPS_DST_NAT; /* Invert if this is reply dir. */ if (dir == IP_CT_DIR_REPLY) statusbit ^= IPS_NAT_MASK; /* Non-atomic: these bits don't change. */ if (ct->status & statusbit) verdict = nf_nat_manip_pkt(skb, ct, mtype, dir); return verdict; } EXPORT_SYMBOL_GPL(nf_nat_packet); static bool in_vrf_postrouting(const struct nf_hook_state *state) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (state->hook == NF_INET_POST_ROUTING && netif_is_l3_master(state->out)) return true; #endif return false; } unsigned int nf_nat_inet_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; struct nf_conn_nat *nat; /* maniptype == SRC for postrouting. */ enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); ct = nf_ct_get(skb, &ctinfo); /* Can't track? It's not due to stress, or conntrack would * have dropped it. Hence it's the user's responsibilty to * packet filter it out, or implement conntrack/NAT for that * protocol. 8) --RR */ if (!ct || in_vrf_postrouting(state)) return NF_ACCEPT; nat = nfct_nat(ct); switch (ctinfo) { case IP_CT_RELATED: case IP_CT_RELATED_REPLY: /* Only ICMPs can be IP_CT_IS_REPLY. Fallthrough */ case IP_CT_NEW: /* Seen it before? This can happen for loopback, retrans, * or local packets. */ if (!nf_nat_initialized(ct, maniptype)) { struct nf_nat_lookup_hook_priv *lpriv = priv; struct nf_hook_entries *e = rcu_dereference(lpriv->entries); unsigned int ret; int i; if (!e) goto null_bind; for (i = 0; i < e->num_hook_entries; i++) { ret = e->hooks[i].hook(e->hooks[i].priv, skb, state); if (ret != NF_ACCEPT) return ret; if (nf_nat_initialized(ct, maniptype)) goto do_nat; } null_bind: ret = nf_nat_alloc_null_binding(ct, state->hook); if (ret != NF_ACCEPT) return ret; } else { pr_debug("Already setup manip %s for ct %p (status bits 0x%lx)\n", maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", ct, ct->status); if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) goto oif_changed; } break; default: /* ESTABLISHED */ WARN_ON(ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY); if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) goto oif_changed; } do_nat: return nf_nat_packet(ct, ctinfo, state->hook, skb); oif_changed: nf_ct_kill_acct(ct, ctinfo, skb); return NF_DROP; } EXPORT_SYMBOL_GPL(nf_nat_inet_fn); struct nf_nat_proto_clean { u8 l3proto; u8 l4proto; }; /* kill conntracks with affected NAT section */ static int nf_nat_proto_remove(struct nf_conn *i, void *data) { const struct nf_nat_proto_clean *clean = data; if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) || (clean->l4proto && nf_ct_protonum(i) != clean->l4proto)) return 0; return i->status & IPS_NAT_MASK ? 1 : 0; } static void nf_nat_cleanup_conntrack(struct nf_conn *ct) { unsigned int h; h = hash_by_src(nf_ct_net(ct), nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); hlist_del_rcu(&ct->nat_bysource); spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); } static int nf_nat_proto_clean(struct nf_conn *ct, void *data) { if (nf_nat_proto_remove(ct, data)) return 1; /* This module is being removed and conntrack has nat null binding. * Remove it from bysource hash, as the table will be freed soon. * * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() * will delete entry from already-freed table. */ if (test_and_clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status)) nf_nat_cleanup_conntrack(ct); /* don't delete conntrack. Although that would make things a lot * simpler, we'd end up flushing all conntracks on nat rmmod. */ return 0; } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, }; static int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range2 *range) { if (tb[CTA_PROTONAT_PORT_MIN]) { range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); range->max_proto.all = range->min_proto.all; range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } if (tb[CTA_PROTONAT_PORT_MAX]) { range->max_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]); range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } return 0; } static int nfnetlink_parse_nat_proto(struct nlattr *attr, const struct nf_conn *ct, struct nf_nat_range2 *range) { struct nlattr *tb[CTA_PROTONAT_MAX+1]; int err; err = nla_parse_nested_deprecated(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy, NULL); if (err < 0) return err; return nf_nat_l4proto_nlattr_to_range(tb, range); } static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { [CTA_NAT_V4_MINIP] = { .type = NLA_U32 }, [CTA_NAT_V4_MAXIP] = { .type = NLA_U32 }, [CTA_NAT_V6_MINIP] = { .len = sizeof(struct in6_addr) }, [CTA_NAT_V6_MAXIP] = { .len = sizeof(struct in6_addr) }, [CTA_NAT_PROTO] = { .type = NLA_NESTED }, }; static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range2 *range) { if (tb[CTA_NAT_V4_MINIP]) { range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]); range->flags |= NF_NAT_RANGE_MAP_IPS; } range->max_addr.ip = nla_get_be32_default(tb[CTA_NAT_V4_MAXIP], range->min_addr.ip); return 0; } static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range2 *range) { if (tb[CTA_NAT_V6_MINIP]) { nla_memcpy(&range->min_addr.ip6, tb[CTA_NAT_V6_MINIP], sizeof(struct in6_addr)); range->flags |= NF_NAT_RANGE_MAP_IPS; } if (tb[CTA_NAT_V6_MAXIP]) nla_memcpy(&range->max_addr.ip6, tb[CTA_NAT_V6_MAXIP], sizeof(struct in6_addr)); else range->max_addr = range->min_addr; return 0; } static int nfnetlink_parse_nat(const struct nlattr *nat, const struct nf_conn *ct, struct nf_nat_range2 *range) { struct nlattr *tb[CTA_NAT_MAX+1]; int err; memset(range, 0, sizeof(*range)); err = nla_parse_nested_deprecated(tb, CTA_NAT_MAX, nat, nat_nla_policy, NULL); if (err < 0) return err; switch (nf_ct_l3num(ct)) { case NFPROTO_IPV4: err = nf_nat_ipv4_nlattr_to_range(tb, range); break; case NFPROTO_IPV6: err = nf_nat_ipv6_nlattr_to_range(tb, range); break; default: err = -EPROTONOSUPPORT; break; } if (err) return err; if (!tb[CTA_NAT_PROTO]) return 0; return nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range); } /* This function is called under rcu_read_lock() */ static int nfnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr) { struct nf_nat_range2 range; int err; /* Should not happen, restricted to creating new conntracks * via ctnetlink. */ if (WARN_ON_ONCE(nf_nat_initialized(ct, manip))) return -EEXIST; /* No NAT information has been passed, allocate the null-binding */ if (attr == NULL) return __nf_nat_alloc_null_binding(ct, manip) == NF_DROP ? -ENOMEM : 0; err = nfnetlink_parse_nat(attr, ct, &range); if (err < 0) return err; return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0; } #else static int nfnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr) { return -EOPNOTSUPP; } #endif static struct nf_ct_helper_expectfn follow_master_nat = { .name = "nat-follow-master", .expectfn = nf_nat_follow_master, }; int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, const struct nf_hook_ops *orig_nat_ops, unsigned int ops_count) { struct nat_net *nat_net = net_generic(net, nat_net_id); struct nf_nat_hooks_net *nat_proto_net; struct nf_nat_lookup_hook_priv *priv; unsigned int hooknum = ops->hooknum; struct nf_hook_ops *nat_ops; int i, ret; if (WARN_ON_ONCE(pf >= ARRAY_SIZE(nat_net->nat_proto_net))) return -EINVAL; nat_proto_net = &nat_net->nat_proto_net[pf]; for (i = 0; i < ops_count; i++) { if (orig_nat_ops[i].hooknum == hooknum) { hooknum = i; break; } } if (WARN_ON_ONCE(i == ops_count)) return -EINVAL; mutex_lock(&nf_nat_proto_mutex); if (!nat_proto_net->nat_hook_ops) { WARN_ON(nat_proto_net->users != 0); nat_ops = kmemdup_array(orig_nat_ops, ops_count, sizeof(*orig_nat_ops), GFP_KERNEL); if (!nat_ops) { mutex_unlock(&nf_nat_proto_mutex); return -ENOMEM; } for (i = 0; i < ops_count; i++) { priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (priv) { nat_ops[i].priv = priv; continue; } mutex_unlock(&nf_nat_proto_mutex); while (i) kfree(nat_ops[--i].priv); kfree(nat_ops); return -ENOMEM; } ret = nf_register_net_hooks(net, nat_ops, ops_count); if (ret < 0) { mutex_unlock(&nf_nat_proto_mutex); for (i = 0; i < ops_count; i++) kfree(nat_ops[i].priv); kfree(nat_ops); return ret; } nat_proto_net->nat_hook_ops = nat_ops; } nat_ops = nat_proto_net->nat_hook_ops; priv = nat_ops[hooknum].priv; if (WARN_ON_ONCE(!priv)) { mutex_unlock(&nf_nat_proto_mutex); return -EOPNOTSUPP; } ret = nf_hook_entries_insert_raw(&priv->entries, ops); if (ret == 0) nat_proto_net->users++; mutex_unlock(&nf_nat_proto_mutex); return ret; } void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, unsigned int ops_count) { struct nat_net *nat_net = net_generic(net, nat_net_id); struct nf_nat_hooks_net *nat_proto_net; struct nf_nat_lookup_hook_priv *priv; struct nf_hook_ops *nat_ops; int hooknum = ops->hooknum; int i; if (pf >= ARRAY_SIZE(nat_net->nat_proto_net)) return; nat_proto_net = &nat_net->nat_proto_net[pf]; mutex_lock(&nf_nat_proto_mutex); if (WARN_ON(nat_proto_net->users == 0)) goto unlock; nat_proto_net->users--; nat_ops = nat_proto_net->nat_hook_ops; for (i = 0; i < ops_count; i++) { if (nat_ops[i].hooknum == hooknum) { hooknum = i; break; } } if (WARN_ON_ONCE(i == ops_count)) goto unlock; priv = nat_ops[hooknum].priv; nf_hook_entries_delete_raw(&priv->entries, ops); if (nat_proto_net->users == 0) { nf_unregister_net_hooks(net, nat_ops, ops_count); for (i = 0; i < ops_count; i++) { priv = nat_ops[i].priv; kfree_rcu(priv, rcu_head); } nat_proto_net->nat_hook_ops = NULL; kfree(nat_ops); } unlock: mutex_unlock(&nf_nat_proto_mutex); } static struct pernet_operations nat_net_ops = { .id = &nat_net_id, .size = sizeof(struct nat_net), }; static const struct nf_nat_hook nat_hook = { .parse_nat_setup = nfnetlink_parse_nat_setup, #ifdef CONFIG_XFRM .decode_session = __nf_nat_decode_session, #endif .remove_nat_bysrc = nf_nat_cleanup_conntrack, }; static int __init nf_nat_init(void) { int ret, i; /* Leave them the same for the moment. */ nf_nat_htable_size = nf_conntrack_htable_size; if (nf_nat_htable_size < CONNTRACK_LOCKS) nf_nat_htable_size = CONNTRACK_LOCKS; nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); if (!nf_nat_bysource) return -ENOMEM; for (i = 0; i < CONNTRACK_LOCKS; i++) spin_lock_init(&nf_nat_locks[i]); ret = register_pernet_subsys(&nat_net_ops); if (ret < 0) { kvfree(nf_nat_bysource); return ret; } nf_ct_helper_expectfn_register(&follow_master_nat); WARN_ON(nf_nat_hook != NULL); RCU_INIT_POINTER(nf_nat_hook, &nat_hook); ret = register_nf_nat_bpf(); if (ret < 0) { RCU_INIT_POINTER(nf_nat_hook, NULL); nf_ct_helper_expectfn_unregister(&follow_master_nat); synchronize_net(); unregister_pernet_subsys(&nat_net_ops); kvfree(nf_nat_bysource); } return ret; } static void __exit nf_nat_cleanup(void) { struct nf_nat_proto_clean clean = {}; nf_ct_iterate_destroy(nf_nat_proto_clean, &clean); nf_ct_helper_expectfn_unregister(&follow_master_nat); RCU_INIT_POINTER(nf_nat_hook, NULL); synchronize_net(); kvfree(nf_nat_bysource); unregister_pernet_subsys(&nat_net_ops); } MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Network address translation core"); module_init(nf_nat_init); module_exit(nf_nat_cleanup);
39 203 81 88 998 268 40 50 40 50 1121 768 589 2867 686 1121 722 768 589 400 396 3156 13 453 181 2633 482 178 2597 438 606 210 4 2090 2107 3 134 92 39 31 567 4 5 166 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_FS_H #define _LINUX_FS_H #include <linux/linkage.h> #include <linux/wait_bit.h> #include <linux/kdev_t.h> #include <linux/dcache.h> #include <linux/path.h> #include <linux/stat.h> #include <linux/cache.h> #include <linux/list.h> #include <linux/list_lru.h> #include <linux/llist.h> #include <linux/radix-tree.h> #include <linux/xarray.h> #include <linux/rbtree.h> #include <linux/init.h> #include <linux/pid.h> #include <linux/bug.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/mm_types.h> #include <linux/capability.h> #include <linux/semaphore.h> #include <linux/fcntl.h> #include <linux/rculist_bl.h> #include <linux/atomic.h> #include <linux/shrinker.h> #include <linux/migrate_mode.h> #include <linux/uidgid.h> #include <linux/lockdep.h> #include <linux/percpu-rwsem.h> #include <linux/workqueue.h> #include <linux/delayed_call.h> #include <linux/uuid.h> #include <linux/errseq.h> #include <linux/ioprio.h> #include <linux/fs_types.h> #include <linux/build_bug.h> #include <linux/stddef.h> #include <linux/mount.h> #include <linux/cred.h> #include <linux/mnt_idmapping.h> #include <linux/slab.h> #include <linux/maple_tree.h> #include <linux/rw_hint.h> #include <linux/file_ref.h> #include <linux/unicode.h> #include <asm/byteorder.h> #include <uapi/linux/fs.h> struct backing_dev_info; struct bdi_writeback; struct bio; struct io_comp_batch; struct export_operations; struct fiemap_extent_info; struct hd_geometry; struct iovec; struct kiocb; struct kobject; struct pipe_inode_info; struct poll_table_struct; struct kstatfs; struct vm_area_struct; struct vfsmount; struct cred; struct swap_info_struct; struct seq_file; struct workqueue_struct; struct iov_iter; struct fscrypt_inode_info; struct fscrypt_operations; struct fsverity_info; struct fsverity_operations; struct fsnotify_mark_connector; struct fsnotify_sb_info; struct fs_context; struct fs_parameter_spec; struct fileattr; struct iomap_ops; extern void __init inode_init(void); extern void __init inode_init_early(void); extern void __init files_init(void); extern void __init files_maxfiles_init(void); extern unsigned long get_max_files(void); extern unsigned int sysctl_nr_open; typedef __kernel_rwf_t rwf_t; struct buffer_head; typedef int (get_block_t)(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, ssize_t bytes, void *private); #define MAY_EXEC 0x00000001 #define MAY_WRITE 0x00000002 #define MAY_READ 0x00000004 #define MAY_APPEND 0x00000008 #define MAY_ACCESS 0x00000010 #define MAY_OPEN 0x00000020 #define MAY_CHDIR 0x00000040 /* called from RCU mode, don't block */ #define MAY_NOT_BLOCK 0x00000080 /* * flags in file.f_mode. Note that FMODE_READ and FMODE_WRITE must correspond * to O_WRONLY and O_RDWR via the strange trick in do_dentry_open() */ /* file is open for reading */ #define FMODE_READ ((__force fmode_t)(1 << 0)) /* file is open for writing */ #define FMODE_WRITE ((__force fmode_t)(1 << 1)) /* file is seekable */ #define FMODE_LSEEK ((__force fmode_t)(1 << 2)) /* file can be accessed using pread */ #define FMODE_PREAD ((__force fmode_t)(1 << 3)) /* file can be accessed using pwrite */ #define FMODE_PWRITE ((__force fmode_t)(1 << 4)) /* File is opened for execution with sys_execve / sys_uselib */ #define FMODE_EXEC ((__force fmode_t)(1 << 5)) /* File writes are restricted (block device specific) */ #define FMODE_WRITE_RESTRICTED ((__force fmode_t)(1 << 6)) /* File supports atomic writes */ #define FMODE_CAN_ATOMIC_WRITE ((__force fmode_t)(1 << 7)) /* FMODE_* bit 8 */ /* 32bit hashes as llseek() offset (for directories) */ #define FMODE_32BITHASH ((__force fmode_t)(1 << 9)) /* 64bit hashes as llseek() offset (for directories) */ #define FMODE_64BITHASH ((__force fmode_t)(1 << 10)) /* * Don't update ctime and mtime. * * Currently a special hack for the XFS open_by_handle ioctl, but we'll * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon. */ #define FMODE_NOCMTIME ((__force fmode_t)(1 << 11)) /* Expect random access pattern */ #define FMODE_RANDOM ((__force fmode_t)(1 << 12)) /* FMODE_* bit 13 */ /* File is opened with O_PATH; almost nothing can be done with it */ #define FMODE_PATH ((__force fmode_t)(1 << 14)) /* File needs atomic accesses to f_pos */ #define FMODE_ATOMIC_POS ((__force fmode_t)(1 << 15)) /* Write access to underlying fs */ #define FMODE_WRITER ((__force fmode_t)(1 << 16)) /* Has read method(s) */ #define FMODE_CAN_READ ((__force fmode_t)(1 << 17)) /* Has write method(s) */ #define FMODE_CAN_WRITE ((__force fmode_t)(1 << 18)) #define FMODE_OPENED ((__force fmode_t)(1 << 19)) #define FMODE_CREATED ((__force fmode_t)(1 << 20)) /* File is stream-like */ #define FMODE_STREAM ((__force fmode_t)(1 << 21)) /* File supports DIRECT IO */ #define FMODE_CAN_ODIRECT ((__force fmode_t)(1 << 22)) #define FMODE_NOREUSE ((__force fmode_t)(1 << 23)) /* File is embedded in backing_file object */ #define FMODE_BACKING ((__force fmode_t)(1 << 24)) /* * Together with FMODE_NONOTIFY_PERM defines which fsnotify events shouldn't be * generated (see below) */ #define FMODE_NONOTIFY ((__force fmode_t)(1 << 25)) /* * Together with FMODE_NONOTIFY defines which fsnotify events shouldn't be * generated (see below) */ #define FMODE_NONOTIFY_PERM ((__force fmode_t)(1 << 26)) /* File is capable of returning -EAGAIN if I/O will block */ #define FMODE_NOWAIT ((__force fmode_t)(1 << 27)) /* File represents mount that needs unmounting */ #define FMODE_NEED_UNMOUNT ((__force fmode_t)(1 << 28)) /* File does not contribute to nr_files count */ #define FMODE_NOACCOUNT ((__force fmode_t)(1 << 29)) /* * The two FMODE_NONOTIFY* define which fsnotify events should not be generated * for a file. These are the possible values of (f->f_mode & * FMODE_FSNOTIFY_MASK) and their meaning: * * FMODE_NONOTIFY - suppress all (incl. non-permission) events. * FMODE_NONOTIFY_PERM - suppress permission (incl. pre-content) events. * FMODE_NONOTIFY | FMODE_NONOTIFY_PERM - suppress only pre-content events. */ #define FMODE_FSNOTIFY_MASK \ (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM) #define FMODE_FSNOTIFY_NONE(mode) \ ((mode & FMODE_FSNOTIFY_MASK) == FMODE_NONOTIFY) #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS #define FMODE_FSNOTIFY_PERM(mode) \ ((mode & FMODE_FSNOTIFY_MASK) == 0 || \ (mode & FMODE_FSNOTIFY_MASK) == (FMODE_NONOTIFY | FMODE_NONOTIFY_PERM)) #define FMODE_FSNOTIFY_HSM(mode) \ ((mode & FMODE_FSNOTIFY_MASK) == 0) #else #define FMODE_FSNOTIFY_PERM(mode) 0 #define FMODE_FSNOTIFY_HSM(mode) 0 #endif /* * Attribute flags. These should be or-ed together to figure out what * has been changed! */ #define ATTR_MODE (1 << 0) #define ATTR_UID (1 << 1) #define ATTR_GID (1 << 2) #define ATTR_SIZE (1 << 3) #define ATTR_ATIME (1 << 4) #define ATTR_MTIME (1 << 5) #define ATTR_CTIME (1 << 6) #define ATTR_ATIME_SET (1 << 7) #define ATTR_MTIME_SET (1 << 8) #define ATTR_FORCE (1 << 9) /* Not a change, but a change it */ #define ATTR_KILL_SUID (1 << 11) #define ATTR_KILL_SGID (1 << 12) #define ATTR_FILE (1 << 13) #define ATTR_KILL_PRIV (1 << 14) #define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */ #define ATTR_TIMES_SET (1 << 16) #define ATTR_TOUCH (1 << 17) #define ATTR_DELEG (1 << 18) /* Delegated attrs. Don't break write delegations */ /* * Whiteout is represented by a char device. The following constants define the * mode and device number to use. */ #define WHITEOUT_MODE 0 #define WHITEOUT_DEV 0 /* * This is the Inode Attributes structure, used for notify_change(). It * uses the above definitions as flags, to know which values have changed. * Also, in this manner, a Filesystem can look at only the values it cares * about. Basically, these are the attributes that the VFS layer can * request to change from the FS layer. * * Derek Atkins <warlord@MIT.EDU> 94-10-20 */ struct iattr { unsigned int ia_valid; umode_t ia_mode; /* * The two anonymous unions wrap structures with the same member. * * Filesystems raising FS_ALLOW_IDMAP need to use ia_vfs{g,u}id which * are a dedicated type requiring the filesystem to use the dedicated * helpers. Other filesystem can continue to use ia_{g,u}id until they * have been ported. * * They always contain the same value. In other words FS_ALLOW_IDMAP * pass down the same value on idmapped mounts as they would on regular * mounts. */ union { kuid_t ia_uid; vfsuid_t ia_vfsuid; }; union { kgid_t ia_gid; vfsgid_t ia_vfsgid; }; loff_t ia_size; struct timespec64 ia_atime; struct timespec64 ia_mtime; struct timespec64 ia_ctime; /* * Not an attribute, but an auxiliary info for filesystems wanting to * implement an ftruncate() like method. NOTE: filesystem should * check for (ia_valid & ATTR_FILE), and not for (ia_file != NULL). */ struct file *ia_file; }; /* * Includes for diskquotas. */ #include <linux/quota.h> /* * Maximum number of layers of fs stack. Needs to be limited to * prevent kernel stack overflow */ #define FILESYSTEM_MAX_STACK_DEPTH 2 /** * enum positive_aop_returns - aop return codes with specific semantics * * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has * completed, that the page is still locked, and * should be considered active. The VM uses this hint * to return the page to the active list -- it won't * be a candidate for writeback again in the near * future. Other callers must be careful to unlock * the page if they get this return. Returned by * writepage(); * * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has * unlocked it and the page might have been truncated. * The caller should back up to acquiring a new page and * trying again. The aop will be taking reasonable * precautions not to livelock. If the caller held a page * reference, it should drop it before retrying. Returned * by read_folio(). * * address_space_operation functions return these large constants to indicate * special semantics to the caller. These are much larger than the bytes in a * page to allow for functions that return the number of bytes operated on in a * given page. */ enum positive_aop_returns { AOP_WRITEPAGE_ACTIVATE = 0x80000, AOP_TRUNCATED_PAGE = 0x80001, }; /* * oh the beauties of C type declarations. */ struct page; struct address_space; struct writeback_control; struct readahead_control; /* Match RWF_* bits to IOCB bits */ #define IOCB_HIPRI (__force int) RWF_HIPRI #define IOCB_DSYNC (__force int) RWF_DSYNC #define IOCB_SYNC (__force int) RWF_SYNC #define IOCB_NOWAIT (__force int) RWF_NOWAIT #define IOCB_APPEND (__force int) RWF_APPEND #define IOCB_ATOMIC (__force int) RWF_ATOMIC #define IOCB_DONTCACHE (__force int) RWF_DONTCACHE /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) #define IOCB_DIRECT (1 << 17) #define IOCB_WRITE (1 << 18) /* iocb->ki_waitq is valid */ #define IOCB_WAITQ (1 << 19) #define IOCB_NOIO (1 << 20) /* can use bio alloc cache */ #define IOCB_ALLOC_CACHE (1 << 21) /* * IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the * iocb completion can be passed back to the owner for execution from a safe * context rather than needing to be punted through a workqueue. If this * flag is set, the bio completion handling may set iocb->dio_complete to a * handler function and iocb->private to context information for that handler. * The issuer should call the handler with that context information from task * context to complete the processing of the iocb. Note that while this * provides a task context for the dio_complete() callback, it should only be * used on the completion side for non-IO generating completions. It's fine to * call blocking functions from this callback, but they should not wait for * unrelated IO (like cache flushing, new IO generation, etc). */ #define IOCB_DIO_CALLER_COMP (1 << 22) /* kiocb is a read or write operation submitted by fs/aio.c. */ #define IOCB_AIO_RW (1 << 23) #define IOCB_HAS_METADATA (1 << 24) /* for use in trace events */ #define TRACE_IOCB_STRINGS \ { IOCB_HIPRI, "HIPRI" }, \ { IOCB_DSYNC, "DSYNC" }, \ { IOCB_SYNC, "SYNC" }, \ { IOCB_NOWAIT, "NOWAIT" }, \ { IOCB_APPEND, "APPEND" }, \ { IOCB_ATOMIC, "ATOMIC" }, \ { IOCB_DONTCACHE, "DONTCACHE" }, \ { IOCB_EVENTFD, "EVENTFD"}, \ { IOCB_DIRECT, "DIRECT" }, \ { IOCB_WRITE, "WRITE" }, \ { IOCB_WAITQ, "WAITQ" }, \ { IOCB_NOIO, "NOIO" }, \ { IOCB_ALLOC_CACHE, "ALLOC_CACHE" }, \ { IOCB_DIO_CALLER_COMP, "CALLER_COMP" } struct kiocb { struct file *ki_filp; loff_t ki_pos; void (*ki_complete)(struct kiocb *iocb, long ret); void *private; int ki_flags; u16 ki_ioprio; /* See linux/ioprio.h */ union { /* * Only used for async buffered reads, where it denotes the * page waitqueue associated with completing the read. Valid * IFF IOCB_WAITQ is set. */ struct wait_page_queue *ki_waitq; /* * Can be used for O_DIRECT IO, where the completion handling * is punted back to the issuer of the IO. May only be set * if IOCB_DIO_CALLER_COMP is set by the issuer, and the issuer * must then check for presence of this handler when ki_complete * is invoked. The data passed in to this handler must be * assigned to ->private when dio_complete is assigned. */ ssize_t (*dio_complete)(void *data); }; }; static inline bool is_sync_kiocb(struct kiocb *kiocb) { return kiocb->ki_complete == NULL; } struct address_space_operations { int (*writepage)(struct page *page, struct writeback_control *wbc); int (*read_folio)(struct file *, struct folio *); /* Write back some dirty pages from this mapping. */ int (*writepages)(struct address_space *, struct writeback_control *); /* Mark a folio dirty. Return true if this dirtied it */ bool (*dirty_folio)(struct address_space *, struct folio *); void (*readahead)(struct readahead_control *); int (*write_begin)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata); int (*write_end)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata); /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ sector_t (*bmap)(struct address_space *, sector_t); void (*invalidate_folio) (struct folio *, size_t offset, size_t len); bool (*release_folio)(struct folio *, gfp_t); void (*free_folio)(struct folio *folio); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); /* * migrate the contents of a folio to the specified target. If * migrate_mode is MIGRATE_ASYNC, it must not block. */ int (*migrate_folio)(struct address_space *, struct folio *dst, struct folio *src, enum migrate_mode); int (*launder_folio)(struct folio *); bool (*is_partially_uptodate) (struct folio *, size_t from, size_t count); void (*is_dirty_writeback) (struct folio *, bool *dirty, bool *wb); int (*error_remove_folio)(struct address_space *, struct folio *); /* swapfile support */ int (*swap_activate)(struct swap_info_struct *sis, struct file *file, sector_t *span); void (*swap_deactivate)(struct file *file); int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); }; extern const struct address_space_operations empty_aops; /** * struct address_space - Contents of a cacheable, mappable object. * @host: Owner, either the inode or the block_device. * @i_pages: Cached pages. * @invalidate_lock: Guards coherency between page cache contents and * file offset->disk block mappings in the filesystem during invalidates. * It is also used to block modification of page cache contents through * memory mappings. * @gfp_mask: Memory allocation flags to use for allocating pages. * @i_mmap_writable: Number of VM_SHARED, VM_MAYWRITE mappings. * @nr_thps: Number of THPs in the pagecache (non-shmem only). * @i_mmap: Tree of private and shared mappings. * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. * @nrpages: Number of page entries, protected by the i_pages lock. * @writeback_index: Writeback starts here. * @a_ops: Methods. * @flags: Error bits and flags (AS_*). * @wb_err: The most recent error which has occurred. * @i_private_lock: For use by the owner of the address_space. * @i_private_list: For use by the owner of the address_space. * @i_private_data: For use by the owner of the address_space. */ struct address_space { struct inode *host; struct xarray i_pages; struct rw_semaphore invalidate_lock; gfp_t gfp_mask; atomic_t i_mmap_writable; #ifdef CONFIG_READ_ONLY_THP_FOR_FS /* number of thp, only for non-shmem files */ atomic_t nr_thps; #endif struct rb_root_cached i_mmap; unsigned long nrpages; pgoff_t writeback_index; const struct address_space_operations *a_ops; unsigned long flags; errseq_t wb_err; spinlock_t i_private_lock; struct list_head i_private_list; struct rw_semaphore i_mmap_rwsem; void * i_private_data; } __attribute__((aligned(sizeof(long)))) __randomize_layout; /* * On most architectures that alignment is already the case; but * must be enforced here for CRIS, to let the least significant bit * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON. */ /* XArray tags, for tagging dirty and writeback pages in the pagecache. */ #define PAGECACHE_TAG_DIRTY XA_MARK_0 #define PAGECACHE_TAG_WRITEBACK XA_MARK_1 #define PAGECACHE_TAG_TOWRITE XA_MARK_2 /* * Returns true if any of the pages in the mapping are marked with the tag. */ static inline bool mapping_tagged(struct address_space *mapping, xa_mark_t tag) { return xa_marked(&mapping->i_pages, tag); } static inline void i_mmap_lock_write(struct address_space *mapping) { down_write(&mapping->i_mmap_rwsem); } static inline int i_mmap_trylock_write(struct address_space *mapping) { return down_write_trylock(&mapping->i_mmap_rwsem); } static inline void i_mmap_unlock_write(struct address_space *mapping) { up_write(&mapping->i_mmap_rwsem); } static inline int i_mmap_trylock_read(struct address_space *mapping) { return down_read_trylock(&mapping->i_mmap_rwsem); } static inline void i_mmap_lock_read(struct address_space *mapping) { down_read(&mapping->i_mmap_rwsem); } static inline void i_mmap_unlock_read(struct address_space *mapping) { up_read(&mapping->i_mmap_rwsem); } static inline void i_mmap_assert_locked(struct address_space *mapping) { lockdep_assert_held(&mapping->i_mmap_rwsem); } static inline void i_mmap_assert_write_locked(struct address_space *mapping) { lockdep_assert_held_write(&mapping->i_mmap_rwsem); } /* * Might pages of this file be mapped into userspace? */ static inline int mapping_mapped(struct address_space *mapping) { return !RB_EMPTY_ROOT(&mapping->i_mmap.rb_root); } /* * Might pages of this file have been modified in userspace? * Note that i_mmap_writable counts all VM_SHARED, VM_MAYWRITE vmas: do_mmap * marks vma as VM_SHARED if it is shared, and the file was opened for * writing i.e. vma may be mprotected writable even if now readonly. * * If i_mmap_writable is negative, no new writable mappings are allowed. You * can only deny writable mappings, if none exists right now. */ static inline int mapping_writably_mapped(struct address_space *mapping) { return atomic_read(&mapping->i_mmap_writable) > 0; } static inline int mapping_map_writable(struct address_space *mapping) { return atomic_inc_unless_negative(&mapping->i_mmap_writable) ? 0 : -EPERM; } static inline void mapping_unmap_writable(struct address_space *mapping) { atomic_dec(&mapping->i_mmap_writable); } static inline int mapping_deny_writable(struct address_space *mapping) { return atomic_dec_unless_positive(&mapping->i_mmap_writable) ? 0 : -EBUSY; } static inline void mapping_allow_writable(struct address_space *mapping) { atomic_inc(&mapping->i_mmap_writable); } /* * Use sequence counter to get consistent i_size on 32-bit processors. */ #if BITS_PER_LONG==32 && defined(CONFIG_SMP) #include <linux/seqlock.h> #define __NEED_I_SIZE_ORDERED #define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount) #else #define i_size_ordered_init(inode) do { } while (0) #endif struct posix_acl; #define ACL_NOT_CACHED ((void *)(-1)) /* * ACL_DONT_CACHE is for stacked filesystems, that rely on underlying fs to * cache the ACL. This also means that ->get_inode_acl() can be called in RCU * mode with the LOOKUP_RCU flag. */ #define ACL_DONT_CACHE ((void *)(-3)) static inline struct posix_acl * uncached_acl_sentinel(struct task_struct *task) { return (void *)task + 1; } static inline bool is_uncached_acl(struct posix_acl *acl) { return (long)acl & 1; } #define IOP_FASTPERM 0x0001 #define IOP_LOOKUP 0x0002 #define IOP_NOFOLLOW 0x0004 #define IOP_XATTR 0x0008 #define IOP_DEFAULT_READLINK 0x0010 #define IOP_MGTIME 0x0020 #define IOP_CACHED_LINK 0x0040 /* * Keep mostly read-only and often accessed (especially for * the RCU path lookup and 'stat' data) fields at the beginning * of the 'struct inode' */ struct inode { umode_t i_mode; unsigned short i_opflags; kuid_t i_uid; kgid_t i_gid; unsigned int i_flags; #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *i_acl; struct posix_acl *i_default_acl; #endif const struct inode_operations *i_op; struct super_block *i_sb; struct address_space *i_mapping; #ifdef CONFIG_SECURITY void *i_security; #endif /* Stat data, not accessed from path walking */ unsigned long i_ino; /* * Filesystems may only read i_nlink directly. They shall use the * following functions for modification: * * (set|clear|inc|drop)_nlink * inode_(inc|dec)_link_count */ union { const unsigned int i_nlink; unsigned int __i_nlink; }; dev_t i_rdev; loff_t i_size; time64_t i_atime_sec; time64_t i_mtime_sec; time64_t i_ctime_sec; u32 i_atime_nsec; u32 i_mtime_nsec; u32 i_ctime_nsec; u32 i_generation; spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ unsigned short i_bytes; u8 i_blkbits; enum rw_hint i_write_hint; blkcnt_t i_blocks; #ifdef __NEED_I_SIZE_ORDERED seqcount_t i_size_seqcount; #endif /* Misc */ u32 i_state; /* 32-bit hole */ struct rw_semaphore i_rwsem; unsigned long dirtied_when; /* jiffies of first dirtying */ unsigned long dirtied_time_when; struct hlist_node i_hash; struct list_head i_io_list; /* backing dev IO list */ #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *i_wb; /* the associated cgroup wb */ /* foreign inode detection, see wbc_detach_inode() */ int i_wb_frn_winner; u16 i_wb_frn_avg_time; u16 i_wb_frn_history; #endif struct list_head i_lru; /* inode LRU list */ struct list_head i_sb_list; struct list_head i_wb_list; /* backing dev writeback list */ union { struct hlist_head i_dentry; struct rcu_head i_rcu; }; atomic64_t i_version; atomic64_t i_sequence; /* see futex */ atomic_t i_count; atomic_t i_dio_count; atomic_t i_writecount; #if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING) atomic_t i_readcount; /* struct files open RO */ #endif union { const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ void (*free_inode)(struct inode *); }; struct file_lock_context *i_flctx; struct address_space i_data; union { struct list_head i_devices; int i_linklen; }; union { struct pipe_inode_info *i_pipe; struct cdev *i_cdev; char *i_link; unsigned i_dir_seq; }; #ifdef CONFIG_FSNOTIFY __u32 i_fsnotify_mask; /* all events this inode cares about */ /* 32-bit hole reserved for expanding i_fsnotify_mask */ struct fsnotify_mark_connector __rcu *i_fsnotify_marks; #endif #ifdef CONFIG_FS_ENCRYPTION struct fscrypt_inode_info *i_crypt_info; #endif #ifdef CONFIG_FS_VERITY struct fsverity_info *i_verity_info; #endif void *i_private; /* fs or device private pointer */ } __randomize_layout; static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen) { int testlen; /* * TODO: patch it into a debug-only check if relevant macros show up. * In the meantime, since we are suffering strlen even on production kernels * to find the right length, do a fixup if the wrong value got passed. */ testlen = strlen(link); if (testlen != linklen) { WARN_ONCE(1, "bad length passed for symlink [%s] (got %d, expected %d)", link, linklen, testlen); linklen = testlen; } inode->i_link = link; inode->i_linklen = linklen; inode->i_opflags |= IOP_CACHED_LINK; } /* * Get bit address from inode->i_state to use with wait_var_event() * infrastructre. */ #define inode_state_wait_address(inode, bit) ((char *)&(inode)->i_state + (bit)) struct wait_queue_head *inode_bit_waitqueue(struct wait_bit_queue_entry *wqe, struct inode *inode, u32 bit); static inline void inode_wake_up_bit(struct inode *inode, u32 bit) { /* Caller is responsible for correct memory barriers. */ wake_up_var(inode_state_wait_address(inode, bit)); } struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode); static inline unsigned int i_blocksize(const struct inode *node) { return (1 << node->i_blkbits); } static inline int inode_unhashed(struct inode *inode) { return hlist_unhashed(&inode->i_hash); } /* * __mark_inode_dirty expects inodes to be hashed. Since we don't * want special inodes in the fileset inode space, we make them * appear hashed, but do not put on any lists. hlist_del() * will work fine and require no locking. */ static inline void inode_fake_hash(struct inode *inode) { hlist_add_fake(&inode->i_hash); } /* * inode->i_mutex nesting subclasses for the lock validator: * * 0: the object of the current VFS operation * 1: parent * 2: child/target * 3: xattr * 4: second non-directory * 5: second parent (when locking independent directories in rename) * * I_MUTEX_NONDIR2 is for certain operations (such as rename) which lock two * non-directories at once. * * The locking order between these classes is * parent[2] -> child -> grandchild -> normal -> xattr -> second non-directory */ enum inode_i_mutex_lock_class { I_MUTEX_NORMAL, I_MUTEX_PARENT, I_MUTEX_CHILD, I_MUTEX_XATTR, I_MUTEX_NONDIR2, I_MUTEX_PARENT2, }; static inline void inode_lock(struct inode *inode) { down_write(&inode->i_rwsem); } static inline void inode_unlock(struct inode *inode) { up_write(&inode->i_rwsem); } static inline void inode_lock_shared(struct inode *inode) { down_read(&inode->i_rwsem); } static inline void inode_unlock_shared(struct inode *inode) { up_read(&inode->i_rwsem); } static inline int inode_trylock(struct inode *inode) { return down_write_trylock(&inode->i_rwsem); } static inline int inode_trylock_shared(struct inode *inode) { return down_read_trylock(&inode->i_rwsem); } static inline int inode_is_locked(struct inode *inode) { return rwsem_is_locked(&inode->i_rwsem); } static inline void inode_lock_nested(struct inode *inode, unsigned subclass) { down_write_nested(&inode->i_rwsem, subclass); } static inline void inode_lock_shared_nested(struct inode *inode, unsigned subclass) { down_read_nested(&inode->i_rwsem, subclass); } static inline void filemap_invalidate_lock(struct address_space *mapping) { down_write(&mapping->invalidate_lock); } static inline void filemap_invalidate_unlock(struct address_space *mapping) { up_write(&mapping->invalidate_lock); } static inline void filemap_invalidate_lock_shared(struct address_space *mapping) { down_read(&mapping->invalidate_lock); } static inline int filemap_invalidate_trylock_shared( struct address_space *mapping) { return down_read_trylock(&mapping->invalidate_lock); } static inline void filemap_invalidate_unlock_shared( struct address_space *mapping) { up_read(&mapping->invalidate_lock); } void lock_two_nondirectories(struct inode *, struct inode*); void unlock_two_nondirectories(struct inode *, struct inode*); void filemap_invalidate_lock_two(struct address_space *mapping1, struct address_space *mapping2); void filemap_invalidate_unlock_two(struct address_space *mapping1, struct address_space *mapping2); /* * NOTE: in a 32bit arch with a preemptable kernel and * an UP compile the i_size_read/write must be atomic * with respect to the local cpu (unlike with preempt disabled), * but they don't need to be atomic with respect to other cpus like in * true SMP (so they need either to either locally disable irq around * the read or for example on x86 they can be still implemented as a * cmpxchg8b without the need of the lock prefix). For SMP compiles * and 64bit archs it makes no difference if preempt is enabled or not. */ static inline loff_t i_size_read(const struct inode *inode) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) loff_t i_size; unsigned int seq; do { seq = read_seqcount_begin(&inode->i_size_seqcount); i_size = inode->i_size; } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); return i_size; #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) loff_t i_size; preempt_disable(); i_size = inode->i_size; preempt_enable(); return i_size; #else /* Pairs with smp_store_release() in i_size_write() */ return smp_load_acquire(&inode->i_size); #endif } /* * NOTE: unlike i_size_read(), i_size_write() does need locking around it * (normally i_mutex), otherwise on 32bit/SMP an update of i_size_seqcount * can be lost, resulting in subsequent i_size_read() calls spinning forever. */ static inline void i_size_write(struct inode *inode, loff_t i_size) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) preempt_disable(); write_seqcount_begin(&inode->i_size_seqcount); inode->i_size = i_size; write_seqcount_end(&inode->i_size_seqcount); preempt_enable(); #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) preempt_disable(); inode->i_size = i_size; preempt_enable(); #else /* * Pairs with smp_load_acquire() in i_size_read() to ensure * changes related to inode size (such as page contents) are * visible before we see the changed inode size. */ smp_store_release(&inode->i_size, i_size); #endif } static inline unsigned iminor(const struct inode *inode) { return MINOR(inode->i_rdev); } static inline unsigned imajor(const struct inode *inode) { return MAJOR(inode->i_rdev); } struct fown_struct { struct file *file; /* backpointer for security modules */ rwlock_t lock; /* protects pid, uid, euid fields */ struct pid *pid; /* pid or -pgrp where SIGIO should be sent */ enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */ kuid_t uid, euid; /* uid/euid of process setting the owner */ int signum; /* posix.1b rt signal to be delivered on IO */ }; /** * struct file_ra_state - Track a file's readahead state. * @start: Where the most recent readahead started. * @size: Number of pages read in the most recent readahead. * @async_size: Numer of pages that were/are not needed immediately * and so were/are genuinely "ahead". Start next readahead when * the first of these pages is accessed. * @ra_pages: Maximum size of a readahead request, copied from the bdi. * @mmap_miss: How many mmap accesses missed in the page cache. * @prev_pos: The last byte in the most recent read request. * * When this structure is passed to ->readahead(), the "most recent" * readahead means the current readahead. */ struct file_ra_state { pgoff_t start; unsigned int size; unsigned int async_size; unsigned int ra_pages; unsigned int mmap_miss; loff_t prev_pos; }; /* * Check if @index falls in the readahead windows. */ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) { return (index >= ra->start && index < ra->start + ra->size); } /** * struct file - Represents a file * @f_ref: reference count * @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context. * @f_mode: FMODE_* flags often used in hotpaths * @f_op: file operations * @f_mapping: Contents of a cacheable, mappable object. * @private_data: filesystem or driver specific data * @f_inode: cached inode * @f_flags: file flags * @f_iocb_flags: iocb flags * @f_cred: stashed credentials of creator/opener * @f_path: path of the file * @f_pos_lock: lock protecting file position * @f_pipe: specific to pipes * @f_pos: file position * @f_security: LSM security context of this file * @f_owner: file owner * @f_wb_err: writeback error * @f_sb_err: per sb writeback errors * @f_ep: link of all epoll hooks for this file * @f_task_work: task work entry point * @f_llist: work queue entrypoint * @f_ra: file's readahead state * @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.) */ struct file { file_ref_t f_ref; spinlock_t f_lock; fmode_t f_mode; const struct file_operations *f_op; struct address_space *f_mapping; void *private_data; struct inode *f_inode; unsigned int f_flags; unsigned int f_iocb_flags; const struct cred *f_cred; /* --- cacheline 1 boundary (64 bytes) --- */ struct path f_path; union { /* regular files (with FMODE_ATOMIC_POS) and directories */ struct mutex f_pos_lock; /* pipes */ u64 f_pipe; }; loff_t f_pos; #ifdef CONFIG_SECURITY void *f_security; #endif /* --- cacheline 2 boundary (128 bytes) --- */ struct fown_struct *f_owner; errseq_t f_wb_err; errseq_t f_sb_err; #ifdef CONFIG_EPOLL struct hlist_head *f_ep; #endif union { struct callback_head f_task_work; struct llist_node f_llist; struct file_ra_state f_ra; freeptr_t f_freeptr; }; /* --- cacheline 3 boundary (192 bytes) --- */ } __randomize_layout __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ struct file_handle { __u32 handle_bytes; int handle_type; /* file identifier */ unsigned char f_handle[] __counted_by(handle_bytes); }; static inline struct file *get_file(struct file *f) { file_ref_inc(&f->f_ref); return f; } struct file *get_file_rcu(struct file __rcu **f); struct file *get_file_active(struct file **f); #define file_count(f) file_ref_read(&(f)->f_ref) #define MAX_NON_LFS ((1UL<<31) - 1) /* Page cache limit. The filesystems should put that into their s_maxbytes limits, otherwise bad things can happen in VM. */ #if BITS_PER_LONG==32 #define MAX_LFS_FILESIZE ((loff_t)ULONG_MAX << PAGE_SHIFT) #elif BITS_PER_LONG==64 #define MAX_LFS_FILESIZE ((loff_t)LLONG_MAX) #endif /* legacy typedef, should eventually be removed */ typedef void *fl_owner_t; struct file_lock; struct file_lease; /* The following constant reflects the upper bound of the file/locking space */ #ifndef OFFSET_MAX #define OFFSET_MAX type_max(loff_t) #define OFFT_OFFSET_MAX type_max(off_t) #endif int file_f_owner_allocate(struct file *file); static inline struct fown_struct *file_f_owner(const struct file *file) { return READ_ONCE(file->f_owner); } extern void send_sigio(struct fown_struct *fown, int fd, int band); static inline struct inode *file_inode(const struct file *f) { return f->f_inode; } /* * file_dentry() is a relic from the days that overlayfs was using files with a * "fake" path, meaning, f_path on overlayfs and f_inode on underlying fs. * In those days, file_dentry() was needed to get the underlying fs dentry that * matches f_inode. * Files with "fake" path should not exist nowadays, so use an assertion to make * sure that file_dentry() was not papering over filesystem bugs. */ static inline struct dentry *file_dentry(const struct file *file) { struct dentry *dentry = file->f_path.dentry; WARN_ON_ONCE(d_inode(dentry) != file_inode(file)); return dentry; } struct fasync_struct { rwlock_t fa_lock; int magic; int fa_fd; struct fasync_struct *fa_next; /* singly linked list */ struct file *fa_file; struct rcu_head fa_rcu; }; #define FASYNC_MAGIC 0x4601 /* SMP safe fasync helpers: */ extern int fasync_helper(int, struct file *, int, struct fasync_struct **); extern struct fasync_struct *fasync_insert_entry(int, struct file *, struct fasync_struct **, struct fasync_struct *); extern int fasync_remove_entry(struct file *, struct fasync_struct **); extern struct fasync_struct *fasync_alloc(void); extern void fasync_free(struct fasync_struct *); /* can be called from interrupts */ extern void kill_fasync(struct fasync_struct **, int, int); extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force); extern int f_setown(struct file *filp, int who, int force); extern void f_delown(struct file *filp); extern pid_t f_getown(struct file *filp); extern int send_sigurg(struct file *file); /* * sb->s_flags. Note that these mirror the equivalent MS_* flags where * represented in both. */ #define SB_RDONLY BIT(0) /* Mount read-only */ #define SB_NOSUID BIT(1) /* Ignore suid and sgid bits */ #define SB_NODEV BIT(2) /* Disallow access to device special files */ #define SB_NOEXEC BIT(3) /* Disallow program execution */ #define SB_SYNCHRONOUS BIT(4) /* Writes are synced at once */ #define SB_MANDLOCK BIT(6) /* Allow mandatory locks on an FS */ #define SB_DIRSYNC BIT(7) /* Directory modifications are synchronous */ #define SB_NOATIME BIT(10) /* Do not update access times. */ #define SB_NODIRATIME BIT(11) /* Do not update directory access times */ #define SB_SILENT BIT(15) #define SB_POSIXACL BIT(16) /* Supports POSIX ACLs */ #define SB_INLINECRYPT BIT(17) /* Use blk-crypto for encrypted files */ #define SB_KERNMOUNT BIT(22) /* this is a kern_mount call */ #define SB_I_VERSION BIT(23) /* Update inode I_version field */ #define SB_LAZYTIME BIT(25) /* Update the on-disk [acm]times lazily */ /* These sb flags are internal to the kernel */ #define SB_DEAD BIT(21) #define SB_DYING BIT(24) #define SB_SUBMOUNT BIT(26) #define SB_FORCE BIT(27) #define SB_NOSEC BIT(28) #define SB_BORN BIT(29) #define SB_ACTIVE BIT(30) #define SB_NOUSER BIT(31) /* These flags relate to encoding and casefolding */ #define SB_ENC_STRICT_MODE_FL (1 << 0) #define sb_has_strict_encoding(sb) \ (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL) /* * Umount options */ #define MNT_FORCE 0x00000001 /* Attempt to forcibily umount */ #define MNT_DETACH 0x00000002 /* Just detach from the tree */ #define MNT_EXPIRE 0x00000004 /* Mark for expiry */ #define UMOUNT_NOFOLLOW 0x00000008 /* Don't follow symlink on umount */ #define UMOUNT_UNUSED 0x80000000 /* Flag guaranteed to be unused */ /* sb->s_iflags */ #define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ #define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */ #define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */ #define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */ /* sb->s_iflags to limit user namespace mounts */ #define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ #define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020 #define SB_I_UNTRUSTED_MOUNTER 0x00000040 #define SB_I_EVM_HMAC_UNSUPPORTED 0x00000080 #define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ #define SB_I_PERSB_BDI 0x00000200 /* has a per-sb bdi */ #define SB_I_TS_EXPIRY_WARNED 0x00000400 /* warned about timestamp range expiry */ #define SB_I_RETIRED 0x00000800 /* superblock shouldn't be reused */ #define SB_I_NOUMASK 0x00001000 /* VFS does not apply umask */ #define SB_I_NOIDMAP 0x00002000 /* No idmapped mounts on this superblock */ #define SB_I_ALLOW_HSM 0x00004000 /* Allow HSM events on this superblock */ /* Possible states of 'frozen' field */ enum { SB_UNFROZEN = 0, /* FS is unfrozen */ SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */ SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */ SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop * internal threads if needed) */ SB_FREEZE_COMPLETE = 4, /* ->freeze_fs finished successfully */ }; #define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) struct sb_writers { unsigned short frozen; /* Is sb frozen? */ int freeze_kcount; /* How many kernel freeze requests? */ int freeze_ucount; /* How many userspace freeze requests? */ struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; }; struct super_block { struct list_head s_list; /* Keep this first */ dev_t s_dev; /* search index; _not_ kdev_t */ unsigned char s_blocksize_bits; unsigned long s_blocksize; loff_t s_maxbytes; /* Max file size */ struct file_system_type *s_type; const struct super_operations *s_op; const struct dquot_operations *dq_op; const struct quotactl_ops *s_qcop; const struct export_operations *s_export_op; unsigned long s_flags; unsigned long s_iflags; /* internal SB_I_* flags */ unsigned long s_magic; struct dentry *s_root; struct rw_semaphore s_umount; int s_count; atomic_t s_active; #ifdef CONFIG_SECURITY void *s_security; #endif const struct xattr_handler * const *s_xattr; #ifdef CONFIG_FS_ENCRYPTION const struct fscrypt_operations *s_cop; struct fscrypt_keyring *s_master_keys; /* master crypto keys in use */ #endif #ifdef CONFIG_FS_VERITY const struct fsverity_operations *s_vop; #endif #if IS_ENABLED(CONFIG_UNICODE) struct unicode_map *s_encoding; __u16 s_encoding_flags; #endif struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ struct list_head s_mounts; /* list of mounts; _not_ for fs use */ struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */ struct file *s_bdev_file; struct backing_dev_info *s_bdi; struct mtd_info *s_mtd; struct hlist_node s_instances; unsigned int s_quota_types; /* Bitmask of supported quota types */ struct quota_info s_dquot; /* Diskquota specific options */ struct sb_writers s_writers; /* * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and * s_fsnotify_info together for cache efficiency. They are frequently * accessed and rarely modified. */ void *s_fs_info; /* Filesystem private info */ /* Granularity of c/m/atime in ns (cannot be worse than a second) */ u32 s_time_gran; /* Time limits for c/m/atime in seconds */ time64_t s_time_min; time64_t s_time_max; #ifdef CONFIG_FSNOTIFY u32 s_fsnotify_mask; struct fsnotify_sb_info *s_fsnotify_info; #endif /* * q: why are s_id and s_sysfs_name not the same? both are human * readable strings that identify the filesystem * a: s_id is allowed to change at runtime; it's used in log messages, * and we want to when a device starts out as single device (s_id is dev * name) but then a device is hot added and we have to switch to * identifying it by UUID * but s_sysfs_name is a handle for programmatic access, and can't * change at runtime */ char s_id[32]; /* Informational name */ uuid_t s_uuid; /* UUID */ u8 s_uuid_len; /* Default 16, possibly smaller for weird filesystems */ /* if set, fs shows up under sysfs at /sys/fs/$FSTYP/s_sysfs_name */ char s_sysfs_name[UUID_STRING_LEN + 1]; unsigned int s_max_links; /* * The next field is for VFS *only*. No filesystems have any business * even looking at it. You had been warned. */ struct mutex s_vfs_rename_mutex; /* Kludge */ /* * Filesystem subtype. If non-empty the filesystem type field * in /proc/mounts will be "type.subtype" */ const char *s_subtype; const struct dentry_operations *s_d_op; /* default d_op for dentries */ struct shrinker *s_shrink; /* per-sb shrinker handle */ /* Number of inodes with nlink == 0 but still referenced */ atomic_long_t s_remove_count; /* Read-only state of the superblock is being changed */ int s_readonly_remount; /* per-sb errseq_t for reporting writeback errors via syncfs */ errseq_t s_wb_err; /* AIO completions deferred from interrupt context */ struct workqueue_struct *s_dio_done_wq; struct hlist_head s_pins; /* * Owning user namespace and default context in which to * interpret filesystem uids, gids, quotas, device nodes, * xattrs and security labels. */ struct user_namespace *s_user_ns; /* * The list_lru structure is essentially just a pointer to a table * of per-node lru lists, each of which has its own spinlock. * There is no need to put them into separate cachelines. */ struct list_lru s_dentry_lru; struct list_lru s_inode_lru; struct rcu_head rcu; struct work_struct destroy_work; struct mutex s_sync_lock; /* sync serialisation lock */ /* * Indicates how deep in a filesystem stack this SB is */ int s_stack_depth; /* s_inode_list_lock protects s_inodes */ spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp; struct list_head s_inodes; /* all inodes */ spinlock_t s_inode_wblist_lock; struct list_head s_inodes_wb; /* writeback inodes */ } __randomize_layout; static inline struct user_namespace *i_user_ns(const struct inode *inode) { return inode->i_sb->s_user_ns; } /* Helper functions so that in most cases filesystems will * not need to deal directly with kuid_t and kgid_t and can * instead deal with the raw numeric values that are stored * in the filesystem. */ static inline uid_t i_uid_read(const struct inode *inode) { return from_kuid(i_user_ns(inode), inode->i_uid); } static inline gid_t i_gid_read(const struct inode *inode) { return from_kgid(i_user_ns(inode), inode->i_gid); } static inline void i_uid_write(struct inode *inode, uid_t uid) { inode->i_uid = make_kuid(i_user_ns(inode), uid); } static inline void i_gid_write(struct inode *inode, gid_t gid) { inode->i_gid = make_kgid(i_user_ns(inode), gid); } /** * i_uid_into_vfsuid - map an inode's i_uid down according to an idmapping * @idmap: idmap of the mount the inode was found from * @inode: inode to map * * Return: whe inode's i_uid mapped down according to @idmap. * If the inode's i_uid has no mapping INVALID_VFSUID is returned. */ static inline vfsuid_t i_uid_into_vfsuid(struct mnt_idmap *idmap, const struct inode *inode) { return make_vfsuid(idmap, i_user_ns(inode), inode->i_uid); } /** * i_uid_needs_update - check whether inode's i_uid needs to be updated * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * * Check whether the $inode's i_uid field needs to be updated taking idmapped * mounts into account if the filesystem supports it. * * Return: true if @inode's i_uid field needs to be updated, false if not. */ static inline bool i_uid_needs_update(struct mnt_idmap *idmap, const struct iattr *attr, const struct inode *inode) { return ((attr->ia_valid & ATTR_UID) && !vfsuid_eq(attr->ia_vfsuid, i_uid_into_vfsuid(idmap, inode))); } /** * i_uid_update - update @inode's i_uid field * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * * Safely update @inode's i_uid field translating the vfsuid of any idmapped * mount into the filesystem kuid. */ static inline void i_uid_update(struct mnt_idmap *idmap, const struct iattr *attr, struct inode *inode) { if (attr->ia_valid & ATTR_UID) inode->i_uid = from_vfsuid(idmap, i_user_ns(inode), attr->ia_vfsuid); } /** * i_gid_into_vfsgid - map an inode's i_gid down according to an idmapping * @idmap: idmap of the mount the inode was found from * @inode: inode to map * * Return: the inode's i_gid mapped down according to @idmap. * If the inode's i_gid has no mapping INVALID_VFSGID is returned. */ static inline vfsgid_t i_gid_into_vfsgid(struct mnt_idmap *idmap, const struct inode *inode) { return make_vfsgid(idmap, i_user_ns(inode), inode->i_gid); } /** * i_gid_needs_update - check whether inode's i_gid needs to be updated * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * * Check whether the $inode's i_gid field needs to be updated taking idmapped * mounts into account if the filesystem supports it. * * Return: true if @inode's i_gid field needs to be updated, false if not. */ static inline bool i_gid_needs_update(struct mnt_idmap *idmap, const struct iattr *attr, const struct inode *inode) { return ((attr->ia_valid & ATTR_GID) && !vfsgid_eq(attr->ia_vfsgid, i_gid_into_vfsgid(idmap, inode))); } /** * i_gid_update - update @inode's i_gid field * @idmap: idmap of the mount the inode was found from * @attr: the new attributes of @inode * @inode: the inode to update * * Safely update @inode's i_gid field translating the vfsgid of any idmapped * mount into the filesystem kgid. */ static inline void i_gid_update(struct mnt_idmap *idmap, const struct iattr *attr, struct inode *inode) { if (attr->ia_valid & ATTR_GID) inode->i_gid = from_vfsgid(idmap, i_user_ns(inode), attr->ia_vfsgid); } /** * inode_fsuid_set - initialize inode's i_uid field with callers fsuid * @inode: inode to initialize * @idmap: idmap of the mount the inode was found from * * Initialize the i_uid field of @inode. If the inode was found/created via * an idmapped mount map the caller's fsuid according to @idmap. */ static inline void inode_fsuid_set(struct inode *inode, struct mnt_idmap *idmap) { inode->i_uid = mapped_fsuid(idmap, i_user_ns(inode)); } /** * inode_fsgid_set - initialize inode's i_gid field with callers fsgid * @inode: inode to initialize * @idmap: idmap of the mount the inode was found from * * Initialize the i_gid field of @inode. If the inode was found/created via * an idmapped mount map the caller's fsgid according to @idmap. */ static inline void inode_fsgid_set(struct inode *inode, struct mnt_idmap *idmap) { inode->i_gid = mapped_fsgid(idmap, i_user_ns(inode)); } /** * fsuidgid_has_mapping() - check whether caller's fsuid/fsgid is mapped * @sb: the superblock we want a mapping in * @idmap: idmap of the relevant mount * * Check whether the caller's fsuid and fsgid have a valid mapping in the * s_user_ns of the superblock @sb. If the caller is on an idmapped mount map * the caller's fsuid and fsgid according to the @idmap first. * * Return: true if fsuid and fsgid is mapped, false if not. */ static inline bool fsuidgid_has_mapping(struct super_block *sb, struct mnt_idmap *idmap) { struct user_namespace *fs_userns = sb->s_user_ns; kuid_t kuid; kgid_t kgid; kuid = mapped_fsuid(idmap, fs_userns); if (!uid_valid(kuid)) return false; kgid = mapped_fsgid(idmap, fs_userns); if (!gid_valid(kgid)) return false; return kuid_has_mapping(fs_userns, kuid) && kgid_has_mapping(fs_userns, kgid); } struct timespec64 current_time(struct inode *inode); struct timespec64 inode_set_ctime_current(struct inode *inode); struct timespec64 inode_set_ctime_deleg(struct inode *inode, struct timespec64 update); static inline time64_t inode_get_atime_sec(const struct inode *inode) { return inode->i_atime_sec; } static inline long inode_get_atime_nsec(const struct inode *inode) { return inode->i_atime_nsec; } static inline struct timespec64 inode_get_atime(const struct inode *inode) { struct timespec64 ts = { .tv_sec = inode_get_atime_sec(inode), .tv_nsec = inode_get_atime_nsec(inode) }; return ts; } static inline struct timespec64 inode_set_atime_to_ts(struct inode *inode, struct timespec64 ts) { inode->i_atime_sec = ts.tv_sec; inode->i_atime_nsec = ts.tv_nsec; return ts; } static inline struct timespec64 inode_set_atime(struct inode *inode, time64_t sec, long nsec) { struct timespec64 ts = { .tv_sec = sec, .tv_nsec = nsec }; return inode_set_atime_to_ts(inode, ts); } static inline time64_t inode_get_mtime_sec(const struct inode *inode) { return inode->i_mtime_sec; } static inline long inode_get_mtime_nsec(const struct inode *inode) { return inode->i_mtime_nsec; } static inline struct timespec64 inode_get_mtime(const struct inode *inode) { struct timespec64 ts = { .tv_sec = inode_get_mtime_sec(inode), .tv_nsec = inode_get_mtime_nsec(inode) }; return ts; } static inline struct timespec64 inode_set_mtime_to_ts(struct inode *inode, struct timespec64 ts) { inode->i_mtime_sec = ts.tv_sec; inode->i_mtime_nsec = ts.tv_nsec; return ts; } static inline struct timespec64 inode_set_mtime(struct inode *inode, time64_t sec, long nsec) { struct timespec64 ts = { .tv_sec = sec, .tv_nsec = nsec }; return inode_set_mtime_to_ts(inode, ts); } /* * Multigrain timestamps * * Conditionally use fine-grained ctime and mtime timestamps when there * are users actively observing them via getattr. The primary use-case * for this is NFS clients that use the ctime to distinguish between * different states of the file, and that are often fooled by multiple * operations that occur in the same coarse-grained timer tick. */ #define I_CTIME_QUERIED ((u32)BIT(31)) static inline time64_t inode_get_ctime_sec(const struct inode *inode) { return inode->i_ctime_sec; } static inline long inode_get_ctime_nsec(const struct inode *inode) { return inode->i_ctime_nsec & ~I_CTIME_QUERIED; } static inline struct timespec64 inode_get_ctime(const struct inode *inode) { struct timespec64 ts = { .tv_sec = inode_get_ctime_sec(inode), .tv_nsec = inode_get_ctime_nsec(inode) }; return ts; } struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts); /** * inode_set_ctime - set the ctime in the inode * @inode: inode in which to set the ctime * @sec: tv_sec value to set * @nsec: tv_nsec value to set * * Set the ctime in @inode to { @sec, @nsec } */ static inline struct timespec64 inode_set_ctime(struct inode *inode, time64_t sec, long nsec) { struct timespec64 ts = { .tv_sec = sec, .tv_nsec = nsec }; return inode_set_ctime_to_ts(inode, ts); } struct timespec64 simple_inode_init_ts(struct inode *inode); /* * Snapshotting support. */ /* * These are internal functions, please use sb_start_{write,pagefault,intwrite} * instead. */ static inline void __sb_end_write(struct super_block *sb, int level) { percpu_up_read(sb->s_writers.rw_sem + level-1); } static inline void __sb_start_write(struct super_block *sb, int level) { percpu_down_read(sb->s_writers.rw_sem + level - 1); } static inline bool __sb_start_write_trylock(struct super_block *sb, int level) { return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1); } #define __sb_writers_acquired(sb, lev) \ percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) #define __sb_writers_release(sb, lev) \ percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], _THIS_IP_) /** * __sb_write_started - check if sb freeze level is held * @sb: the super we write to * @level: the freeze level * * * > 0 - sb freeze level is held * * 0 - sb freeze level is not held * * < 0 - !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN */ static inline int __sb_write_started(const struct super_block *sb, int level) { return lockdep_is_held_type(sb->s_writers.rw_sem + level - 1, 1); } /** * sb_write_started - check if SB_FREEZE_WRITE is held * @sb: the super we write to * * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. */ static inline bool sb_write_started(const struct super_block *sb) { return __sb_write_started(sb, SB_FREEZE_WRITE); } /** * sb_write_not_started - check if SB_FREEZE_WRITE is not held * @sb: the super we write to * * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. */ static inline bool sb_write_not_started(const struct super_block *sb) { return __sb_write_started(sb, SB_FREEZE_WRITE) <= 0; } /** * file_write_started - check if SB_FREEZE_WRITE is held * @file: the file we write to * * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. * May be false positive with !S_ISREG, because file_start_write() has * no effect on !S_ISREG. */ static inline bool file_write_started(const struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return true; return sb_write_started(file_inode(file)->i_sb); } /** * file_write_not_started - check if SB_FREEZE_WRITE is not held * @file: the file we write to * * May be false positive with !CONFIG_LOCKDEP/LOCK_STATE_UNKNOWN. * May be false positive with !S_ISREG, because file_start_write() has * no effect on !S_ISREG. */ static inline bool file_write_not_started(const struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return true; return sb_write_not_started(file_inode(file)->i_sb); } /** * sb_end_write - drop write access to a superblock * @sb: the super we wrote to * * Decrement number of writers to the filesystem. Wake up possible waiters * wanting to freeze the filesystem. */ static inline void sb_end_write(struct super_block *sb) { __sb_end_write(sb, SB_FREEZE_WRITE); } /** * sb_end_pagefault - drop write access to a superblock from a page fault * @sb: the super we wrote to * * Decrement number of processes handling write page fault to the filesystem. * Wake up possible waiters wanting to freeze the filesystem. */ static inline void sb_end_pagefault(struct super_block *sb) { __sb_end_write(sb, SB_FREEZE_PAGEFAULT); } /** * sb_end_intwrite - drop write access to a superblock for internal fs purposes * @sb: the super we wrote to * * Decrement fs-internal number of writers to the filesystem. Wake up possible * waiters wanting to freeze the filesystem. */ static inline void sb_end_intwrite(struct super_block *sb) { __sb_end_write(sb, SB_FREEZE_FS); } /** * sb_start_write - get write access to a superblock * @sb: the super we write to * * When a process wants to write data or metadata to a file system (i.e. dirty * a page or an inode), it should embed the operation in a sb_start_write() - * sb_end_write() pair to get exclusion against file system freezing. This * function increments number of writers preventing freezing. If the file * system is already frozen, the function waits until the file system is * thawed. * * Since freeze protection behaves as a lock, users have to preserve * ordering of freeze protection and other filesystem locks. Generally, * freeze protection should be the outermost lock. In particular, we have: * * sb_start_write * -> i_mutex (write path, truncate, directory ops, ...) * -> s_umount (freeze_super, thaw_super) */ static inline void sb_start_write(struct super_block *sb) { __sb_start_write(sb, SB_FREEZE_WRITE); } static inline bool sb_start_write_trylock(struct super_block *sb) { return __sb_start_write_trylock(sb, SB_FREEZE_WRITE); } /** * sb_start_pagefault - get write access to a superblock from a page fault * @sb: the super we write to * * When a process starts handling write page fault, it should embed the * operation into sb_start_pagefault() - sb_end_pagefault() pair to get * exclusion against file system freezing. This is needed since the page fault * is going to dirty a page. This function increments number of running page * faults preventing freezing. If the file system is already frozen, the * function waits until the file system is thawed. * * Since page fault freeze protection behaves as a lock, users have to preserve * ordering of freeze protection and other filesystem locks. It is advised to * put sb_start_pagefault() close to mmap_lock in lock ordering. Page fault * handling code implies lock dependency: * * mmap_lock * -> sb_start_pagefault */ static inline void sb_start_pagefault(struct super_block *sb) { __sb_start_write(sb, SB_FREEZE_PAGEFAULT); } /** * sb_start_intwrite - get write access to a superblock for internal fs purposes * @sb: the super we write to * * This is the third level of protection against filesystem freezing. It is * free for use by a filesystem. The only requirement is that it must rank * below sb_start_pagefault. * * For example filesystem can call sb_start_intwrite() when starting a * transaction which somewhat eases handling of freezing for internal sources * of filesystem changes (internal fs threads, discarding preallocation on file * close, etc.). */ static inline void sb_start_intwrite(struct super_block *sb) { __sb_start_write(sb, SB_FREEZE_FS); } static inline bool sb_start_intwrite_trylock(struct super_block *sb) { return __sb_start_write_trylock(sb, SB_FREEZE_FS); } bool inode_owner_or_capable(struct mnt_idmap *idmap, const struct inode *inode); /* * VFS helper functions.. */ int vfs_create(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, bool); int vfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, umode_t); int vfs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t); int vfs_symlink(struct mnt_idmap *, struct inode *, struct dentry *, const char *); int vfs_link(struct dentry *, struct mnt_idmap *, struct inode *, struct dentry *, struct inode **); int vfs_rmdir(struct mnt_idmap *, struct inode *, struct dentry *); int vfs_unlink(struct mnt_idmap *, struct inode *, struct dentry *, struct inode **); /** * struct renamedata - contains all information required for renaming * @old_mnt_idmap: idmap of the old mount the inode was found from * @old_dir: parent of source * @old_dentry: source * @new_mnt_idmap: idmap of the new mount the inode was found from * @new_dir: parent of destination * @new_dentry: destination * @delegated_inode: returns an inode needing a delegation break * @flags: rename flags */ struct renamedata { struct mnt_idmap *old_mnt_idmap; struct inode *old_dir; struct dentry *old_dentry; struct mnt_idmap *new_mnt_idmap; struct inode *new_dir; struct dentry *new_dentry; struct inode **delegated_inode; unsigned int flags; } __randomize_layout; int vfs_rename(struct renamedata *); static inline int vfs_whiteout(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry) { return vfs_mknod(idmap, dir, dentry, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); } struct file *kernel_tmpfile_open(struct mnt_idmap *idmap, const struct path *parentpath, umode_t mode, int open_flag, const struct cred *cred); struct file *kernel_file_open(const struct path *path, int flags, const struct cred *cred); int vfs_mkobj(struct dentry *, umode_t, int (*f)(struct dentry *, umode_t, void *), void *); int vfs_fchown(struct file *file, uid_t user, gid_t group); int vfs_fchmod(struct file *file, umode_t mode); int vfs_utimes(const struct path *path, struct timespec64 *times); extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #ifdef CONFIG_COMPAT extern long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #else #define compat_ptr_ioctl NULL #endif /* * VFS file helper functions. */ void inode_init_owner(struct mnt_idmap *idmap, struct inode *inode, const struct inode *dir, umode_t mode); extern bool may_open_dev(const struct path *path); umode_t mode_strip_sgid(struct mnt_idmap *idmap, const struct inode *dir, umode_t mode); bool in_group_or_capable(struct mnt_idmap *idmap, const struct inode *inode, vfsgid_t vfsgid); /* * This is the "filldir" function type, used by readdir() to let * the kernel specify what kind of dirent layout it wants to have. * This allows the kernel to read directories into kernel space or * to have different dirent layouts depending on the binary type. * Return 'true' to keep going and 'false' if there are no more entries. */ struct dir_context; typedef bool (*filldir_t)(struct dir_context *, const char *, int, loff_t, u64, unsigned); struct dir_context { filldir_t actor; loff_t pos; }; /* * These flags let !MMU mmap() govern direct device mapping vs immediate * copying more easily for MAP_PRIVATE, especially for ROM filesystems. * * NOMMU_MAP_COPY: Copy can be mapped (MAP_PRIVATE) * NOMMU_MAP_DIRECT: Can be mapped directly (MAP_SHARED) * NOMMU_MAP_READ: Can be mapped for reading * NOMMU_MAP_WRITE: Can be mapped for writing * NOMMU_MAP_EXEC: Can be mapped for execution */ #define NOMMU_MAP_COPY 0x00000001 #define NOMMU_MAP_DIRECT 0x00000008 #define NOMMU_MAP_READ VM_MAYREAD #define NOMMU_MAP_WRITE VM_MAYWRITE #define NOMMU_MAP_EXEC VM_MAYEXEC #define NOMMU_VMFLAGS \ (NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC) /* * These flags control the behavior of the remap_file_range function pointer. * If it is called with len == 0 that means "remap to end of source file". * See Documentation/filesystems/vfs.rst for more details about this call. * * REMAP_FILE_DEDUP: only remap if contents identical (i.e. deduplicate) * REMAP_FILE_CAN_SHORTEN: caller can handle a shortened request */ #define REMAP_FILE_DEDUP (1 << 0) #define REMAP_FILE_CAN_SHORTEN (1 << 1) /* * These flags signal that the caller is ok with altering various aspects of * the behavior of the remap operation. The changes must be made by the * implementation; the vfs remap helper functions can take advantage of them. * Flags in this category exist to preserve the quirky behavior of the hoisted * btrfs clone/dedupe ioctls. */ #define REMAP_FILE_ADVISORY (REMAP_FILE_CAN_SHORTEN) /* * These flags control the behavior of vfs_copy_file_range(). * They are not available to the user via syscall. * * COPY_FILE_SPLICE: call splice direct instead of fs clone/copy ops */ #define COPY_FILE_SPLICE (1 << 0) struct iov_iter; struct io_uring_cmd; struct offset_ctx; typedef unsigned int __bitwise fop_flags_t; struct file_operations { struct module *owner; fop_flags_t fop_flags; loff_t (*llseek) (struct file *, loff_t, int); ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); int (*iopoll)(struct kiocb *kiocb, struct io_comp_batch *, unsigned int flags); int (*iterate_shared) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); int (*open) (struct inode *, struct file *); int (*flush) (struct file *, fl_owner_t id); int (*release) (struct inode *, struct file *); int (*fsync) (struct file *, loff_t, loff_t, int datasync); int (*fasync) (int, struct file *, int); int (*lock) (struct file *, int, struct file_lock *); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); int (*check_flags)(int); int (*flock) (struct file *, int, struct file_lock *); ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); void (*splice_eof)(struct file *file); int (*setlease)(struct file *, int, struct file_lease **, void **); long (*fallocate)(struct file *file, int mode, loff_t offset, loff_t len); void (*show_fdinfo)(struct seq_file *m, struct file *f); #ifndef CONFIG_MMU unsigned (*mmap_capabilities)(struct file *); #endif ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); int (*fadvise)(struct file *, loff_t, loff_t, int); int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags); int (*uring_cmd_iopoll)(struct io_uring_cmd *, struct io_comp_batch *, unsigned int poll_flags); } __randomize_layout; /* Supports async buffered reads */ #define FOP_BUFFER_RASYNC ((__force fop_flags_t)(1 << 0)) /* Supports async buffered writes */ #define FOP_BUFFER_WASYNC ((__force fop_flags_t)(1 << 1)) /* Supports synchronous page faults for mappings */ #define FOP_MMAP_SYNC ((__force fop_flags_t)(1 << 2)) /* Supports non-exclusive O_DIRECT writes from multiple threads */ #define FOP_DIO_PARALLEL_WRITE ((__force fop_flags_t)(1 << 3)) /* Contains huge pages */ #define FOP_HUGE_PAGES ((__force fop_flags_t)(1 << 4)) /* Treat loff_t as unsigned (e.g., /dev/mem) */ #define FOP_UNSIGNED_OFFSET ((__force fop_flags_t)(1 << 5)) /* Supports asynchronous lock callbacks */ #define FOP_ASYNC_LOCK ((__force fop_flags_t)(1 << 6)) /* File system supports uncached read/write buffered IO */ #define FOP_DONTCACHE ((__force fop_flags_t)(1 << 7)) /* Wrap a directory iterator that needs exclusive inode access */ int wrap_directory_iterator(struct file *, struct dir_context *, int (*) (struct file *, struct dir_context *)); #define WRAP_DIR_ITER(x) \ static int shared_##x(struct file *file , struct dir_context *ctx) \ { return wrap_directory_iterator(file, ctx, x); } struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *); int (*permission) (struct mnt_idmap *, struct inode *, int); struct posix_acl * (*get_inode_acl)(struct inode *, int, bool); int (*readlink) (struct dentry *, char __user *,int); int (*create) (struct mnt_idmap *, struct inode *,struct dentry *, umode_t, bool); int (*link) (struct dentry *,struct inode *,struct dentry *); int (*unlink) (struct inode *,struct dentry *); int (*symlink) (struct mnt_idmap *, struct inode *,struct dentry *, const char *); int (*mkdir) (struct mnt_idmap *, struct inode *,struct dentry *, umode_t); int (*rmdir) (struct inode *,struct dentry *); int (*mknod) (struct mnt_idmap *, struct inode *,struct dentry *, umode_t,dev_t); int (*rename) (struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); int (*setattr) (struct mnt_idmap *, struct dentry *, struct iattr *); int (*getattr) (struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); int (*update_time)(struct inode *, int); int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode); int (*tmpfile) (struct mnt_idmap *, struct inode *, struct file *, umode_t); struct posix_acl *(*get_acl)(struct mnt_idmap *, struct dentry *, int); int (*set_acl)(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); int (*fileattr_set)(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int (*fileattr_get)(struct dentry *dentry, struct fileattr *fa); struct offset_ctx *(*get_offset_ctx)(struct inode *inode); } ____cacheline_aligned; static inline int call_mmap(struct file *file, struct vm_area_struct *vma) { return file->f_op->mmap(file, vma); } extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, loff_t, size_t, unsigned int); int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write); int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags, const struct iomap_ops *dax_read_ops); int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *count, unsigned int remap_flags); extern loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); extern int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same); extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, struct file *dst_file, loff_t dst_pos, loff_t len, unsigned int remap_flags); /** * enum freeze_holder - holder of the freeze * @FREEZE_HOLDER_KERNEL: kernel wants to freeze or thaw filesystem * @FREEZE_HOLDER_USERSPACE: userspace wants to freeze or thaw filesystem * @FREEZE_MAY_NEST: whether nesting freeze and thaw requests is allowed * * Indicate who the owner of the freeze or thaw request is and whether * the freeze needs to be exclusive or can nest. * Without @FREEZE_MAY_NEST, multiple freeze and thaw requests from the * same holder aren't allowed. It is however allowed to hold a single * @FREEZE_HOLDER_USERSPACE and a single @FREEZE_HOLDER_KERNEL freeze at * the same time. This is relied upon by some filesystems during online * repair or similar. */ enum freeze_holder { FREEZE_HOLDER_KERNEL = (1U << 0), FREEZE_HOLDER_USERSPACE = (1U << 1), FREEZE_MAY_NEST = (1U << 2), }; struct super_operations { struct inode *(*alloc_inode)(struct super_block *sb); void (*destroy_inode)(struct inode *); void (*free_inode)(struct inode *); void (*dirty_inode) (struct inode *, int flags); int (*write_inode) (struct inode *, struct writeback_control *wbc); int (*drop_inode) (struct inode *); void (*evict_inode) (struct inode *); void (*put_super) (struct super_block *); int (*sync_fs)(struct super_block *sb, int wait); int (*freeze_super) (struct super_block *, enum freeze_holder who); int (*freeze_fs) (struct super_block *); int (*thaw_super) (struct super_block *, enum freeze_holder who); int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); void (*umount_begin) (struct super_block *); int (*show_options)(struct seq_file *, struct dentry *); int (*show_devname)(struct seq_file *, struct dentry *); int (*show_path)(struct seq_file *, struct dentry *); int (*show_stats)(struct seq_file *, struct dentry *); #ifdef CONFIG_QUOTA ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); struct dquot __rcu **(*get_dquots)(struct inode *); #endif long (*nr_cached_objects)(struct super_block *, struct shrink_control *); long (*free_cached_objects)(struct super_block *, struct shrink_control *); void (*shutdown)(struct super_block *sb); }; /* * Inode flags - they have no relation to superblock flags now */ #define S_SYNC (1 << 0) /* Writes are synced at once */ #define S_NOATIME (1 << 1) /* Do not update access times */ #define S_APPEND (1 << 2) /* Append-only file */ #define S_IMMUTABLE (1 << 3) /* Immutable file */ #define S_DEAD (1 << 4) /* removed, but still open directory */ #define S_NOQUOTA (1 << 5) /* Inode is not counted to quota */ #define S_DIRSYNC (1 << 6) /* Directory modifications are synchronous */ #define S_NOCMTIME (1 << 7) /* Do not update file c/mtime */ #define S_SWAPFILE (1 << 8) /* Do not truncate: swapon got its bmaps */ #define S_PRIVATE (1 << 9) /* Inode is fs-internal */ #define S_IMA (1 << 10) /* Inode has an associated IMA struct */ #define S_AUTOMOUNT (1 << 11) /* Automount/referral quasi-directory */ #define S_NOSEC (1 << 12) /* no suid or xattr security attributes */ #ifdef CONFIG_FS_DAX #define S_DAX (1 << 13) /* Direct Access, avoiding the page cache */ #else #define S_DAX 0 /* Make all the DAX code disappear */ #endif #define S_ENCRYPTED (1 << 14) /* Encrypted file (using fs/crypto/) */ #define S_CASEFOLD (1 << 15) /* Casefolded file */ #define S_VERITY (1 << 16) /* Verity file (using fs/verity/) */ #define S_KERNEL_FILE (1 << 17) /* File is in use by the kernel (eg. fs/cachefiles) */ /* * Note that nosuid etc flags are inode-specific: setting some file-system * flags just means all the inodes inherit those flags by default. It might be * possible to override it selectively if you really wanted to with some * ioctl() that is not currently implemented. * * Exception: SB_RDONLY is always applied to the entire file system. * * Unfortunately, it is possible to change a filesystems flags with it mounted * with files in use. This means that all of the inodes will not have their * i_flags updated. Hence, i_flags no longer inherit the superblock mount * flags, so these have to be checked separately. -- rmk@arm.uk.linux.org */ #define __IS_FLG(inode, flg) ((inode)->i_sb->s_flags & (flg)) static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags & SB_RDONLY; } #define IS_RDONLY(inode) sb_rdonly((inode)->i_sb) #define IS_SYNC(inode) (__IS_FLG(inode, SB_SYNCHRONOUS) || \ ((inode)->i_flags & S_SYNC)) #define IS_DIRSYNC(inode) (__IS_FLG(inode, SB_SYNCHRONOUS|SB_DIRSYNC) || \ ((inode)->i_flags & (S_SYNC|S_DIRSYNC))) #define IS_MANDLOCK(inode) __IS_FLG(inode, SB_MANDLOCK) #define IS_NOATIME(inode) __IS_FLG(inode, SB_RDONLY|SB_NOATIME) #define IS_I_VERSION(inode) __IS_FLG(inode, SB_I_VERSION) #define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA) #define IS_APPEND(inode) ((inode)->i_flags & S_APPEND) #define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE) #ifdef CONFIG_FS_POSIX_ACL #define IS_POSIXACL(inode) __IS_FLG(inode, SB_POSIXACL) #else #define IS_POSIXACL(inode) 0 #endif #define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD) #define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME) #ifdef CONFIG_SWAP #define IS_SWAPFILE(inode) ((inode)->i_flags & S_SWAPFILE) #else #define IS_SWAPFILE(inode) ((void)(inode), 0U) #endif #define IS_PRIVATE(inode) ((inode)->i_flags & S_PRIVATE) #define IS_IMA(inode) ((inode)->i_flags & S_IMA) #define IS_AUTOMOUNT(inode) ((inode)->i_flags & S_AUTOMOUNT) #define IS_NOSEC(inode) ((inode)->i_flags & S_NOSEC) #define IS_DAX(inode) ((inode)->i_flags & S_DAX) #define IS_ENCRYPTED(inode) ((inode)->i_flags & S_ENCRYPTED) #define IS_CASEFOLDED(inode) ((inode)->i_flags & S_CASEFOLD) #define IS_VERITY(inode) ((inode)->i_flags & S_VERITY) #define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \ (inode)->i_rdev == WHITEOUT_DEV) static inline bool HAS_UNMAPPED_ID(struct mnt_idmap *idmap, struct inode *inode) { return !vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || !vfsgid_valid(i_gid_into_vfsgid(idmap, inode)); } static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) { *kiocb = (struct kiocb) { .ki_filp = filp, .ki_flags = filp->f_iocb_flags, .ki_ioprio = get_current_ioprio(), }; } static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src, struct file *filp) { *kiocb = (struct kiocb) { .ki_filp = filp, .ki_flags = kiocb_src->ki_flags, .ki_ioprio = kiocb_src->ki_ioprio, .ki_pos = kiocb_src->ki_pos, }; } /* * Inode state bits. Protected by inode->i_lock * * Four bits determine the dirty state of the inode: I_DIRTY_SYNC, * I_DIRTY_DATASYNC, I_DIRTY_PAGES, and I_DIRTY_TIME. * * Four bits define the lifetime of an inode. Initially, inodes are I_NEW, * until that flag is cleared. I_WILL_FREE, I_FREEING and I_CLEAR are set at * various stages of removing an inode. * * Two bits are used for locking and completion notification, I_NEW and I_SYNC. * * I_DIRTY_SYNC Inode is dirty, but doesn't have to be written on * fdatasync() (unless I_DIRTY_DATASYNC is also set). * Timestamp updates are the usual cause. * I_DIRTY_DATASYNC Data-related inode changes pending. We keep track of * these changes separately from I_DIRTY_SYNC so that we * don't have to write inode on fdatasync() when only * e.g. the timestamps have changed. * I_DIRTY_PAGES Inode has dirty pages. Inode itself may be clean. * I_DIRTY_TIME The inode itself has dirty timestamps, and the * lazytime mount option is enabled. We keep track of this * separately from I_DIRTY_SYNC in order to implement * lazytime. This gets cleared if I_DIRTY_INODE * (I_DIRTY_SYNC and/or I_DIRTY_DATASYNC) gets set. But * I_DIRTY_TIME can still be set if I_DIRTY_SYNC is already * in place because writeback might already be in progress * and we don't want to lose the time update * I_NEW Serves as both a mutex and completion notification. * New inodes set I_NEW. If two processes both create * the same inode, one of them will release its inode and * wait for I_NEW to be released before returning. * Inodes in I_WILL_FREE, I_FREEING or I_CLEAR state can * also cause waiting on I_NEW, without I_NEW actually * being set. find_inode() uses this to prevent returning * nearly-dead inodes. * I_WILL_FREE Must be set when calling write_inode_now() if i_count * is zero. I_FREEING must be set when I_WILL_FREE is * cleared. * I_FREEING Set when inode is about to be freed but still has dirty * pages or buffers attached or the inode itself is still * dirty. * I_CLEAR Added by clear_inode(). In this state the inode is * clean and can be destroyed. Inode keeps I_FREEING. * * Inodes that are I_WILL_FREE, I_FREEING or I_CLEAR are * prohibited for many purposes. iget() must wait for * the inode to be completely released, then create it * anew. Other functions will just ignore such inodes, * if appropriate. I_NEW is used for waiting. * * I_SYNC Writeback of inode is running. The bit is set during * data writeback, and cleared with a wakeup on the bit * address once it is done. The bit is also used to pin * the inode in memory for flusher thread. * * I_REFERENCED Marks the inode as recently references on the LRU list. * * I_WB_SWITCH Cgroup bdi_writeback switching in progress. Used to * synchronize competing switching instances and to tell * wb stat updates to grab the i_pages lock. See * inode_switch_wbs_work_fn() for details. * * I_OVL_INUSE Used by overlayfs to get exclusive ownership on upper * and work dirs among overlayfs mounts. * * I_CREATING New object's inode in the middle of setting up. * * I_DONTCACHE Evict inode as soon as it is not used anymore. * * I_SYNC_QUEUED Inode is queued in b_io or b_more_io writeback lists. * Used to detect that mark_inode_dirty() should not move * inode between dirty lists. * * I_PINNING_FSCACHE_WB Inode is pinning an fscache object for writeback. * * I_LRU_ISOLATING Inode is pinned being isolated from LRU without holding * i_count. * * Q: What is the difference between I_WILL_FREE and I_FREEING? * * __I_{SYNC,NEW,LRU_ISOLATING} are used to derive unique addresses to wait * upon. There's one free address left. */ #define __I_NEW 0 #define I_NEW (1 << __I_NEW) #define __I_SYNC 1 #define I_SYNC (1 << __I_SYNC) #define __I_LRU_ISOLATING 2 #define I_LRU_ISOLATING (1 << __I_LRU_ISOLATING) #define I_DIRTY_SYNC (1 << 3) #define I_DIRTY_DATASYNC (1 << 4) #define I_DIRTY_PAGES (1 << 5) #define I_WILL_FREE (1 << 6) #define I_FREEING (1 << 7) #define I_CLEAR (1 << 8) #define I_REFERENCED (1 << 9) #define I_LINKABLE (1 << 10) #define I_DIRTY_TIME (1 << 11) #define I_WB_SWITCH (1 << 12) #define I_OVL_INUSE (1 << 13) #define I_CREATING (1 << 14) #define I_DONTCACHE (1 << 15) #define I_SYNC_QUEUED (1 << 16) #define I_PINNING_NETFS_WB (1 << 17) #define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC) #define I_DIRTY (I_DIRTY_INODE | I_DIRTY_PAGES) #define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME) extern void __mark_inode_dirty(struct inode *, int); static inline void mark_inode_dirty(struct inode *inode) { __mark_inode_dirty(inode, I_DIRTY); } static inline void mark_inode_dirty_sync(struct inode *inode) { __mark_inode_dirty(inode, I_DIRTY_SYNC); } /* * Returns true if the given inode itself only has dirty timestamps (its pages * may still be dirty) and isn't currently being allocated or freed. * Filesystems should call this if when writing an inode when lazytime is * enabled, they want to opportunistically write the timestamps of other inodes * located very nearby on-disk, e.g. in the same inode block. This returns true * if the given inode is in need of such an opportunistic update. Requires * i_lock, or at least later re-checking under i_lock. */ static inline bool inode_is_dirtytime_only(struct inode *inode) { return (inode->i_state & (I_DIRTY_TIME | I_NEW | I_FREEING | I_WILL_FREE)) == I_DIRTY_TIME; } extern void inc_nlink(struct inode *inode); extern void drop_nlink(struct inode *inode); extern void clear_nlink(struct inode *inode); extern void set_nlink(struct inode *inode, unsigned int nlink); static inline void inode_inc_link_count(struct inode *inode) { inc_nlink(inode); mark_inode_dirty(inode); } static inline void inode_dec_link_count(struct inode *inode) { drop_nlink(inode); mark_inode_dirty(inode); } enum file_time_flags { S_ATIME = 1, S_MTIME = 2, S_CTIME = 4, S_VERSION = 8, }; extern bool atime_needs_update(const struct path *, struct inode *); extern void touch_atime(const struct path *); int inode_update_time(struct inode *inode, int flags); static inline void file_accessed(struct file *file) { if (!(file->f_flags & O_NOATIME)) touch_atime(&file->f_path); } extern int file_modified(struct file *file); int kiocb_modified(struct kiocb *iocb); int sync_inode_metadata(struct inode *inode, int wait); struct file_system_type { const char *name; int fs_flags; #define FS_REQUIRES_DEV 1 #define FS_BINARY_MOUNTDATA 2 #define FS_HAS_SUBTYPE 4 #define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */ #define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */ #define FS_ALLOW_IDMAP 32 /* FS has been updated to handle vfs idmappings. */ #define FS_MGTIME 64 /* FS uses multigrain timestamps */ #define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */ int (*init_fs_context)(struct fs_context *); const struct fs_parameter_spec *parameters; struct dentry *(*mount) (struct file_system_type *, int, const char *, void *); void (*kill_sb) (struct super_block *); struct module *owner; struct file_system_type * next; struct hlist_head fs_supers; struct lock_class_key s_lock_key; struct lock_class_key s_umount_key; struct lock_class_key s_vfs_rename_key; struct lock_class_key s_writers_key[SB_FREEZE_LEVELS]; struct lock_class_key i_lock_key; struct lock_class_key i_mutex_key; struct lock_class_key invalidate_lock_key; struct lock_class_key i_mutex_dir_key; }; #define MODULE_ALIAS_FS(NAME) MODULE_ALIAS("fs-" NAME) /** * is_mgtime: is this inode using multigrain timestamps * @inode: inode to test for multigrain timestamps * * Return true if the inode uses multigrain timestamps, false otherwise. */ static inline bool is_mgtime(const struct inode *inode) { return inode->i_opflags & IOP_MGTIME; } extern struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_single(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_nodev(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)); extern struct dentry *mount_subtree(struct vfsmount *mnt, const char *path); void retire_super(struct super_block *sb); void generic_shutdown_super(struct super_block *sb); void kill_block_super(struct super_block *sb); void kill_anon_super(struct super_block *sb); void kill_litter_super(struct super_block *sb); void deactivate_super(struct super_block *sb); void deactivate_locked_super(struct super_block *sb); int set_anon_super(struct super_block *s, void *data); int set_anon_super_fc(struct super_block *s, struct fs_context *fc); int get_anon_bdev(dev_t *); void free_anon_bdev(dev_t); struct super_block *sget_fc(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), int (*set)(struct super_block *, struct fs_context *)); struct super_block *sget(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), int flags, void *data); struct super_block *sget_dev(struct fs_context *fc, dev_t dev); /* Alas, no aliases. Too much hassle with bringing module.h everywhere */ #define fops_get(fops) ({ \ const struct file_operations *_fops = (fops); \ (((_fops) && try_module_get((_fops)->owner) ? (_fops) : NULL)); \ }) #define fops_put(fops) ({ \ const struct file_operations *_fops = (fops); \ if (_fops) \ module_put((_fops)->owner); \ }) /* * This one is to be used *ONLY* from ->open() instances. * fops must be non-NULL, pinned down *and* module dependencies * should be sufficient to pin the caller down as well. */ #define replace_fops(f, fops) \ do { \ struct file *__file = (f); \ fops_put(__file->f_op); \ BUG_ON(!(__file->f_op = (fops))); \ } while(0) extern int register_filesystem(struct file_system_type *); extern int unregister_filesystem(struct file_system_type *); extern int vfs_statfs(const struct path *, struct kstatfs *); extern int user_statfs(const char __user *, struct kstatfs *); extern int fd_statfs(int, struct kstatfs *); int freeze_super(struct super_block *super, enum freeze_holder who); int thaw_super(struct super_block *super, enum freeze_holder who); extern __printf(2, 3) int super_setup_bdi_name(struct super_block *sb, char *fmt, ...); extern int super_setup_bdi(struct super_block *sb); static inline void super_set_uuid(struct super_block *sb, const u8 *uuid, unsigned len) { if (WARN_ON(len > sizeof(sb->s_uuid))) len = sizeof(sb->s_uuid); sb->s_uuid_len = len; memcpy(&sb->s_uuid, uuid, len); } /* set sb sysfs name based on sb->s_bdev */ static inline void super_set_sysfs_name_bdev(struct super_block *sb) { snprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), "%pg", sb->s_bdev); } /* set sb sysfs name based on sb->s_uuid */ static inline void super_set_sysfs_name_uuid(struct super_block *sb) { WARN_ON(sb->s_uuid_len != sizeof(sb->s_uuid)); snprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), "%pU", sb->s_uuid.b); } /* set sb sysfs name based on sb->s_id */ static inline void super_set_sysfs_name_id(struct super_block *sb) { strscpy(sb->s_sysfs_name, sb->s_id, sizeof(sb->s_sysfs_name)); } /* try to use something standard before you use this */ __printf(2, 3) static inline void super_set_sysfs_name_generic(struct super_block *sb, const char *fmt, ...) { va_list args; va_start(args, fmt); vsnprintf(sb->s_sysfs_name, sizeof(sb->s_sysfs_name), fmt, args); va_end(args); } extern int current_umask(void); extern void ihold(struct inode * inode); extern void iput(struct inode *); int inode_update_timestamps(struct inode *inode, int flags); int generic_update_time(struct inode *, int); /* /sys/fs */ extern struct kobject *fs_kobj; #define MAX_RW_COUNT (INT_MAX & PAGE_MASK) /* fs/open.c */ struct audit_names; struct filename { const char *name; /* pointer to actual string */ const __user char *uptr; /* original userland pointer */ atomic_t refcnt; struct audit_names *aname; const char iname[]; }; static_assert(offsetof(struct filename, iname) % sizeof(long) == 0); static inline struct mnt_idmap *file_mnt_idmap(const struct file *file) { return mnt_idmap(file->f_path.mnt); } /** * is_idmapped_mnt - check whether a mount is mapped * @mnt: the mount to check * * If @mnt has an non @nop_mnt_idmap attached to it then @mnt is mapped. * * Return: true if mount is mapped, false if not. */ static inline bool is_idmapped_mnt(const struct vfsmount *mnt) { return mnt_idmap(mnt) != &nop_mnt_idmap; } extern long vfs_truncate(const struct path *, loff_t); int do_truncate(struct mnt_idmap *, struct dentry *, loff_t start, unsigned int time_attrs, struct file *filp); extern int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len); extern long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode); extern struct file *file_open_name(struct filename *, int, umode_t); extern struct file *filp_open(const char *, int, umode_t); extern struct file *file_open_root(const struct path *, const char *, int, umode_t); static inline struct file *file_open_root_mnt(struct vfsmount *mnt, const char *name, int flags, umode_t mode) { return file_open_root(&(struct path){.mnt = mnt, .dentry = mnt->mnt_root}, name, flags, mode); } struct file *dentry_open(const struct path *path, int flags, const struct cred *creds); struct file *dentry_open_nonotify(const struct path *path, int flags, const struct cred *cred); struct file *dentry_create(const struct path *path, int flags, umode_t mode, const struct cred *cred); struct path *backing_file_user_path(struct file *f); /* * When mmapping a file on a stackable filesystem (e.g., overlayfs), the file * stored in ->vm_file is a backing file whose f_inode is on the underlying * filesystem. When the mapped file path and inode number are displayed to * user (e.g. via /proc/<pid>/maps), these helpers should be used to get the * path and inode number to display to the user, which is the path of the fd * that user has requested to map and the inode number that would be returned * by fstat() on that same fd. */ /* Get the path to display in /proc/<pid>/maps */ static inline const struct path *file_user_path(struct file *f) { if (unlikely(f->f_mode & FMODE_BACKING)) return backing_file_user_path(f); return &f->f_path; } /* Get the inode whose inode number to display in /proc/<pid>/maps */ static inline const struct inode *file_user_inode(struct file *f) { if (unlikely(f->f_mode & FMODE_BACKING)) return d_inode(backing_file_user_path(f)->dentry); return file_inode(f); } static inline struct file *file_clone_open(struct file *file) { return dentry_open(&file->f_path, file->f_flags, file->f_cred); } extern int filp_close(struct file *, fl_owner_t id); extern struct filename *getname_flags(const char __user *, int); extern struct filename *getname_uflags(const char __user *, int); extern struct filename *getname(const char __user *); extern struct filename *getname_kernel(const char *); extern struct filename *__getname_maybe_null(const char __user *); static inline struct filename *getname_maybe_null(const char __user *name, int flags) { if (!(flags & AT_EMPTY_PATH)) return getname(name); if (!name) return NULL; return __getname_maybe_null(name); } extern void putname(struct filename *name); extern int finish_open(struct file *file, struct dentry *dentry, int (*open)(struct inode *, struct file *)); extern int finish_no_open(struct file *file, struct dentry *dentry); /* Helper for the simple case when original dentry is used */ static inline int finish_open_simple(struct file *file, int error) { if (error) return error; return finish_open(file, file->f_path.dentry, NULL); } /* fs/dcache.c */ extern void __init vfs_caches_init_early(void); extern void __init vfs_caches_init(void); extern struct kmem_cache *names_cachep; #define __getname() kmem_cache_alloc(names_cachep, GFP_KERNEL) #define __putname(name) kmem_cache_free(names_cachep, (void *)(name)) extern struct super_block *blockdev_superblock; static inline bool sb_is_blkdev_sb(struct super_block *sb) { return IS_ENABLED(CONFIG_BLOCK) && sb == blockdev_superblock; } void emergency_thaw_all(void); extern int sync_filesystem(struct super_block *); extern const struct file_operations def_blk_fops; extern const struct file_operations def_chr_fops; /* fs/char_dev.c */ #define CHRDEV_MAJOR_MAX 512 /* Marks the bottom of the first segment of free char majors */ #define CHRDEV_MAJOR_DYN_END 234 /* Marks the top and bottom of the second segment of free char majors */ #define CHRDEV_MAJOR_DYN_EXT_START 511 #define CHRDEV_MAJOR_DYN_EXT_END 384 extern int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *); extern int register_chrdev_region(dev_t, unsigned, const char *); extern int __register_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name, const struct file_operations *fops); extern void __unregister_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name); extern void unregister_chrdev_region(dev_t, unsigned); extern void chrdev_show(struct seq_file *,off_t); static inline int register_chrdev(unsigned int major, const char *name, const struct file_operations *fops) { return __register_chrdev(major, 0, 256, name, fops); } static inline void unregister_chrdev(unsigned int major, const char *name) { __unregister_chrdev(major, 0, 256, name); } extern void init_special_inode(struct inode *, umode_t, dev_t); /* Invalid inode operations -- fs/bad_inode.c */ extern void make_bad_inode(struct inode *); extern bool is_bad_inode(struct inode *); extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart, loff_t lend); extern int __must_check file_check_and_advance_wb_err(struct file *file); extern int __must_check file_write_and_wait_range(struct file *file, loff_t start, loff_t end); int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, loff_t end); static inline int file_write_and_wait(struct file *file) { return file_write_and_wait_range(file, 0, LLONG_MAX); } extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync); extern int vfs_fsync(struct file *file, int datasync); extern int sync_file_range(struct file *file, loff_t offset, loff_t nbytes, unsigned int flags); static inline bool iocb_is_dsync(const struct kiocb *iocb) { return (iocb->ki_flags & IOCB_DSYNC) || IS_SYNC(iocb->ki_filp->f_mapping->host); } /* * Sync the bytes written if this was a synchronous write. Expect ki_pos * to already be updated for the write, and will return either the amount * of bytes passed in, or an error if syncing the file failed. */ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count) { if (iocb_is_dsync(iocb)) { int ret = vfs_fsync_range(iocb->ki_filp, iocb->ki_pos - count, iocb->ki_pos - 1, (iocb->ki_flags & IOCB_SYNC) ? 0 : 1); if (ret) return ret; } else if (iocb->ki_flags & IOCB_DONTCACHE) { struct address_space *mapping = iocb->ki_filp->f_mapping; filemap_fdatawrite_range_kick(mapping, iocb->ki_pos, iocb->ki_pos + count); } return count; } extern void emergency_sync(void); extern void emergency_remount(void); #ifdef CONFIG_BLOCK extern int bmap(struct inode *inode, sector_t *block); #else static inline int bmap(struct inode *inode, sector_t *block) { return -EINVAL; } #endif int notify_change(struct mnt_idmap *, struct dentry *, struct iattr *, struct inode **); int inode_permission(struct mnt_idmap *, struct inode *, int); int generic_permission(struct mnt_idmap *, struct inode *, int); static inline int file_permission(struct file *file, int mask) { return inode_permission(file_mnt_idmap(file), file_inode(file), mask); } static inline int path_permission(const struct path *path, int mask) { return inode_permission(mnt_idmap(path->mnt), d_inode(path->dentry), mask); } int __check_sticky(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode); static inline bool execute_ok(struct inode *inode) { return (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode); } static inline bool inode_wrong_type(const struct inode *inode, umode_t mode) { return (inode->i_mode ^ mode) & S_IFMT; } /** * file_start_write - get write access to a superblock for regular file io * @file: the file we want to write to * * This is a variant of sb_start_write() which is a noop on non-regualr file. * Should be matched with a call to file_end_write(). */ static inline void file_start_write(struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return; sb_start_write(file_inode(file)->i_sb); } static inline bool file_start_write_trylock(struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return true; return sb_start_write_trylock(file_inode(file)->i_sb); } /** * file_end_write - drop write access to a superblock of a regular file * @file: the file we wrote to * * Should be matched with a call to file_start_write(). */ static inline void file_end_write(struct file *file) { if (!S_ISREG(file_inode(file)->i_mode)) return; sb_end_write(file_inode(file)->i_sb); } /** * kiocb_start_write - get write access to a superblock for async file io * @iocb: the io context we want to submit the write with * * This is a variant of sb_start_write() for async io submission. * Should be matched with a call to kiocb_end_write(). */ static inline void kiocb_start_write(struct kiocb *iocb) { struct inode *inode = file_inode(iocb->ki_filp); sb_start_write(inode->i_sb); /* * Fool lockdep by telling it the lock got released so that it * doesn't complain about the held lock when we return to userspace. */ __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE); } /** * kiocb_end_write - drop write access to a superblock after async file io * @iocb: the io context we sumbitted the write with * * Should be matched with a call to kiocb_start_write(). */ static inline void kiocb_end_write(struct kiocb *iocb) { struct inode *inode = file_inode(iocb->ki_filp); /* * Tell lockdep we inherited freeze protection from submission thread. */ __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); sb_end_write(inode->i_sb); } /* * This is used for regular files where some users -- especially the * currently executed binary in a process, previously handled via * VM_DENYWRITE -- cannot handle concurrent write (and maybe mmap * read-write shared) accesses. * * get_write_access() gets write permission for a file. * put_write_access() releases this write permission. * deny_write_access() denies write access to a file. * allow_write_access() re-enables write access to a file. * * The i_writecount field of an inode can have the following values: * 0: no write access, no denied write access * < 0: (-i_writecount) users that denied write access to the file. * > 0: (i_writecount) users that have write access to the file. * * Normally we operate on that counter with atomic_{inc,dec} and it's safe * except for the cases where we don't hold i_writecount yet. Then we need to * use {get,deny}_write_access() - these functions check the sign and refuse * to do the change if sign is wrong. */ static inline int get_write_access(struct inode *inode) { return atomic_inc_unless_negative(&inode->i_writecount) ? 0 : -ETXTBSY; } static inline int deny_write_access(struct file *file) { struct inode *inode = file_inode(file); return atomic_dec_unless_positive(&inode->i_writecount) ? 0 : -ETXTBSY; } static inline void put_write_access(struct inode * inode) { atomic_dec(&inode->i_writecount); } static inline void allow_write_access(struct file *file) { if (file) atomic_inc(&file_inode(file)->i_writecount); } /* * Do not prevent write to executable file when watched by pre-content events. * * Note that FMODE_FSNOTIFY_HSM mode is set depending on pre-content watches at * the time of file open and remains constant for entire lifetime of the file, * so if pre-content watches are added post execution or removed before the end * of the execution, it will not cause i_writecount reference leak. */ static inline int exe_file_deny_write_access(struct file *exe_file) { if (unlikely(FMODE_FSNOTIFY_HSM(exe_file->f_mode))) return 0; return deny_write_access(exe_file); } static inline void exe_file_allow_write_access(struct file *exe_file) { if (unlikely(!exe_file || FMODE_FSNOTIFY_HSM(exe_file->f_mode))) return; allow_write_access(exe_file); } static inline void file_set_fsnotify_mode(struct file *file, fmode_t mode) { file->f_mode &= ~FMODE_FSNOTIFY_MASK; file->f_mode |= mode; } static inline bool inode_is_open_for_write(const struct inode *inode) { return atomic_read(&inode->i_writecount) > 0; } #if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING) static inline void i_readcount_dec(struct inode *inode) { BUG_ON(atomic_dec_return(&inode->i_readcount) < 0); } static inline void i_readcount_inc(struct inode *inode) { atomic_inc(&inode->i_readcount); } #else static inline void i_readcount_dec(struct inode *inode) { return; } static inline void i_readcount_inc(struct inode *inode) { return; } #endif extern int do_pipe_flags(int *, int); extern ssize_t kernel_read(struct file *, void *, size_t, loff_t *); ssize_t __kernel_read(struct file *file, void *buf, size_t count, loff_t *pos); extern ssize_t kernel_write(struct file *, const void *, size_t, loff_t *); extern ssize_t __kernel_write(struct file *, const void *, size_t, loff_t *); extern struct file * open_exec(const char *); /* fs/dcache.c -- generic fs support functions */ extern bool is_subdir(struct dentry *, struct dentry *); extern bool path_is_under(const struct path *, const struct path *); extern char *file_path(struct file *, char *, int); /** * is_dot_dotdot - returns true only if @name is "." or ".." * @name: file name to check * @len: length of file name, in bytes */ static inline bool is_dot_dotdot(const char *name, size_t len) { return len && unlikely(name[0] == '.') && (len == 1 || (len == 2 && name[1] == '.')); } #include <linux/err.h> /* needed for stackable file system support */ extern loff_t default_llseek(struct file *file, loff_t offset, int whence); extern loff_t vfs_llseek(struct file *file, loff_t offset, int whence); extern int inode_init_always_gfp(struct super_block *, struct inode *, gfp_t); static inline int inode_init_always(struct super_block *sb, struct inode *inode) { return inode_init_always_gfp(sb, inode, GFP_NOFS); } extern void inode_init_once(struct inode *); extern void address_space_init_once(struct address_space *mapping); extern struct inode * igrab(struct inode *); extern ino_t iunique(struct super_block *, ino_t); extern int inode_needs_sync(struct inode *inode); extern int generic_delete_inode(struct inode *inode); static inline int generic_drop_inode(struct inode *inode) { return !inode->i_nlink || inode_unhashed(inode); } extern void d_mark_dontcache(struct inode *inode); extern struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data); extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data); extern struct inode *ilookup(struct super_block *sb, unsigned long ino); extern struct inode *inode_insert5(struct inode *inode, unsigned long hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data); struct inode *iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); struct inode *iget5_locked_rcu(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *); extern struct inode * iget_locked(struct super_block *, unsigned long); extern struct inode *find_inode_nowait(struct super_block *, unsigned long, int (*match)(struct inode *, unsigned long, void *), void *data); extern struct inode *find_inode_rcu(struct super_block *, unsigned long, int (*)(struct inode *, void *), void *); extern struct inode *find_inode_by_ino_rcu(struct super_block *, unsigned long); extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *); extern int insert_inode_locked(struct inode *); #ifdef CONFIG_DEBUG_LOCK_ALLOC extern void lockdep_annotate_inode_mutex_key(struct inode *inode); #else static inline void lockdep_annotate_inode_mutex_key(struct inode *inode) { }; #endif extern void unlock_new_inode(struct inode *); extern void discard_new_inode(struct inode *); extern unsigned int get_next_ino(void); extern void evict_inodes(struct super_block *sb); void dump_mapping(const struct address_space *); /* * Userspace may rely on the inode number being non-zero. For example, glibc * simply ignores files with zero i_ino in unlink() and other places. * * As an additional complication, if userspace was compiled with * _FILE_OFFSET_BITS=32 on a 64-bit kernel we'll only end up reading out the * lower 32 bits, so we need to check that those aren't zero explicitly. With * _FILE_OFFSET_BITS=64, this may cause some harmless false-negatives, but * better safe than sorry. */ static inline bool is_zero_ino(ino_t ino) { return (u32)ino == 0; } /* * inode->i_lock must be held */ static inline void __iget(struct inode *inode) { atomic_inc(&inode->i_count); } extern void iget_failed(struct inode *); extern void clear_inode(struct inode *); extern void __destroy_inode(struct inode *); extern struct inode *new_inode_pseudo(struct super_block *sb); extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *); extern int file_remove_privs_flags(struct file *file, unsigned int flags); extern int file_remove_privs(struct file *); int setattr_should_drop_sgid(struct mnt_idmap *idmap, const struct inode *inode); /* * This must be used for allocating filesystems specific inodes to set * up the inode reclaim context correctly. */ #define alloc_inode_sb(_sb, _cache, _gfp) kmem_cache_alloc_lru(_cache, &_sb->s_inode_lru, _gfp) extern void __insert_inode_hash(struct inode *, unsigned long hashval); static inline void insert_inode_hash(struct inode *inode) { __insert_inode_hash(inode, inode->i_ino); } extern void __remove_inode_hash(struct inode *); static inline void remove_inode_hash(struct inode *inode) { if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash)) __remove_inode_hash(inode); } extern void inode_sb_list_add(struct inode *inode); extern void inode_add_lru(struct inode *inode); extern int sb_set_blocksize(struct super_block *, int); extern int sb_min_blocksize(struct super_block *, int); extern int generic_file_mmap(struct file *, struct vm_area_struct *); extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *); extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *); int generic_write_checks_count(struct kiocb *iocb, loff_t *count); extern int generic_write_check_limits(struct file *file, loff_t pos, loff_t *count); extern int generic_file_rw_checks(struct file *file_in, struct file *file_out); ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *to, ssize_t already_read); extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *); ssize_t generic_perform_write(struct kiocb *, struct iov_iter *); ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter, ssize_t direct_written, ssize_t buffered_written); ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags); ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, rwf_t flags); ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb, struct iov_iter *iter); ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, struct iov_iter *iter); /* fs/splice.c */ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); ssize_t copy_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); extern ssize_t iter_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); extern void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); extern loff_t noop_llseek(struct file *file, loff_t offset, int whence); extern loff_t vfs_setpos(struct file *file, loff_t offset, loff_t maxsize); extern loff_t generic_file_llseek(struct file *file, loff_t offset, int whence); extern loff_t generic_file_llseek_size(struct file *file, loff_t offset, int whence, loff_t maxsize, loff_t eof); loff_t generic_llseek_cookie(struct file *file, loff_t offset, int whence, u64 *cookie); extern loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t size); extern loff_t no_seek_end_llseek_size(struct file *, loff_t, int, loff_t); extern loff_t no_seek_end_llseek(struct file *, loff_t, int); int rw_verify_area(int, struct file *, const loff_t *, size_t); extern int generic_file_open(struct inode * inode, struct file * filp); extern int nonseekable_open(struct inode * inode, struct file * filp); extern int stream_open(struct inode * inode, struct file * filp); #ifdef CONFIG_BLOCK typedef void (dio_submit_t)(struct bio *bio, struct inode *inode, loff_t file_offset); enum { /* need locking between buffered and direct access */ DIO_LOCKING = 0x01, /* filesystem does not support filling holes */ DIO_SKIP_HOLES = 0x02, }; ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, get_block_t get_block, dio_iodone_t end_io, int flags); static inline ssize_t blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct iov_iter *iter, get_block_t get_block) { return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, get_block, NULL, DIO_LOCKING | DIO_SKIP_HOLES); } #endif bool inode_dio_finished(const struct inode *inode); void inode_dio_wait(struct inode *inode); void inode_dio_wait_interruptible(struct inode *inode); /** * inode_dio_begin - signal start of a direct I/O requests * @inode: inode the direct I/O happens on * * This is called once we've finished processing a direct I/O request, * and is used to wake up callers waiting for direct I/O to be quiesced. */ static inline void inode_dio_begin(struct inode *inode) { atomic_inc(&inode->i_dio_count); } /** * inode_dio_end - signal finish of a direct I/O requests * @inode: inode the direct I/O happens on * * This is called once we've finished processing a direct I/O request, * and is used to wake up callers waiting for direct I/O to be quiesced. */ static inline void inode_dio_end(struct inode *inode) { if (atomic_dec_and_test(&inode->i_dio_count)) wake_up_var(&inode->i_dio_count); } extern void inode_set_flags(struct inode *inode, unsigned int flags, unsigned int mask); extern const struct file_operations generic_ro_fops; #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) extern int readlink_copy(char __user *, int, const char *, int); extern int page_readlink(struct dentry *, char __user *, int); extern const char *page_get_link(struct dentry *, struct inode *, struct delayed_call *); extern void page_put_link(void *); extern int page_symlink(struct inode *inode, const char *symname, int len); extern const struct inode_operations page_symlink_inode_operations; extern void kfree_link(void *); void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode); void generic_fillattr(struct mnt_idmap *, u32, struct inode *, struct kstat *); void generic_fill_statx_attr(struct inode *inode, struct kstat *stat); void generic_fill_statx_atomic_writes(struct kstat *stat, unsigned int unit_min, unsigned int unit_max); extern int vfs_getattr_nosec(const struct path *, struct kstat *, u32, unsigned int); extern int vfs_getattr(const struct path *, struct kstat *, u32, unsigned int); void __inode_add_bytes(struct inode *inode, loff_t bytes); void inode_add_bytes(struct inode *inode, loff_t bytes); void __inode_sub_bytes(struct inode *inode, loff_t bytes); void inode_sub_bytes(struct inode *inode, loff_t bytes); static inline loff_t __inode_get_bytes(struct inode *inode) { return (((loff_t)inode->i_blocks) << 9) + inode->i_bytes; } loff_t inode_get_bytes(struct inode *inode); void inode_set_bytes(struct inode *inode, loff_t bytes); const char *simple_get_link(struct dentry *, struct inode *, struct delayed_call *); extern const struct inode_operations simple_symlink_inode_operations; extern int iterate_dir(struct file *, struct dir_context *); int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, int flags); int vfs_fstat(int fd, struct kstat *stat); static inline int vfs_stat(const char __user *filename, struct kstat *stat) { return vfs_fstatat(AT_FDCWD, filename, stat, 0); } static inline int vfs_lstat(const char __user *name, struct kstat *stat) { return vfs_fstatat(AT_FDCWD, name, stat, AT_SYMLINK_NOFOLLOW); } extern const char *vfs_get_link(struct dentry *, struct delayed_call *); extern int vfs_readlink(struct dentry *, char __user *, int); extern struct file_system_type *get_filesystem(struct file_system_type *fs); extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); extern void drop_super(struct super_block *sb); extern void drop_super_exclusive(struct super_block *sb); extern void iterate_supers(void (*)(struct super_block *, void *), void *); extern void iterate_supers_type(struct file_system_type *, void (*)(struct super_block *, void *), void *); extern int dcache_dir_open(struct inode *, struct file *); extern int dcache_dir_close(struct inode *, struct file *); extern loff_t dcache_dir_lseek(struct file *, loff_t, int); extern int dcache_readdir(struct file *, struct dir_context *); extern int simple_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern int simple_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern int simple_statfs(struct dentry *, struct kstatfs *); extern int simple_open(struct inode *inode, struct file *file); extern int simple_link(struct dentry *, struct inode *, struct dentry *); extern int simple_unlink(struct inode *, struct dentry *); extern int simple_rmdir(struct inode *, struct dentry *); void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); extern int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); extern int simple_rename(struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); extern void simple_recursive_removal(struct dentry *, void (*callback)(struct dentry *)); extern int noop_fsync(struct file *, loff_t, loff_t, int); extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter); extern int simple_empty(struct dentry *); extern int simple_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata); extern const struct address_space_operations ram_aops; extern int always_delete_dentry(const struct dentry *); extern struct inode *alloc_anon_inode(struct super_block *); extern int simple_nosetlease(struct file *, int, struct file_lease **, void **); extern const struct dentry_operations simple_dentry_operations; extern struct dentry *simple_lookup(struct inode *, struct dentry *, unsigned int flags); extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *); extern const struct file_operations simple_dir_operations; extern const struct inode_operations simple_dir_inode_operations; extern void make_empty_dir_inode(struct inode *inode); extern bool is_empty_dir_inode(struct inode *inode); struct tree_descr { const char *name; const struct file_operations *ops; int mode; }; struct dentry *d_alloc_name(struct dentry *, const char *); extern int simple_fill_super(struct super_block *, unsigned long, const struct tree_descr *); extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count); extern void simple_release_fs(struct vfsmount **mount, int *count); extern ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, const void *from, size_t available); extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, const void __user *from, size_t count); struct offset_ctx { struct maple_tree mt; unsigned long next_offset; }; void simple_offset_init(struct offset_ctx *octx); int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry); void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry); int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); int simple_offset_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); void simple_offset_destroy(struct offset_ctx *octx); extern const struct file_operations simple_offset_dir_operations; extern int __generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_check_addressable(unsigned, u64); extern void generic_set_sb_d_ops(struct super_block *sb); extern int generic_ci_match(const struct inode *parent, const struct qstr *name, const struct qstr *folded_name, const u8 *de_name, u32 de_name_len); #if IS_ENABLED(CONFIG_UNICODE) int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str); int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name); /** * generic_ci_validate_strict_name - Check if a given name is suitable * for a directory * * This functions checks if the proposed filename is valid for the * parent directory. That means that only valid UTF-8 filenames will be * accepted for casefold directories from filesystems created with the * strict encoding flag. That also means that any name will be * accepted for directories that doesn't have casefold enabled, or * aren't being strict with the encoding. * * @dir: inode of the directory where the new file will be created * @name: name of the new file * * Return: * * True: if the filename is suitable for this directory. It can be * true if a given name is not suitable for a strict encoding * directory, but the directory being used isn't strict * * False if the filename isn't suitable for this directory. This only * happens when a directory is casefolded and the filesystem is strict * about its encoding. */ static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qstr *name) { if (!IS_CASEFOLDED(dir) || !sb_has_strict_encoding(dir->i_sb)) return true; /* * A casefold dir must have a encoding set, unless the filesystem * is corrupted */ if (WARN_ON_ONCE(!dir->i_sb->s_encoding)) return true; return !utf8_validate(dir->i_sb->s_encoding, name); } #else static inline bool generic_ci_validate_strict_name(struct inode *dir, struct qstr *name) { return true; } #endif static inline bool sb_has_encoding(const struct super_block *sb) { #if IS_ENABLED(CONFIG_UNICODE) return !!sb->s_encoding; #else return false; #endif } int may_setattr(struct mnt_idmap *idmap, struct inode *inode, unsigned int ia_valid); int setattr_prepare(struct mnt_idmap *, struct dentry *, struct iattr *); extern int inode_newsize_ok(const struct inode *, loff_t offset); void setattr_copy(struct mnt_idmap *, struct inode *inode, const struct iattr *attr); extern int file_update_time(struct file *file); static inline bool vma_is_dax(const struct vm_area_struct *vma) { return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host); } static inline bool vma_is_fsdax(struct vm_area_struct *vma) { struct inode *inode; if (!IS_ENABLED(CONFIG_FS_DAX) || !vma->vm_file) return false; if (!vma_is_dax(vma)) return false; inode = file_inode(vma->vm_file); if (S_ISCHR(inode->i_mode)) return false; /* device-dax */ return true; } static inline int iocb_flags(struct file *file) { int res = 0; if (file->f_flags & O_APPEND) res |= IOCB_APPEND; if (file->f_flags & O_DIRECT) res |= IOCB_DIRECT; if (file->f_flags & O_DSYNC) res |= IOCB_DSYNC; if (file->f_flags & __O_SYNC) res |= IOCB_SYNC; return res; } static inline int kiocb_set_rw_flags(struct kiocb *ki, rwf_t flags, int rw_type) { int kiocb_flags = 0; /* make sure there's no overlap between RWF and private IOCB flags */ BUILD_BUG_ON((__force int) RWF_SUPPORTED & IOCB_EVENTFD); if (!flags) return 0; if (unlikely(flags & ~RWF_SUPPORTED)) return -EOPNOTSUPP; if (unlikely((flags & RWF_APPEND) && (flags & RWF_NOAPPEND))) return -EINVAL; if (flags & RWF_NOWAIT) { if (!(ki->ki_filp->f_mode & FMODE_NOWAIT)) return -EOPNOTSUPP; } if (flags & RWF_ATOMIC) { if (rw_type != WRITE) return -EOPNOTSUPP; if (!(ki->ki_filp->f_mode & FMODE_CAN_ATOMIC_WRITE)) return -EOPNOTSUPP; } if (flags & RWF_DONTCACHE) { /* file system must support it */ if (!(ki->ki_filp->f_op->fop_flags & FOP_DONTCACHE)) return -EOPNOTSUPP; /* DAX mappings not supported */ if (IS_DAX(ki->ki_filp->f_mapping->host)) return -EOPNOTSUPP; } kiocb_flags |= (__force int) (flags & RWF_SUPPORTED); if (flags & RWF_SYNC) kiocb_flags |= IOCB_DSYNC; if ((flags & RWF_NOAPPEND) && (ki->ki_flags & IOCB_APPEND)) { if (IS_APPEND(file_inode(ki->ki_filp))) return -EPERM; ki->ki_flags &= ~IOCB_APPEND; } ki->ki_flags |= kiocb_flags; return 0; } /* Transaction based IO helpers */ /* * An argresp is stored in an allocated page and holds the * size of the argument or response, along with its content */ struct simple_transaction_argresp { ssize_t size; char data[]; }; #define SIMPLE_TRANSACTION_LIMIT (PAGE_SIZE - sizeof(struct simple_transaction_argresp)) char *simple_transaction_get(struct file *file, const char __user *buf, size_t size); ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos); int simple_transaction_release(struct inode *inode, struct file *file); void simple_transaction_set(struct file *file, size_t n); /* * simple attribute files * * These attributes behave similar to those in sysfs: * * Writing to an attribute immediately sets a value, an open file can be * written to multiple times. * * Reading from an attribute creates a buffer from the value that might get * read with multiple read calls. When the attribute has been read * completely, no further read calls are possible until the file is opened * again. * * All attributes contain a text representation of a numeric value * that are accessed with the get() and set() functions. */ #define DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, __is_signed) \ static int __fops ## _open(struct inode *inode, struct file *file) \ { \ __simple_attr_check_format(__fmt, 0ull); \ return simple_attr_open(inode, file, __get, __set, __fmt); \ } \ static const struct file_operations __fops = { \ .owner = THIS_MODULE, \ .open = __fops ## _open, \ .release = simple_attr_release, \ .read = simple_attr_read, \ .write = (__is_signed) ? simple_attr_write_signed : simple_attr_write, \ .llseek = generic_file_llseek, \ } #define DEFINE_SIMPLE_ATTRIBUTE(__fops, __get, __set, __fmt) \ DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, false) #define DEFINE_SIMPLE_ATTRIBUTE_SIGNED(__fops, __get, __set, __fmt) \ DEFINE_SIMPLE_ATTRIBUTE_XSIGNED(__fops, __get, __set, __fmt, true) static inline __printf(1, 2) void __simple_attr_check_format(const char *fmt, ...) { /* don't do anything, just let the compiler check the arguments; */ } int simple_attr_open(struct inode *inode, struct file *file, int (*get)(void *, u64 *), int (*set)(void *, u64), const char *fmt); int simple_attr_release(struct inode *inode, struct file *file); ssize_t simple_attr_read(struct file *file, char __user *buf, size_t len, loff_t *ppos); ssize_t simple_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos); ssize_t simple_attr_write_signed(struct file *file, const char __user *buf, size_t len, loff_t *ppos); struct ctl_table; int __init list_bdev_fs_names(char *buf, size_t size); #define __FMODE_EXEC ((__force int) FMODE_EXEC) #define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE]) #define OPEN_FMODE(flag) ((__force fmode_t)((flag + 1) & O_ACCMODE)) static inline bool is_sxid(umode_t mode) { return mode & (S_ISUID | S_ISGID); } static inline int check_sticky(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode) { if (!(dir->i_mode & S_ISVTX)) return 0; return __check_sticky(idmap, dir, inode); } static inline void inode_has_no_xattr(struct inode *inode) { if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & SB_NOSEC)) inode->i_flags |= S_NOSEC; } static inline bool is_root_inode(struct inode *inode) { return inode == inode->i_sb->s_root->d_inode; } static inline bool dir_emit(struct dir_context *ctx, const char *name, int namelen, u64 ino, unsigned type) { return ctx->actor(ctx, name, namelen, ctx->pos, ino, type); } static inline bool dir_emit_dot(struct file *file, struct dir_context *ctx) { return ctx->actor(ctx, ".", 1, ctx->pos, file->f_path.dentry->d_inode->i_ino, DT_DIR); } static inline bool dir_emit_dotdot(struct file *file, struct dir_context *ctx) { return ctx->actor(ctx, "..", 2, ctx->pos, d_parent_ino(file->f_path.dentry), DT_DIR); } static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx) { if (ctx->pos == 0) { if (!dir_emit_dot(file, ctx)) return false; ctx->pos = 1; } if (ctx->pos == 1) { if (!dir_emit_dotdot(file, ctx)) return false; ctx->pos = 2; } return true; } static inline bool dir_relax(struct inode *inode) { inode_unlock(inode); inode_lock(inode); return !IS_DEADDIR(inode); } static inline bool dir_relax_shared(struct inode *inode) { inode_unlock_shared(inode); inode_lock_shared(inode); return !IS_DEADDIR(inode); } extern bool path_noexec(const struct path *path); extern void inode_nohighmem(struct inode *inode); /* mm/fadvise.c */ extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice); extern int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice); static inline bool vfs_empty_path(int dfd, const char __user *path) { char c; if (dfd < 0) return false; /* We now allow NULL to be used for empty path. */ if (!path) return true; if (unlikely(get_user(c, path))) return false; return !c; } int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter); #endif /* _LINUX_FS_H */
682 157 6 240 241 172 73 159 162 20 142 2 2 161 161 6 90 43 162 161 6 6 6 20 141 161 48 10 147 162 682 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2002 International Business Machines, Corp. * * This file is part of the SCTP kernel implementation * * These functions are the methods for accessing the SCTP inqueue. * * An SCTP inqueue is a queue into which you push SCTP packets * (which might be bundles or fragments of chunks) and out of which you * pop SCTP whole chunks. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <linux/interrupt.h> #include <linux/slab.h> /* Initialize an SCTP inqueue. */ void sctp_inq_init(struct sctp_inq *queue) { INIT_LIST_HEAD(&queue->in_chunk_list); queue->in_progress = NULL; /* Create a task for delivering data. */ INIT_WORK(&queue->immediate, NULL); } /* Properly release the chunk which is being worked on. */ static inline void sctp_inq_chunk_free(struct sctp_chunk *chunk) { if (chunk->head_skb) chunk->skb = chunk->head_skb; sctp_chunk_free(chunk); } /* Release the memory associated with an SCTP inqueue. */ void sctp_inq_free(struct sctp_inq *queue) { struct sctp_chunk *chunk, *tmp; /* Empty the queue. */ list_for_each_entry_safe(chunk, tmp, &queue->in_chunk_list, list) { list_del_init(&chunk->list); sctp_chunk_free(chunk); } /* If there is a packet which is currently being worked on, * free it as well. */ if (queue->in_progress) { sctp_inq_chunk_free(queue->in_progress); queue->in_progress = NULL; } } /* Put a new packet in an SCTP inqueue. * We assume that packet->sctp_hdr is set and in host byte order. */ void sctp_inq_push(struct sctp_inq *q, struct sctp_chunk *chunk) { /* Directly call the packet handling routine. */ if (chunk->rcvr->dead) { sctp_chunk_free(chunk); return; } /* We are now calling this either from the soft interrupt * or from the backlog processing. * Eventually, we should clean up inqueue to not rely * on the BH related data structures. */ list_add_tail(&chunk->list, &q->in_chunk_list); if (chunk->asoc) chunk->asoc->stats.ipackets++; q->immediate.func(&q->immediate); } /* Peek at the next chunk on the inqeue. */ struct sctp_chunkhdr *sctp_inq_peek(struct sctp_inq *queue) { struct sctp_chunk *chunk; struct sctp_chunkhdr *ch = NULL; chunk = queue->in_progress; /* If there is no more chunks in this packet, say so */ if (chunk->singleton || chunk->end_of_packet || chunk->pdiscard) return NULL; ch = (struct sctp_chunkhdr *)chunk->chunk_end; return ch; } /* Extract a chunk from an SCTP inqueue. * * WARNING: If you need to put the chunk on another queue, you need to * make a shallow copy (clone) of it. */ struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue) { struct sctp_chunk *chunk; struct sctp_chunkhdr *ch = NULL; /* The assumption is that we are safe to process the chunks * at this time. */ chunk = queue->in_progress; if (chunk) { /* There is a packet that we have been working on. * Any post processing work to do before we move on? */ if (chunk->singleton || chunk->end_of_packet || chunk->pdiscard) { if (chunk->head_skb == chunk->skb) { chunk->skb = skb_shinfo(chunk->skb)->frag_list; goto new_skb; } if (chunk->skb->next) { chunk->skb = chunk->skb->next; goto new_skb; } sctp_inq_chunk_free(chunk); chunk = queue->in_progress = NULL; } else { /* Nothing to do. Next chunk in the packet, please. */ ch = (struct sctp_chunkhdr *)chunk->chunk_end; /* Force chunk->skb->data to chunk->chunk_end. */ skb_pull(chunk->skb, chunk->chunk_end - chunk->skb->data); /* We are guaranteed to pull a SCTP header. */ } } /* Do we need to take the next packet out of the queue to process? */ if (!chunk) { struct list_head *entry; next_chunk: /* Is the queue empty? */ entry = sctp_list_dequeue(&queue->in_chunk_list); if (!entry) return NULL; chunk = list_entry(entry, struct sctp_chunk, list); if (skb_is_gso(chunk->skb) && skb_is_gso_sctp(chunk->skb)) { /* GSO-marked skbs but without frags, handle * them normally */ if (skb_shinfo(chunk->skb)->frag_list) chunk->head_skb = chunk->skb; /* skbs with "cover letter" */ if (chunk->head_skb && chunk->skb->data_len == chunk->skb->len) chunk->skb = skb_shinfo(chunk->skb)->frag_list; if (WARN_ON(!chunk->skb)) { __SCTP_INC_STATS(dev_net(chunk->skb->dev), SCTP_MIB_IN_PKT_DISCARDS); sctp_chunk_free(chunk); goto next_chunk; } } if (chunk->asoc) sock_rps_save_rxhash(chunk->asoc->base.sk, chunk->skb); queue->in_progress = chunk; new_skb: /* This is the first chunk in the packet. */ ch = (struct sctp_chunkhdr *)chunk->skb->data; chunk->singleton = 1; chunk->data_accepted = 0; chunk->pdiscard = 0; chunk->auth = 0; chunk->has_asconf = 0; chunk->end_of_packet = 0; if (chunk->head_skb) { struct sctp_input_cb *cb = SCTP_INPUT_CB(chunk->skb), *head_cb = SCTP_INPUT_CB(chunk->head_skb); cb->chunk = head_cb->chunk; cb->af = head_cb->af; } } chunk->chunk_hdr = ch; chunk->chunk_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length)); skb_pull(chunk->skb, sizeof(*ch)); chunk->subh.v = NULL; /* Subheader is no longer valid. */ if (chunk->chunk_end + sizeof(*ch) <= skb_tail_pointer(chunk->skb)) { /* This is not a singleton */ chunk->singleton = 0; } else if (chunk->chunk_end > skb_tail_pointer(chunk->skb)) { /* Discard inside state machine. */ chunk->pdiscard = 1; chunk->chunk_end = skb_tail_pointer(chunk->skb); } else { /* We are at the end of the packet, so mark the chunk * in case we need to send a SACK. */ chunk->end_of_packet = 1; } pr_debug("+++sctp_inq_pop+++ chunk:%p[%s], length:%d, skb->len:%d\n", chunk, sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)), ntohs(chunk->chunk_hdr->length), chunk->skb->len); return chunk; } /* Set a top-half handler. * * Originally, we the top-half handler was scheduled as a BH. We now * call the handler directly in sctp_inq_push() at a time that * we know we are lock safe. * The intent is that this routine will pull stuff out of the * inqueue and process it. */ void sctp_inq_set_th_handler(struct sctp_inq *q, work_func_t callback) { INIT_WORK(&q->immediate, callback); }
2341 4525 4490 36 57 1753 1683 116 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 #ifndef _LINUX_JHASH_H #define _LINUX_JHASH_H /* jhash.h: Jenkins hash support. * * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net) * * https://burtleburtle.net/bob/hash/ * * These are the credits from Bob's sources: * * lookup3.c, by Bob Jenkins, May 2006, Public Domain. * * These are functions for producing 32-bit hashes for hash table lookup. * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final() * are externally useful functions. Routines to test the hash are included * if SELF_TEST is defined. You can use this free for any purpose. It's in * the public domain. It has no warranty. * * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@netfilter.org) * * I've modified Bob's hash to be useful in the Linux kernel, and * any bugs present are my fault. * Jozsef */ #include <linux/bitops.h> #include <linux/unaligned/packed_struct.h> /* Best hash sizes are of power of two */ #define jhash_size(n) ((u32)1<<(n)) /* Mask the hash value, i.e (value & jhash_mask(n)) instead of (value % n) */ #define jhash_mask(n) (jhash_size(n)-1) /* __jhash_mix - mix 3 32-bit values reversibly. */ #define __jhash_mix(a, b, c) \ { \ a -= c; a ^= rol32(c, 4); c += b; \ b -= a; b ^= rol32(a, 6); a += c; \ c -= b; c ^= rol32(b, 8); b += a; \ a -= c; a ^= rol32(c, 16); c += b; \ b -= a; b ^= rol32(a, 19); a += c; \ c -= b; c ^= rol32(b, 4); b += a; \ } /* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */ #define __jhash_final(a, b, c) \ { \ c ^= b; c -= rol32(b, 14); \ a ^= c; a -= rol32(c, 11); \ b ^= a; b -= rol32(a, 25); \ c ^= b; c -= rol32(b, 16); \ a ^= c; a -= rol32(c, 4); \ b ^= a; b -= rol32(a, 14); \ c ^= b; c -= rol32(b, 24); \ } /* An arbitrary initial parameter */ #define JHASH_INITVAL 0xdeadbeef /* jhash - hash an arbitrary key * @k: sequence of bytes as key * @length: the length of the key * @initval: the previous hash, or an arbitrary value * * The generic version, hashes an arbitrary sequence of bytes. * No alignment or length assumptions are made about the input key. * * Returns the hash value of the key. The result depends on endianness. */ static inline u32 jhash(const void *key, u32 length, u32 initval) { u32 a, b, c; const u8 *k = key; /* Set up the internal state */ a = b = c = JHASH_INITVAL + length + initval; /* All but the last block: affect some 32 bits of (a,b,c) */ while (length > 12) { a += __get_unaligned_cpu32(k); b += __get_unaligned_cpu32(k + 4); c += __get_unaligned_cpu32(k + 8); __jhash_mix(a, b, c); length -= 12; k += 12; } /* Last block: affect all 32 bits of (c) */ switch (length) { case 12: c += (u32)k[11]<<24; fallthrough; case 11: c += (u32)k[10]<<16; fallthrough; case 10: c += (u32)k[9]<<8; fallthrough; case 9: c += k[8]; fallthrough; case 8: b += (u32)k[7]<<24; fallthrough; case 7: b += (u32)k[6]<<16; fallthrough; case 6: b += (u32)k[5]<<8; fallthrough; case 5: b += k[4]; fallthrough; case 4: a += (u32)k[3]<<24; fallthrough; case 3: a += (u32)k[2]<<16; fallthrough; case 2: a += (u32)k[1]<<8; fallthrough; case 1: a += k[0]; __jhash_final(a, b, c); break; case 0: /* Nothing left to add */ break; } return c; } /* jhash2 - hash an array of u32's * @k: the key which must be an array of u32's * @length: the number of u32's in the key * @initval: the previous hash, or an arbitrary value * * Returns the hash value of the key. */ static inline u32 jhash2(const u32 *k, u32 length, u32 initval) { u32 a, b, c; /* Set up the internal state */ a = b = c = JHASH_INITVAL + (length<<2) + initval; /* Handle most of the key */ while (length > 3) { a += k[0]; b += k[1]; c += k[2]; __jhash_mix(a, b, c); length -= 3; k += 3; } /* Handle the last 3 u32's */ switch (length) { case 3: c += k[2]; fallthrough; case 2: b += k[1]; fallthrough; case 1: a += k[0]; __jhash_final(a, b, c); break; case 0: /* Nothing left to add */ break; } return c; } /* __jhash_nwords - hash exactly 3, 2 or 1 word(s) */ static inline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval) { a += initval; b += initval; c += initval; __jhash_final(a, b, c); return c; } static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval) { return __jhash_nwords(a, b, c, initval + JHASH_INITVAL + (3 << 2)); } static inline u32 jhash_2words(u32 a, u32 b, u32 initval) { return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2)); } static inline u32 jhash_1word(u32 a, u32 initval) { return __jhash_nwords(a, 0, 0, initval + JHASH_INITVAL + (1 << 2)); } #endif /* _LINUX_JHASH_H */
430 429 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/exec.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* * #!-checking implemented by tytso. */ /* * Demand-loading implemented 01.12.91 - no need to read anything but * the header into memory. The inode of the executable is put into * "current->executable", and page faults do the actual loading. Clean. * * Once more I can proudly say that linux stood up to being changed: it * was less than 2 hours work to get demand-loading completely implemented. * * Demand loading changed July 1993 by Eric Youngdale. Use mmap instead, * current->executable is only used by the procfs. This allows a dispatch * table to check for several different types of binary formats. We keep * trying until we recognize the file or we run out of supported binary * formats. */ #include <linux/kernel_read_file.h> #include <linux/slab.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/mm.h> #include <linux/stat.h> #include <linux/fcntl.h> #include <linux/swap.h> #include <linux/string.h> #include <linux/init.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/signal.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task.h> #include <linux/pagemap.h> #include <linux/perf_event.h> #include <linux/highmem.h> #include <linux/spinlock.h> #include <linux/key.h> #include <linux/personality.h> #include <linux/binfmts.h> #include <linux/utsname.h> #include <linux/pid_namespace.h> #include <linux/module.h> #include <linux/namei.h> #include <linux/mount.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/audit.h> #include <linux/kmod.h> #include <linux/fsnotify.h> #include <linux/fs_struct.h> #include <linux/oom.h> #include <linux/compat.h> #include <linux/vmalloc.h> #include <linux/io_uring.h> #include <linux/syscall_user_dispatch.h> #include <linux/coredump.h> #include <linux/time_namespace.h> #include <linux/user_events.h> #include <linux/rseq.h> #include <linux/ksm.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> #include <asm/tlb.h> #include <trace/events/task.h> #include "internal.h" #include <trace/events/sched.h> static int bprm_creds_from_file(struct linux_binprm *bprm); int suid_dumpable = 0; static LIST_HEAD(formats); static DEFINE_RWLOCK(binfmt_lock); void __register_binfmt(struct linux_binfmt * fmt, int insert) { write_lock(&binfmt_lock); insert ? list_add(&fmt->lh, &formats) : list_add_tail(&fmt->lh, &formats); write_unlock(&binfmt_lock); } EXPORT_SYMBOL(__register_binfmt); void unregister_binfmt(struct linux_binfmt * fmt) { write_lock(&binfmt_lock); list_del(&fmt->lh); write_unlock(&binfmt_lock); } EXPORT_SYMBOL(unregister_binfmt); static inline void put_binfmt(struct linux_binfmt * fmt) { module_put(fmt->module); } bool path_noexec(const struct path *path) { return (path->mnt->mnt_flags & MNT_NOEXEC) || (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC); } #ifdef CONFIG_USELIB /* * Note that a shared library must be both readable and executable due to * security reasons. * * Also note that we take the address to load from the file itself. */ SYSCALL_DEFINE1(uselib, const char __user *, library) { struct linux_binfmt *fmt; struct file *file; struct filename *tmp = getname(library); int error = PTR_ERR(tmp); static const struct open_flags uselib_flags = { .open_flag = O_LARGEFILE | O_RDONLY, .acc_mode = MAY_READ | MAY_EXEC, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, }; if (IS_ERR(tmp)) goto out; file = do_filp_open(AT_FDCWD, tmp, &uselib_flags); putname(tmp); error = PTR_ERR(file); if (IS_ERR(file)) goto out; /* * Check do_open_execat() for an explanation. */ error = -EACCES; if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) || path_noexec(&file->f_path)) goto exit; error = -ENOEXEC; read_lock(&binfmt_lock); list_for_each_entry(fmt, &formats, lh) { if (!fmt->load_shlib) continue; if (!try_module_get(fmt->module)) continue; read_unlock(&binfmt_lock); error = fmt->load_shlib(file); read_lock(&binfmt_lock); put_binfmt(fmt); if (error != -ENOEXEC) break; } read_unlock(&binfmt_lock); exit: fput(file); out: return error; } #endif /* #ifdef CONFIG_USELIB */ #ifdef CONFIG_MMU /* * The nascent bprm->mm is not visible until exec_mmap() but it can * use a lot of memory, account these pages in current->mm temporary * for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we * change the counter back via acct_arg_size(0). */ static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) { struct mm_struct *mm = current->mm; long diff = (long)(pages - bprm->vma_pages); if (!mm || !diff) return; bprm->vma_pages = pages; add_mm_counter(mm, MM_ANONPAGES, diff); } static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; struct vm_area_struct *vma = bprm->vma; struct mm_struct *mm = bprm->mm; int ret; /* * Avoid relying on expanding the stack down in GUP (which * does not work for STACK_GROWSUP anyway), and just do it * ahead of time. */ if (!mmap_read_lock_maybe_expand(mm, vma, pos, write)) return NULL; /* * We are doing an exec(). 'current' is the process * doing the exec and 'mm' is the new process's mm. */ ret = get_user_pages_remote(mm, pos, 1, write ? FOLL_WRITE : 0, &page, NULL); mmap_read_unlock(mm); if (ret <= 0) return NULL; if (write) acct_arg_size(bprm, vma_pages(vma)); return page; } static void put_arg_page(struct page *page) { put_page(page); } static void free_arg_pages(struct linux_binprm *bprm) { } static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, struct page *page) { flush_cache_page(bprm->vma, pos, page_to_pfn(page)); } static int __bprm_mm_init(struct linux_binprm *bprm) { int err; struct vm_area_struct *vma = NULL; struct mm_struct *mm = bprm->mm; bprm->vma = vma = vm_area_alloc(mm); if (!vma) return -ENOMEM; vma_set_anonymous(vma); if (mmap_write_lock_killable(mm)) { err = -EINTR; goto err_free; } /* * Need to be called with mmap write lock * held, to avoid race with ksmd. */ err = ksm_execve(mm); if (err) goto err_ksm; /* * Place the stack at the largest stack address the architecture * supports. Later, we'll move this to an appropriate place. We don't * use STACK_TOP because that can depend on attributes which aren't * configured yet. */ BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); err = insert_vm_struct(mm, vma); if (err) goto err; mm->stack_vm = mm->total_vm = 1; mmap_write_unlock(mm); bprm->p = vma->vm_end - sizeof(void *); return 0; err: ksm_exit(mm); err_ksm: mmap_write_unlock(mm); err_free: bprm->vma = NULL; vm_area_free(vma); return err; } static bool valid_arg_len(struct linux_binprm *bprm, long len) { return len <= MAX_ARG_STRLEN; } #else static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages) { } static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, int write) { struct page *page; page = bprm->page[pos / PAGE_SIZE]; if (!page && write) { page = alloc_page(GFP_HIGHUSER|__GFP_ZERO); if (!page) return NULL; bprm->page[pos / PAGE_SIZE] = page; } return page; } static void put_arg_page(struct page *page) { } static void free_arg_page(struct linux_binprm *bprm, int i) { if (bprm->page[i]) { __free_page(bprm->page[i]); bprm->page[i] = NULL; } } static void free_arg_pages(struct linux_binprm *bprm) { int i; for (i = 0; i < MAX_ARG_PAGES; i++) free_arg_page(bprm, i); } static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, struct page *page) { } static int __bprm_mm_init(struct linux_binprm *bprm) { bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *); return 0; } static bool valid_arg_len(struct linux_binprm *bprm, long len) { return len <= bprm->p; } #endif /* CONFIG_MMU */ /* * Create a new mm_struct and populate it with a temporary stack * vm_area_struct. We don't have enough context at this point to set the stack * flags, permissions, and offset, so we use temporary values. We'll update * them later in setup_arg_pages(). */ static int bprm_mm_init(struct linux_binprm *bprm) { int err; struct mm_struct *mm = NULL; bprm->mm = mm = mm_alloc(); err = -ENOMEM; if (!mm) goto err; /* Save current stack limit for all calculations made during exec. */ task_lock(current->group_leader); bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK]; task_unlock(current->group_leader); err = __bprm_mm_init(bprm); if (err) goto err; return 0; err: if (mm) { bprm->mm = NULL; mmdrop(mm); } return err; } struct user_arg_ptr { #ifdef CONFIG_COMPAT bool is_compat; #endif union { const char __user *const __user *native; #ifdef CONFIG_COMPAT const compat_uptr_t __user *compat; #endif } ptr; }; static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr) { const char __user *native; #ifdef CONFIG_COMPAT if (unlikely(argv.is_compat)) { compat_uptr_t compat; if (get_user(compat, argv.ptr.compat + nr)) return ERR_PTR(-EFAULT); return compat_ptr(compat); } #endif if (get_user(native, argv.ptr.native + nr)) return ERR_PTR(-EFAULT); return native; } /* * count() counts the number of strings in array ARGV. */ static int count(struct user_arg_ptr argv, int max) { int i = 0; if (argv.ptr.native != NULL) { for (;;) { const char __user *p = get_user_arg_ptr(argv, i); if (!p) break; if (IS_ERR(p)) return -EFAULT; if (i >= max) return -E2BIG; ++i; if (fatal_signal_pending(current)) return -ERESTARTNOHAND; cond_resched(); } } return i; } static int count_strings_kernel(const char *const *argv) { int i; if (!argv) return 0; for (i = 0; argv[i]; ++i) { if (i >= MAX_ARG_STRINGS) return -E2BIG; if (fatal_signal_pending(current)) return -ERESTARTNOHAND; cond_resched(); } return i; } static inline int bprm_set_stack_limit(struct linux_binprm *bprm, unsigned long limit) { #ifdef CONFIG_MMU /* Avoid a pathological bprm->p. */ if (bprm->p < limit) return -E2BIG; bprm->argmin = bprm->p - limit; #endif return 0; } static inline bool bprm_hit_stack_limit(struct linux_binprm *bprm) { #ifdef CONFIG_MMU return bprm->p < bprm->argmin; #else return false; #endif } /* * Calculate bprm->argmin from: * - _STK_LIM * - ARG_MAX * - bprm->rlim_stack.rlim_cur * - bprm->argc * - bprm->envc * - bprm->p */ static int bprm_stack_limits(struct linux_binprm *bprm) { unsigned long limit, ptr_size; /* * Limit to 1/4 of the max stack size or 3/4 of _STK_LIM * (whichever is smaller) for the argv+env strings. * This ensures that: * - the remaining binfmt code will not run out of stack space, * - the program will have a reasonable amount of stack left * to work from. */ limit = _STK_LIM / 4 * 3; limit = min(limit, bprm->rlim_stack.rlim_cur / 4); /* * We've historically supported up to 32 pages (ARG_MAX) * of argument strings even with small stacks */ limit = max_t(unsigned long, limit, ARG_MAX); /* Reject totally pathological counts. */ if (bprm->argc < 0 || bprm->envc < 0) return -E2BIG; /* * We must account for the size of all the argv and envp pointers to * the argv and envp strings, since they will also take up space in * the stack. They aren't stored until much later when we can't * signal to the parent that the child has run out of stack space. * Instead, calculate it here so it's possible to fail gracefully. * * In the case of argc = 0, make sure there is space for adding a * empty string (which will bump argc to 1), to ensure confused * userspace programs don't start processing from argv[1], thinking * argc can never be 0, to keep them from walking envp by accident. * See do_execveat_common(). */ if (check_add_overflow(max(bprm->argc, 1), bprm->envc, &ptr_size) || check_mul_overflow(ptr_size, sizeof(void *), &ptr_size)) return -E2BIG; if (limit <= ptr_size) return -E2BIG; limit -= ptr_size; return bprm_set_stack_limit(bprm, limit); } /* * 'copy_strings()' copies argument/environment strings from the old * processes's memory to the new process's stack. The call to get_user_pages() * ensures the destination page is created and not swapped out. */ static int copy_strings(int argc, struct user_arg_ptr argv, struct linux_binprm *bprm) { struct page *kmapped_page = NULL; char *kaddr = NULL; unsigned long kpos = 0; int ret; while (argc-- > 0) { const char __user *str; int len; unsigned long pos; ret = -EFAULT; str = get_user_arg_ptr(argv, argc); if (IS_ERR(str)) goto out; len = strnlen_user(str, MAX_ARG_STRLEN); if (!len) goto out; ret = -E2BIG; if (!valid_arg_len(bprm, len)) goto out; /* We're going to work our way backwards. */ pos = bprm->p; str += len; bprm->p -= len; if (bprm_hit_stack_limit(bprm)) goto out; while (len > 0) { int offset, bytes_to_copy; if (fatal_signal_pending(current)) { ret = -ERESTARTNOHAND; goto out; } cond_resched(); offset = pos % PAGE_SIZE; if (offset == 0) offset = PAGE_SIZE; bytes_to_copy = offset; if (bytes_to_copy > len) bytes_to_copy = len; offset -= bytes_to_copy; pos -= bytes_to_copy; str -= bytes_to_copy; len -= bytes_to_copy; if (!kmapped_page || kpos != (pos & PAGE_MASK)) { struct page *page; page = get_arg_page(bprm, pos, 1); if (!page) { ret = -E2BIG; goto out; } if (kmapped_page) { flush_dcache_page(kmapped_page); kunmap_local(kaddr); put_arg_page(kmapped_page); } kmapped_page = page; kaddr = kmap_local_page(kmapped_page); kpos = pos & PAGE_MASK; flush_arg_page(bprm, kpos, kmapped_page); } if (copy_from_user(kaddr+offset, str, bytes_to_copy)) { ret = -EFAULT; goto out; } } } ret = 0; out: if (kmapped_page) { flush_dcache_page(kmapped_page); kunmap_local(kaddr); put_arg_page(kmapped_page); } return ret; } /* * Copy and argument/environment string from the kernel to the processes stack. */ int copy_string_kernel(const char *arg, struct linux_binprm *bprm) { int len = strnlen(arg, MAX_ARG_STRLEN) + 1 /* terminating NUL */; unsigned long pos = bprm->p; if (len == 0) return -EFAULT; if (!valid_arg_len(bprm, len)) return -E2BIG; /* We're going to work our way backwards. */ arg += len; bprm->p -= len; if (bprm_hit_stack_limit(bprm)) return -E2BIG; while (len > 0) { unsigned int bytes_to_copy = min_t(unsigned int, len, min_not_zero(offset_in_page(pos), PAGE_SIZE)); struct page *page; pos -= bytes_to_copy; arg -= bytes_to_copy; len -= bytes_to_copy; page = get_arg_page(bprm, pos, 1); if (!page) return -E2BIG; flush_arg_page(bprm, pos & PAGE_MASK, page); memcpy_to_page(page, offset_in_page(pos), arg, bytes_to_copy); put_arg_page(page); } return 0; } EXPORT_SYMBOL(copy_string_kernel); static int copy_strings_kernel(int argc, const char *const *argv, struct linux_binprm *bprm) { while (argc-- > 0) { int ret = copy_string_kernel(argv[argc], bprm); if (ret < 0) return ret; if (fatal_signal_pending(current)) return -ERESTARTNOHAND; cond_resched(); } return 0; } #ifdef CONFIG_MMU /* * Finalizes the stack vm_area_struct. The flags and permissions are updated, * the stack is optionally relocated, and some extra space is added. */ int setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int executable_stack) { unsigned long ret; unsigned long stack_shift; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = bprm->vma; struct vm_area_struct *prev = NULL; unsigned long vm_flags; unsigned long stack_base; unsigned long stack_size; unsigned long stack_expand; unsigned long rlim_stack; struct mmu_gather tlb; struct vma_iterator vmi; #ifdef CONFIG_STACK_GROWSUP /* Limit stack size */ stack_base = bprm->rlim_stack.rlim_max; stack_base = calc_max_stack_size(stack_base); /* Add space for stack randomization. */ if (current->flags & PF_RANDOMIZE) stack_base += (STACK_RND_MASK << PAGE_SHIFT); /* Make sure we didn't let the argument array grow too large. */ if (vma->vm_end - vma->vm_start > stack_base) return -ENOMEM; stack_base = PAGE_ALIGN(stack_top - stack_base); stack_shift = vma->vm_start - stack_base; mm->arg_start = bprm->p - stack_shift; bprm->p = vma->vm_end - stack_shift; #else stack_top = arch_align_stack(stack_top); stack_top = PAGE_ALIGN(stack_top); if (unlikely(stack_top < mmap_min_addr) || unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr)) return -ENOMEM; stack_shift = vma->vm_end - stack_top; bprm->p -= stack_shift; mm->arg_start = bprm->p; #endif if (bprm->loader) bprm->loader -= stack_shift; bprm->exec -= stack_shift; if (mmap_write_lock_killable(mm)) return -EINTR; vm_flags = VM_STACK_FLAGS; /* * Adjust stack execute permissions; explicitly enable for * EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone * (arch default) otherwise. */ if (unlikely(executable_stack == EXSTACK_ENABLE_X)) vm_flags |= VM_EXEC; else if (executable_stack == EXSTACK_DISABLE_X) vm_flags &= ~VM_EXEC; vm_flags |= mm->def_flags; vm_flags |= VM_STACK_INCOMPLETE_SETUP; vma_iter_init(&vmi, mm, vma->vm_start); tlb_gather_mmu(&tlb, mm); ret = mprotect_fixup(&vmi, &tlb, vma, &prev, vma->vm_start, vma->vm_end, vm_flags); tlb_finish_mmu(&tlb); if (ret) goto out_unlock; BUG_ON(prev != vma); if (unlikely(vm_flags & VM_EXEC)) { pr_warn_once("process '%pD4' started with executable stack\n", bprm->file); } /* Move stack pages down in memory. */ if (stack_shift) { /* * During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once * the binfmt code determines where the new stack should reside, we shift it to * its final location. */ ret = relocate_vma_down(vma, stack_shift); if (ret) goto out_unlock; } /* mprotect_fixup is overkill to remove the temporary stack flags */ vm_flags_clear(vma, VM_STACK_INCOMPLETE_SETUP); stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */ stack_size = vma->vm_end - vma->vm_start; /* * Align this down to a page boundary as expand_stack * will align it up. */ rlim_stack = bprm->rlim_stack.rlim_cur & PAGE_MASK; stack_expand = min(rlim_stack, stack_size + stack_expand); #ifdef CONFIG_STACK_GROWSUP stack_base = vma->vm_start + stack_expand; #else stack_base = vma->vm_end - stack_expand; #endif current->mm->start_stack = bprm->p; ret = expand_stack_locked(vma, stack_base); if (ret) ret = -EFAULT; out_unlock: mmap_write_unlock(mm); return ret; } EXPORT_SYMBOL(setup_arg_pages); #else /* * Transfer the program arguments and environment from the holding pages * onto the stack. The provided stack pointer is adjusted accordingly. */ int transfer_args_to_stack(struct linux_binprm *bprm, unsigned long *sp_location) { unsigned long index, stop, sp; int ret = 0; stop = bprm->p >> PAGE_SHIFT; sp = *sp_location; for (index = MAX_ARG_PAGES - 1; index >= stop; index--) { unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0; char *src = kmap_local_page(bprm->page[index]) + offset; sp -= PAGE_SIZE - offset; if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0) ret = -EFAULT; kunmap_local(src); if (ret) goto out; } bprm->exec += *sp_location - MAX_ARG_PAGES * PAGE_SIZE; *sp_location = sp; out: return ret; } EXPORT_SYMBOL(transfer_args_to_stack); #endif /* CONFIG_MMU */ /* * On success, caller must call do_close_execat() on the returned * struct file to close it. */ static struct file *do_open_execat(int fd, struct filename *name, int flags) { int err; struct file *file __free(fput) = NULL; struct open_flags open_exec_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, .acc_mode = MAY_EXEC, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, }; if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH | AT_EXECVE_CHECK)) != 0) return ERR_PTR(-EINVAL); if (flags & AT_SYMLINK_NOFOLLOW) open_exec_flags.lookup_flags &= ~LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) open_exec_flags.lookup_flags |= LOOKUP_EMPTY; file = do_filp_open(fd, name, &open_exec_flags); if (IS_ERR(file)) return file; /* * In the past the regular type check was here. It moved to may_open() in * 633fb6ac3980 ("exec: move S_ISREG() check earlier"). Since then it is * an invariant that all non-regular files error out before we get here. */ if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) || path_noexec(&file->f_path)) return ERR_PTR(-EACCES); err = exe_file_deny_write_access(file); if (err) return ERR_PTR(err); return no_free_ptr(file); } /** * open_exec - Open a path name for execution * * @name: path name to open with the intent of executing it. * * Returns ERR_PTR on failure or allocated struct file on success. * * As this is a wrapper for the internal do_open_execat(), callers * must call exe_file_allow_write_access() before fput() on release. Also see * do_close_execat(). */ struct file *open_exec(const char *name) { struct filename *filename = getname_kernel(name); struct file *f = ERR_CAST(filename); if (!IS_ERR(filename)) { f = do_open_execat(AT_FDCWD, filename, 0); putname(filename); } return f; } EXPORT_SYMBOL(open_exec); #if defined(CONFIG_BINFMT_FLAT) || defined(CONFIG_BINFMT_ELF_FDPIC) ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) { ssize_t res = vfs_read(file, (void __user *)addr, len, &pos); if (res > 0) flush_icache_user_range(addr, addr + len); return res; } EXPORT_SYMBOL(read_code); #endif /* * Maps the mm_struct mm into the current task struct. * On success, this function returns with exec_update_lock * held for writing. */ static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; struct mm_struct *old_mm, *active_mm; int ret; /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; exec_mm_release(tsk, old_mm); ret = down_write_killable(&tsk->signal->exec_update_lock); if (ret) return ret; if (old_mm) { /* * If there is a pending fatal signal perhaps a signal * whose default action is to create a coredump get * out and die instead of going through with the exec. */ ret = mmap_read_lock_killable(old_mm); if (ret) { up_write(&tsk->signal->exec_update_lock); return ret; } } task_lock(tsk); membarrier_exec_mmap(mm); local_irq_disable(); active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; mm_init_cid(mm, tsk); /* * This prevents preemption while active_mm is being loaded and * it and mm are being updated, which could cause problems for * lazy tlb mm refcounting when these are updated by context * switches. Not all architectures can handle irqs off over * activate_mm yet. */ if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); activate_mm(active_mm, mm); if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); lru_gen_add_mm(mm); task_unlock(tsk); lru_gen_use_mm(mm); if (old_mm) { mmap_read_unlock(old_mm); BUG_ON(active_mm != old_mm); setmax_mm_hiwater_rss(&tsk->signal->maxrss, old_mm); mm_update_next_owner(old_mm); mmput(old_mm); return 0; } mmdrop_lazy_tlb(active_mm); return 0; } static int de_thread(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; struct sighand_struct *oldsighand = tsk->sighand; spinlock_t *lock = &oldsighand->siglock; if (thread_group_empty(tsk)) goto no_thread_group; /* * Kill all other threads in the thread group. */ spin_lock_irq(lock); if ((sig->flags & SIGNAL_GROUP_EXIT) || sig->group_exec_task) { /* * Another group action in progress, just * return so that the signal is processed. */ spin_unlock_irq(lock); return -EAGAIN; } sig->group_exec_task = tsk; sig->notify_count = zap_other_threads(tsk); if (!thread_group_leader(tsk)) sig->notify_count--; while (sig->notify_count) { __set_current_state(TASK_KILLABLE); spin_unlock_irq(lock); schedule(); if (__fatal_signal_pending(tsk)) goto killed; spin_lock_irq(lock); } spin_unlock_irq(lock); /* * At this point all other threads have exited, all we have to * do is to wait for the thread group leader to become inactive, * and to assume its PID: */ if (!thread_group_leader(tsk)) { struct task_struct *leader = tsk->group_leader; for (;;) { cgroup_threadgroup_change_begin(tsk); write_lock_irq(&tasklist_lock); /* * Do this under tasklist_lock to ensure that * exit_notify() can't miss ->group_exec_task */ sig->notify_count = -1; if (likely(leader->exit_state)) break; __set_current_state(TASK_KILLABLE); write_unlock_irq(&tasklist_lock); cgroup_threadgroup_change_end(tsk); schedule(); if (__fatal_signal_pending(tsk)) goto killed; } /* * The only record we have of the real-time age of a * process, regardless of execs it's done, is start_time. * All the past CPU time is accumulated in signal_struct * from sister threads now dead. But in this non-leader * exec, nothing survives from the original leader thread, * whose birth marks the true age of this process now. * When we take on its identity by switching to its PID, we * also take its birthdate (always earlier than our own). */ tsk->start_time = leader->start_time; tsk->start_boottime = leader->start_boottime; BUG_ON(!same_thread_group(leader, tsk)); /* * An exec() starts a new thread group with the * TGID of the previous thread group. Rehash the * two threads with a switched PID, and release * the former thread group leader: */ /* Become a process group leader with the old leader's pid. * The old leader becomes a thread of the this thread group. */ exchange_tids(tsk, leader); transfer_pid(leader, tsk, PIDTYPE_TGID); transfer_pid(leader, tsk, PIDTYPE_PGID); transfer_pid(leader, tsk, PIDTYPE_SID); list_replace_rcu(&leader->tasks, &tsk->tasks); list_replace_init(&leader->sibling, &tsk->sibling); tsk->group_leader = tsk; leader->group_leader = tsk; tsk->exit_signal = SIGCHLD; leader->exit_signal = -1; BUG_ON(leader->exit_state != EXIT_ZOMBIE); leader->exit_state = EXIT_DEAD; /* * We are going to release_task()->ptrace_unlink() silently, * the tracer can sleep in do_wait(). EXIT_DEAD guarantees * the tracer won't block again waiting for this thread. */ if (unlikely(leader->ptrace)) __wake_up_parent(leader, leader->parent); write_unlock_irq(&tasklist_lock); cgroup_threadgroup_change_end(tsk); release_task(leader); } sig->group_exec_task = NULL; sig->notify_count = 0; no_thread_group: /* we have changed execution domain */ tsk->exit_signal = SIGCHLD; BUG_ON(!thread_group_leader(tsk)); return 0; killed: /* protects against exit_notify() and __exit_signal() */ read_lock(&tasklist_lock); sig->group_exec_task = NULL; sig->notify_count = 0; read_unlock(&tasklist_lock); return -EAGAIN; } /* * This function makes sure the current process has its own signal table, * so that flush_signal_handlers can later reset the handlers without * disturbing other processes. (Other processes might share the signal * table via the CLONE_SIGHAND option to clone().) */ static int unshare_sighand(struct task_struct *me) { struct sighand_struct *oldsighand = me->sighand; if (refcount_read(&oldsighand->count) != 1) { struct sighand_struct *newsighand; /* * This ->sighand is shared with the CLONE_SIGHAND * but not CLONE_THREAD task, switch to the new one. */ newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); if (!newsighand) return -ENOMEM; refcount_set(&newsighand->count, 1); write_lock_irq(&tasklist_lock); spin_lock(&oldsighand->siglock); memcpy(newsighand->action, oldsighand->action, sizeof(newsighand->action)); rcu_assign_pointer(me->sighand, newsighand); spin_unlock(&oldsighand->siglock); write_unlock_irq(&tasklist_lock); __cleanup_sighand(oldsighand); } return 0; } /* * This is unlocked -- the string will always be NUL-terminated, but * may show overlapping contents if racing concurrent reads. */ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) { size_t len = min(strlen(buf), sizeof(tsk->comm) - 1); trace_task_rename(tsk, buf); memcpy(tsk->comm, buf, len); memset(&tsk->comm[len], 0, sizeof(tsk->comm) - len); perf_event_comm(tsk, exec); } /* * Calling this is the point of no return. None of the failures will be * seen by userspace since either the process is already taking a fatal * signal (via de_thread() or coredump), or will have SEGV raised * (after exec_mmap()) by search_binary_handler (see below). */ int begin_new_exec(struct linux_binprm * bprm) { struct task_struct *me = current; int retval; /* Once we are committed compute the creds */ retval = bprm_creds_from_file(bprm); if (retval) return retval; /* * This tracepoint marks the point before flushing the old exec where * the current task is still unchanged, but errors are fatal (point of * no return). The later "sched_process_exec" tracepoint is called after * the current task has successfully switched to the new exec. */ trace_sched_prepare_exec(current, bprm); /* * Ensure all future errors are fatal. */ bprm->point_of_no_return = true; /* * Make this the only thread in the thread group. */ retval = de_thread(me); if (retval) goto out; /* * Cancel any io_uring activity across execve */ io_uring_task_cancel(); /* Ensure the files table is not shared. */ retval = unshare_files(); if (retval) goto out; /* * Must be called _before_ exec_mmap() as bprm->mm is * not visible until then. Doing it here also ensures * we don't race against replace_mm_exe_file(). */ retval = set_mm_exe_file(bprm->mm, bprm->file); if (retval) goto out; /* If the binary is not readable then enforce mm->dumpable=0 */ would_dump(bprm, bprm->file); if (bprm->have_execfd) would_dump(bprm, bprm->executable); /* * Release all of the old mmap stuff */ acct_arg_size(bprm, 0); retval = exec_mmap(bprm->mm); if (retval) goto out; bprm->mm = NULL; retval = exec_task_namespaces(); if (retval) goto out_unlock; #ifdef CONFIG_POSIX_TIMERS spin_lock_irq(&me->sighand->siglock); posix_cpu_timers_exit(me); spin_unlock_irq(&me->sighand->siglock); exit_itimers(me); flush_itimer_signals(); #endif /* * Make the signal table private. */ retval = unshare_sighand(me); if (retval) goto out_unlock; me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_NOFREEZE | PF_NO_SETAFFINITY); flush_thread(); me->personality &= ~bprm->per_clear; clear_syscall_work_syscall_user_dispatch(me); /* * We have to apply CLOEXEC before we change whether the process is * dumpable (in setup_new_exec) to avoid a race with a process in userspace * trying to access the should-be-closed file descriptors of a process * undergoing exec(2). */ do_close_on_exec(me->files); if (bprm->secureexec) { /* Make sure parent cannot signal privileged process. */ me->pdeath_signal = 0; /* * For secureexec, reset the stack limit to sane default to * avoid bad behavior from the prior rlimits. This has to * happen before arch_pick_mmap_layout(), which examines * RLIMIT_STACK, but after the point of no return to avoid * needing to clean up the change on failure. */ if (bprm->rlim_stack.rlim_cur > _STK_LIM) bprm->rlim_stack.rlim_cur = _STK_LIM; } me->sas_ss_sp = me->sas_ss_size = 0; /* * Figure out dumpability. Note that this checking only of current * is wrong, but userspace depends on it. This should be testing * bprm->secureexec instead. */ if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP || !(uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid()))) set_dumpable(current->mm, suid_dumpable); else set_dumpable(current->mm, SUID_DUMP_USER); perf_event_exec(); /* * If the original filename was empty, alloc_bprm() made up a path * that will probably not be useful to admins running ps or similar. * Let's fix it up to be something reasonable. */ if (bprm->comm_from_dentry) { /* * Hold RCU lock to keep the name from being freed behind our back. * Use acquire semantics to make sure the terminating NUL from * __d_alloc() is seen. * * Note, we're deliberately sloppy here. We don't need to care about * detecting a concurrent rename and just want a terminated name. */ rcu_read_lock(); __set_task_comm(me, smp_load_acquire(&bprm->file->f_path.dentry->d_name.name), true); rcu_read_unlock(); } else { __set_task_comm(me, kbasename(bprm->filename), true); } /* An exec changes our domain. We are no longer part of the thread group */ WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1); flush_signal_handlers(me, 0); retval = set_cred_ucounts(bprm->cred); if (retval < 0) goto out_unlock; /* * install the new credentials for this executable */ security_bprm_committing_creds(bprm); commit_creds(bprm->cred); bprm->cred = NULL; /* * Disable monitoring for regular users * when executing setuid binaries. Must * wait until new credentials are committed * by commit_creds() above */ if (get_dumpable(me->mm) != SUID_DUMP_USER) perf_event_exit_task(me); /* * cred_guard_mutex must be held at least to this point to prevent * ptrace_attach() from altering our determination of the task's * credentials; any time after this it may be unlocked. */ security_bprm_committed_creds(bprm); /* Pass the opened binary to the interpreter. */ if (bprm->have_execfd) { retval = get_unused_fd_flags(0); if (retval < 0) goto out_unlock; fd_install(retval, bprm->executable); bprm->executable = NULL; bprm->execfd = retval; } return 0; out_unlock: up_write(&me->signal->exec_update_lock); if (!bprm->cred) mutex_unlock(&me->signal->cred_guard_mutex); out: return retval; } EXPORT_SYMBOL(begin_new_exec); void would_dump(struct linux_binprm *bprm, struct file *file) { struct inode *inode = file_inode(file); struct mnt_idmap *idmap = file_mnt_idmap(file); if (inode_permission(idmap, inode, MAY_READ) < 0) { struct user_namespace *old, *user_ns; bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; /* Ensure mm->user_ns contains the executable */ user_ns = old = bprm->mm->user_ns; while ((user_ns != &init_user_ns) && !privileged_wrt_inode_uidgid(user_ns, idmap, inode)) user_ns = user_ns->parent; if (old != user_ns) { bprm->mm->user_ns = get_user_ns(user_ns); put_user_ns(old); } } } EXPORT_SYMBOL(would_dump); void setup_new_exec(struct linux_binprm * bprm) { /* Setup things that can depend upon the personality */ struct task_struct *me = current; arch_pick_mmap_layout(me->mm, &bprm->rlim_stack); arch_setup_new_exec(); /* Set the new mm task size. We have to do that late because it may * depend on TIF_32BIT which is only updated in flush_thread() on * some architectures like powerpc */ me->mm->task_size = TASK_SIZE; up_write(&me->signal->exec_update_lock); mutex_unlock(&me->signal->cred_guard_mutex); } EXPORT_SYMBOL(setup_new_exec); /* Runs immediately before start_thread() takes over. */ void finalize_exec(struct linux_binprm *bprm) { /* Store any stack rlimit changes before starting thread. */ task_lock(current->group_leader); current->signal->rlim[RLIMIT_STACK] = bprm->rlim_stack; task_unlock(current->group_leader); } EXPORT_SYMBOL(finalize_exec); /* * Prepare credentials and lock ->cred_guard_mutex. * setup_new_exec() commits the new creds and drops the lock. * Or, if exec fails before, free_bprm() should release ->cred * and unlock. */ static int prepare_bprm_creds(struct linux_binprm *bprm) { if (mutex_lock_interruptible(&current->signal->cred_guard_mutex)) return -ERESTARTNOINTR; bprm->cred = prepare_exec_creds(); if (likely(bprm->cred)) return 0; mutex_unlock(&current->signal->cred_guard_mutex); return -ENOMEM; } /* Matches do_open_execat() */ static void do_close_execat(struct file *file) { if (!file) return; exe_file_allow_write_access(file); fput(file); } static void free_bprm(struct linux_binprm *bprm) { if (bprm->mm) { acct_arg_size(bprm, 0); mmput(bprm->mm); } free_arg_pages(bprm); if (bprm->cred) { mutex_unlock(&current->signal->cred_guard_mutex); abort_creds(bprm->cred); } do_close_execat(bprm->file); if (bprm->executable) fput(bprm->executable); /* If a binfmt changed the interp, free it. */ if (bprm->interp != bprm->filename) kfree(bprm->interp); kfree(bprm->fdpath); kfree(bprm); } static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags) { struct linux_binprm *bprm; struct file *file; int retval = -ENOMEM; file = do_open_execat(fd, filename, flags); if (IS_ERR(file)) return ERR_CAST(file); bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); if (!bprm) { do_close_execat(file); return ERR_PTR(-ENOMEM); } bprm->file = file; if (fd == AT_FDCWD || filename->name[0] == '/') { bprm->filename = filename->name; } else { if (filename->name[0] == '\0') { bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd); bprm->comm_from_dentry = 1; } else { bprm->fdpath = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s", fd, filename->name); } if (!bprm->fdpath) goto out_free; /* * Record that a name derived from an O_CLOEXEC fd will be * inaccessible after exec. This allows the code in exec to * choose to fail when the executable is not mmaped into the * interpreter and an open file descriptor is not passed to * the interpreter. This makes for a better user experience * than having the interpreter start and then immediately fail * when it finds the executable is inaccessible. */ if (get_close_on_exec(fd)) bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE; bprm->filename = bprm->fdpath; } bprm->interp = bprm->filename; /* * At this point, security_file_open() has already been called (with * __FMODE_EXEC) and access control checks for AT_EXECVE_CHECK will * stop just after the security_bprm_creds_for_exec() call in * bprm_execve(). Indeed, the kernel should not try to parse the * content of the file with exec_binprm() nor change the calling * thread, which means that the following security functions will not * be called: * - security_bprm_check() * - security_bprm_creds_from_file() * - security_bprm_committing_creds() * - security_bprm_committed_creds() */ bprm->is_check = !!(flags & AT_EXECVE_CHECK); retval = bprm_mm_init(bprm); if (!retval) return bprm; out_free: free_bprm(bprm); return ERR_PTR(retval); } int bprm_change_interp(const char *interp, struct linux_binprm *bprm) { /* If a binfmt changed the interp, free it first. */ if (bprm->interp != bprm->filename) kfree(bprm->interp); bprm->interp = kstrdup(interp, GFP_KERNEL); if (!bprm->interp) return -ENOMEM; return 0; } EXPORT_SYMBOL(bprm_change_interp); /* * determine how safe it is to execute the proposed program * - the caller must hold ->cred_guard_mutex to protect against * PTRACE_ATTACH or seccomp thread-sync */ static void check_unsafe_exec(struct linux_binprm *bprm) { struct task_struct *p = current, *t; unsigned n_fs; if (p->ptrace) bprm->unsafe |= LSM_UNSAFE_PTRACE; /* * This isn't strictly necessary, but it makes it harder for LSMs to * mess up. */ if (task_no_new_privs(current)) bprm->unsafe |= LSM_UNSAFE_NO_NEW_PRIVS; /* * If another task is sharing our fs, we cannot safely * suid exec because the differently privileged task * will be able to manipulate the current directory, etc. * It would be nice to force an unshare instead... */ n_fs = 1; spin_lock(&p->fs->lock); rcu_read_lock(); for_other_threads(p, t) { if (t->fs == p->fs) n_fs++; } rcu_read_unlock(); /* "users" and "in_exec" locked for copy_fs() */ if (p->fs->users > n_fs) bprm->unsafe |= LSM_UNSAFE_SHARE; else p->fs->in_exec = 1; spin_unlock(&p->fs->lock); } static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) { /* Handle suid and sgid on files */ struct mnt_idmap *idmap; struct inode *inode = file_inode(file); unsigned int mode; vfsuid_t vfsuid; vfsgid_t vfsgid; int err; if (!mnt_may_suid(file->f_path.mnt)) return; if (task_no_new_privs(current)) return; mode = READ_ONCE(inode->i_mode); if (!(mode & (S_ISUID|S_ISGID))) return; idmap = file_mnt_idmap(file); /* Be careful if suid/sgid is set */ inode_lock(inode); /* Atomically reload and check mode/uid/gid now that lock held. */ mode = inode->i_mode; vfsuid = i_uid_into_vfsuid(idmap, inode); vfsgid = i_gid_into_vfsgid(idmap, inode); err = inode_permission(idmap, inode, MAY_EXEC); inode_unlock(inode); /* Did the exec bit vanish out from under us? Give up. */ if (err) return; /* We ignore suid/sgid if there are no mappings for them in the ns */ if (!vfsuid_has_mapping(bprm->cred->user_ns, vfsuid) || !vfsgid_has_mapping(bprm->cred->user_ns, vfsgid)) return; if (mode & S_ISUID) { bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->euid = vfsuid_into_kuid(vfsuid); } if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { bprm->per_clear |= PER_CLEAR_ON_SETID; bprm->cred->egid = vfsgid_into_kgid(vfsgid); } } /* * Compute brpm->cred based upon the final binary. */ static int bprm_creds_from_file(struct linux_binprm *bprm) { /* Compute creds based on which file? */ struct file *file = bprm->execfd_creds ? bprm->executable : bprm->file; bprm_fill_uid(bprm, file); return security_bprm_creds_from_file(bprm, file); } /* * Fill the binprm structure from the inode. * Read the first BINPRM_BUF_SIZE bytes * * This may be called multiple times for binary chains (scripts for example). */ static int prepare_binprm(struct linux_binprm *bprm) { loff_t pos = 0; memset(bprm->buf, 0, BINPRM_BUF_SIZE); return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos); } /* * Arguments are '\0' separated strings found at the location bprm->p * points to; chop off the first by relocating brpm->p to right after * the first '\0' encountered. */ int remove_arg_zero(struct linux_binprm *bprm) { unsigned long offset; char *kaddr; struct page *page; if (!bprm->argc) return 0; do { offset = bprm->p & ~PAGE_MASK; page = get_arg_page(bprm, bprm->p, 0); if (!page) return -EFAULT; kaddr = kmap_local_page(page); for (; offset < PAGE_SIZE && kaddr[offset]; offset++, bprm->p++) ; kunmap_local(kaddr); put_arg_page(page); } while (offset == PAGE_SIZE); bprm->p++; bprm->argc--; return 0; } EXPORT_SYMBOL(remove_arg_zero); /* * cycle the list of binary formats handler, until one recognizes the image */ static int search_binary_handler(struct linux_binprm *bprm) { struct linux_binfmt *fmt; int retval; retval = prepare_binprm(bprm); if (retval < 0) return retval; retval = security_bprm_check(bprm); if (retval) return retval; read_lock(&binfmt_lock); list_for_each_entry(fmt, &formats, lh) { if (!try_module_get(fmt->module)) continue; read_unlock(&binfmt_lock); retval = fmt->load_binary(bprm); read_lock(&binfmt_lock); put_binfmt(fmt); if (bprm->point_of_no_return || (retval != -ENOEXEC)) { read_unlock(&binfmt_lock); return retval; } } read_unlock(&binfmt_lock); return -ENOEXEC; } /* binfmt handlers will call back into begin_new_exec() on success. */ static int exec_binprm(struct linux_binprm *bprm) { pid_t old_pid, old_vpid; int ret, depth; /* Need to fetch pid before load_binary changes it */ old_pid = current->pid; rcu_read_lock(); old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent)); rcu_read_unlock(); /* This allows 4 levels of binfmt rewrites before failing hard. */ for (depth = 0;; depth++) { struct file *exec; if (depth > 5) return -ELOOP; ret = search_binary_handler(bprm); if (ret < 0) return ret; if (!bprm->interpreter) break; exec = bprm->file; bprm->file = bprm->interpreter; bprm->interpreter = NULL; exe_file_allow_write_access(exec); if (unlikely(bprm->have_execfd)) { if (bprm->executable) { fput(exec); return -ENOEXEC; } bprm->executable = exec; } else fput(exec); } audit_bprm(bprm); trace_sched_process_exec(current, old_pid, bprm); ptrace_event(PTRACE_EVENT_EXEC, old_vpid); proc_exec_connector(current); return 0; } static int bprm_execve(struct linux_binprm *bprm) { int retval; retval = prepare_bprm_creds(bprm); if (retval) return retval; /* * Check for unsafe execution states before exec_binprm(), which * will call back into begin_new_exec(), into bprm_creds_from_file(), * where setuid-ness is evaluated. */ check_unsafe_exec(bprm); current->in_execve = 1; sched_mm_cid_before_execve(current); sched_exec(); /* Set the unchanging part of bprm->cred */ retval = security_bprm_creds_for_exec(bprm); if (retval || bprm->is_check) goto out; retval = exec_binprm(bprm); if (retval < 0) goto out; sched_mm_cid_after_execve(current); /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; rseq_execve(current); user_events_execve(current); acct_update_integrals(current); task_numa_free(current, false); return retval; out: /* * If past the point of no return ensure the code never * returns to the userspace process. Use an existing fatal * signal if present otherwise terminate the process with * SIGSEGV. */ if (bprm->point_of_no_return && !fatal_signal_pending(current)) force_fatal_sig(SIGSEGV); sched_mm_cid_after_execve(current); current->fs->in_exec = 0; current->in_execve = 0; return retval; } static int do_execveat_common(int fd, struct filename *filename, struct user_arg_ptr argv, struct user_arg_ptr envp, int flags) { struct linux_binprm *bprm; int retval; if (IS_ERR(filename)) return PTR_ERR(filename); /* * We move the actual failure in case of RLIMIT_NPROC excess from * set*uid() to execve() because too many poorly written programs * don't check setuid() return code. Here we additionally recheck * whether NPROC limit is still exceeded. */ if ((current->flags & PF_NPROC_EXCEEDED) && is_rlimit_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { retval = -EAGAIN; goto out_ret; } /* We're below the limit (still or again), so we don't want to make * further execve() calls fail. */ current->flags &= ~PF_NPROC_EXCEEDED; bprm = alloc_bprm(fd, filename, flags); if (IS_ERR(bprm)) { retval = PTR_ERR(bprm); goto out_ret; } retval = count(argv, MAX_ARG_STRINGS); if (retval < 0) goto out_free; bprm->argc = retval; retval = count(envp, MAX_ARG_STRINGS); if (retval < 0) goto out_free; bprm->envc = retval; retval = bprm_stack_limits(bprm); if (retval < 0) goto out_free; retval = copy_string_kernel(bprm->filename, bprm); if (retval < 0) goto out_free; bprm->exec = bprm->p; retval = copy_strings(bprm->envc, envp, bprm); if (retval < 0) goto out_free; retval = copy_strings(bprm->argc, argv, bprm); if (retval < 0) goto out_free; /* * When argv is empty, add an empty string ("") as argv[0] to * ensure confused userspace programs that start processing * from argv[1] won't end up walking envp. See also * bprm_stack_limits(). */ if (bprm->argc == 0) { retval = copy_string_kernel("", bprm); if (retval < 0) goto out_free; bprm->argc = 1; pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n", current->comm, bprm->filename); } retval = bprm_execve(bprm); out_free: free_bprm(bprm); out_ret: putname(filename); return retval; } int kernel_execve(const char *kernel_filename, const char *const *argv, const char *const *envp) { struct filename *filename; struct linux_binprm *bprm; int fd = AT_FDCWD; int retval; /* It is non-sense for kernel threads to call execve */ if (WARN_ON_ONCE(current->flags & PF_KTHREAD)) return -EINVAL; filename = getname_kernel(kernel_filename); if (IS_ERR(filename)) return PTR_ERR(filename); bprm = alloc_bprm(fd, filename, 0); if (IS_ERR(bprm)) { retval = PTR_ERR(bprm); goto out_ret; } retval = count_strings_kernel(argv); if (WARN_ON_ONCE(retval == 0)) retval = -EINVAL; if (retval < 0) goto out_free; bprm->argc = retval; retval = count_strings_kernel(envp); if (retval < 0) goto out_free; bprm->envc = retval; retval = bprm_stack_limits(bprm); if (retval < 0) goto out_free; retval = copy_string_kernel(bprm->filename, bprm); if (retval < 0) goto out_free; bprm->exec = bprm->p; retval = copy_strings_kernel(bprm->envc, envp, bprm); if (retval < 0) goto out_free; retval = copy_strings_kernel(bprm->argc, argv, bprm); if (retval < 0) goto out_free; retval = bprm_execve(bprm); out_free: free_bprm(bprm); out_ret: putname(filename); return retval; } static int do_execve(struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp) { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); } static int do_execveat(int fd, struct filename *filename, const char __user *const __user *__argv, const char __user *const __user *__envp, int flags) { struct user_arg_ptr argv = { .ptr.native = __argv }; struct user_arg_ptr envp = { .ptr.native = __envp }; return do_execveat_common(fd, filename, argv, envp, flags); } #ifdef CONFIG_COMPAT static int compat_do_execve(struct filename *filename, const compat_uptr_t __user *__argv, const compat_uptr_t __user *__envp) { struct user_arg_ptr argv = { .is_compat = true, .ptr.compat = __argv, }; struct user_arg_ptr envp = { .is_compat = true, .ptr.compat = __envp, }; return do_execveat_common(AT_FDCWD, filename, argv, envp, 0); } static int compat_do_execveat(int fd, struct filename *filename, const compat_uptr_t __user *__argv, const compat_uptr_t __user *__envp, int flags) { struct user_arg_ptr argv = { .is_compat = true, .ptr.compat = __argv, }; struct user_arg_ptr envp = { .is_compat = true, .ptr.compat = __envp, }; return do_execveat_common(fd, filename, argv, envp, flags); } #endif void set_binfmt(struct linux_binfmt *new) { struct mm_struct *mm = current->mm; if (mm->binfmt) module_put(mm->binfmt->module); mm->binfmt = new; if (new) __module_get(new->module); } EXPORT_SYMBOL(set_binfmt); /* * set_dumpable stores three-value SUID_DUMP_* into mm->flags. */ void set_dumpable(struct mm_struct *mm, int value) { if (WARN_ON((unsigned)value > SUID_DUMP_ROOT)) return; set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value); } SYSCALL_DEFINE3(execve, const char __user *, filename, const char __user *const __user *, argv, const char __user *const __user *, envp) { return do_execve(getname(filename), argv, envp); } SYSCALL_DEFINE5(execveat, int, fd, const char __user *, filename, const char __user *const __user *, argv, const char __user *const __user *, envp, int, flags) { return do_execveat(fd, getname_uflags(filename, flags), argv, envp, flags); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(execve, const char __user *, filename, const compat_uptr_t __user *, argv, const compat_uptr_t __user *, envp) { return compat_do_execve(getname(filename), argv, envp); } COMPAT_SYSCALL_DEFINE5(execveat, int, fd, const char __user *, filename, const compat_uptr_t __user *, argv, const compat_uptr_t __user *, envp, int, flags) { return compat_do_execveat(fd, getname_uflags(filename, flags), argv, envp, flags); } #endif #ifdef CONFIG_SYSCTL static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!error) validate_coredump_safety(); return error; } static const struct ctl_table fs_exec_sysctls[] = { { .procname = "suid_dumpable", .data = &suid_dumpable, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax_coredump, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, }; static int __init init_fs_exec_sysctls(void) { register_sysctl_init("fs", fs_exec_sysctls); return 0; } fs_initcall(init_fs_exec_sysctls); #endif /* CONFIG_SYSCTL */ #ifdef CONFIG_EXEC_KUNIT_TEST #include "tests/exec_kunit.c" #endif
31 31 31 3 31 23 12 31 8 8 8 2 8 6 2 1 1 1 1 1 1 2 2 2 1 1 1 1 8 8 8 1 1 1 1 1 1 5 5 5 2 3 1 5 1 1 1 2 2 1 1 1 1 24 23 2 23 24 24 16 5 21 2 2 2 1 23 1 23 2 23 2 23 22 2 21 20 1 23 23 21 2 2 2 2 2 19 4 21 3 3 72 51 52 51 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 // SPDX-License-Identifier: GPL-2.0-only /* * Off-channel operation helpers * * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi> * Copyright 2004, Instant802 Networks, Inc. * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2019, 2022-2024 Intel Corporation */ #include <linux/export.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "driver-ops.h" /* * Tell our hardware to disable PS. * Optionally inform AP that we will go to sleep so that it will buffer * the frames while we are doing off-channel work. This is optional * because we *may* be doing work on-operating channel, and want our * hardware unconditionally awake, but still let the AP send us normal frames. */ static void ieee80211_offchannel_ps_enable(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; bool offchannel_ps_enabled = false; /* FIXME: what to do when local->pspolling is true? */ del_timer_sync(&local->dynamic_ps_timer); del_timer_sync(&ifmgd->bcn_mon_timer); del_timer_sync(&ifmgd->conn_mon_timer); wiphy_work_cancel(local->hw.wiphy, &local->dynamic_ps_enable_work); if (local->hw.conf.flags & IEEE80211_CONF_PS) { offchannel_ps_enabled = true; local->hw.conf.flags &= ~IEEE80211_CONF_PS; ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); } if (!offchannel_ps_enabled || !ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK)) /* * If power save was enabled, no need to send a nullfunc * frame because AP knows that we are sleeping. But if the * hardware is creating the nullfunc frame for power save * status (ie. IEEE80211_HW_PS_NULLFUNC_STACK is not * enabled) and power save was enabled, the firmware just * sent a null frame with power save disabled. So we need * to send a new nullfunc frame to inform the AP that we * are again sleeping. */ ieee80211_send_nullfunc(local, sdata, true); } /* inform AP that we are awake again */ static void ieee80211_offchannel_ps_disable(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; if (!local->ps_sdata) ieee80211_send_nullfunc(local, sdata, false); else if (local->hw.conf.dynamic_ps_timeout > 0) { /* * the dynamic_ps_timer had been running before leaving the * operating channel, restart the timer now and send a nullfunc * frame to inform the AP that we are awake so that AP sends * the buffered packets (if any). */ ieee80211_send_nullfunc(local, sdata, false); mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout)); } ieee80211_sta_reset_beacon_monitor(sdata); ieee80211_sta_reset_conn_monitor(sdata); } void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(!local->emulate_chanctx)) return; /* * notify the AP about us leaving the channel and stop all * STA interfaces. */ /* * Stop queues and transmit all frames queued by the driver * before sending nullfunc to enable powersave at the AP. */ ieee80211_stop_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_OFFCHANNEL, false); ieee80211_flush_queues(local, NULL, false); list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; if (sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE || sdata->vif.type == NL80211_IFTYPE_NAN) continue; if (sdata->vif.type != NL80211_IFTYPE_MONITOR) set_bit(SDATA_STATE_OFFCHANNEL, &sdata->state); /* Check to see if we should disable beaconing. */ if (sdata->vif.bss_conf.enable_beacon) { set_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state); sdata->vif.bss_conf.enable_beacon = false; ieee80211_link_info_change_notify( sdata, &sdata->deflink, BSS_CHANGED_BEACON_ENABLED); } if (sdata->vif.type == NL80211_IFTYPE_STATION && sdata->u.mgd.associated) ieee80211_offchannel_ps_enable(sdata); } } void ieee80211_offchannel_return(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(!local->emulate_chanctx)) return; list_for_each_entry(sdata, &local->interfaces, list) { if (sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE) continue; if (sdata->vif.type != NL80211_IFTYPE_MONITOR) clear_bit(SDATA_STATE_OFFCHANNEL, &sdata->state); if (!ieee80211_sdata_running(sdata)) continue; /* Tell AP we're back */ if (sdata->vif.type == NL80211_IFTYPE_STATION && sdata->u.mgd.associated) ieee80211_offchannel_ps_disable(sdata); if (test_and_clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state)) { sdata->vif.bss_conf.enable_beacon = true; ieee80211_link_info_change_notify( sdata, &sdata->deflink, BSS_CHANGED_BEACON_ENABLED); } } ieee80211_wake_queues_by_reason(&local->hw, IEEE80211_MAX_QUEUE_MAP, IEEE80211_QUEUE_STOP_REASON_OFFCHANNEL, false); } static void ieee80211_roc_notify_destroy(struct ieee80211_roc_work *roc) { /* was never transmitted */ if (roc->frame) { cfg80211_mgmt_tx_status(&roc->sdata->wdev, roc->mgmt_tx_cookie, roc->frame->data, roc->frame->len, false, GFP_KERNEL); ieee80211_free_txskb(&roc->sdata->local->hw, roc->frame); } if (!roc->mgmt_tx_cookie) cfg80211_remain_on_channel_expired(&roc->sdata->wdev, roc->cookie, roc->chan, GFP_KERNEL); else cfg80211_tx_mgmt_expired(&roc->sdata->wdev, roc->mgmt_tx_cookie, roc->chan, GFP_KERNEL); list_del(&roc->list); kfree(roc); } static unsigned long ieee80211_end_finished_rocs(struct ieee80211_local *local, unsigned long now) { struct ieee80211_roc_work *roc, *tmp; long remaining_dur_min = LONG_MAX; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { long remaining; if (!roc->started) break; remaining = roc->start_time + msecs_to_jiffies(roc->duration) - now; /* In case of HW ROC, it is possible that the HW finished the * ROC session before the actual requested time. In such a case * end the ROC session (disregarding the remaining time). */ if (roc->abort || roc->hw_begun || remaining <= 0) ieee80211_roc_notify_destroy(roc); else remaining_dur_min = min(remaining_dur_min, remaining); } return remaining_dur_min; } static bool ieee80211_recalc_sw_work(struct ieee80211_local *local, unsigned long now) { long dur = ieee80211_end_finished_rocs(local, now); if (dur == LONG_MAX) return false; wiphy_delayed_work_queue(local->hw.wiphy, &local->roc_work, dur); return true; } static void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc, unsigned long start_time) { if (WARN_ON(roc->notified)) return; roc->start_time = start_time; roc->started = true; if (roc->mgmt_tx_cookie) { if (!WARN_ON(!roc->frame)) { ieee80211_tx_skb_tid_band(roc->sdata, roc->frame, 7, roc->chan->band); roc->frame = NULL; } } else { cfg80211_ready_on_channel(&roc->sdata->wdev, roc->cookie, roc->chan, roc->req_duration, GFP_KERNEL); } roc->notified = true; } static void ieee80211_hw_roc_start(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, hw_roc_start); struct ieee80211_roc_work *roc; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry(roc, &local->roc_list, list) { if (!roc->started) break; roc->hw_begun = true; ieee80211_handle_roc_started(roc, local->hw_roc_start_time); } } void ieee80211_ready_on_channel(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); local->hw_roc_start_time = jiffies; trace_api_ready_on_channel(local); wiphy_work_queue(hw->wiphy, &local->hw_roc_start); } EXPORT_SYMBOL_GPL(ieee80211_ready_on_channel); static void _ieee80211_start_next_roc(struct ieee80211_local *local) { struct ieee80211_roc_work *roc, *tmp; enum ieee80211_roc_type type; u32 min_dur, max_dur; lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(list_empty(&local->roc_list))) return; roc = list_first_entry(&local->roc_list, struct ieee80211_roc_work, list); if (WARN_ON(roc->started)) return; min_dur = roc->duration; max_dur = roc->duration; type = roc->type; list_for_each_entry(tmp, &local->roc_list, list) { if (tmp == roc) continue; if (tmp->sdata != roc->sdata || tmp->chan != roc->chan) break; max_dur = max(tmp->duration, max_dur); min_dur = min(tmp->duration, min_dur); type = max(tmp->type, type); } if (local->ops->remain_on_channel) { int ret = drv_remain_on_channel(local, roc->sdata, roc->chan, max_dur, type); if (ret) { wiphy_warn(local->hw.wiphy, "failed to start next HW ROC (%d)\n", ret); /* * queue the work struct again to avoid recursion * when multiple failures occur */ list_for_each_entry(tmp, &local->roc_list, list) { if (tmp->sdata != roc->sdata || tmp->chan != roc->chan) break; tmp->started = true; tmp->abort = true; } wiphy_work_queue(local->hw.wiphy, &local->hw_roc_done); return; } /* we'll notify about the start once the HW calls back */ list_for_each_entry(tmp, &local->roc_list, list) { if (tmp->sdata != roc->sdata || tmp->chan != roc->chan) break; tmp->started = true; } } else { /* If actually operating on the desired channel (with at least * 20 MHz channel width) don't stop all the operations but still * treat it as though the ROC operation started properly, so * other ROC operations won't interfere with this one. * * Note: scan can't run, tmp_channel is what we use, so this * must be the currently active channel. */ roc->on_channel = roc->chan == local->hw.conf.chandef.chan && local->hw.conf.chandef.width != NL80211_CHAN_WIDTH_5 && local->hw.conf.chandef.width != NL80211_CHAN_WIDTH_10; /* start this ROC */ ieee80211_recalc_idle(local); if (!roc->on_channel) { ieee80211_offchannel_stop_vifs(local); local->tmp_channel = roc->chan; ieee80211_hw_conf_chan(local); } wiphy_delayed_work_queue(local->hw.wiphy, &local->roc_work, msecs_to_jiffies(min_dur)); /* tell userspace or send frame(s) */ list_for_each_entry(tmp, &local->roc_list, list) { if (tmp->sdata != roc->sdata || tmp->chan != roc->chan) break; tmp->on_channel = roc->on_channel; ieee80211_handle_roc_started(tmp, jiffies); } } } void ieee80211_start_next_roc(struct ieee80211_local *local) { struct ieee80211_roc_work *roc; lockdep_assert_wiphy(local->hw.wiphy); if (list_empty(&local->roc_list)) { ieee80211_run_deferred_scan(local); return; } /* defer roc if driver is not started (i.e. during reconfig) */ if (local->in_reconfig) return; roc = list_first_entry(&local->roc_list, struct ieee80211_roc_work, list); if (WARN_ON_ONCE(roc->started)) return; if (local->ops->remain_on_channel) { _ieee80211_start_next_roc(local); } else { /* delay it a bit */ wiphy_delayed_work_queue(local->hw.wiphy, &local->roc_work, round_jiffies_relative(HZ / 2)); } } void ieee80211_reconfig_roc(struct ieee80211_local *local) { struct ieee80211_roc_work *roc, *tmp; /* * In the software implementation can just continue with the * interruption due to reconfig, roc_work is still queued if * needed. */ if (!local->ops->remain_on_channel) return; /* flush work so nothing from the driver is still pending */ wiphy_work_flush(local->hw.wiphy, &local->hw_roc_start); wiphy_work_flush(local->hw.wiphy, &local->hw_roc_done); list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { if (!roc->started) break; if (!roc->hw_begun) { /* it didn't start in HW yet, so we can restart it */ roc->started = false; continue; } /* otherwise destroy it and tell userspace */ ieee80211_roc_notify_destroy(roc); } ieee80211_start_next_roc(local); } static void __ieee80211_roc_work(struct ieee80211_local *local) { struct ieee80211_roc_work *roc; bool on_channel; lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(local->ops->remain_on_channel)) return; roc = list_first_entry_or_null(&local->roc_list, struct ieee80211_roc_work, list); if (!roc) return; if (!roc->started) { WARN_ON(!local->emulate_chanctx); _ieee80211_start_next_roc(local); } else { on_channel = roc->on_channel; if (ieee80211_recalc_sw_work(local, jiffies)) return; /* careful - roc pointer became invalid during recalc */ if (!on_channel) { ieee80211_flush_queues(local, NULL, false); local->tmp_channel = NULL; ieee80211_hw_conf_chan(local); ieee80211_offchannel_return(local); } ieee80211_recalc_idle(local); ieee80211_start_next_roc(local); } } static void ieee80211_roc_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, roc_work.work); lockdep_assert_wiphy(local->hw.wiphy); __ieee80211_roc_work(local); } static void ieee80211_hw_roc_done(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, hw_roc_done); lockdep_assert_wiphy(local->hw.wiphy); ieee80211_end_finished_rocs(local, jiffies); /* if there's another roc, start it now */ ieee80211_start_next_roc(local); } void ieee80211_remain_on_channel_expired(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); trace_api_remain_on_channel_expired(local); wiphy_work_queue(hw->wiphy, &local->hw_roc_done); } EXPORT_SYMBOL_GPL(ieee80211_remain_on_channel_expired); static bool ieee80211_coalesce_hw_started_roc(struct ieee80211_local *local, struct ieee80211_roc_work *new_roc, struct ieee80211_roc_work *cur_roc) { unsigned long now = jiffies; unsigned long remaining; if (WARN_ON(!cur_roc->started)) return false; /* if it was scheduled in the hardware, but not started yet, * we can only combine if the older one had a longer duration */ if (!cur_roc->hw_begun && new_roc->duration > cur_roc->duration) return false; remaining = cur_roc->start_time + msecs_to_jiffies(cur_roc->duration) - now; /* if it doesn't fit entirely, schedule a new one */ if (new_roc->duration > jiffies_to_msecs(remaining)) return false; /* add just after the current one so we combine their finish later */ list_add(&new_roc->list, &cur_roc->list); /* if the existing one has already begun then let this one also * begin, otherwise they'll both be marked properly by the work * struct that runs once the driver notifies us of the beginning */ if (cur_roc->hw_begun) { new_roc->hw_begun = true; ieee80211_handle_roc_started(new_roc, now); } return true; } static int ieee80211_start_roc_work(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct ieee80211_channel *channel, unsigned int duration, u64 *cookie, struct sk_buff *txskb, enum ieee80211_roc_type type) { struct ieee80211_roc_work *roc, *tmp; bool queued = false, combine_started = true; int ret; lockdep_assert_wiphy(local->hw.wiphy); if (channel->freq_offset) /* this may work, but is untested */ return -EOPNOTSUPP; if (!local->emulate_chanctx && !local->ops->remain_on_channel) return -EOPNOTSUPP; roc = kzalloc(sizeof(*roc), GFP_KERNEL); if (!roc) return -ENOMEM; /* * If the duration is zero, then the driver * wouldn't actually do anything. Set it to * 10 for now. * * TODO: cancel the off-channel operation * when we get the SKB's TX status and * the wait time was zero before. */ if (!duration) duration = 10; roc->chan = channel; roc->duration = duration; roc->req_duration = duration; roc->frame = txskb; roc->type = type; roc->sdata = sdata; /* * cookie is either the roc cookie (for normal roc) * or the SKB (for mgmt TX) */ if (!txskb) { roc->cookie = ieee80211_mgmt_tx_cookie(local); *cookie = roc->cookie; } else { roc->mgmt_tx_cookie = *cookie; } /* if there's no need to queue, handle it immediately */ if (list_empty(&local->roc_list) && !local->scanning && !ieee80211_is_radar_required(local)) { /* if not HW assist, just queue & schedule work */ if (!local->ops->remain_on_channel) { list_add_tail(&roc->list, &local->roc_list); wiphy_delayed_work_queue(local->hw.wiphy, &local->roc_work, 0); } else { /* otherwise actually kick it off here * (for error handling) */ ret = drv_remain_on_channel(local, sdata, channel, duration, type); if (ret) { kfree(roc); return ret; } roc->started = true; list_add_tail(&roc->list, &local->roc_list); } return 0; } /* otherwise handle queueing */ list_for_each_entry(tmp, &local->roc_list, list) { if (tmp->chan != channel || tmp->sdata != sdata) continue; /* * Extend this ROC if possible: If it hasn't started, add * just after the new one to combine. */ if (!tmp->started) { list_add(&roc->list, &tmp->list); queued = true; break; } if (!combine_started) continue; if (!local->ops->remain_on_channel) { /* If there's no hardware remain-on-channel, and * doing so won't push us over the maximum r-o-c * we allow, then we can just add the new one to * the list and mark it as having started now. * If it would push over the limit, don't try to * combine with other started ones (that haven't * been running as long) but potentially sort it * with others that had the same fate. */ unsigned long now = jiffies; u32 elapsed = jiffies_to_msecs(now - tmp->start_time); struct wiphy *wiphy = local->hw.wiphy; u32 max_roc = wiphy->max_remain_on_channel_duration; if (elapsed + roc->duration > max_roc) { combine_started = false; continue; } list_add(&roc->list, &tmp->list); queued = true; roc->on_channel = tmp->on_channel; ieee80211_handle_roc_started(roc, now); ieee80211_recalc_sw_work(local, now); break; } queued = ieee80211_coalesce_hw_started_roc(local, roc, tmp); if (queued) break; /* if it wasn't queued, perhaps it can be combined with * another that also couldn't get combined previously, * but no need to check for already started ones, since * that can't work. */ combine_started = false; } if (!queued) list_add_tail(&roc->list, &local->roc_list); return 0; } int ieee80211_remain_on_channel(struct wiphy *wiphy, struct wireless_dev *wdev, struct ieee80211_channel *chan, unsigned int duration, u64 *cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct ieee80211_local *local = sdata->local; lockdep_assert_wiphy(local->hw.wiphy); return ieee80211_start_roc_work(local, sdata, chan, duration, cookie, NULL, IEEE80211_ROC_TYPE_NORMAL); } static int ieee80211_cancel_roc(struct ieee80211_local *local, u64 cookie, bool mgmt_tx) { struct ieee80211_roc_work *roc, *tmp, *found = NULL; int ret; lockdep_assert_wiphy(local->hw.wiphy); if (!cookie) return -ENOENT; wiphy_work_flush(local->hw.wiphy, &local->hw_roc_start); list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { if (!mgmt_tx && roc->cookie != cookie) continue; else if (mgmt_tx && roc->mgmt_tx_cookie != cookie) continue; found = roc; break; } if (!found) { return -ENOENT; } if (!found->started) { ieee80211_roc_notify_destroy(found); goto out_unlock; } if (local->ops->remain_on_channel) { ret = drv_cancel_remain_on_channel(local, roc->sdata); if (WARN_ON_ONCE(ret)) { return ret; } /* * We could be racing against the notification from the driver: * + driver is handling the notification on CPU0 * + user space is cancelling the remain on channel and * schedules the hw_roc_done worker. * * Now hw_roc_done might start to run after the next roc will * start and mac80211 will think that this second roc has * ended prematurely. * Cancel the work to make sure that all the pending workers * have completed execution. * Note that this assumes that by the time the driver returns * from drv_cancel_remain_on_channel, it has completed all * the processing of related notifications. */ wiphy_work_cancel(local->hw.wiphy, &local->hw_roc_done); /* TODO: * if multiple items were combined here then we really shouldn't * cancel them all - we should wait for as much time as needed * for the longest remaining one, and only then cancel ... */ list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { if (!roc->started) break; if (roc == found) found = NULL; ieee80211_roc_notify_destroy(roc); } /* that really must not happen - it was started */ WARN_ON(found); ieee80211_start_next_roc(local); } else { /* go through work struct to return to the operating channel */ found->abort = true; wiphy_delayed_work_queue(local->hw.wiphy, &local->roc_work, 0); } out_unlock: return 0; } int ieee80211_cancel_remain_on_channel(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct ieee80211_local *local = sdata->local; return ieee80211_cancel_roc(local, cookie, false); } int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_mgmt_tx_params *params, u64 *cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); struct ieee80211_local *local = sdata->local; struct sk_buff *skb; struct sta_info *sta = NULL; const struct ieee80211_mgmt *mgmt = (void *)params->buf; bool need_offchan = false; bool mlo_sta = false; int link_id = -1; u32 flags; int ret; u8 *data; lockdep_assert_wiphy(local->hw.wiphy); if (params->dont_wait_for_ack) flags = IEEE80211_TX_CTL_NO_ACK; else flags = IEEE80211_TX_INTFL_NL80211_FRAME_TX | IEEE80211_TX_CTL_REQ_TX_STATUS; if (params->no_cck) flags |= IEEE80211_TX_CTL_NO_CCK_RATE; switch (sdata->vif.type) { case NL80211_IFTYPE_ADHOC: if (!sdata->vif.cfg.ibss_joined) need_offchan = true; #ifdef CONFIG_MAC80211_MESH fallthrough; case NL80211_IFTYPE_MESH_POINT: if (ieee80211_vif_is_mesh(&sdata->vif) && !sdata->u.mesh.mesh_id_len) need_offchan = true; #endif fallthrough; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_P2P_GO: if (sdata->vif.type != NL80211_IFTYPE_ADHOC && !ieee80211_vif_is_mesh(&sdata->vif) && !sdata->bss->active) need_offchan = true; rcu_read_lock(); sta = sta_info_get_bss(sdata, mgmt->da); mlo_sta = sta && sta->sta.mlo; if (!ieee80211_is_action(mgmt->frame_control) || mgmt->u.action.category == WLAN_CATEGORY_PUBLIC || mgmt->u.action.category == WLAN_CATEGORY_SELF_PROTECTED || mgmt->u.action.category == WLAN_CATEGORY_SPECTRUM_MGMT) { rcu_read_unlock(); break; } if (!sta) { rcu_read_unlock(); return -ENOLINK; } if (params->link_id >= 0 && !(sta->sta.valid_links & BIT(params->link_id))) { rcu_read_unlock(); return -ENOLINK; } link_id = params->link_id; rcu_read_unlock(); break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: if (!sdata->u.mgd.associated || (params->offchan && params->wait && local->ops->remain_on_channel && memcmp(sdata->vif.cfg.ap_addr, mgmt->bssid, ETH_ALEN))) { need_offchan = true; } else if (sdata->u.mgd.associated && ether_addr_equal(sdata->vif.cfg.ap_addr, mgmt->da)) { sta = sta_info_get_bss(sdata, mgmt->da); mlo_sta = sta && sta->sta.mlo; } break; case NL80211_IFTYPE_P2P_DEVICE: need_offchan = true; break; case NL80211_IFTYPE_NAN: default: return -EOPNOTSUPP; } /* configurations requiring offchan cannot work if no channel has been * specified */ if (need_offchan && !params->chan) return -EINVAL; /* Check if the operating channel is the requested channel */ if (!params->chan && mlo_sta) { need_offchan = false; } else if (!need_offchan) { struct ieee80211_chanctx_conf *chanctx_conf = NULL; int i; rcu_read_lock(); /* Check all the links first */ for (i = 0; i < ARRAY_SIZE(sdata->vif.link_conf); i++) { struct ieee80211_bss_conf *conf; conf = rcu_dereference(sdata->vif.link_conf[i]); if (!conf) continue; chanctx_conf = rcu_dereference(conf->chanctx_conf); if (!chanctx_conf) continue; if (mlo_sta && params->chan == chanctx_conf->def.chan && ether_addr_equal(sdata->vif.addr, mgmt->sa)) { link_id = i; break; } if (ether_addr_equal(conf->addr, mgmt->sa)) { /* If userspace requested Tx on a specific link * use the same link id if the link bss is matching * the requested chan. */ if (sdata->vif.valid_links && params->link_id >= 0 && params->link_id == i && params->chan == chanctx_conf->def.chan) link_id = i; break; } chanctx_conf = NULL; } if (chanctx_conf) { need_offchan = params->chan && (params->chan != chanctx_conf->def.chan); } else { need_offchan = true; } rcu_read_unlock(); } if (need_offchan && !params->offchan) { ret = -EBUSY; goto out_unlock; } skb = dev_alloc_skb(local->hw.extra_tx_headroom + params->len); if (!skb) { ret = -ENOMEM; goto out_unlock; } skb_reserve(skb, local->hw.extra_tx_headroom); data = skb_put_data(skb, params->buf, params->len); /* Update CSA counters */ if (sdata->vif.bss_conf.csa_active && (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_MESH_POINT || sdata->vif.type == NL80211_IFTYPE_ADHOC) && params->n_csa_offsets) { int i; struct beacon_data *beacon = NULL; rcu_read_lock(); if (sdata->vif.type == NL80211_IFTYPE_AP) beacon = rcu_dereference(sdata->deflink.u.ap.beacon); else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) beacon = rcu_dereference(sdata->u.ibss.presp); else if (ieee80211_vif_is_mesh(&sdata->vif)) beacon = rcu_dereference(sdata->u.mesh.beacon); if (beacon) for (i = 0; i < params->n_csa_offsets; i++) data[params->csa_offsets[i]] = beacon->cntdwn_current_counter; rcu_read_unlock(); } IEEE80211_SKB_CB(skb)->flags = flags; IEEE80211_SKB_CB(skb)->control.flags |= IEEE80211_TX_CTRL_DONT_USE_RATE_MASK; skb->dev = sdata->dev; if (!params->dont_wait_for_ack) { /* make a copy to preserve the frame contents * in case of encryption. */ ret = ieee80211_attach_ack_skb(local, skb, cookie, GFP_KERNEL); if (ret) { kfree_skb(skb); goto out_unlock; } } else { /* Assign a dummy non-zero cookie, it's not sent to * userspace in this case but we rely on its value * internally in the need_offchan case to distinguish * mgmt-tx from remain-on-channel. */ *cookie = 0xffffffff; } if (!need_offchan) { ieee80211_tx_skb_tid(sdata, skb, 7, link_id); ret = 0; goto out_unlock; } IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_TX_OFFCHAN | IEEE80211_TX_INTFL_OFFCHAN_TX_OK; if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) IEEE80211_SKB_CB(skb)->hw_queue = local->hw.offchannel_tx_hw_queue; /* This will handle all kinds of coalescing and immediate TX */ ret = ieee80211_start_roc_work(local, sdata, params->chan, params->wait, cookie, skb, IEEE80211_ROC_TYPE_MGMT_TX); if (ret) ieee80211_free_txskb(&local->hw, skb); out_unlock: return ret; } int ieee80211_mgmt_tx_cancel_wait(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie) { struct ieee80211_local *local = wiphy_priv(wiphy); return ieee80211_cancel_roc(local, cookie, true); } void ieee80211_roc_setup(struct ieee80211_local *local) { wiphy_work_init(&local->hw_roc_start, ieee80211_hw_roc_start); wiphy_work_init(&local->hw_roc_done, ieee80211_hw_roc_done); wiphy_delayed_work_init(&local->roc_work, ieee80211_roc_work); INIT_LIST_HEAD(&local->roc_list); } void ieee80211_roc_purge(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { struct ieee80211_roc_work *roc, *tmp; bool work_to_do = false; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { if (sdata && roc->sdata != sdata) continue; if (roc->started) { if (local->ops->remain_on_channel) { /* can race, so ignore return value */ drv_cancel_remain_on_channel(local, roc->sdata); ieee80211_roc_notify_destroy(roc); } else { roc->abort = true; work_to_do = true; } } else { ieee80211_roc_notify_destroy(roc); } } if (work_to_do) __ieee80211_roc_work(local); }
5 5 3 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 // SPDX-License-Identifier: GPL-2.0-or-later /* * The ChaCha stream cipher (RFC7539) * * Copyright (C) 2015 Martin Willi */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/module.h> #include <crypto/algapi.h> // for crypto_xor_cpy #include <crypto/chacha.h> void chacha_crypt_generic(u32 *state, u8 *dst, const u8 *src, unsigned int bytes, int nrounds) { /* aligned to potentially speed up crypto_xor() */ u8 stream[CHACHA_BLOCK_SIZE] __aligned(sizeof(long)); while (bytes >= CHACHA_BLOCK_SIZE) { chacha_block_generic(state, stream, nrounds); crypto_xor_cpy(dst, src, stream, CHACHA_BLOCK_SIZE); bytes -= CHACHA_BLOCK_SIZE; dst += CHACHA_BLOCK_SIZE; src += CHACHA_BLOCK_SIZE; } if (bytes) { chacha_block_generic(state, stream, nrounds); crypto_xor_cpy(dst, src, stream, bytes); } } EXPORT_SYMBOL(chacha_crypt_generic); MODULE_DESCRIPTION("ChaCha stream cipher (RFC7539)"); MODULE_LICENSE("GPL");
71 50 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> /* * The default broadcast address of an interface is QST-0; the default address * is LINUX-1. The null address is defined as a callsign of all spaces with * an SSID of zero. */ const ax25_address ax25_bcast = {{'Q' << 1, 'S' << 1, 'T' << 1, ' ' << 1, ' ' << 1, ' ' << 1, 0 << 1}}; const ax25_address ax25_defaddr = {{'L' << 1, 'I' << 1, 'N' << 1, 'U' << 1, 'X' << 1, ' ' << 1, 1 << 1}}; const ax25_address null_ax25_address = {{' ' << 1, ' ' << 1, ' ' << 1, ' ' << 1, ' ' << 1, ' ' << 1, 0 << 1}}; EXPORT_SYMBOL_GPL(ax25_bcast); EXPORT_SYMBOL_GPL(ax25_defaddr); EXPORT_SYMBOL(null_ax25_address); /* * ax25 -> ascii conversion */ char *ax2asc(char *buf, const ax25_address *a) { char c, *s; int n; for (n = 0, s = buf; n < 6; n++) { c = (a->ax25_call[n] >> 1) & 0x7F; if (c != ' ') *s++ = c; } *s++ = '-'; if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) { *s++ = '1'; n -= 10; } *s++ = n + '0'; *s++ = '\0'; if (*buf == '\0' || *buf == '-') return "*"; return buf; } EXPORT_SYMBOL(ax2asc); /* * ascii -> ax25 conversion */ void asc2ax(ax25_address *addr, const char *callsign) { const char *s; int n; for (s = callsign, n = 0; n < 6; n++) { if (*s != '\0' && *s != '-') addr->ax25_call[n] = *s++; else addr->ax25_call[n] = ' '; addr->ax25_call[n] <<= 1; addr->ax25_call[n] &= 0xFE; } if (*s++ == '\0') { addr->ax25_call[6] = 0x00; return; } addr->ax25_call[6] = *s++ - '0'; if (*s != '\0') { addr->ax25_call[6] *= 10; addr->ax25_call[6] += *s++ - '0'; } addr->ax25_call[6] <<= 1; addr->ax25_call[6] &= 0x1E; } EXPORT_SYMBOL(asc2ax); /* * Compare two ax.25 addresses */ int ax25cmp(const ax25_address *a, const ax25_address *b) { int ct = 0; while (ct < 6) { if ((a->ax25_call[ct] & 0xFE) != (b->ax25_call[ct] & 0xFE)) /* Clean off repeater bits */ return 1; ct++; } if ((a->ax25_call[ct] & 0x1E) == (b->ax25_call[ct] & 0x1E)) /* SSID without control bit */ return 0; return 2; /* Partial match */ } EXPORT_SYMBOL(ax25cmp); /* * Compare two AX.25 digipeater paths. */ int ax25digicmp(const ax25_digi *digi1, const ax25_digi *digi2) { int i; if (digi1->ndigi != digi2->ndigi) return 1; if (digi1->lastrepeat != digi2->lastrepeat) return 1; for (i = 0; i < digi1->ndigi; i++) if (ax25cmp(&digi1->calls[i], &digi2->calls[i]) != 0) return 1; return 0; } /* * Given an AX.25 address pull of to, from, digi list, command/response and the start of data * */ const unsigned char *ax25_addr_parse(const unsigned char *buf, int len, ax25_address *src, ax25_address *dest, ax25_digi *digi, int *flags, int *dama) { int d = 0; if (len < 14) return NULL; if (flags != NULL) { *flags = 0; if (buf[6] & AX25_CBIT) *flags = AX25_COMMAND; if (buf[13] & AX25_CBIT) *flags = AX25_RESPONSE; } if (dama != NULL) *dama = ~buf[13] & AX25_DAMA_FLAG; /* Copy to, from */ if (dest != NULL) memcpy(dest, buf + 0, AX25_ADDR_LEN); if (src != NULL) memcpy(src, buf + 7, AX25_ADDR_LEN); buf += 2 * AX25_ADDR_LEN; len -= 2 * AX25_ADDR_LEN; digi->lastrepeat = -1; digi->ndigi = 0; while (!(buf[-1] & AX25_EBIT)) { if (d >= AX25_MAX_DIGIS) return NULL; if (len < AX25_ADDR_LEN) return NULL; memcpy(&digi->calls[d], buf, AX25_ADDR_LEN); digi->ndigi = d + 1; if (buf[6] & AX25_HBIT) { digi->repeated[d] = 1; digi->lastrepeat = d; } else { digi->repeated[d] = 0; } buf += AX25_ADDR_LEN; len -= AX25_ADDR_LEN; d++; } return buf; } /* * Assemble an AX.25 header from the bits */ int ax25_addr_build(unsigned char *buf, const ax25_address *src, const ax25_address *dest, const ax25_digi *d, int flag, int modulus) { int len = 0; int ct = 0; memcpy(buf, dest, AX25_ADDR_LEN); buf[6] &= ~(AX25_EBIT | AX25_CBIT); buf[6] |= AX25_SSSID_SPARE; if (flag == AX25_COMMAND) buf[6] |= AX25_CBIT; buf += AX25_ADDR_LEN; len += AX25_ADDR_LEN; memcpy(buf, src, AX25_ADDR_LEN); buf[6] &= ~(AX25_EBIT | AX25_CBIT); buf[6] &= ~AX25_SSSID_SPARE; if (modulus == AX25_MODULUS) buf[6] |= AX25_SSSID_SPARE; else buf[6] |= AX25_ESSID_SPARE; if (flag == AX25_RESPONSE) buf[6] |= AX25_CBIT; /* * Fast path the normal digiless path */ if (d == NULL || d->ndigi == 0) { buf[6] |= AX25_EBIT; return 2 * AX25_ADDR_LEN; } buf += AX25_ADDR_LEN; len += AX25_ADDR_LEN; while (ct < d->ndigi) { memcpy(buf, &d->calls[ct], AX25_ADDR_LEN); if (d->repeated[ct]) buf[6] |= AX25_HBIT; else buf[6] &= ~AX25_HBIT; buf[6] &= ~AX25_EBIT; buf[6] |= AX25_SSSID_SPARE; buf += AX25_ADDR_LEN; len += AX25_ADDR_LEN; ct++; } buf[-1] |= AX25_EBIT; return len; } int ax25_addr_size(const ax25_digi *dp) { if (dp == NULL) return 2 * AX25_ADDR_LEN; return AX25_ADDR_LEN * (2 + dp->ndigi); } /* * Reverse Digipeat List. May not pass both parameters as same struct */ void ax25_digi_invert(const ax25_digi *in, ax25_digi *out) { int ct; out->ndigi = in->ndigi; out->lastrepeat = in->ndigi - in->lastrepeat - 2; /* Invert the digipeaters */ for (ct = 0; ct < in->ndigi; ct++) { out->calls[ct] = in->calls[in->ndigi - ct - 1]; if (ct <= out->lastrepeat) { out->calls[ct].ax25_call[6] |= AX25_HBIT; out->repeated[ct] = 1; } else { out->calls[ct].ax25_call[6] &= ~AX25_HBIT; out->repeated[ct] = 0; } } }
2184 2181 4 10 10 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2004 Voltaire, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "ipoib.h" #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/vmalloc.h> #include <linux/if_arp.h> /* For ARPHRD_xxx */ #include <linux/ip.h> #include <linux/in.h> #include <linux/jhash.h> #include <net/arp.h> #include <net/addrconf.h> #include <net/pkt_sched.h> #include <linux/inetdevice.h> #include <rdma/ib_cache.h> MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); MODULE_LICENSE("Dual BSD/GPL"); int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE; int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE; module_param_named(send_queue_size, ipoib_sendq_size, int, 0444); MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue"); module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444); MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue"); #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG int ipoib_debug_level; module_param_named(debug_level, ipoib_debug_level, int, 0644); MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0"); #endif struct ipoib_path_iter { struct net_device *dev; struct ipoib_path path; }; static const u8 ipv4_bcast_addr[] = { 0x00, 0xff, 0xff, 0xff, 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff }; struct workqueue_struct *ipoib_workqueue; struct ib_sa_client ipoib_sa_client; static int ipoib_add_one(struct ib_device *device); static void ipoib_remove_one(struct ib_device *device, void *client_data); static void ipoib_neigh_reclaim(struct rcu_head *rp); static struct net_device *ipoib_get_net_dev_by_params( struct ib_device *dev, u32 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr, void *client_data); static int ipoib_set_mac(struct net_device *dev, void *addr); static int ipoib_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd); static struct ib_client ipoib_client = { .name = "ipoib", .add = ipoib_add_one, .remove = ipoib_remove_one, .get_net_dev_by_params = ipoib_get_net_dev_by_params, }; #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG static int ipoib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netdev_notifier_info *ni = ptr; struct net_device *dev = ni->dev; if (dev->netdev_ops->ndo_open != ipoib_open) return NOTIFY_DONE; switch (event) { case NETDEV_REGISTER: ipoib_create_debug_files(dev); break; case NETDEV_CHANGENAME: ipoib_delete_debug_files(dev); ipoib_create_debug_files(dev); break; case NETDEV_UNREGISTER: ipoib_delete_debug_files(dev); break; } return NOTIFY_DONE; } #endif int ipoib_open(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); ipoib_dbg(priv, "bringing up interface\n"); netif_carrier_off(dev); set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); if (ipoib_ib_dev_open(dev)) { if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) return 0; goto err_disable; } ipoib_ib_dev_up(dev); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; /* Bring up any child interfaces too */ down_read(&priv->vlan_rwsem); list_for_each_entry(cpriv, &priv->child_intfs, list) { int flags; flags = cpriv->dev->flags; if (flags & IFF_UP) continue; dev_change_flags(cpriv->dev, flags | IFF_UP, NULL); } up_read(&priv->vlan_rwsem); } else if (priv->parent) { struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); if (!test_bit(IPOIB_FLAG_ADMIN_UP, &ppriv->flags)) ipoib_dbg(priv, "parent device %s is not up, so child device may be not functioning.\n", ppriv->dev->name); } netif_start_queue(dev); return 0; err_disable: clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); return -EINVAL; } static int ipoib_stop(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); ipoib_dbg(priv, "stopping interface\n"); clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); netif_stop_queue(dev); ipoib_ib_dev_down(dev); ipoib_ib_dev_stop(dev); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { struct ipoib_dev_priv *cpriv; /* Bring down any child interfaces too */ down_read(&priv->vlan_rwsem); list_for_each_entry(cpriv, &priv->child_intfs, list) { int flags; flags = cpriv->dev->flags; if (!(flags & IFF_UP)) continue; dev_change_flags(cpriv->dev, flags & ~IFF_UP, NULL); } up_read(&priv->vlan_rwsem); } return 0; } static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features) { struct ipoib_dev_priv *priv = ipoib_priv(dev); if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags)) features &= ~(NETIF_F_IP_CSUM | NETIF_F_TSO); return features; } static int ipoib_change_mtu(struct net_device *dev, int new_mtu) { struct ipoib_dev_priv *priv = ipoib_priv(dev); int ret = 0; /* dev->mtu > 2K ==> connected mode */ if (ipoib_cm_admin_enabled(dev)) { if (new_mtu > ipoib_cm_max_mtu(dev)) return -EINVAL; if (new_mtu > priv->mcast_mtu) ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n", priv->mcast_mtu); WRITE_ONCE(dev->mtu, new_mtu); return 0; } if (new_mtu < (ETH_MIN_MTU + IPOIB_ENCAP_LEN) || new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu)) return -EINVAL; priv->admin_mtu = new_mtu; if (priv->mcast_mtu < priv->admin_mtu) ipoib_dbg(priv, "MTU must be smaller than the underlying " "link layer MTU - 4 (%u)\n", priv->mcast_mtu); new_mtu = min(priv->mcast_mtu, priv->admin_mtu); if (priv->rn_ops->ndo_change_mtu) { bool carrier_status = netif_carrier_ok(dev); netif_carrier_off(dev); /* notify lower level on the real mtu */ ret = priv->rn_ops->ndo_change_mtu(dev, new_mtu); if (carrier_status) netif_carrier_on(dev); } else { WRITE_ONCE(dev->mtu, new_mtu); } return ret; } static void ipoib_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct ipoib_dev_priv *priv = ipoib_priv(dev); if (priv->rn_ops->ndo_get_stats64) priv->rn_ops->ndo_get_stats64(dev, stats); else netdev_stats_to_stats64(stats, &dev->stats); } /* Called with an RCU read lock taken */ static bool ipoib_is_dev_match_addr_rcu(const struct sockaddr *addr, struct net_device *dev) { struct net *net = dev_net(dev); struct in_device *in_dev; struct sockaddr_in *addr_in = (struct sockaddr_in *)addr; struct sockaddr_in6 *addr_in6 = (struct sockaddr_in6 *)addr; __be32 ret_addr; switch (addr->sa_family) { case AF_INET: in_dev = in_dev_get(dev); if (!in_dev) return false; ret_addr = inet_confirm_addr(net, in_dev, 0, addr_in->sin_addr.s_addr, RT_SCOPE_HOST); in_dev_put(in_dev); if (ret_addr) return true; break; case AF_INET6: if (IS_ENABLED(CONFIG_IPV6) && ipv6_chk_addr(net, &addr_in6->sin6_addr, dev, 1)) return true; break; } return false; } /* * Find the master net_device on top of the given net_device. * @dev: base IPoIB net_device * * Returns the master net_device with a reference held, or the same net_device * if no master exists. */ static struct net_device *ipoib_get_master_net_dev(struct net_device *dev) { struct net_device *master; rcu_read_lock(); master = netdev_master_upper_dev_get_rcu(dev); dev_hold(master); rcu_read_unlock(); if (master) return master; dev_hold(dev); return dev; } struct ipoib_walk_data { const struct sockaddr *addr; struct net_device *result; }; static int ipoib_upper_walk(struct net_device *upper, struct netdev_nested_priv *priv) { struct ipoib_walk_data *data = (struct ipoib_walk_data *)priv->data; int ret = 0; if (ipoib_is_dev_match_addr_rcu(data->addr, upper)) { dev_hold(upper); data->result = upper; ret = 1; } return ret; } /** * ipoib_get_net_dev_match_addr - Find a net_device matching * the given address, which is an upper device of the given net_device. * * @addr: IP address to look for. * @dev: base IPoIB net_device * * If found, returns the net_device with a reference held. Otherwise return * NULL. */ static struct net_device *ipoib_get_net_dev_match_addr( const struct sockaddr *addr, struct net_device *dev) { struct netdev_nested_priv priv; struct ipoib_walk_data data = { .addr = addr, }; priv.data = (void *)&data; rcu_read_lock(); if (ipoib_is_dev_match_addr_rcu(addr, dev)) { dev_hold(dev); data.result = dev; goto out; } netdev_walk_all_upper_dev_rcu(dev, ipoib_upper_walk, &priv); out: rcu_read_unlock(); return data.result; } /* returns the number of IPoIB netdevs on top a given ipoib device matching a * pkey_index and address, if one exists. * * @found_net_dev: contains a matching net_device if the return value >= 1, * with a reference held. */ static int ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv, const union ib_gid *gid, u16 pkey_index, const struct sockaddr *addr, int nesting, struct net_device **found_net_dev) { struct ipoib_dev_priv *child_priv; struct net_device *net_dev = NULL; int matches = 0; if (priv->pkey_index == pkey_index && (!gid || !memcmp(gid, &priv->local_gid, sizeof(*gid)))) { if (!addr) { net_dev = ipoib_get_master_net_dev(priv->dev); } else { /* Verify the net_device matches the IP address, as * IPoIB child devices currently share a GID. */ net_dev = ipoib_get_net_dev_match_addr(addr, priv->dev); } if (net_dev) { if (!*found_net_dev) *found_net_dev = net_dev; else dev_put(net_dev); ++matches; } } /* Check child interfaces */ down_read_nested(&priv->vlan_rwsem, nesting); list_for_each_entry(child_priv, &priv->child_intfs, list) { matches += ipoib_match_gid_pkey_addr(child_priv, gid, pkey_index, addr, nesting + 1, found_net_dev); if (matches > 1) break; } up_read(&priv->vlan_rwsem); return matches; } /* Returns the number of matching net_devs found (between 0 and 2). Also * return the matching net_device in the @net_dev parameter, holding a * reference to the net_device, if the number of matches >= 1 */ static int __ipoib_get_net_dev_by_params(struct list_head *dev_list, u32 port, u16 pkey_index, const union ib_gid *gid, const struct sockaddr *addr, struct net_device **net_dev) { struct ipoib_dev_priv *priv; int matches = 0; *net_dev = NULL; list_for_each_entry(priv, dev_list, list) { if (priv->port != port) continue; matches += ipoib_match_gid_pkey_addr(priv, gid, pkey_index, addr, 0, net_dev); if (matches > 1) break; } return matches; } static struct net_device *ipoib_get_net_dev_by_params( struct ib_device *dev, u32 port, u16 pkey, const union ib_gid *gid, const struct sockaddr *addr, void *client_data) { struct net_device *net_dev; struct list_head *dev_list = client_data; u16 pkey_index; int matches; int ret; if (!rdma_protocol_ib(dev, port)) return NULL; ret = ib_find_cached_pkey(dev, port, pkey, &pkey_index); if (ret) return NULL; /* See if we can find a unique device matching the L2 parameters */ matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index, gid, NULL, &net_dev); switch (matches) { case 0: return NULL; case 1: return net_dev; } dev_put(net_dev); /* Couldn't find a unique device with L2 parameters only. Use L3 * address to uniquely match the net device */ matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index, gid, addr, &net_dev); switch (matches) { case 0: return NULL; default: dev_warn_ratelimited(&dev->dev, "duplicate IP address detected\n"); fallthrough; case 1: return net_dev; } } int ipoib_set_mode(struct net_device *dev, const char *buf) { struct ipoib_dev_priv *priv = ipoib_priv(dev); if ((test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) && !strcmp(buf, "connected\n")) || (!test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags) && !strcmp(buf, "datagram\n"))) { return 0; } /* flush paths if we switch modes so that connections are restarted */ if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) { set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); ipoib_warn(priv, "enabling connected mode " "will cause multicast packet drops\n"); netdev_update_features(dev); dev_set_mtu(dev, ipoib_cm_max_mtu(dev)); netif_set_real_num_tx_queues(dev, 1); rtnl_unlock(); priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM; ipoib_flush_paths(dev); return (!rtnl_trylock()) ? -EBUSY : 0; } if (!strcmp(buf, "datagram\n")) { clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags); netdev_update_features(dev); dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu)); netif_set_real_num_tx_queues(dev, dev->num_tx_queues); rtnl_unlock(); ipoib_flush_paths(dev); return (!rtnl_trylock()) ? -EBUSY : 0; } return -EINVAL; } struct ipoib_path *__path_find(struct net_device *dev, void *gid) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rb_node *n = priv->path_tree.rb_node; struct ipoib_path *path; int ret; while (n) { path = rb_entry(n, struct ipoib_path, rb_node); ret = memcmp(gid, path->pathrec.dgid.raw, sizeof (union ib_gid)); if (ret < 0) n = n->rb_left; else if (ret > 0) n = n->rb_right; else return path; } return NULL; } static int __path_add(struct net_device *dev, struct ipoib_path *path) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rb_node **n = &priv->path_tree.rb_node; struct rb_node *pn = NULL; struct ipoib_path *tpath; int ret; while (*n) { pn = *n; tpath = rb_entry(pn, struct ipoib_path, rb_node); ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw, sizeof (union ib_gid)); if (ret < 0) n = &pn->rb_left; else if (ret > 0) n = &pn->rb_right; else return -EEXIST; } rb_link_node(&path->rb_node, pn, n); rb_insert_color(&path->rb_node, &priv->path_tree); list_add_tail(&path->list, &priv->path_list); return 0; } static void path_free(struct net_device *dev, struct ipoib_path *path) { struct sk_buff *skb; while ((skb = __skb_dequeue(&path->queue))) dev_kfree_skb_irq(skb); ipoib_dbg(ipoib_priv(dev), "%s\n", __func__); /* remove all neigh connected to this path */ ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw); if (path->ah) ipoib_put_ah(path->ah); kfree(path); } #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev) { struct ipoib_path_iter *iter; iter = kmalloc(sizeof(*iter), GFP_KERNEL); if (!iter) return NULL; iter->dev = dev; memset(iter->path.pathrec.dgid.raw, 0, 16); if (ipoib_path_iter_next(iter)) { kfree(iter); return NULL; } return iter; } int ipoib_path_iter_next(struct ipoib_path_iter *iter) { struct ipoib_dev_priv *priv = ipoib_priv(iter->dev); struct rb_node *n; struct ipoib_path *path; int ret = 1; spin_lock_irq(&priv->lock); n = rb_first(&priv->path_tree); while (n) { path = rb_entry(n, struct ipoib_path, rb_node); if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw, sizeof (union ib_gid)) < 0) { iter->path = *path; ret = 0; break; } n = rb_next(n); } spin_unlock_irq(&priv->lock); return ret; } void ipoib_path_iter_read(struct ipoib_path_iter *iter, struct ipoib_path *path) { *path = iter->path; } #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */ void ipoib_mark_paths_invalid(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_path *path, *tp; spin_lock_irq(&priv->lock); list_for_each_entry_safe(path, tp, &priv->path_list, list) { ipoib_dbg(priv, "mark path LID 0x%08x GID %pI6 invalid\n", be32_to_cpu(sa_path_get_dlid(&path->pathrec)), path->pathrec.dgid.raw); if (path->ah) path->ah->valid = 0; } spin_unlock_irq(&priv->lock); } static void push_pseudo_header(struct sk_buff *skb, const char *daddr) { struct ipoib_pseudo_header *phdr; phdr = skb_push(skb, sizeof(*phdr)); memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN); } void ipoib_flush_paths(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_path *path, *tp; LIST_HEAD(remove_list); unsigned long flags; netif_tx_lock_bh(dev); spin_lock_irqsave(&priv->lock, flags); list_splice_init(&priv->path_list, &remove_list); list_for_each_entry(path, &remove_list, list) rb_erase(&path->rb_node, &priv->path_tree); list_for_each_entry_safe(path, tp, &remove_list, list) { if (path->query) ib_sa_cancel_query(path->query_id, path->query); spin_unlock_irqrestore(&priv->lock, flags); netif_tx_unlock_bh(dev); wait_for_completion(&path->done); path_free(dev, path); netif_tx_lock_bh(dev); spin_lock_irqsave(&priv->lock, flags); } spin_unlock_irqrestore(&priv->lock, flags); netif_tx_unlock_bh(dev); } static void path_rec_completion(int status, struct sa_path_rec *pathrec, unsigned int num_prs, void *path_ptr) { struct ipoib_path *path = path_ptr; struct net_device *dev = path->dev; struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_ah *ah = NULL; struct ipoib_ah *old_ah = NULL; struct ipoib_neigh *neigh, *tn; struct sk_buff_head skqueue; struct sk_buff *skb; unsigned long flags; if (!status) ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n", be32_to_cpu(sa_path_get_dlid(pathrec)), pathrec->dgid.raw); else ipoib_dbg(priv, "PathRec status %d for GID %pI6\n", status, path->pathrec.dgid.raw); skb_queue_head_init(&skqueue); if (!status) { struct rdma_ah_attr av; if (!ib_init_ah_attr_from_path(priv->ca, priv->port, pathrec, &av, NULL)) { ah = ipoib_create_ah(dev, priv->pd, &av); rdma_destroy_ah_attr(&av); } } spin_lock_irqsave(&priv->lock, flags); if (!IS_ERR_OR_NULL(ah)) { /* * pathrec.dgid is used as the database key from the LLADDR, * it must remain unchanged even if the SA returns a different * GID to use in the AH. */ if (memcmp(pathrec->dgid.raw, path->pathrec.dgid.raw, sizeof(union ib_gid))) { ipoib_dbg( priv, "%s got PathRec for gid %pI6 while asked for %pI6\n", dev->name, pathrec->dgid.raw, path->pathrec.dgid.raw); memcpy(pathrec->dgid.raw, path->pathrec.dgid.raw, sizeof(union ib_gid)); } path->pathrec = *pathrec; old_ah = path->ah; path->ah = ah; ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n", ah, be32_to_cpu(sa_path_get_dlid(pathrec)), pathrec->sl); while ((skb = __skb_dequeue(&path->queue))) __skb_queue_tail(&skqueue, skb); list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) { if (neigh->ah) { WARN_ON(neigh->ah != old_ah); /* * Dropping the ah reference inside * priv->lock is safe here, because we * will hold one more reference from * the original value of path->ah (ie * old_ah). */ ipoib_put_ah(neigh->ah); } kref_get(&path->ah->ref); neigh->ah = path->ah; if (ipoib_cm_enabled(dev, neigh->daddr)) { if (!ipoib_cm_get(neigh)) ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh)); if (!ipoib_cm_get(neigh)) { ipoib_neigh_free(neigh); continue; } } while ((skb = __skb_dequeue(&neigh->queue))) __skb_queue_tail(&skqueue, skb); } path->ah->valid = 1; } path->query = NULL; complete(&path->done); spin_unlock_irqrestore(&priv->lock, flags); if (IS_ERR_OR_NULL(ah)) ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw); if (old_ah) ipoib_put_ah(old_ah); while ((skb = __skb_dequeue(&skqueue))) { int ret; skb->dev = dev; ret = dev_queue_xmit(skb); if (ret) ipoib_warn(priv, "%s: dev_queue_xmit failed to re-queue packet, ret:%d\n", __func__, ret); } } static void init_path_rec(struct ipoib_dev_priv *priv, struct ipoib_path *path, void *gid) { path->dev = priv->dev; if (rdma_cap_opa_ah(priv->ca, priv->port)) path->pathrec.rec_type = SA_PATH_REC_TYPE_OPA; else path->pathrec.rec_type = SA_PATH_REC_TYPE_IB; memcpy(path->pathrec.dgid.raw, gid, sizeof(union ib_gid)); path->pathrec.sgid = priv->local_gid; path->pathrec.pkey = cpu_to_be16(priv->pkey); path->pathrec.numb_path = 1; path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class; } static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_path *path; if (!priv->broadcast) return NULL; path = kzalloc(sizeof(*path), GFP_ATOMIC); if (!path) return NULL; skb_queue_head_init(&path->queue); INIT_LIST_HEAD(&path->neigh_list); init_path_rec(priv, path, gid); return path; } static int path_rec_start(struct net_device *dev, struct ipoib_path *path) { struct ipoib_dev_priv *priv = ipoib_priv(dev); ipoib_dbg(priv, "Start path record lookup for %pI6\n", path->pathrec.dgid.raw); init_completion(&path->done); path->query_id = ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port, &path->pathrec, IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID | IB_SA_PATH_REC_NUMB_PATH | IB_SA_PATH_REC_TRAFFIC_CLASS | IB_SA_PATH_REC_PKEY, 1000, GFP_ATOMIC, path_rec_completion, path, &path->query); if (path->query_id < 0) { ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id); path->query = NULL; complete(&path->done); return path->query_id; } return 0; } static void neigh_refresh_path(struct ipoib_neigh *neigh, u8 *daddr, struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_path *path; unsigned long flags; spin_lock_irqsave(&priv->lock, flags); path = __path_find(dev, daddr + 4); if (!path) goto out; if (!path->query) path_rec_start(dev, path); out: spin_unlock_irqrestore(&priv->lock, flags); } static struct ipoib_neigh *neigh_add_path(struct sk_buff *skb, u8 *daddr, struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rdma_netdev *rn = netdev_priv(dev); struct ipoib_path *path; struct ipoib_neigh *neigh; unsigned long flags; spin_lock_irqsave(&priv->lock, flags); neigh = ipoib_neigh_alloc(daddr, dev); if (!neigh) { spin_unlock_irqrestore(&priv->lock, flags); ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); return NULL; } /* To avoid race condition, make sure that the * neigh will be added only once. */ if (unlikely(!list_empty(&neigh->list))) { spin_unlock_irqrestore(&priv->lock, flags); return neigh; } path = __path_find(dev, daddr + 4); if (!path) { path = path_rec_create(dev, daddr + 4); if (!path) goto err_path; __path_add(dev, path); } list_add_tail(&neigh->list, &path->neigh_list); if (path->ah && path->ah->valid) { kref_get(&path->ah->ref); neigh->ah = path->ah; if (ipoib_cm_enabled(dev, neigh->daddr)) { if (!ipoib_cm_get(neigh)) ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh)); if (!ipoib_cm_get(neigh)) { ipoib_neigh_free(neigh); goto err_drop; } if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { push_pseudo_header(skb, neigh->daddr); __skb_queue_tail(&neigh->queue, skb); } else { ipoib_warn(priv, "queue length limit %d. Packet drop.\n", skb_queue_len(&neigh->queue)); goto err_drop; } } else { spin_unlock_irqrestore(&priv->lock, flags); path->ah->last_send = rn->send(dev, skb, path->ah->ah, IPOIB_QPN(daddr)); ipoib_neigh_put(neigh); return NULL; } } else { neigh->ah = NULL; if (!path->query && path_rec_start(dev, path)) goto err_path; if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { push_pseudo_header(skb, neigh->daddr); __skb_queue_tail(&neigh->queue, skb); } else { goto err_drop; } } spin_unlock_irqrestore(&priv->lock, flags); ipoib_neigh_put(neigh); return NULL; err_path: ipoib_neigh_free(neigh); err_drop: ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); spin_unlock_irqrestore(&priv->lock, flags); ipoib_neigh_put(neigh); return NULL; } static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, struct ipoib_pseudo_header *phdr) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rdma_netdev *rn = netdev_priv(dev); struct ipoib_path *path; unsigned long flags; spin_lock_irqsave(&priv->lock, flags); /* no broadcast means that all paths are (going to be) not valid */ if (!priv->broadcast) goto drop_and_unlock; path = __path_find(dev, phdr->hwaddr + 4); if (!path || !path->ah || !path->ah->valid) { if (!path) { path = path_rec_create(dev, phdr->hwaddr + 4); if (!path) goto drop_and_unlock; __path_add(dev, path); } else { /* * make sure there are no changes in the existing * path record */ init_path_rec(priv, path, phdr->hwaddr + 4); } if (!path->query && path_rec_start(dev, path)) { goto drop_and_unlock; } if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { push_pseudo_header(skb, phdr->hwaddr); __skb_queue_tail(&path->queue, skb); goto unlock; } else { goto drop_and_unlock; } } spin_unlock_irqrestore(&priv->lock, flags); ipoib_dbg(priv, "Send unicast ARP to %08x\n", be32_to_cpu(sa_path_get_dlid(&path->pathrec))); path->ah->last_send = rn->send(dev, skb, path->ah->ah, IPOIB_QPN(phdr->hwaddr)); return; drop_and_unlock: ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); unlock: spin_unlock_irqrestore(&priv->lock, flags); } static netdev_tx_t ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rdma_netdev *rn = netdev_priv(dev); struct ipoib_neigh *neigh; struct ipoib_pseudo_header *phdr; struct ipoib_header *header; unsigned long flags; phdr = (struct ipoib_pseudo_header *) skb->data; skb_pull(skb, sizeof(*phdr)); header = (struct ipoib_header *) skb->data; if (unlikely(phdr->hwaddr[4] == 0xff)) { /* multicast, arrange "if" according to probability */ if ((header->proto != htons(ETH_P_IP)) && (header->proto != htons(ETH_P_IPV6)) && (header->proto != htons(ETH_P_ARP)) && (header->proto != htons(ETH_P_RARP)) && (header->proto != htons(ETH_P_TIPC))) { /* ethertype not supported by IPoIB */ ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); return NETDEV_TX_OK; } /* Add in the P_Key for multicast*/ phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff; phdr->hwaddr[9] = priv->pkey & 0xff; neigh = ipoib_neigh_get(dev, phdr->hwaddr); if (likely(neigh)) goto send_using_neigh; ipoib_mcast_send(dev, phdr->hwaddr, skb); return NETDEV_TX_OK; } /* unicast, arrange "switch" according to probability */ switch (header->proto) { case htons(ETH_P_IP): case htons(ETH_P_IPV6): case htons(ETH_P_TIPC): neigh = ipoib_neigh_get(dev, phdr->hwaddr); if (unlikely(!neigh)) { neigh = neigh_add_path(skb, phdr->hwaddr, dev); if (likely(!neigh)) return NETDEV_TX_OK; } break; case htons(ETH_P_ARP): case htons(ETH_P_RARP): /* for unicast ARP and RARP should always perform path find */ unicast_arp_send(skb, dev, phdr); return NETDEV_TX_OK; default: /* ethertype not supported by IPoIB */ ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); return NETDEV_TX_OK; } send_using_neigh: /* note we now hold a ref to neigh */ if (ipoib_cm_get(neigh)) { if (ipoib_cm_up(neigh)) { ipoib_cm_send(dev, skb, ipoib_cm_get(neigh)); goto unref; } } else if (neigh->ah && neigh->ah->valid) { neigh->ah->last_send = rn->send(dev, skb, neigh->ah->ah, IPOIB_QPN(phdr->hwaddr)); goto unref; } else if (neigh->ah) { neigh_refresh_path(neigh, phdr->hwaddr, dev); } if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { push_pseudo_header(skb, phdr->hwaddr); spin_lock_irqsave(&priv->lock, flags); __skb_queue_tail(&neigh->queue, skb); spin_unlock_irqrestore(&priv->lock, flags); } else { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); } unref: ipoib_neigh_put(neigh); return NETDEV_TX_OK; } static void ipoib_timeout(struct net_device *dev, unsigned int txqueue) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rdma_netdev *rn = netdev_priv(dev); if (rn->tx_timeout) { rn->tx_timeout(dev, txqueue); return; } ipoib_warn(priv, "transmit timeout: latency %d msecs\n", jiffies_to_msecs(jiffies - dev_trans_start(dev))); ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u, global_tx_head %u, global_tx_tail %u\n", netif_queue_stopped(dev), priv->tx_head, priv->tx_tail, priv->global_tx_head, priv->global_tx_tail); schedule_work(&priv->tx_timeout_work); } void ipoib_ib_tx_timeout_work(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, tx_timeout_work); int err; rtnl_lock(); if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) goto unlock; ipoib_stop(priv->dev); err = ipoib_open(priv->dev); if (err) { ipoib_warn(priv, "ipoib_open failed recovering from a tx_timeout, err(%d).\n", err); goto unlock; } netif_tx_wake_all_queues(priv->dev); unlock: rtnl_unlock(); } static int ipoib_hard_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { struct ipoib_header *header; header = skb_push(skb, sizeof(*header)); header->proto = htons(type); header->reserved = 0; /* * we don't rely on dst_entry structure, always stuff the * destination address into skb hard header so we can figure out where * to send the packet later. */ push_pseudo_header(skb, daddr); return IPOIB_HARD_LEN; } static void ipoib_set_mcast_list(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) { ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set"); return; } queue_work(priv->wq, &priv->restart_task); } static int ipoib_get_iflink(const struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); /* parent interface */ if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) return READ_ONCE(dev->ifindex); /* child/vlan interface */ return READ_ONCE(priv->parent->ifindex); } static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr) { /* * Use only the address parts that contributes to spreading * The subnet prefix is not used as one can not connect to * same remote port (GUID) using the same remote QPN via two * different subnets. */ /* qpn octets[1:4) & port GUID octets[12:20) */ u32 *d32 = (u32 *) daddr; u32 hv; hv = jhash_3words(d32[3], d32[4], IPOIB_QPN_MASK & d32[0], 0); return hv & htbl->mask; } struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; struct ipoib_neigh *neigh = NULL; u32 hash_val; rcu_read_lock_bh(); htbl = rcu_dereference_bh(ntbl->htbl); if (!htbl) goto out_unlock; hash_val = ipoib_addr_hash(htbl, daddr); for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]); neigh != NULL; neigh = rcu_dereference_bh(neigh->hnext)) { if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) { /* found, take one ref on behalf of the caller */ if (!refcount_inc_not_zero(&neigh->refcnt)) { /* deleted */ neigh = NULL; goto out_unlock; } if (likely(skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)) neigh->alive = jiffies; goto out_unlock; } } out_unlock: rcu_read_unlock_bh(); return neigh; } static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv) { struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; unsigned long neigh_obsolete; unsigned long dt; unsigned long flags; int i; LIST_HEAD(remove_list); spin_lock_irqsave(&priv->lock, flags); htbl = rcu_dereference_protected(ntbl->htbl, lockdep_is_held(&priv->lock)); if (!htbl) goto out_unlock; /* neigh is obsolete if it was idle for two GC periods */ dt = 2 * arp_tbl.gc_interval; neigh_obsolete = jiffies - dt; for (i = 0; i < htbl->size; i++) { struct ipoib_neigh *neigh; struct ipoib_neigh __rcu **np = &htbl->buckets[i]; while ((neigh = rcu_dereference_protected(*np, lockdep_is_held(&priv->lock))) != NULL) { /* was the neigh idle for two GC periods */ if (time_after(neigh_obsolete, neigh->alive)) { ipoib_check_and_add_mcast_sendonly(priv, neigh->daddr + 4, &remove_list); rcu_assign_pointer(*np, rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from path/mc list */ list_del_init(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); } else { np = &neigh->hnext; } } } out_unlock: spin_unlock_irqrestore(&priv->lock, flags); ipoib_mcast_remove_list(&remove_list); } static void ipoib_reap_neigh(struct work_struct *work) { struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, neigh_reap_task.work); __ipoib_reap_neigh(priv); queue_delayed_work(priv->wq, &priv->neigh_reap_task, arp_tbl.gc_interval); } static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr, struct net_device *dev) { struct ipoib_neigh *neigh; neigh = kzalloc(sizeof(*neigh), GFP_ATOMIC); if (!neigh) return NULL; neigh->dev = dev; memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr)); skb_queue_head_init(&neigh->queue); INIT_LIST_HEAD(&neigh->list); ipoib_cm_set(neigh, NULL); /* one ref on behalf of the caller */ refcount_set(&neigh->refcnt, 1); return neigh; } struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr, struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; struct ipoib_neigh *neigh; u32 hash_val; htbl = rcu_dereference_protected(ntbl->htbl, lockdep_is_held(&priv->lock)); if (!htbl) { neigh = NULL; goto out_unlock; } /* need to add a new neigh, but maybe some other thread succeeded? * recalc hash, maybe hash resize took place so we do a search */ hash_val = ipoib_addr_hash(htbl, daddr); for (neigh = rcu_dereference_protected(htbl->buckets[hash_val], lockdep_is_held(&priv->lock)); neigh != NULL; neigh = rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))) { if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) { /* found, take one ref on behalf of the caller */ if (!refcount_inc_not_zero(&neigh->refcnt)) { /* deleted */ neigh = NULL; break; } neigh->alive = jiffies; goto out_unlock; } } neigh = ipoib_neigh_ctor(daddr, dev); if (!neigh) goto out_unlock; /* one ref on behalf of the hash table */ refcount_inc(&neigh->refcnt); neigh->alive = jiffies; /* put in hash */ rcu_assign_pointer(neigh->hnext, rcu_dereference_protected(htbl->buckets[hash_val], lockdep_is_held(&priv->lock))); rcu_assign_pointer(htbl->buckets[hash_val], neigh); atomic_inc(&ntbl->entries); out_unlock: return neigh; } void ipoib_neigh_dtor(struct ipoib_neigh *neigh) { /* neigh reference count was dropprd to zero */ struct net_device *dev = neigh->dev; struct ipoib_dev_priv *priv = ipoib_priv(dev); struct sk_buff *skb; if (neigh->ah) ipoib_put_ah(neigh->ah); while ((skb = __skb_dequeue(&neigh->queue))) { ++dev->stats.tx_dropped; dev_kfree_skb_any(skb); } if (ipoib_cm_get(neigh)) ipoib_cm_destroy_tx(ipoib_cm_get(neigh)); ipoib_dbg(ipoib_priv(dev), "neigh free for %06x %pI6\n", IPOIB_QPN(neigh->daddr), neigh->daddr + 4); kfree(neigh); if (atomic_dec_and_test(&priv->ntbl.entries)) { if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags)) complete(&priv->ntbl.flushed); } } static void ipoib_neigh_reclaim(struct rcu_head *rp) { /* Called as a result of removal from hash table */ struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu); /* note TX context may hold another ref */ ipoib_neigh_put(neigh); } void ipoib_neigh_free(struct ipoib_neigh *neigh) { struct net_device *dev = neigh->dev; struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; struct ipoib_neigh __rcu **np; struct ipoib_neigh *n; u32 hash_val; htbl = rcu_dereference_protected(ntbl->htbl, lockdep_is_held(&priv->lock)); if (!htbl) return; hash_val = ipoib_addr_hash(htbl, neigh->daddr); np = &htbl->buckets[hash_val]; for (n = rcu_dereference_protected(*np, lockdep_is_held(&priv->lock)); n != NULL; n = rcu_dereference_protected(*np, lockdep_is_held(&priv->lock))) { if (n == neigh) { /* found */ rcu_assign_pointer(*np, rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from parent list */ list_del_init(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); return; } else { np = &n->hnext; } } } static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv) { struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; struct ipoib_neigh __rcu **buckets; u32 size; clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); ntbl->htbl = NULL; htbl = kzalloc(sizeof(*htbl), GFP_KERNEL); if (!htbl) return -ENOMEM; size = roundup_pow_of_two(arp_tbl.gc_thresh3); buckets = kvcalloc(size, sizeof(*buckets), GFP_KERNEL); if (!buckets) { kfree(htbl); return -ENOMEM; } htbl->size = size; htbl->mask = (size - 1); htbl->buckets = buckets; RCU_INIT_POINTER(ntbl->htbl, htbl); htbl->ntbl = ntbl; atomic_set(&ntbl->entries, 0); /* start garbage collection */ queue_delayed_work(priv->wq, &priv->neigh_reap_task, arp_tbl.gc_interval); return 0; } static void neigh_hash_free_rcu(struct rcu_head *head) { struct ipoib_neigh_hash *htbl = container_of(head, struct ipoib_neigh_hash, rcu); struct ipoib_neigh __rcu **buckets = htbl->buckets; struct ipoib_neigh_table *ntbl = htbl->ntbl; kvfree(buckets); kfree(htbl); complete(&ntbl->deleted); } void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; unsigned long flags; int i; /* remove all neigh connected to a given path or mcast */ spin_lock_irqsave(&priv->lock, flags); htbl = rcu_dereference_protected(ntbl->htbl, lockdep_is_held(&priv->lock)); if (!htbl) goto out_unlock; for (i = 0; i < htbl->size; i++) { struct ipoib_neigh *neigh; struct ipoib_neigh __rcu **np = &htbl->buckets[i]; while ((neigh = rcu_dereference_protected(*np, lockdep_is_held(&priv->lock))) != NULL) { /* delete neighs belong to this parent */ if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) { rcu_assign_pointer(*np, rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from parent list */ list_del_init(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); } else { np = &neigh->hnext; } } } out_unlock: spin_unlock_irqrestore(&priv->lock, flags); } static void ipoib_flush_neighs(struct ipoib_dev_priv *priv) { struct ipoib_neigh_table *ntbl = &priv->ntbl; struct ipoib_neigh_hash *htbl; unsigned long flags; int i, wait_flushed = 0; init_completion(&priv->ntbl.flushed); set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); spin_lock_irqsave(&priv->lock, flags); htbl = rcu_dereference_protected(ntbl->htbl, lockdep_is_held(&priv->lock)); if (!htbl) goto out_unlock; wait_flushed = atomic_read(&priv->ntbl.entries); if (!wait_flushed) goto free_htbl; for (i = 0; i < htbl->size; i++) { struct ipoib_neigh *neigh; struct ipoib_neigh __rcu **np = &htbl->buckets[i]; while ((neigh = rcu_dereference_protected(*np, lockdep_is_held(&priv->lock))) != NULL) { rcu_assign_pointer(*np, rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from path/mc list */ list_del_init(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); } } free_htbl: rcu_assign_pointer(ntbl->htbl, NULL); call_rcu(&htbl->rcu, neigh_hash_free_rcu); out_unlock: spin_unlock_irqrestore(&priv->lock, flags); if (wait_flushed) wait_for_completion(&priv->ntbl.flushed); } static void ipoib_neigh_hash_uninit(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); ipoib_dbg(priv, "%s\n", __func__); init_completion(&priv->ntbl.deleted); cancel_delayed_work_sync(&priv->neigh_reap_task); ipoib_flush_neighs(priv); wait_for_completion(&priv->ntbl.deleted); } static void ipoib_napi_add(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); netif_napi_add_weight(dev, &priv->recv_napi, ipoib_rx_poll, IPOIB_NUM_WC); netif_napi_add_weight(dev, &priv->send_napi, ipoib_tx_poll, MAX_SEND_CQE); } static void ipoib_napi_del(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); netif_napi_del(&priv->recv_napi); netif_napi_del(&priv->send_napi); } static void ipoib_dev_uninit_default(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); ipoib_transport_dev_cleanup(dev); ipoib_napi_del(dev); ipoib_cm_dev_cleanup(dev); kfree(priv->rx_ring); vfree(priv->tx_ring); priv->rx_ring = NULL; priv->tx_ring = NULL; } static int ipoib_dev_init_default(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); u8 addr_mod[3]; ipoib_napi_add(dev); /* Allocate RX/TX "rings" to hold queued skbs */ priv->rx_ring = kcalloc(ipoib_recvq_size, sizeof(*priv->rx_ring), GFP_KERNEL); if (!priv->rx_ring) goto out; priv->tx_ring = vzalloc(array_size(ipoib_sendq_size, sizeof(*priv->tx_ring))); if (!priv->tx_ring) { pr_warn("%s: failed to allocate TX ring (%d entries)\n", priv->ca->name, ipoib_sendq_size); goto out_rx_ring_cleanup; } /* priv->tx_head, tx_tail and global_tx_tail/head are already 0 */ if (ipoib_transport_dev_init(dev, priv->ca)) { pr_warn("%s: ipoib_transport_dev_init failed\n", priv->ca->name); goto out_tx_ring_cleanup; } /* after qp created set dev address */ addr_mod[0] = (priv->qp->qp_num >> 16) & 0xff; addr_mod[1] = (priv->qp->qp_num >> 8) & 0xff; addr_mod[2] = (priv->qp->qp_num) & 0xff; dev_addr_mod(priv->dev, 1, addr_mod, sizeof(addr_mod)); return 0; out_tx_ring_cleanup: vfree(priv->tx_ring); out_rx_ring_cleanup: kfree(priv->rx_ring); out: ipoib_napi_del(dev); return -ENOMEM; } static int ipoib_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct ipoib_dev_priv *priv = ipoib_priv(dev); if (!priv->rn_ops->ndo_eth_ioctl) return -EOPNOTSUPP; return priv->rn_ops->ndo_eth_ioctl(dev, ifr, cmd); } static int ipoib_dev_init(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); int ret = -ENOMEM; priv->qp = NULL; /* * the various IPoIB tasks assume they will never race against * themselves, so always use a single thread workqueue */ priv->wq = alloc_ordered_workqueue("ipoib_wq", WQ_MEM_RECLAIM); if (!priv->wq) { pr_warn("%s: failed to allocate device WQ\n", dev->name); goto out; } /* create pd, which used both for control and datapath*/ priv->pd = ib_alloc_pd(priv->ca, 0); if (IS_ERR(priv->pd)) { pr_warn("%s: failed to allocate PD\n", priv->ca->name); goto clean_wq; } ret = priv->rn_ops->ndo_init(dev); if (ret) { pr_warn("%s failed to init HW resource\n", dev->name); goto out_free_pd; } ret = ipoib_neigh_hash_init(priv); if (ret) { pr_warn("%s failed to init neigh hash\n", dev->name); goto out_dev_uninit; } if (dev->flags & IFF_UP) { if (ipoib_ib_dev_open(dev)) { pr_warn("%s failed to open device\n", dev->name); ret = -ENODEV; goto out_hash_uninit; } } return 0; out_hash_uninit: ipoib_neigh_hash_uninit(dev); out_dev_uninit: ipoib_ib_dev_cleanup(dev); out_free_pd: if (priv->pd) { ib_dealloc_pd(priv->pd); priv->pd = NULL; } clean_wq: if (priv->wq) { destroy_workqueue(priv->wq); priv->wq = NULL; } out: return ret; } /* * This must be called before doing an unregister_netdev on a parent device to * shutdown the IB event handler. */ static void ipoib_parent_unregister_pre(struct net_device *ndev) { struct ipoib_dev_priv *priv = ipoib_priv(ndev); /* * ipoib_set_mac checks netif_running before pushing work, clearing * running ensures the it will not add more work. */ rtnl_lock(); dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP, NULL); rtnl_unlock(); /* ipoib_event() cannot be running once this returns */ ib_unregister_event_handler(&priv->event_handler); /* * Work on the queue grabs the rtnl lock, so this cannot be done while * also holding it. */ flush_workqueue(ipoib_workqueue); } static void ipoib_set_dev_features(struct ipoib_dev_priv *priv) { priv->hca_caps = priv->ca->attrs.device_cap_flags; priv->kernel_caps = priv->ca->attrs.kernel_cap_flags; if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { priv->dev->hw_features |= NETIF_F_IP_CSUM | NETIF_F_RXCSUM; if (priv->kernel_caps & IBK_UD_TSO) priv->dev->hw_features |= NETIF_F_TSO; priv->dev->features |= priv->dev->hw_features; } } static int ipoib_parent_init(struct net_device *ndev) { struct ipoib_dev_priv *priv = ipoib_priv(ndev); struct ib_port_attr attr; int result; result = ib_query_port(priv->ca, priv->port, &attr); if (result) { pr_warn("%s: ib_query_port %d failed\n", priv->ca->name, priv->port); return result; } priv->max_ib_mtu = rdma_mtu_from_attr(priv->ca, priv->port, &attr); result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey); if (result) { pr_warn("%s: ib_query_pkey port %d failed (ret = %d)\n", priv->ca->name, priv->port, result); return result; } result = rdma_query_gid(priv->ca, priv->port, 0, &priv->local_gid); if (result) { pr_warn("%s: rdma_query_gid port %d failed (ret = %d)\n", priv->ca->name, priv->port, result); return result; } dev_addr_mod(priv->dev, 4, priv->local_gid.raw, sizeof(union ib_gid)); SET_NETDEV_DEV(priv->dev, priv->ca->dev.parent); priv->dev->dev_port = priv->port - 1; /* Let's set this one too for backwards compatibility. */ priv->dev->dev_id = priv->port - 1; return 0; } static void ipoib_child_init(struct net_device *ndev) { struct ipoib_dev_priv *priv = ipoib_priv(ndev); struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); priv->max_ib_mtu = ppriv->max_ib_mtu; set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags); if (memchr_inv(priv->dev->dev_addr, 0, INFINIBAND_ALEN)) memcpy(&priv->local_gid, priv->dev->dev_addr + 4, sizeof(priv->local_gid)); else { __dev_addr_set(priv->dev, ppriv->dev->dev_addr, INFINIBAND_ALEN); memcpy(&priv->local_gid, &ppriv->local_gid, sizeof(priv->local_gid)); } } static int ipoib_ndo_init(struct net_device *ndev) { struct ipoib_dev_priv *priv = ipoib_priv(ndev); int rc; struct rdma_netdev *rn = netdev_priv(ndev); if (priv->parent) { ipoib_child_init(ndev); } else { rc = ipoib_parent_init(ndev); if (rc) return rc; } /* MTU will be reset when mcast join happens */ ndev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); priv->mcast_mtu = priv->admin_mtu = ndev->mtu; rn->mtu = priv->mcast_mtu; ndev->max_mtu = IPOIB_CM_MTU; ndev->neigh_priv_len = sizeof(struct ipoib_neigh); /* * Set the full membership bit, so that we join the right * broadcast group, etc. */ priv->pkey |= 0x8000; ndev->broadcast[8] = priv->pkey >> 8; ndev->broadcast[9] = priv->pkey & 0xff; set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); ipoib_set_dev_features(priv); rc = ipoib_dev_init(ndev); if (rc) { pr_warn("%s: failed to initialize device: %s port %d (ret = %d)\n", priv->ca->name, priv->dev->name, priv->port, rc); return rc; } if (priv->parent) { struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); dev_hold(priv->parent); down_write(&ppriv->vlan_rwsem); list_add_tail(&priv->list, &ppriv->child_intfs); up_write(&ppriv->vlan_rwsem); } return 0; } static void ipoib_ndo_uninit(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); ASSERT_RTNL(); /* * ipoib_remove_one guarantees the children are removed before the * parent, and that is the only place where a parent can be removed. */ WARN_ON(!list_empty(&priv->child_intfs)); if (priv->parent) { struct ipoib_dev_priv *ppriv = ipoib_priv(priv->parent); down_write(&ppriv->vlan_rwsem); list_del(&priv->list); up_write(&ppriv->vlan_rwsem); } ipoib_neigh_hash_uninit(dev); ipoib_ib_dev_cleanup(dev); /* no more works over the priv->wq */ if (priv->wq) { /* See ipoib_mcast_carrier_on_task() */ WARN_ON(test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)); destroy_workqueue(priv->wq); priv->wq = NULL; } dev_put(priv->parent); } static int ipoib_set_vf_link_state(struct net_device *dev, int vf, int link_state) { struct ipoib_dev_priv *priv = ipoib_priv(dev); return ib_set_vf_link_state(priv->ca, vf, priv->port, link_state); } static int ipoib_get_vf_config(struct net_device *dev, int vf, struct ifla_vf_info *ivf) { struct ipoib_dev_priv *priv = ipoib_priv(dev); int err; err = ib_get_vf_config(priv->ca, vf, priv->port, ivf); if (err) return err; ivf->vf = vf; memcpy(ivf->mac, dev->dev_addr, dev->addr_len); return 0; } static int ipoib_set_vf_guid(struct net_device *dev, int vf, u64 guid, int type) { struct ipoib_dev_priv *priv = ipoib_priv(dev); if (type != IFLA_VF_IB_NODE_GUID && type != IFLA_VF_IB_PORT_GUID) return -EINVAL; return ib_set_vf_guid(priv->ca, vf, priv->port, guid, type); } static int ipoib_get_vf_guid(struct net_device *dev, int vf, struct ifla_vf_guid *node_guid, struct ifla_vf_guid *port_guid) { struct ipoib_dev_priv *priv = ipoib_priv(dev); return ib_get_vf_guid(priv->ca, vf, priv->port, node_guid, port_guid); } static int ipoib_get_vf_stats(struct net_device *dev, int vf, struct ifla_vf_stats *vf_stats) { struct ipoib_dev_priv *priv = ipoib_priv(dev); return ib_get_vf_stats(priv->ca, vf, priv->port, vf_stats); } static const struct header_ops ipoib_header_ops = { .create = ipoib_hard_header, }; static const struct net_device_ops ipoib_netdev_ops_pf = { .ndo_init = ipoib_ndo_init, .ndo_uninit = ipoib_ndo_uninit, .ndo_open = ipoib_open, .ndo_stop = ipoib_stop, .ndo_change_mtu = ipoib_change_mtu, .ndo_fix_features = ipoib_fix_features, .ndo_start_xmit = ipoib_start_xmit, .ndo_tx_timeout = ipoib_timeout, .ndo_set_rx_mode = ipoib_set_mcast_list, .ndo_get_iflink = ipoib_get_iflink, .ndo_set_vf_link_state = ipoib_set_vf_link_state, .ndo_get_vf_config = ipoib_get_vf_config, .ndo_get_vf_stats = ipoib_get_vf_stats, .ndo_get_vf_guid = ipoib_get_vf_guid, .ndo_set_vf_guid = ipoib_set_vf_guid, .ndo_set_mac_address = ipoib_set_mac, .ndo_get_stats64 = ipoib_get_stats, .ndo_eth_ioctl = ipoib_ioctl, }; static const struct net_device_ops ipoib_netdev_ops_vf = { .ndo_init = ipoib_ndo_init, .ndo_uninit = ipoib_ndo_uninit, .ndo_open = ipoib_open, .ndo_stop = ipoib_stop, .ndo_change_mtu = ipoib_change_mtu, .ndo_fix_features = ipoib_fix_features, .ndo_start_xmit = ipoib_start_xmit, .ndo_tx_timeout = ipoib_timeout, .ndo_set_rx_mode = ipoib_set_mcast_list, .ndo_get_iflink = ipoib_get_iflink, .ndo_get_stats64 = ipoib_get_stats, .ndo_eth_ioctl = ipoib_ioctl, }; static const struct net_device_ops ipoib_netdev_default_pf = { .ndo_init = ipoib_dev_init_default, .ndo_uninit = ipoib_dev_uninit_default, .ndo_open = ipoib_ib_dev_open_default, .ndo_stop = ipoib_ib_dev_stop_default, }; void ipoib_setup_common(struct net_device *dev) { dev->header_ops = &ipoib_header_ops; dev->netdev_ops = &ipoib_netdev_default_pf; ipoib_set_ethtool_ops(dev); dev->watchdog_timeo = 10 * HZ; dev->flags |= IFF_BROADCAST | IFF_MULTICAST; dev->hard_header_len = IPOIB_HARD_LEN; dev->addr_len = INFINIBAND_ALEN; dev->type = ARPHRD_INFINIBAND; dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; dev->features = (NETIF_F_VLAN_CHALLENGED | NETIF_F_HIGHDMA); netif_keep_dst(dev); memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN); /* * unregister_netdev always frees the netdev, we use this mode * consistently to unify all the various unregister paths, including * those connected to rtnl_link_ops which require it. */ dev->needs_free_netdev = true; } static void ipoib_build_priv(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); priv->dev = dev; spin_lock_init(&priv->lock); init_rwsem(&priv->vlan_rwsem); mutex_init(&priv->mcast_mutex); INIT_LIST_HEAD(&priv->path_list); INIT_LIST_HEAD(&priv->child_intfs); INIT_LIST_HEAD(&priv->dead_ahs); INIT_LIST_HEAD(&priv->multicast_list); INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task); INIT_WORK(&priv->reschedule_napi_work, ipoib_napi_schedule_work); INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light); INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal); INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy); INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task); INIT_WORK(&priv->tx_timeout_work, ipoib_ib_tx_timeout_work); INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah); INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh); } static struct net_device *ipoib_alloc_netdev(struct ib_device *hca, u32 port, const char *name) { struct net_device *dev; dev = rdma_alloc_netdev(hca, port, RDMA_NETDEV_IPOIB, name, NET_NAME_UNKNOWN, ipoib_setup_common); if (!IS_ERR(dev) || PTR_ERR(dev) != -EOPNOTSUPP) return dev; dev = alloc_netdev(sizeof(struct rdma_netdev), name, NET_NAME_UNKNOWN, ipoib_setup_common); if (!dev) return ERR_PTR(-ENOMEM); return dev; } int ipoib_intf_init(struct ib_device *hca, u32 port, const char *name, struct net_device *dev) { struct rdma_netdev *rn = netdev_priv(dev); struct ipoib_dev_priv *priv; int rc; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; priv->ca = hca; priv->port = port; rc = rdma_init_netdev(hca, port, RDMA_NETDEV_IPOIB, name, NET_NAME_UNKNOWN, ipoib_setup_common, dev); if (rc) { if (rc != -EOPNOTSUPP) goto out; rn->send = ipoib_send; rn->attach_mcast = ipoib_mcast_attach; rn->detach_mcast = ipoib_mcast_detach; rn->hca = hca; rc = netif_set_real_num_tx_queues(dev, 1); if (rc) goto out; rc = netif_set_real_num_rx_queues(dev, 1); if (rc) goto out; } priv->rn_ops = dev->netdev_ops; if (hca->attrs.kernel_cap_flags & IBK_VIRTUAL_FUNCTION) dev->netdev_ops = &ipoib_netdev_ops_vf; else dev->netdev_ops = &ipoib_netdev_ops_pf; rn->clnt_priv = priv; /* * Only the child register_netdev flows can handle priv_destructor * being set, so we force it to NULL here and handle manually until it * is safe to turn on. */ priv->next_priv_destructor = dev->priv_destructor; dev->priv_destructor = NULL; ipoib_build_priv(dev); return 0; out: kfree(priv); return rc; } struct net_device *ipoib_intf_alloc(struct ib_device *hca, u32 port, const char *name) { struct net_device *dev; int rc; dev = ipoib_alloc_netdev(hca, port, name); if (IS_ERR(dev)) return dev; rc = ipoib_intf_init(hca, port, name, dev); if (rc) { free_netdev(dev); return ERR_PTR(rc); } /* * Upon success the caller must ensure ipoib_intf_free is called or * register_netdevice succeed'd and priv_destructor is set to * ipoib_intf_free. */ return dev; } void ipoib_intf_free(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct rdma_netdev *rn = netdev_priv(dev); dev->priv_destructor = priv->next_priv_destructor; if (dev->priv_destructor) dev->priv_destructor(dev); /* * There are some error flows around register_netdev failing that may * attempt to call priv_destructor twice, prevent that from happening. */ dev->priv_destructor = NULL; /* unregister/destroy is very complicated. Make bugs more obvious. */ rn->clnt_priv = NULL; kfree(priv); } static ssize_t pkey_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); struct ipoib_dev_priv *priv = ipoib_priv(ndev); return sysfs_emit(buf, "0x%04x\n", priv->pkey); } static DEVICE_ATTR_RO(pkey); static ssize_t umcast_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); struct ipoib_dev_priv *priv = ipoib_priv(ndev); return sysfs_emit(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags)); } void ipoib_set_umcast(struct net_device *ndev, int umcast_val) { struct ipoib_dev_priv *priv = ipoib_priv(ndev); if (umcast_val > 0) { set_bit(IPOIB_FLAG_UMCAST, &priv->flags); ipoib_warn(priv, "ignoring multicast groups joined directly " "by userspace\n"); } else clear_bit(IPOIB_FLAG_UMCAST, &priv->flags); } static ssize_t umcast_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { unsigned long umcast_val = simple_strtoul(buf, NULL, 0); ipoib_set_umcast(to_net_dev(dev), umcast_val); return count; } static DEVICE_ATTR_RW(umcast); int ipoib_add_umcast_attr(struct net_device *dev) { return device_create_file(&dev->dev, &dev_attr_umcast); } static void set_base_guid(struct ipoib_dev_priv *priv, union ib_gid *gid) { struct ipoib_dev_priv *child_priv; struct net_device *netdev = priv->dev; netif_addr_lock_bh(netdev); memcpy(&priv->local_gid.global.interface_id, &gid->global.interface_id, sizeof(gid->global.interface_id)); dev_addr_mod(netdev, 4, (u8 *)&priv->local_gid, sizeof(priv->local_gid)); clear_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); netif_addr_unlock_bh(netdev); if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { down_read(&priv->vlan_rwsem); list_for_each_entry(child_priv, &priv->child_intfs, list) set_base_guid(child_priv, gid); up_read(&priv->vlan_rwsem); } } static int ipoib_check_lladdr(struct net_device *dev, struct sockaddr_storage *ss) { union ib_gid *gid = (union ib_gid *)(ss->__data + 4); int ret = 0; netif_addr_lock_bh(dev); /* Make sure the QPN, reserved and subnet prefix match the current * lladdr, it also makes sure the lladdr is unicast. */ if (memcmp(dev->dev_addr, ss->__data, 4 + sizeof(gid->global.subnet_prefix)) || gid->global.interface_id == 0) ret = -EINVAL; netif_addr_unlock_bh(dev); return ret; } static int ipoib_set_mac(struct net_device *dev, void *addr) { struct ipoib_dev_priv *priv = ipoib_priv(dev); struct sockaddr_storage *ss = addr; int ret; if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev)) return -EBUSY; ret = ipoib_check_lladdr(dev, ss); if (ret) return ret; set_base_guid(priv, (union ib_gid *)(ss->__data + 4)); queue_work(ipoib_workqueue, &priv->flush_light); return 0; } static ssize_t create_child_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int pkey; int ret; if (sscanf(buf, "%i", &pkey) != 1) return -EINVAL; if (pkey <= 0 || pkey > 0xffff || pkey == 0x8000) return -EINVAL; ret = ipoib_vlan_add(to_net_dev(dev), pkey); return ret ? ret : count; } static DEVICE_ATTR_WO(create_child); static ssize_t delete_child_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int pkey; int ret; if (sscanf(buf, "%i", &pkey) != 1) return -EINVAL; if (pkey < 0 || pkey > 0xffff) return -EINVAL; ret = ipoib_vlan_delete(to_net_dev(dev), pkey); return ret ? ret : count; } static DEVICE_ATTR_WO(delete_child); int ipoib_add_pkey_attr(struct net_device *dev) { return device_create_file(&dev->dev, &dev_attr_pkey); } /* * We erroneously exposed the iface's port number in the dev_id * sysfs field long after dev_port was introduced for that purpose[1], * and we need to stop everyone from relying on that. * Let's overload the shower routine for the dev_id file here * to gently bring the issue up. * * [1] https://www.spinics.net/lists/netdev/msg272123.html */ static ssize_t dev_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); /* * ndev->dev_port will be equal to 0 in old kernel prior to commit * 9b8b2a323008 ("IB/ipoib: Use dev_port to expose network interface * port numbers") Zero was chosen as special case for user space * applications to fallback and query dev_id to check if it has * different value or not. * * Don't print warning in such scenario. * * https://github.com/systemd/systemd/blob/master/src/udev/udev-builtin-net_id.c#L358 */ if (ndev->dev_port && ndev->dev_id == ndev->dev_port) netdev_info_once(ndev, "\"%s\" wants to know my dev_id. Should it look at dev_port instead? See Documentation/ABI/testing/sysfs-class-net for more info.\n", current->comm); return sysfs_emit(buf, "%#x\n", ndev->dev_id); } static DEVICE_ATTR_RO(dev_id); static int ipoib_intercept_dev_id_attr(struct net_device *dev) { device_remove_file(&dev->dev, &dev_attr_dev_id); return device_create_file(&dev->dev, &dev_attr_dev_id); } static struct net_device *ipoib_add_port(const char *format, struct ib_device *hca, u32 port) { struct rtnl_link_ops *ops = ipoib_get_link_ops(); struct rdma_netdev_alloc_params params; struct ipoib_dev_priv *priv; struct net_device *ndev; int result; ndev = ipoib_intf_alloc(hca, port, format); if (IS_ERR(ndev)) { pr_warn("%s, %d: ipoib_intf_alloc failed %ld\n", hca->name, port, PTR_ERR(ndev)); return ndev; } priv = ipoib_priv(ndev); INIT_IB_EVENT_HANDLER(&priv->event_handler, priv->ca, ipoib_event); ib_register_event_handler(&priv->event_handler); /* call event handler to ensure pkey in sync */ queue_work(ipoib_workqueue, &priv->flush_heavy); ndev->rtnl_link_ops = ipoib_get_link_ops(); result = register_netdev(ndev); if (result) { pr_warn("%s: couldn't register ipoib port %d; error %d\n", hca->name, port, result); ipoib_parent_unregister_pre(ndev); ipoib_intf_free(ndev); free_netdev(ndev); return ERR_PTR(result); } if (hca->ops.rdma_netdev_get_params) { int rc = hca->ops.rdma_netdev_get_params(hca, port, RDMA_NETDEV_IPOIB, &params); if (!rc && ops->priv_size < params.sizeof_priv) ops->priv_size = params.sizeof_priv; } /* * We cannot set priv_destructor before register_netdev because we * need priv to be always valid during the error flow to execute * ipoib_parent_unregister_pre(). Instead handle it manually and only * enter priv_destructor mode once we are completely registered. */ ndev->priv_destructor = ipoib_intf_free; if (ipoib_intercept_dev_id_attr(ndev)) goto sysfs_failed; if (ipoib_cm_add_mode_attr(ndev)) goto sysfs_failed; if (ipoib_add_pkey_attr(ndev)) goto sysfs_failed; if (ipoib_add_umcast_attr(ndev)) goto sysfs_failed; if (device_create_file(&ndev->dev, &dev_attr_create_child)) goto sysfs_failed; if (device_create_file(&ndev->dev, &dev_attr_delete_child)) goto sysfs_failed; return ndev; sysfs_failed: ipoib_parent_unregister_pre(ndev); unregister_netdev(ndev); return ERR_PTR(-ENOMEM); } static int ipoib_add_one(struct ib_device *device) { struct list_head *dev_list; struct net_device *dev; struct ipoib_dev_priv *priv; unsigned int p; int count = 0; dev_list = kmalloc(sizeof(*dev_list), GFP_KERNEL); if (!dev_list) return -ENOMEM; INIT_LIST_HEAD(dev_list); rdma_for_each_port (device, p) { if (!rdma_protocol_ib(device, p)) continue; dev = ipoib_add_port("ib%d", device, p); if (!IS_ERR(dev)) { priv = ipoib_priv(dev); list_add_tail(&priv->list, dev_list); count++; } } if (!count) { kfree(dev_list); return -EOPNOTSUPP; } ib_set_client_data(device, &ipoib_client, dev_list); return 0; } static void ipoib_remove_one(struct ib_device *device, void *client_data) { struct ipoib_dev_priv *priv, *tmp, *cpriv, *tcpriv; struct list_head *dev_list = client_data; list_for_each_entry_safe(priv, tmp, dev_list, list) { LIST_HEAD(head); ipoib_parent_unregister_pre(priv->dev); rtnl_lock(); list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) unregister_netdevice_queue(cpriv->dev, &head); unregister_netdevice_queue(priv->dev, &head); unregister_netdevice_many(&head); rtnl_unlock(); } kfree(dev_list); } #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG static struct notifier_block ipoib_netdev_notifier = { .notifier_call = ipoib_netdev_event, }; #endif static int __init ipoib_init_module(void) { int ret; ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size); ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE); ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE); ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size); ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE); ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE); #ifdef CONFIG_INFINIBAND_IPOIB_CM ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP); ipoib_max_conn_qp = max(ipoib_max_conn_qp, 0); #endif /* * When copying small received packets, we only copy from the * linear data part of the SKB, so we rely on this condition. */ BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE); ipoib_register_debugfs(); /* * We create a global workqueue here that is used for all flush * operations. However, if you attempt to flush a workqueue * from a task on that same workqueue, it deadlocks the system. * We want to be able to flush the tasks associated with a * specific net device, so we also create a workqueue for each * netdevice. We queue up the tasks for that device only on * its private workqueue, and we only queue up flush events * on our global flush workqueue. This avoids the deadlocks. */ ipoib_workqueue = alloc_ordered_workqueue("ipoib_flush", 0); if (!ipoib_workqueue) { ret = -ENOMEM; goto err_fs; } ib_sa_register_client(&ipoib_sa_client); ret = ib_register_client(&ipoib_client); if (ret) goto err_sa; ret = ipoib_netlink_init(); if (ret) goto err_client; #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG register_netdevice_notifier(&ipoib_netdev_notifier); #endif return 0; err_client: ib_unregister_client(&ipoib_client); err_sa: ib_sa_unregister_client(&ipoib_sa_client); destroy_workqueue(ipoib_workqueue); err_fs: ipoib_unregister_debugfs(); return ret; } static void __exit ipoib_cleanup_module(void) { #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG unregister_netdevice_notifier(&ipoib_netdev_notifier); #endif ipoib_netlink_fini(); ib_unregister_client(&ipoib_client); ib_sa_unregister_client(&ipoib_sa_client); ipoib_unregister_debugfs(); destroy_workqueue(ipoib_workqueue); } module_init(ipoib_init_module); module_exit(ipoib_cleanup_module);
9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 10 10 1 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ /* Fredy Neeser */ /* Greg Joyce <greg@opengridcomputing.com> */ /* Copyright (c) 2008-2019, IBM Corporation */ /* Copyright (c) 2017, Open Grid Computing, Inc. */ #include <linux/errno.h> #include <linux/types.h> #include <linux/net.h> #include <linux/inetdevice.h> #include <net/addrconf.h> #include <linux/workqueue.h> #include <net/sock.h> #include <net/tcp.h> #include <linux/inet.h> #include <linux/tcp.h> #include <trace/events/sock.h> #include <rdma/iw_cm.h> #include <rdma/ib_verbs.h> #include <rdma/ib_user_verbs.h> #include "siw.h" #include "siw_cm.h" /* * Set to any combination of * MPA_V2_RDMA_NO_RTR, MPA_V2_RDMA_READ_RTR, MPA_V2_RDMA_WRITE_RTR */ static __be16 rtr_type = MPA_V2_RDMA_READ_RTR | MPA_V2_RDMA_WRITE_RTR; static const bool relaxed_ird_negotiation = true; static void siw_cm_llp_state_change(struct sock *s); static void siw_cm_llp_data_ready(struct sock *s); static void siw_cm_llp_write_space(struct sock *s); static void siw_cm_llp_error_report(struct sock *s); static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason, int status); static void siw_sk_assign_cm_upcalls(struct sock *sk) { struct siw_cep *cep = sk_to_cep(sk); write_lock_bh(&sk->sk_callback_lock); cep->sk_state_change = sk->sk_state_change; cep->sk_data_ready = sk->sk_data_ready; cep->sk_write_space = sk->sk_write_space; cep->sk_error_report = sk->sk_error_report; sk->sk_state_change = siw_cm_llp_state_change; sk->sk_data_ready = siw_cm_llp_data_ready; sk->sk_write_space = siw_cm_llp_write_space; sk->sk_error_report = siw_cm_llp_error_report; write_unlock_bh(&sk->sk_callback_lock); } static void siw_sk_restore_upcalls(struct sock *sk, struct siw_cep *cep) { sk->sk_state_change = cep->sk_state_change; sk->sk_data_ready = cep->sk_data_ready; sk->sk_write_space = cep->sk_write_space; sk->sk_error_report = cep->sk_error_report; sk->sk_user_data = NULL; } static void siw_qp_socket_assoc(struct siw_cep *cep, struct siw_qp *qp) { struct socket *s = cep->sock; struct sock *sk = s->sk; write_lock_bh(&sk->sk_callback_lock); qp->attrs.sk = s; sk->sk_data_ready = siw_qp_llp_data_ready; sk->sk_write_space = siw_qp_llp_write_space; write_unlock_bh(&sk->sk_callback_lock); } static void siw_socket_disassoc(struct socket *s) { struct sock *sk = s->sk; struct siw_cep *cep; if (sk) { write_lock_bh(&sk->sk_callback_lock); cep = sk_to_cep(sk); if (cep) { siw_sk_restore_upcalls(sk, cep); siw_cep_put(cep); } else { pr_warn("siw: cannot restore sk callbacks: no ep\n"); } write_unlock_bh(&sk->sk_callback_lock); } else { pr_warn("siw: cannot restore sk callbacks: no sk\n"); } } static void siw_rtr_data_ready(struct sock *sk) { struct siw_cep *cep; struct siw_qp *qp = NULL; read_descriptor_t rd_desc; trace_sk_data_ready(sk); read_lock(&sk->sk_callback_lock); cep = sk_to_cep(sk); if (!cep) { WARN(1, "No connection endpoint\n"); goto out; } qp = sk_to_qp(sk); memset(&rd_desc, 0, sizeof(rd_desc)); rd_desc.arg.data = qp; rd_desc.count = 1; tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data); /* * Check if first frame was successfully processed. * Signal connection full establishment if yes. * Failed data processing would have already scheduled * connection drop. */ if (!qp->rx_stream.rx_suspend) siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0); out: read_unlock(&sk->sk_callback_lock); if (qp) siw_qp_socket_assoc(cep, qp); } static void siw_sk_assign_rtr_upcalls(struct siw_cep *cep) { struct sock *sk = cep->sock->sk; write_lock_bh(&sk->sk_callback_lock); sk->sk_data_ready = siw_rtr_data_ready; sk->sk_write_space = siw_qp_llp_write_space; write_unlock_bh(&sk->sk_callback_lock); } static void siw_cep_socket_assoc(struct siw_cep *cep, struct socket *s) { cep->sock = s; siw_cep_get(cep); s->sk->sk_user_data = cep; siw_sk_assign_cm_upcalls(s->sk); } static struct siw_cep *siw_cep_alloc(struct siw_device *sdev) { struct siw_cep *cep = kzalloc(sizeof(*cep), GFP_KERNEL); unsigned long flags; if (!cep) return NULL; INIT_LIST_HEAD(&cep->listenq); INIT_LIST_HEAD(&cep->devq); INIT_LIST_HEAD(&cep->work_freelist); kref_init(&cep->ref); cep->state = SIW_EPSTATE_IDLE; init_waitqueue_head(&cep->waitq); spin_lock_init(&cep->lock); cep->sdev = sdev; cep->enhanced_rdma_conn_est = false; spin_lock_irqsave(&sdev->lock, flags); list_add_tail(&cep->devq, &sdev->cep_list); spin_unlock_irqrestore(&sdev->lock, flags); siw_dbg_cep(cep, "new endpoint\n"); return cep; } static void siw_cm_free_work(struct siw_cep *cep) { struct list_head *w, *tmp; struct siw_cm_work *work; list_for_each_safe(w, tmp, &cep->work_freelist) { work = list_entry(w, struct siw_cm_work, list); list_del(&work->list); kfree(work); } } static void siw_cancel_mpatimer(struct siw_cep *cep) { spin_lock_bh(&cep->lock); if (cep->mpa_timer) { if (cancel_delayed_work(&cep->mpa_timer->work)) { siw_cep_put(cep); kfree(cep->mpa_timer); /* not needed again */ } cep->mpa_timer = NULL; } spin_unlock_bh(&cep->lock); } static void siw_put_work(struct siw_cm_work *work) { INIT_LIST_HEAD(&work->list); spin_lock_bh(&work->cep->lock); list_add(&work->list, &work->cep->work_freelist); spin_unlock_bh(&work->cep->lock); } static void siw_cep_set_inuse(struct siw_cep *cep) { unsigned long flags; retry: spin_lock_irqsave(&cep->lock, flags); if (cep->in_use) { spin_unlock_irqrestore(&cep->lock, flags); wait_event_interruptible(cep->waitq, !cep->in_use); if (signal_pending(current)) flush_signals(current); goto retry; } else { cep->in_use = 1; spin_unlock_irqrestore(&cep->lock, flags); } } static void siw_cep_set_free(struct siw_cep *cep) { unsigned long flags; spin_lock_irqsave(&cep->lock, flags); cep->in_use = 0; spin_unlock_irqrestore(&cep->lock, flags); wake_up(&cep->waitq); } static void __siw_cep_dealloc(struct kref *ref) { struct siw_cep *cep = container_of(ref, struct siw_cep, ref); struct siw_device *sdev = cep->sdev; unsigned long flags; WARN_ON(cep->listen_cep); /* kfree(NULL) is safe */ kfree(cep->mpa.pdata); spin_lock_bh(&cep->lock); if (!list_empty(&cep->work_freelist)) siw_cm_free_work(cep); spin_unlock_bh(&cep->lock); spin_lock_irqsave(&sdev->lock, flags); list_del(&cep->devq); spin_unlock_irqrestore(&sdev->lock, flags); siw_dbg_cep(cep, "free endpoint\n"); kfree(cep); } static struct siw_cm_work *siw_get_work(struct siw_cep *cep) { struct siw_cm_work *work = NULL; spin_lock_bh(&cep->lock); if (!list_empty(&cep->work_freelist)) { work = list_entry(cep->work_freelist.next, struct siw_cm_work, list); list_del_init(&work->list); } spin_unlock_bh(&cep->lock); return work; } static int siw_cm_alloc_work(struct siw_cep *cep, int num) { struct siw_cm_work *work; while (num--) { work = kmalloc(sizeof(*work), GFP_KERNEL); if (!work) { if (!(list_empty(&cep->work_freelist))) siw_cm_free_work(cep); return -ENOMEM; } work->cep = cep; INIT_LIST_HEAD(&work->list); list_add(&work->list, &cep->work_freelist); } return 0; } /* * siw_cm_upcall() * * Upcall to IWCM to inform about async connection events */ static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason, int status) { struct iw_cm_event event; struct iw_cm_id *id; memset(&event, 0, sizeof(event)); event.status = status; event.event = reason; if (reason == IW_CM_EVENT_CONNECT_REQUEST) { event.provider_data = cep; id = cep->listen_cep->cm_id; } else { id = cep->cm_id; } /* Signal IRD and ORD */ if (reason == IW_CM_EVENT_ESTABLISHED || reason == IW_CM_EVENT_CONNECT_REPLY) { /* Signal negotiated IRD/ORD values we will use */ event.ird = cep->ird; event.ord = cep->ord; } else if (reason == IW_CM_EVENT_CONNECT_REQUEST) { event.ird = cep->ord; event.ord = cep->ird; } /* Signal private data and address information */ if (reason == IW_CM_EVENT_CONNECT_REQUEST || reason == IW_CM_EVENT_CONNECT_REPLY) { u16 pd_len = be16_to_cpu(cep->mpa.hdr.params.pd_len); if (pd_len) { /* * hand over MPA private data */ event.private_data_len = pd_len; event.private_data = cep->mpa.pdata; /* Hide MPA V2 IRD/ORD control */ if (cep->enhanced_rdma_conn_est) { event.private_data_len -= sizeof(struct mpa_v2_data); event.private_data += sizeof(struct mpa_v2_data); } } getname_local(cep->sock, &event.local_addr); getname_peer(cep->sock, &event.remote_addr); } siw_dbg_cep(cep, "[QP %u]: reason=%d, status=%d\n", cep->qp ? qp_id(cep->qp) : UINT_MAX, reason, status); return id->event_handler(id, &event); } static void siw_free_cm_id(struct siw_cep *cep) { if (!cep->cm_id) return; cep->cm_id->rem_ref(cep->cm_id); cep->cm_id = NULL; } static void siw_destroy_cep_sock(struct siw_cep *cep) { if (cep->sock) { siw_socket_disassoc(cep->sock); sock_release(cep->sock); cep->sock = NULL; } } /* * siw_qp_cm_drop() * * Drops established LLP connection if present and not already * scheduled for dropping. Called from user context, SQ workqueue * or receive IRQ. Caller signals if socket can be immediately * closed (basically, if not in IRQ). */ void siw_qp_cm_drop(struct siw_qp *qp, int schedule) { struct siw_cep *cep = qp->cep; qp->rx_stream.rx_suspend = 1; qp->tx_ctx.tx_suspend = 1; if (!qp->cep) return; if (schedule) { siw_cm_queue_work(cep, SIW_CM_WORK_CLOSE_LLP); } else { siw_cep_set_inuse(cep); if (cep->state == SIW_EPSTATE_CLOSED) { siw_dbg_cep(cep, "already closed\n"); goto out; } siw_dbg_cep(cep, "immediate close, state %d\n", cep->state); siw_send_terminate(qp); if (cep->cm_id) { switch (cep->state) { case SIW_EPSTATE_AWAIT_MPAREP: siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL); break; case SIW_EPSTATE_RDMA_MODE: siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); break; case SIW_EPSTATE_IDLE: case SIW_EPSTATE_LISTENING: case SIW_EPSTATE_CONNECTING: case SIW_EPSTATE_AWAIT_MPAREQ: case SIW_EPSTATE_RECVD_MPAREQ: case SIW_EPSTATE_CLOSED: default: break; } siw_free_cm_id(cep); siw_cep_put(cep); } cep->state = SIW_EPSTATE_CLOSED; siw_destroy_cep_sock(cep); if (cep->qp) { cep->qp = NULL; siw_qp_put(qp); } out: siw_cep_set_free(cep); } } void siw_cep_put(struct siw_cep *cep) { WARN_ON(kref_read(&cep->ref) < 1); kref_put(&cep->ref, __siw_cep_dealloc); } static void siw_cep_set_free_and_put(struct siw_cep *cep) { siw_cep_set_free(cep); siw_cep_put(cep); } void siw_cep_get(struct siw_cep *cep) { kref_get(&cep->ref); } /* * Expects params->pd_len in host byte order */ static int siw_send_mpareqrep(struct siw_cep *cep, const void *pdata, u8 pd_len) { struct socket *s = cep->sock; struct mpa_rr *rr = &cep->mpa.hdr; struct kvec iov[3]; struct msghdr msg; int rv; int iovec_num = 0; int mpa_len; memset(&msg, 0, sizeof(msg)); iov[iovec_num].iov_base = rr; iov[iovec_num].iov_len = sizeof(*rr); mpa_len = sizeof(*rr); if (cep->enhanced_rdma_conn_est) { iovec_num++; iov[iovec_num].iov_base = &cep->mpa.v2_ctrl; iov[iovec_num].iov_len = sizeof(cep->mpa.v2_ctrl); mpa_len += sizeof(cep->mpa.v2_ctrl); } if (pd_len) { iovec_num++; iov[iovec_num].iov_base = (char *)pdata; iov[iovec_num].iov_len = pd_len; mpa_len += pd_len; } if (cep->enhanced_rdma_conn_est) pd_len += sizeof(cep->mpa.v2_ctrl); rr->params.pd_len = cpu_to_be16(pd_len); rv = kernel_sendmsg(s, &msg, iov, iovec_num + 1, mpa_len); return rv < 0 ? rv : 0; } /* * Receive MPA Request/Reply header. * * Returns 0 if complete MPA Request/Reply header including * eventual private data was received. Returns -EAGAIN if * header was partially received or negative error code otherwise. * * Context: May be called in process context only */ static int siw_recv_mpa_rr(struct siw_cep *cep) { struct mpa_rr *hdr = &cep->mpa.hdr; struct socket *s = cep->sock; u16 pd_len; int rcvd, to_rcv; if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) { rcvd = ksock_recv(s, (char *)hdr + cep->mpa.bytes_rcvd, sizeof(struct mpa_rr) - cep->mpa.bytes_rcvd, 0); if (rcvd <= 0) return -ECONNABORTED; cep->mpa.bytes_rcvd += rcvd; if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) return -EAGAIN; if (be16_to_cpu(hdr->params.pd_len) > MPA_MAX_PRIVDATA) return -EPROTO; } pd_len = be16_to_cpu(hdr->params.pd_len); /* * At least the MPA Request/Reply header (frame not including * private data) has been received. * Receive (or continue receiving) any private data. */ to_rcv = pd_len - (cep->mpa.bytes_rcvd - sizeof(struct mpa_rr)); if (!to_rcv) { /* * We must have hdr->params.pd_len == 0 and thus received a * complete MPA Request/Reply frame. * Check against peer protocol violation. */ u32 word; rcvd = ksock_recv(s, (char *)&word, sizeof(word), MSG_DONTWAIT); if (rcvd == -EAGAIN) return 0; if (rcvd == 0) { siw_dbg_cep(cep, "peer EOF\n"); return -EPIPE; } if (rcvd < 0) { siw_dbg_cep(cep, "error: %d\n", rcvd); return rcvd; } siw_dbg_cep(cep, "peer sent extra data: %d\n", rcvd); return -EPROTO; } /* * At this point, we must have hdr->params.pd_len != 0. * A private data buffer gets allocated if hdr->params.pd_len != 0. */ if (!cep->mpa.pdata) { cep->mpa.pdata = kmalloc(pd_len + 4, GFP_KERNEL); if (!cep->mpa.pdata) return -ENOMEM; } rcvd = ksock_recv( s, cep->mpa.pdata + cep->mpa.bytes_rcvd - sizeof(struct mpa_rr), to_rcv + 4, MSG_DONTWAIT); if (rcvd < 0) return rcvd; if (rcvd > to_rcv) return -EPROTO; cep->mpa.bytes_rcvd += rcvd; if (to_rcv == rcvd) { siw_dbg_cep(cep, "%d bytes private data received\n", pd_len); return 0; } return -EAGAIN; } /* * siw_proc_mpareq() * * Read MPA Request from socket and signal new connection to IWCM * if success. Caller must hold lock on corresponding listening CEP. */ static int siw_proc_mpareq(struct siw_cep *cep) { struct mpa_rr *req; int version, rv; u16 pd_len; rv = siw_recv_mpa_rr(cep); if (rv) return rv; req = &cep->mpa.hdr; version = __mpa_rr_revision(req->params.bits); pd_len = be16_to_cpu(req->params.pd_len); if (version > MPA_REVISION_2) /* allow for 0, 1, and 2 only */ return -EPROTO; if (memcmp(req->key, MPA_KEY_REQ, 16)) return -EPROTO; /* Prepare for sending MPA reply */ memcpy(req->key, MPA_KEY_REP, 16); if (version == MPA_REVISION_2 && (req->params.bits & MPA_RR_FLAG_ENHANCED)) { /* * MPA version 2 must signal IRD/ORD values and P2P mode * in private data if header flag MPA_RR_FLAG_ENHANCED * is set. */ if (pd_len < sizeof(struct mpa_v2_data)) goto reject_conn; cep->enhanced_rdma_conn_est = true; } /* MPA Markers: currently not supported. Marker TX to be added. */ if (req->params.bits & MPA_RR_FLAG_MARKERS) goto reject_conn; if (req->params.bits & MPA_RR_FLAG_CRC) { /* * RFC 5044, page 27: CRC MUST be used if peer requests it. * siw specific: 'mpa_crc_strict' parameter to reject * connection with CRC if local CRC off enforced by * 'mpa_crc_strict' module parameter. */ if (!mpa_crc_required && mpa_crc_strict) goto reject_conn; /* Enable CRC if requested by module parameter */ if (mpa_crc_required) req->params.bits |= MPA_RR_FLAG_CRC; } if (cep->enhanced_rdma_conn_est) { struct mpa_v2_data *v2 = (struct mpa_v2_data *)cep->mpa.pdata; /* * Peer requested ORD becomes requested local IRD, * peer requested IRD becomes requested local ORD. * IRD and ORD get limited by global maximum values. */ cep->ord = ntohs(v2->ird) & MPA_IRD_ORD_MASK; cep->ord = min(cep->ord, SIW_MAX_ORD_QP); cep->ird = ntohs(v2->ord) & MPA_IRD_ORD_MASK; cep->ird = min(cep->ird, SIW_MAX_IRD_QP); /* May get overwritten by locally negotiated values */ cep->mpa.v2_ctrl.ird = htons(cep->ird); cep->mpa.v2_ctrl.ord = htons(cep->ord); /* * Support for peer sent zero length Write or Read to * let local side enter RTS. Writes are preferred. * Sends would require pre-posting a Receive and are * not supported. * Propose zero length Write if none of Read and Write * is indicated. */ if (v2->ird & MPA_V2_PEER_TO_PEER) { cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER; if (v2->ord & MPA_V2_RDMA_WRITE_RTR) cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR; else if (v2->ord & MPA_V2_RDMA_READ_RTR) cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_READ_RTR; else cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR; } } cep->state = SIW_EPSTATE_RECVD_MPAREQ; /* Keep reference until IWCM accepts/rejects */ siw_cep_get(cep); rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REQUEST, 0); if (rv) siw_cep_put(cep); return rv; reject_conn: siw_dbg_cep(cep, "reject: crc %d:%d:%d, m %d:%d\n", req->params.bits & MPA_RR_FLAG_CRC ? 1 : 0, mpa_crc_required, mpa_crc_strict, req->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0); req->params.bits &= ~MPA_RR_FLAG_MARKERS; req->params.bits |= MPA_RR_FLAG_REJECT; if (!mpa_crc_required && mpa_crc_strict) req->params.bits &= ~MPA_RR_FLAG_CRC; if (pd_len) kfree(cep->mpa.pdata); cep->mpa.pdata = NULL; siw_send_mpareqrep(cep, NULL, 0); return -EOPNOTSUPP; } static int siw_proc_mpareply(struct siw_cep *cep) { struct siw_qp_attrs qp_attrs; enum siw_qp_attr_mask qp_attr_mask; struct siw_qp *qp = cep->qp; struct mpa_rr *rep; int rv; u16 rep_ord; u16 rep_ird; bool ird_insufficient = false; enum mpa_v2_ctrl mpa_p2p_mode = MPA_V2_RDMA_NO_RTR; rv = siw_recv_mpa_rr(cep); if (rv) goto out_err; siw_cancel_mpatimer(cep); rep = &cep->mpa.hdr; if (__mpa_rr_revision(rep->params.bits) > MPA_REVISION_2) { /* allow for 0, 1, and 2 only */ rv = -EPROTO; goto out_err; } if (memcmp(rep->key, MPA_KEY_REP, 16)) { siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, LLP_ETYPE_MPA, LLP_ECODE_INVALID_REQ_RESP, 0); siw_send_terminate(qp); rv = -EPROTO; goto out_err; } if (rep->params.bits & MPA_RR_FLAG_REJECT) { siw_dbg_cep(cep, "got mpa reject\n"); siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET); return -ECONNRESET; } if (try_gso && rep->params.bits & MPA_RR_FLAG_GSO_EXP) { siw_dbg_cep(cep, "peer allows GSO on TX\n"); qp->tx_ctx.gso_seg_limit = 0; } if ((rep->params.bits & MPA_RR_FLAG_MARKERS) || (mpa_crc_required && !(rep->params.bits & MPA_RR_FLAG_CRC)) || (mpa_crc_strict && !mpa_crc_required && (rep->params.bits & MPA_RR_FLAG_CRC))) { siw_dbg_cep(cep, "reply unsupp: crc %d:%d:%d, m %d:%d\n", rep->params.bits & MPA_RR_FLAG_CRC ? 1 : 0, mpa_crc_required, mpa_crc_strict, rep->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0); siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNREFUSED); return -EINVAL; } if (cep->enhanced_rdma_conn_est) { struct mpa_v2_data *v2; if (__mpa_rr_revision(rep->params.bits) < MPA_REVISION_2 || !(rep->params.bits & MPA_RR_FLAG_ENHANCED)) { /* * Protocol failure: The responder MUST reply with * MPA version 2 and MUST set MPA_RR_FLAG_ENHANCED. */ siw_dbg_cep(cep, "mpa reply error: vers %d, enhcd %d\n", __mpa_rr_revision(rep->params.bits), rep->params.bits & MPA_RR_FLAG_ENHANCED ? 1 : 0); siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET); return -EINVAL; } v2 = (struct mpa_v2_data *)cep->mpa.pdata; rep_ird = ntohs(v2->ird) & MPA_IRD_ORD_MASK; rep_ord = ntohs(v2->ord) & MPA_IRD_ORD_MASK; if (cep->ird < rep_ord && (relaxed_ird_negotiation == false || rep_ord > cep->sdev->attrs.max_ird)) { siw_dbg_cep(cep, "ird %d, rep_ord %d, max_ord %d\n", cep->ird, rep_ord, cep->sdev->attrs.max_ord); ird_insufficient = true; } if (cep->ord > rep_ird && relaxed_ird_negotiation == false) { siw_dbg_cep(cep, "ord %d, rep_ird %d\n", cep->ord, rep_ird); ird_insufficient = true; } /* * Always report negotiated peer values to user, * even if IRD/ORD negotiation failed */ cep->ird = rep_ord; cep->ord = rep_ird; if (ird_insufficient) { /* * If the initiator IRD is insuffient for the * responder ORD, send a TERM. */ siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, LLP_ETYPE_MPA, LLP_ECODE_INSUFFICIENT_IRD, 0); siw_send_terminate(qp); rv = -ENOMEM; goto out_err; } if (cep->mpa.v2_ctrl_req.ird & MPA_V2_PEER_TO_PEER) mpa_p2p_mode = cep->mpa.v2_ctrl_req.ord & (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR); /* * Check if we requested P2P mode, and if peer agrees */ if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) { if ((mpa_p2p_mode & v2->ord) == 0) { /* * We requested RTR mode(s), but the peer * did not pick any mode we support. */ siw_dbg_cep(cep, "rtr mode: req %2x, got %2x\n", mpa_p2p_mode, v2->ord & (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR)); siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, LLP_ETYPE_MPA, LLP_ECODE_NO_MATCHING_RTR, 0); siw_send_terminate(qp); rv = -EPROTO; goto out_err; } mpa_p2p_mode = v2->ord & (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR); } } memset(&qp_attrs, 0, sizeof(qp_attrs)); if (rep->params.bits & MPA_RR_FLAG_CRC) qp_attrs.flags = SIW_MPA_CRC; qp_attrs.irq_size = cep->ird; qp_attrs.orq_size = cep->ord; qp_attrs.sk = cep->sock; qp_attrs.state = SIW_QP_STATE_RTS; qp_attr_mask = SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE | SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD | SIW_QP_ATTR_MPA; /* Move socket RX/TX under QP control */ down_write(&qp->state_lock); if (qp->attrs.state > SIW_QP_STATE_RTR) { rv = -EINVAL; up_write(&qp->state_lock); goto out_err; } rv = siw_qp_modify(qp, &qp_attrs, qp_attr_mask); siw_qp_socket_assoc(cep, qp); up_write(&qp->state_lock); /* Send extra RDMA frame to trigger peer RTS if negotiated */ if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) { rv = siw_qp_mpa_rts(qp, mpa_p2p_mode); if (rv) goto out_err; } if (!rv) { rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 0); if (!rv) cep->state = SIW_EPSTATE_RDMA_MODE; return 0; } out_err: if (rv != -EAGAIN) siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL); return rv; } /* * siw_accept_newconn - accept an incoming pending connection * */ static void siw_accept_newconn(struct siw_cep *cep) { struct socket *s = cep->sock; struct socket *new_s = NULL; struct siw_cep *new_cep = NULL; int rv = 0; /* debug only. should disappear */ if (cep->state != SIW_EPSTATE_LISTENING) goto error; new_cep = siw_cep_alloc(cep->sdev); if (!new_cep) goto error; /* * 4: Allocate a sufficient number of work elements * to allow concurrent handling of local + peer close * events, MPA header processing + MPA timeout. */ if (siw_cm_alloc_work(new_cep, 4) != 0) goto error; /* * Copy saved socket callbacks from listening CEP * and assign new socket with new CEP */ new_cep->sk_state_change = cep->sk_state_change; new_cep->sk_data_ready = cep->sk_data_ready; new_cep->sk_write_space = cep->sk_write_space; new_cep->sk_error_report = cep->sk_error_report; rv = kernel_accept(s, &new_s, O_NONBLOCK); if (rv != 0) { /* * Connection already aborted by peer..? */ siw_dbg_cep(cep, "kernel_accept() error: %d\n", rv); goto error; } new_cep->sock = new_s; siw_cep_get(new_cep); new_s->sk->sk_user_data = new_cep; if (siw_tcp_nagle == false) tcp_sock_set_nodelay(new_s->sk); new_cep->state = SIW_EPSTATE_AWAIT_MPAREQ; rv = siw_cm_queue_work(new_cep, SIW_CM_WORK_MPATIMEOUT); if (rv) goto error; /* * See siw_proc_mpareq() etc. for the use of new_cep->listen_cep. */ new_cep->listen_cep = cep; siw_cep_get(cep); if (atomic_read(&new_s->sk->sk_rmem_alloc)) { /* * MPA REQ already queued */ siw_dbg_cep(cep, "immediate mpa request\n"); siw_cep_set_inuse(new_cep); rv = siw_proc_mpareq(new_cep); if (rv != -EAGAIN) { siw_cep_put(cep); new_cep->listen_cep = NULL; if (rv) { siw_cancel_mpatimer(new_cep); siw_cep_set_free(new_cep); goto error; } } siw_cep_set_free(new_cep); } return; error: if (new_cep) siw_cep_put(new_cep); if (new_s) { siw_socket_disassoc(new_s); sock_release(new_s); new_cep->sock = NULL; } siw_dbg_cep(cep, "error %d\n", rv); } static void siw_cm_work_handler(struct work_struct *w) { struct siw_cm_work *work; struct siw_cep *cep; int release_cep = 0, rv = 0; work = container_of(w, struct siw_cm_work, work.work); cep = work->cep; siw_dbg_cep(cep, "[QP %u]: work type: %d, state %d\n", cep->qp ? qp_id(cep->qp) : UINT_MAX, work->type, cep->state); siw_cep_set_inuse(cep); switch (work->type) { case SIW_CM_WORK_ACCEPT: siw_accept_newconn(cep); break; case SIW_CM_WORK_READ_MPAHDR: if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { if (cep->listen_cep) { siw_cep_set_inuse(cep->listen_cep); if (cep->listen_cep->state == SIW_EPSTATE_LISTENING) rv = siw_proc_mpareq(cep); else rv = -EFAULT; siw_cep_set_free(cep->listen_cep); if (rv != -EAGAIN) { siw_cep_put(cep->listen_cep); cep->listen_cep = NULL; if (rv) siw_cep_put(cep); } } } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { rv = siw_proc_mpareply(cep); } else { /* * CEP already moved out of MPA handshake. * any connection management already done. * silently ignore the mpa packet. */ if (cep->state == SIW_EPSTATE_RDMA_MODE) { cep->sock->sk->sk_data_ready(cep->sock->sk); siw_dbg_cep(cep, "already in RDMA mode"); } else { siw_dbg_cep(cep, "out of state: %d\n", cep->state); } } if (rv && rv != -EAGAIN) release_cep = 1; break; case SIW_CM_WORK_CLOSE_LLP: /* * QP scheduled LLP close */ if (cep->qp) siw_send_terminate(cep->qp); if (cep->cm_id) siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); release_cep = 1; break; case SIW_CM_WORK_PEER_CLOSE: if (cep->cm_id) { if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { /* * MPA reply not received, but connection drop */ siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET); } else if (cep->state == SIW_EPSTATE_RDMA_MODE) { /* * NOTE: IW_CM_EVENT_DISCONNECT is given just * to transition IWCM into CLOSING. */ siw_cm_upcall(cep, IW_CM_EVENT_DISCONNECT, 0); siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0); } /* * for other states there is no connection * known to the IWCM. */ } else { if (cep->state == SIW_EPSTATE_RECVD_MPAREQ) { /* * Wait for the ulp/CM to call accept/reject */ siw_dbg_cep(cep, "mpa req recvd, wait for ULP\n"); } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { /* * Socket close before MPA request received. */ if (cep->listen_cep) { siw_dbg_cep(cep, "no mpareq: drop listener\n"); siw_cep_put(cep->listen_cep); cep->listen_cep = NULL; } } } release_cep = 1; break; case SIW_CM_WORK_MPATIMEOUT: cep->mpa_timer = NULL; if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) { /* * MPA request timed out: * Hide any partially received private data and signal * timeout */ cep->mpa.hdr.params.pd_len = 0; if (cep->cm_id) siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ETIMEDOUT); release_cep = 1; } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) { /* * No MPA request received after peer TCP stream setup. */ if (cep->listen_cep) { siw_cep_put(cep->listen_cep); cep->listen_cep = NULL; } release_cep = 1; } break; default: WARN(1, "Undefined CM work type: %d\n", work->type); } if (release_cep) { siw_dbg_cep(cep, "release: timer=%s, QP[%u]\n", cep->mpa_timer ? "y" : "n", cep->qp ? qp_id(cep->qp) : UINT_MAX); siw_cancel_mpatimer(cep); cep->state = SIW_EPSTATE_CLOSED; if (cep->qp) { struct siw_qp *qp = cep->qp; /* * Serialize a potential race with application * closing the QP and calling siw_qp_cm_drop() */ siw_qp_get(qp); siw_cep_set_free(cep); siw_qp_llp_close(qp); siw_qp_put(qp); siw_cep_set_inuse(cep); cep->qp = NULL; siw_qp_put(qp); } if (cep->sock) { siw_socket_disassoc(cep->sock); sock_release(cep->sock); cep->sock = NULL; } if (cep->cm_id) { siw_free_cm_id(cep); siw_cep_put(cep); } } siw_cep_set_free(cep); siw_put_work(work); siw_cep_put(cep); } static struct workqueue_struct *siw_cm_wq; int siw_cm_queue_work(struct siw_cep *cep, enum siw_work_type type) { struct siw_cm_work *work = siw_get_work(cep); unsigned long delay = 0; if (!work) { siw_dbg_cep(cep, "failed with no work available\n"); return -ENOMEM; } work->type = type; work->cep = cep; siw_cep_get(cep); INIT_DELAYED_WORK(&work->work, siw_cm_work_handler); if (type == SIW_CM_WORK_MPATIMEOUT) { cep->mpa_timer = work; if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) delay = MPAREQ_TIMEOUT; else delay = MPAREP_TIMEOUT; } siw_dbg_cep(cep, "[QP %u]: work type: %d, timeout %lu\n", cep->qp ? qp_id(cep->qp) : -1, type, delay); queue_delayed_work(siw_cm_wq, &work->work, delay); return 0; } static void siw_cm_llp_data_ready(struct sock *sk) { struct siw_cep *cep; trace_sk_data_ready(sk); read_lock(&sk->sk_callback_lock); cep = sk_to_cep(sk); if (!cep) goto out; siw_dbg_cep(cep, "cep state: %d, socket state %d\n", cep->state, sk->sk_state); if (sk->sk_state != TCP_ESTABLISHED) goto out; switch (cep->state) { case SIW_EPSTATE_RDMA_MODE: case SIW_EPSTATE_LISTENING: break; case SIW_EPSTATE_AWAIT_MPAREQ: case SIW_EPSTATE_AWAIT_MPAREP: siw_cm_queue_work(cep, SIW_CM_WORK_READ_MPAHDR); break; default: siw_dbg_cep(cep, "unexpected data, state %d\n", cep->state); break; } out: read_unlock(&sk->sk_callback_lock); } static void siw_cm_llp_write_space(struct sock *sk) { struct siw_cep *cep = sk_to_cep(sk); if (cep) siw_dbg_cep(cep, "state: %d\n", cep->state); } static void siw_cm_llp_error_report(struct sock *sk) { struct siw_cep *cep = sk_to_cep(sk); if (cep) { siw_dbg_cep(cep, "error %d, socket state: %d, cep state: %d\n", sk->sk_err, sk->sk_state, cep->state); cep->sk_error_report(sk); } } static void siw_cm_llp_state_change(struct sock *sk) { struct siw_cep *cep; void (*orig_state_change)(struct sock *s); read_lock(&sk->sk_callback_lock); cep = sk_to_cep(sk); if (!cep) { /* endpoint already disassociated */ read_unlock(&sk->sk_callback_lock); return; } orig_state_change = cep->sk_state_change; siw_dbg_cep(cep, "state: %d\n", cep->state); switch (sk->sk_state) { case TCP_ESTABLISHED: /* * handle accepting socket as special case where only * new connection is possible */ siw_cm_queue_work(cep, SIW_CM_WORK_ACCEPT); break; case TCP_CLOSE: case TCP_CLOSE_WAIT: if (cep->qp) cep->qp->tx_ctx.tx_suspend = 1; siw_cm_queue_work(cep, SIW_CM_WORK_PEER_CLOSE); break; default: siw_dbg_cep(cep, "unexpected socket state %d\n", sk->sk_state); } read_unlock(&sk->sk_callback_lock); orig_state_change(sk); } static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr, struct sockaddr *raddr, bool afonly) { int rv, flags = 0; size_t size = laddr->sa_family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); /* * Make address available again asap. */ sock_set_reuseaddr(s->sk); if (afonly) { rv = ip6_sock_set_v6only(s->sk); if (rv) return rv; } rv = s->ops->bind(s, laddr, size); if (rv < 0) return rv; rv = s->ops->connect(s, raddr, size, flags); return rv < 0 ? rv : 0; } int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params) { struct siw_device *sdev = to_siw_dev(id->device); struct siw_qp *qp; struct siw_cep *cep = NULL; struct socket *s = NULL; struct sockaddr *laddr = (struct sockaddr *)&id->local_addr, *raddr = (struct sockaddr *)&id->remote_addr; bool p2p_mode = peer_to_peer, v4 = true; u16 pd_len = params->private_data_len; int version = mpa_version, rv; if (pd_len > MPA_MAX_PRIVDATA) return -EINVAL; if (params->ird > sdev->attrs.max_ird || params->ord > sdev->attrs.max_ord) return -ENOMEM; if (laddr->sa_family == AF_INET6) v4 = false; else if (laddr->sa_family != AF_INET) return -EAFNOSUPPORT; /* * Respect any iwarp port mapping: Use mapped remote address * if valid. Local address must not be mapped, since siw * uses kernel TCP stack. */ if ((v4 && to_sockaddr_in(id->remote_addr).sin_port != 0) || to_sockaddr_in6(id->remote_addr).sin6_port != 0) raddr = (struct sockaddr *)&id->m_remote_addr; qp = siw_qp_id2obj(sdev, params->qpn); if (!qp) { WARN(1, "[QP %u] does not exist\n", params->qpn); rv = -EINVAL; goto error; } siw_dbg_qp(qp, "pd_len %d, laddr %pISp, raddr %pISp\n", pd_len, laddr, raddr); rv = sock_create(v4 ? AF_INET : AF_INET6, SOCK_STREAM, IPPROTO_TCP, &s); if (rv < 0) goto error; /* * NOTE: For simplification, connect() is called in blocking * mode. Might be reconsidered for async connection setup at * TCP level. */ rv = kernel_bindconnect(s, laddr, raddr, id->afonly); if (rv != 0) { siw_dbg_qp(qp, "kernel_bindconnect: error %d\n", rv); goto error; } if (siw_tcp_nagle == false) tcp_sock_set_nodelay(s->sk); cep = siw_cep_alloc(sdev); if (!cep) { rv = -ENOMEM; goto error; } siw_cep_set_inuse(cep); /* Associate QP with CEP */ siw_cep_get(cep); qp->cep = cep; /* siw_qp_get(qp) already done by QP lookup */ cep->qp = qp; id->add_ref(id); cep->cm_id = id; /* * 4: Allocate a sufficient number of work elements * to allow concurrent handling of local + peer close * events, MPA header processing + MPA timeout. */ rv = siw_cm_alloc_work(cep, 4); if (rv != 0) { rv = -ENOMEM; goto error; } cep->ird = params->ird; cep->ord = params->ord; if (p2p_mode && cep->ord == 0) cep->ord = 1; cep->state = SIW_EPSTATE_CONNECTING; /* * Associate CEP with socket */ siw_cep_socket_assoc(cep, s); cep->state = SIW_EPSTATE_AWAIT_MPAREP; /* * Set MPA Request bits: CRC if required, no MPA Markers, * MPA Rev. according to module parameter 'mpa_version', Key 'Request'. */ cep->mpa.hdr.params.bits = 0; if (version > MPA_REVISION_2) { pr_warn("Setting MPA version to %u\n", MPA_REVISION_2); version = MPA_REVISION_2; /* Adjust also module parameter */ mpa_version = MPA_REVISION_2; } __mpa_rr_set_revision(&cep->mpa.hdr.params.bits, version); if (try_gso) cep->mpa.hdr.params.bits |= MPA_RR_FLAG_GSO_EXP; if (mpa_crc_required) cep->mpa.hdr.params.bits |= MPA_RR_FLAG_CRC; /* * If MPA version == 2: * o Include ORD and IRD. * o Indicate peer-to-peer mode, if required by module * parameter 'peer_to_peer'. */ if (version == MPA_REVISION_2) { cep->enhanced_rdma_conn_est = true; cep->mpa.hdr.params.bits |= MPA_RR_FLAG_ENHANCED; cep->mpa.v2_ctrl.ird = htons(cep->ird); cep->mpa.v2_ctrl.ord = htons(cep->ord); if (p2p_mode) { cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER; cep->mpa.v2_ctrl.ord |= rtr_type; } /* Remember own P2P mode requested */ cep->mpa.v2_ctrl_req.ird = cep->mpa.v2_ctrl.ird; cep->mpa.v2_ctrl_req.ord = cep->mpa.v2_ctrl.ord; } memcpy(cep->mpa.hdr.key, MPA_KEY_REQ, 16); rv = siw_send_mpareqrep(cep, params->private_data, pd_len); /* * Reset private data. */ cep->mpa.hdr.params.pd_len = 0; if (rv >= 0) { rv = siw_cm_queue_work(cep, SIW_CM_WORK_MPATIMEOUT); if (!rv) { siw_dbg_cep(cep, "[QP %u]: exit\n", qp_id(qp)); siw_cep_set_free(cep); return 0; } } error: siw_dbg(id->device, "failed: %d\n", rv); if (cep) { siw_socket_disassoc(s); sock_release(s); cep->sock = NULL; cep->qp = NULL; cep->cm_id = NULL; id->rem_ref(id); qp->cep = NULL; siw_cep_put(cep); cep->state = SIW_EPSTATE_CLOSED; siw_cep_set_free_and_put(cep); } else if (s) { sock_release(s); } if (qp) siw_qp_put(qp); return rv; } /* * siw_accept - Let SoftiWARP accept an RDMA connection request * * @id: New connection management id to be used for accepted * connection request * @params: Connection parameters provided by ULP for accepting connection * * Transition QP to RTS state, associate new CM id @id with accepted CEP * and get prepared for TCP input by installing socket callbacks. * Then send MPA Reply and generate the "connection established" event. * Socket callbacks must be installed before sending MPA Reply, because * the latter may cause a first RDMA message to arrive from the RDMA Initiator * side very quickly, at which time the socket callbacks must be ready. */ int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params) { struct siw_device *sdev = to_siw_dev(id->device); struct siw_cep *cep = (struct siw_cep *)id->provider_data; struct siw_qp *qp; struct siw_qp_attrs qp_attrs; int rv = -EINVAL, max_priv_data = MPA_MAX_PRIVDATA; bool wait_for_peer_rts = false; siw_cep_set_inuse(cep); siw_cep_put(cep); /* Free lingering inbound private data */ if (cep->mpa.hdr.params.pd_len) { cep->mpa.hdr.params.pd_len = 0; kfree(cep->mpa.pdata); cep->mpa.pdata = NULL; } siw_cancel_mpatimer(cep); if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) { siw_dbg_cep(cep, "out of state\n"); rv = -ECONNRESET; goto free_cep; } qp = siw_qp_id2obj(sdev, params->qpn); if (!qp) { WARN(1, "[QP %d] does not exist\n", params->qpn); goto free_cep; } down_write(&qp->state_lock); if (qp->attrs.state > SIW_QP_STATE_RTR) goto error_unlock; siw_dbg_cep(cep, "[QP %d]\n", params->qpn); if (try_gso && cep->mpa.hdr.params.bits & MPA_RR_FLAG_GSO_EXP) { siw_dbg_cep(cep, "peer allows GSO on TX\n"); qp->tx_ctx.gso_seg_limit = 0; } if (params->ord > sdev->attrs.max_ord || params->ird > sdev->attrs.max_ird) { siw_dbg_cep( cep, "[QP %u]: ord %d (max %d), ird %d (max %d)\n", qp_id(qp), params->ord, sdev->attrs.max_ord, params->ird, sdev->attrs.max_ird); goto error_unlock; } if (cep->enhanced_rdma_conn_est) max_priv_data -= sizeof(struct mpa_v2_data); if (params->private_data_len > max_priv_data) { siw_dbg_cep( cep, "[QP %u]: private data length: %d (max %d)\n", qp_id(qp), params->private_data_len, max_priv_data); goto error_unlock; } if (cep->enhanced_rdma_conn_est) { if (params->ord > cep->ord) { if (relaxed_ird_negotiation) { params->ord = cep->ord; } else { cep->ird = params->ird; cep->ord = params->ord; goto error_unlock; } } if (params->ird < cep->ird) { if (relaxed_ird_negotiation && cep->ird <= sdev->attrs.max_ird) params->ird = cep->ird; else { rv = -ENOMEM; goto error_unlock; } } if (cep->mpa.v2_ctrl.ord & (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR)) wait_for_peer_rts = true; /* * Signal back negotiated IRD and ORD values */ cep->mpa.v2_ctrl.ord = htons(params->ord & MPA_IRD_ORD_MASK) | (cep->mpa.v2_ctrl.ord & ~MPA_V2_MASK_IRD_ORD); cep->mpa.v2_ctrl.ird = htons(params->ird & MPA_IRD_ORD_MASK) | (cep->mpa.v2_ctrl.ird & ~MPA_V2_MASK_IRD_ORD); } cep->ird = params->ird; cep->ord = params->ord; cep->cm_id = id; id->add_ref(id); memset(&qp_attrs, 0, sizeof(qp_attrs)); qp_attrs.orq_size = cep->ord; qp_attrs.irq_size = cep->ird; qp_attrs.sk = cep->sock; if (cep->mpa.hdr.params.bits & MPA_RR_FLAG_CRC) qp_attrs.flags = SIW_MPA_CRC; qp_attrs.state = SIW_QP_STATE_RTS; siw_dbg_cep(cep, "[QP%u]: moving to rts\n", qp_id(qp)); /* Associate QP with CEP */ siw_cep_get(cep); qp->cep = cep; /* siw_qp_get(qp) already done by QP lookup */ cep->qp = qp; cep->state = SIW_EPSTATE_RDMA_MODE; /* Move socket RX/TX under QP control */ rv = siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE | SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD | SIW_QP_ATTR_MPA); up_write(&qp->state_lock); if (rv) goto error; siw_dbg_cep(cep, "[QP %u]: send mpa reply, %d byte pdata\n", qp_id(qp), params->private_data_len); rv = siw_send_mpareqrep(cep, params->private_data, params->private_data_len); if (rv != 0) goto error; if (wait_for_peer_rts) { siw_sk_assign_rtr_upcalls(cep); } else { siw_qp_socket_assoc(cep, qp); rv = siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0); if (rv) goto error; } siw_cep_set_free(cep); return 0; error_unlock: up_write(&qp->state_lock); error: siw_destroy_cep_sock(cep); cep->state = SIW_EPSTATE_CLOSED; siw_free_cm_id(cep); if (qp->cep) { siw_cep_put(cep); qp->cep = NULL; } cep->qp = NULL; siw_qp_put(qp); free_cep: siw_cep_set_free_and_put(cep); return rv; } /* * siw_reject() * * Local connection reject case. Send private data back to peer, * close connection and dereference connection id. */ int siw_reject(struct iw_cm_id *id, const void *pdata, u8 pd_len) { struct siw_cep *cep = (struct siw_cep *)id->provider_data; siw_cep_set_inuse(cep); siw_cep_put(cep); siw_cancel_mpatimer(cep); if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) { siw_dbg_cep(cep, "out of state\n"); siw_cep_set_free_and_put(cep); /* put last reference */ return -ECONNRESET; } siw_dbg_cep(cep, "cep->state %d, pd_len %d\n", cep->state, pd_len); if (__mpa_rr_revision(cep->mpa.hdr.params.bits) >= MPA_REVISION_1) { cep->mpa.hdr.params.bits |= MPA_RR_FLAG_REJECT; /* reject */ siw_send_mpareqrep(cep, pdata, pd_len); } siw_destroy_cep_sock(cep); cep->state = SIW_EPSTATE_CLOSED; siw_cep_set_free_and_put(cep); return 0; } /* * siw_create_listen - Create resources for a listener's IWCM ID @id * * Starts listen on the socket address id->local_addr. * */ int siw_create_listen(struct iw_cm_id *id, int backlog) { struct socket *s; struct siw_cep *cep = NULL; struct net_device *ndev = NULL; struct siw_device *sdev = to_siw_dev(id->device); int addr_family = id->local_addr.ss_family; int rv = 0; if (addr_family != AF_INET && addr_family != AF_INET6) return -EAFNOSUPPORT; rv = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s); if (rv < 0) return rv; /* * Allow binding local port when still in TIME_WAIT from last close. */ sock_set_reuseaddr(s->sk); if (addr_family == AF_INET) { struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr); /* For wildcard addr, limit binding to current device only */ if (ipv4_is_zeronet(laddr->sin_addr.s_addr)) { ndev = ib_device_get_netdev(id->device, SIW_PORT); if (ndev) { s->sk->sk_bound_dev_if = ndev->ifindex; } else { rv = -ENODEV; goto error; } } rv = s->ops->bind(s, (struct sockaddr *)laddr, sizeof(struct sockaddr_in)); } else { struct sockaddr_in6 *laddr = &to_sockaddr_in6(id->local_addr); if (id->afonly) { rv = ip6_sock_set_v6only(s->sk); if (rv) { siw_dbg(id->device, "ip6_sock_set_v6only erro: %d\n", rv); goto error; } } /* For wildcard addr, limit binding to current device only */ if (ipv6_addr_any(&laddr->sin6_addr)) { ndev = ib_device_get_netdev(id->device, SIW_PORT); if (ndev) { s->sk->sk_bound_dev_if = ndev->ifindex; } else { rv = -ENODEV; goto error; } } rv = s->ops->bind(s, (struct sockaddr *)laddr, sizeof(struct sockaddr_in6)); } if (rv) { siw_dbg(id->device, "socket bind error: %d\n", rv); goto error; } cep = siw_cep_alloc(sdev); if (!cep) { rv = -ENOMEM; goto error; } siw_cep_socket_assoc(cep, s); rv = siw_cm_alloc_work(cep, backlog); if (rv) { siw_dbg(id->device, "alloc_work error %d, backlog %d\n", rv, backlog); goto error; } rv = s->ops->listen(s, backlog); if (rv) { siw_dbg(id->device, "listen error %d\n", rv); goto error; } cep->cm_id = id; id->add_ref(id); /* * In case of a wildcard rdma_listen on a multi-homed device, * a listener's IWCM id is associated with more than one listening CEP. * * We currently use id->provider_data in three different ways: * * o For a listener's IWCM id, id->provider_data points to * the list_head of the list of listening CEPs. * Uses: siw_create_listen(), siw_destroy_listen() * * o For each accepted passive-side IWCM id, id->provider_data * points to the CEP itself. This is a consequence of * - siw_cm_upcall() setting event.provider_data = cep and * - the IWCM's cm_conn_req_handler() setting provider_data of the * new passive-side IWCM id equal to event.provider_data * Uses: siw_accept(), siw_reject() * * o For an active-side IWCM id, id->provider_data is not used at all. * */ if (!id->provider_data) { id->provider_data = kmalloc(sizeof(struct list_head), GFP_KERNEL); if (!id->provider_data) { rv = -ENOMEM; goto error; } INIT_LIST_HEAD((struct list_head *)id->provider_data); } list_add_tail(&cep->listenq, (struct list_head *)id->provider_data); cep->state = SIW_EPSTATE_LISTENING; dev_put(ndev); siw_dbg(id->device, "Listen at laddr %pISp\n", &id->local_addr); return 0; error: siw_dbg(id->device, "failed: %d\n", rv); if (cep) { siw_cep_set_inuse(cep); siw_free_cm_id(cep); cep->sock = NULL; siw_socket_disassoc(s); cep->state = SIW_EPSTATE_CLOSED; siw_cep_set_free_and_put(cep); } sock_release(s); dev_put(ndev); return rv; } static void siw_drop_listeners(struct iw_cm_id *id) { struct list_head *p, *tmp; /* * In case of a wildcard rdma_listen on a multi-homed device, * a listener's IWCM id is associated with more than one listening CEP. */ list_for_each_safe(p, tmp, (struct list_head *)id->provider_data) { struct siw_cep *cep = list_entry(p, struct siw_cep, listenq); list_del(p); siw_dbg_cep(cep, "drop cep, state %d\n", cep->state); siw_cep_set_inuse(cep); siw_free_cm_id(cep); if (cep->sock) { siw_socket_disassoc(cep->sock); sock_release(cep->sock); cep->sock = NULL; } cep->state = SIW_EPSTATE_CLOSED; siw_cep_set_free_and_put(cep); } } int siw_destroy_listen(struct iw_cm_id *id) { if (!id->provider_data) { siw_dbg(id->device, "no cep(s)\n"); return 0; } siw_drop_listeners(id); kfree(id->provider_data); id->provider_data = NULL; return 0; } int siw_cm_init(void) { /* * create_single_workqueue for strict ordering */ siw_cm_wq = create_singlethread_workqueue("siw_cm_wq"); if (!siw_cm_wq) return -ENOMEM; return 0; } void siw_cm_exit(void) { if (siw_cm_wq) destroy_workqueue(siw_cm_wq); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 /* SPDX-License-Identifier: GPL-2.0 */ /* * fscrypt_private.h * * Copyright (C) 2015, Google, Inc. * * Originally written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar. * Heavily modified since then. */ #ifndef _FSCRYPT_PRIVATE_H #define _FSCRYPT_PRIVATE_H #include <linux/fscrypt.h> #include <linux/siphash.h> #include <crypto/hash.h> #include <linux/blk-crypto.h> #define CONST_STRLEN(str) (sizeof(str) - 1) #define FSCRYPT_FILE_NONCE_SIZE 16 /* * Minimum size of an fscrypt master key. Note: a longer key will be required * if ciphers with a 256-bit security strength are used. This is just the * absolute minimum, which applies when only 128-bit encryption is used. */ #define FSCRYPT_MIN_KEY_SIZE 16 #define FSCRYPT_CONTEXT_V1 1 #define FSCRYPT_CONTEXT_V2 2 /* Keep this in sync with include/uapi/linux/fscrypt.h */ #define FSCRYPT_MODE_MAX FSCRYPT_MODE_AES_256_HCTR2 struct fscrypt_context_v1 { u8 version; /* FSCRYPT_CONTEXT_V1 */ u8 contents_encryption_mode; u8 filenames_encryption_mode; u8 flags; u8 master_key_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE]; u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; }; struct fscrypt_context_v2 { u8 version; /* FSCRYPT_CONTEXT_V2 */ u8 contents_encryption_mode; u8 filenames_encryption_mode; u8 flags; u8 log2_data_unit_size; u8 __reserved[3]; u8 master_key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]; u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; }; /* * fscrypt_context - the encryption context of an inode * * This is the on-disk equivalent of an fscrypt_policy, stored alongside each * encrypted file usually in a hidden extended attribute. It contains the * fields from the fscrypt_policy, in order to identify the encryption algorithm * and key with which the file is encrypted. It also contains a nonce that was * randomly generated by fscrypt itself; this is used as KDF input or as a tweak * to cause different files to be encrypted differently. */ union fscrypt_context { u8 version; struct fscrypt_context_v1 v1; struct fscrypt_context_v2 v2; }; /* * Return the size expected for the given fscrypt_context based on its version * number, or 0 if the context version is unrecognized. */ static inline int fscrypt_context_size(const union fscrypt_context *ctx) { switch (ctx->version) { case FSCRYPT_CONTEXT_V1: BUILD_BUG_ON(sizeof(ctx->v1) != 28); return sizeof(ctx->v1); case FSCRYPT_CONTEXT_V2: BUILD_BUG_ON(sizeof(ctx->v2) != 40); return sizeof(ctx->v2); } return 0; } /* Check whether an fscrypt_context has a recognized version number and size */ static inline bool fscrypt_context_is_valid(const union fscrypt_context *ctx, int ctx_size) { return ctx_size >= 1 && ctx_size == fscrypt_context_size(ctx); } /* Retrieve the context's nonce, assuming the context was already validated */ static inline const u8 *fscrypt_context_nonce(const union fscrypt_context *ctx) { switch (ctx->version) { case FSCRYPT_CONTEXT_V1: return ctx->v1.nonce; case FSCRYPT_CONTEXT_V2: return ctx->v2.nonce; } WARN_ON_ONCE(1); return NULL; } union fscrypt_policy { u8 version; struct fscrypt_policy_v1 v1; struct fscrypt_policy_v2 v2; }; /* * Return the size expected for the given fscrypt_policy based on its version * number, or 0 if the policy version is unrecognized. */ static inline int fscrypt_policy_size(const union fscrypt_policy *policy) { switch (policy->version) { case FSCRYPT_POLICY_V1: return sizeof(policy->v1); case FSCRYPT_POLICY_V2: return sizeof(policy->v2); } return 0; } /* Return the contents encryption mode of a valid encryption policy */ static inline u8 fscrypt_policy_contents_mode(const union fscrypt_policy *policy) { switch (policy->version) { case FSCRYPT_POLICY_V1: return policy->v1.contents_encryption_mode; case FSCRYPT_POLICY_V2: return policy->v2.contents_encryption_mode; } BUG(); } /* Return the filenames encryption mode of a valid encryption policy */ static inline u8 fscrypt_policy_fnames_mode(const union fscrypt_policy *policy) { switch (policy->version) { case FSCRYPT_POLICY_V1: return policy->v1.filenames_encryption_mode; case FSCRYPT_POLICY_V2: return policy->v2.filenames_encryption_mode; } BUG(); } /* Return the flags (FSCRYPT_POLICY_FLAG*) of a valid encryption policy */ static inline u8 fscrypt_policy_flags(const union fscrypt_policy *policy) { switch (policy->version) { case FSCRYPT_POLICY_V1: return policy->v1.flags; case FSCRYPT_POLICY_V2: return policy->v2.flags; } BUG(); } static inline int fscrypt_policy_v2_du_bits(const struct fscrypt_policy_v2 *policy, const struct inode *inode) { return policy->log2_data_unit_size ?: inode->i_blkbits; } static inline int fscrypt_policy_du_bits(const union fscrypt_policy *policy, const struct inode *inode) { switch (policy->version) { case FSCRYPT_POLICY_V1: return inode->i_blkbits; case FSCRYPT_POLICY_V2: return fscrypt_policy_v2_du_bits(&policy->v2, inode); } BUG(); } /* * For encrypted symlinks, the ciphertext length is stored at the beginning * of the string in little-endian format. */ struct fscrypt_symlink_data { __le16 len; char encrypted_path[]; } __packed; /** * struct fscrypt_prepared_key - a key prepared for actual encryption/decryption * @tfm: crypto API transform object * @blk_key: key for blk-crypto * * Normally only one of the fields will be non-NULL. */ struct fscrypt_prepared_key { struct crypto_skcipher *tfm; #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT struct blk_crypto_key *blk_key; #endif }; /* * fscrypt_inode_info - the "encryption key" for an inode * * When an encrypted file's key is made available, an instance of this struct is * allocated and stored in ->i_crypt_info. Once created, it remains until the * inode is evicted. */ struct fscrypt_inode_info { /* The key in a form prepared for actual encryption/decryption */ struct fscrypt_prepared_key ci_enc_key; /* True if ci_enc_key should be freed when this struct is freed */ u8 ci_owns_key : 1; #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT /* * True if this inode will use inline encryption (blk-crypto) instead of * the traditional filesystem-layer encryption. */ u8 ci_inlinecrypt : 1; #endif /* True if ci_dirhash_key is initialized */ u8 ci_dirhash_key_initialized : 1; /* * log2 of the data unit size (granularity of contents encryption) of * this file. This is computable from ci_policy and ci_inode but is * cached here for efficiency. Only used for regular files. */ u8 ci_data_unit_bits; /* Cached value: log2 of number of data units per FS block */ u8 ci_data_units_per_block_bits; /* Hashed inode number. Only set for IV_INO_LBLK_32 */ u32 ci_hashed_ino; /* * Encryption mode used for this inode. It corresponds to either the * contents or filenames encryption mode, depending on the inode type. */ struct fscrypt_mode *ci_mode; /* Back-pointer to the inode */ struct inode *ci_inode; /* * The master key with which this inode was unlocked (decrypted). This * will be NULL if the master key was found in a process-subscribed * keyring rather than in the filesystem-level keyring. */ struct fscrypt_master_key *ci_master_key; /* * Link in list of inodes that were unlocked with the master key. * Only used when ->ci_master_key is set. */ struct list_head ci_master_key_link; /* * If non-NULL, then encryption is done using the master key directly * and ci_enc_key will equal ci_direct_key->dk_key. */ struct fscrypt_direct_key *ci_direct_key; /* * This inode's hash key for filenames. This is a 128-bit SipHash-2-4 * key. This is only set for directories that use a keyed dirhash over * the plaintext filenames -- currently just casefolded directories. */ siphash_key_t ci_dirhash_key; /* The encryption policy used by this inode */ union fscrypt_policy ci_policy; /* This inode's nonce, copied from the fscrypt_context */ u8 ci_nonce[FSCRYPT_FILE_NONCE_SIZE]; }; typedef enum { FS_DECRYPT = 0, FS_ENCRYPT, } fscrypt_direction_t; /* crypto.c */ extern struct kmem_cache *fscrypt_inode_info_cachep; int fscrypt_initialize(struct super_block *sb); int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci, fscrypt_direction_t rw, u64 index, struct page *src_page, struct page *dest_page, unsigned int len, unsigned int offs, gfp_t gfp_flags); struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags); void __printf(3, 4) __cold fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...); #define fscrypt_warn(inode, fmt, ...) \ fscrypt_msg((inode), KERN_WARNING, fmt, ##__VA_ARGS__) #define fscrypt_err(inode, fmt, ...) \ fscrypt_msg((inode), KERN_ERR, fmt, ##__VA_ARGS__) #define FSCRYPT_MAX_IV_SIZE 32 union fscrypt_iv { struct { /* zero-based index of data unit within the file */ __le64 index; /* per-file nonce; only set in DIRECT_KEY mode */ u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; }; u8 raw[FSCRYPT_MAX_IV_SIZE]; __le64 dun[FSCRYPT_MAX_IV_SIZE / sizeof(__le64)]; }; void fscrypt_generate_iv(union fscrypt_iv *iv, u64 index, const struct fscrypt_inode_info *ci); /* * Return the number of bits used by the maximum file data unit index that is * possible on the given filesystem, using the given log2 data unit size. */ static inline int fscrypt_max_file_dun_bits(const struct super_block *sb, int du_bits) { return fls64(sb->s_maxbytes - 1) - du_bits; } /* fname.c */ bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, u32 orig_len, u32 max_len, u32 *encrypted_len_ret); /* hkdf.c */ struct fscrypt_hkdf { struct crypto_shash *hmac_tfm; }; int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, unsigned int master_key_size); /* * The list of contexts in which fscrypt uses HKDF. These values are used as * the first byte of the HKDF application-specific info string to guarantee that * info strings are never repeated between contexts. This ensures that all HKDF * outputs are unique and cryptographically isolated, i.e. knowledge of one * output doesn't reveal another. */ #define HKDF_CONTEXT_KEY_IDENTIFIER 1 /* info=<empty> */ #define HKDF_CONTEXT_PER_FILE_ENC_KEY 2 /* info=file_nonce */ #define HKDF_CONTEXT_DIRECT_KEY 3 /* info=mode_num */ #define HKDF_CONTEXT_IV_INO_LBLK_64_KEY 4 /* info=mode_num||fs_uuid */ #define HKDF_CONTEXT_DIRHASH_KEY 5 /* info=file_nonce */ #define HKDF_CONTEXT_IV_INO_LBLK_32_KEY 6 /* info=mode_num||fs_uuid */ #define HKDF_CONTEXT_INODE_HASH_KEY 7 /* info=<empty> */ int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, const u8 *info, unsigned int infolen, u8 *okm, unsigned int okmlen); void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf); /* inline_crypt.c */ #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci); static inline bool fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci) { return ci->ci_inlinecrypt; } int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_inode_info *ci); void fscrypt_destroy_inline_crypt_key(struct super_block *sb, struct fscrypt_prepared_key *prep_key); /* * Check whether the crypto transform or blk-crypto key has been allocated in * @prep_key, depending on which encryption implementation the file will use. */ static inline bool fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key, const struct fscrypt_inode_info *ci) { /* * The two smp_load_acquire()'s here pair with the smp_store_release()'s * in fscrypt_prepare_inline_crypt_key() and fscrypt_prepare_key(). * I.e., in some cases (namely, if this prep_key is a per-mode * encryption key) another task can publish blk_key or tfm concurrently, * executing a RELEASE barrier. We need to use smp_load_acquire() here * to safely ACQUIRE the memory the other task published. */ if (fscrypt_using_inline_encryption(ci)) return smp_load_acquire(&prep_key->blk_key) != NULL; return smp_load_acquire(&prep_key->tfm) != NULL; } #else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ static inline int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci) { return 0; } static inline bool fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci) { return false; } static inline int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_inode_info *ci) { WARN_ON_ONCE(1); return -EOPNOTSUPP; } static inline void fscrypt_destroy_inline_crypt_key(struct super_block *sb, struct fscrypt_prepared_key *prep_key) { } static inline bool fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key, const struct fscrypt_inode_info *ci) { return smp_load_acquire(&prep_key->tfm) != NULL; } #endif /* !CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ /* keyring.c */ /* * fscrypt_master_key_secret - secret key material of an in-use master key */ struct fscrypt_master_key_secret { /* * For v2 policy keys: HKDF context keyed by this master key. * For v1 policy keys: not set (hkdf.hmac_tfm == NULL). */ struct fscrypt_hkdf hkdf; /* * Size of the raw key in bytes. This remains set even if ->raw was * zeroized due to no longer being needed. I.e. we still remember the * size of the key even if we don't need to remember the key itself. */ u32 size; /* For v1 policy keys: the raw key. Wiped for v2 policy keys. */ u8 raw[FSCRYPT_MAX_KEY_SIZE]; } __randomize_layout; /* * fscrypt_master_key - an in-use master key * * This represents a master encryption key which has been added to the * filesystem. There are three high-level states that a key can be in: * * FSCRYPT_KEY_STATUS_PRESENT * Key is fully usable; it can be used to unlock inodes that are encrypted * with it (this includes being able to create new inodes). ->mk_present * indicates whether the key is in this state. ->mk_secret exists, the key * is in the keyring, and ->mk_active_refs > 0 due to ->mk_present. * * FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED * Removal of this key has been initiated, but some inodes that were * unlocked with it are still in-use. Like ABSENT, ->mk_secret is wiped, * and the key can no longer be used to unlock inodes. Unlike ABSENT, the * key is still in the keyring; ->mk_decrypted_inodes is nonempty; and * ->mk_active_refs > 0, being equal to the size of ->mk_decrypted_inodes. * * This state transitions to ABSENT if ->mk_decrypted_inodes becomes empty, * or to PRESENT if FS_IOC_ADD_ENCRYPTION_KEY is called again for this key. * * FSCRYPT_KEY_STATUS_ABSENT * Key is fully removed. The key is no longer in the keyring, * ->mk_decrypted_inodes is empty, ->mk_active_refs == 0, ->mk_secret is * wiped, and the key can no longer be used to unlock inodes. */ struct fscrypt_master_key { /* * Link in ->s_master_keys->key_hashtable. * Only valid if ->mk_active_refs > 0. */ struct hlist_node mk_node; /* Semaphore that protects ->mk_secret, ->mk_users, and ->mk_present */ struct rw_semaphore mk_sem; /* * Active and structural reference counts. An active ref guarantees * that the struct continues to exist, continues to be in the keyring * ->s_master_keys, and that any embedded subkeys (e.g. * ->mk_direct_keys) that have been prepared continue to exist. * A structural ref only guarantees that the struct continues to exist. * * There is one active ref associated with ->mk_present being true, and * one active ref for each inode in ->mk_decrypted_inodes. * * There is one structural ref associated with the active refcount being * nonzero. Finding a key in the keyring also takes a structural ref, * which is then held temporarily while the key is operated on. */ refcount_t mk_active_refs; refcount_t mk_struct_refs; struct rcu_head mk_rcu_head; /* * The secret key material. Wiped as soon as it is no longer needed; * for details, see the fscrypt_master_key struct comment. * * Locking: protected by ->mk_sem. */ struct fscrypt_master_key_secret mk_secret; /* * For v1 policy keys: an arbitrary key descriptor which was assigned by * userspace (->descriptor). * * For v2 policy keys: a cryptographic hash of this key (->identifier). */ struct fscrypt_key_specifier mk_spec; /* * Keyring which contains a key of type 'key_type_fscrypt_user' for each * user who has added this key. Normally each key will be added by just * one user, but it's possible that multiple users share a key, and in * that case we need to keep track of those users so that one user can't * remove the key before the others want it removed too. * * This is NULL for v1 policy keys; those can only be added by root. * * Locking: protected by ->mk_sem. (We don't just rely on the keyrings * subsystem semaphore ->mk_users->sem, as we need support for atomic * search+insert along with proper synchronization with other fields.) */ struct key *mk_users; /* * List of inodes that were unlocked using this key. This allows the * inodes to be evicted efficiently if the key is removed. */ struct list_head mk_decrypted_inodes; spinlock_t mk_decrypted_inodes_lock; /* * Per-mode encryption keys for the various types of encryption policies * that use them. Allocated and derived on-demand. */ struct fscrypt_prepared_key mk_direct_keys[FSCRYPT_MODE_MAX + 1]; struct fscrypt_prepared_key mk_iv_ino_lblk_64_keys[FSCRYPT_MODE_MAX + 1]; struct fscrypt_prepared_key mk_iv_ino_lblk_32_keys[FSCRYPT_MODE_MAX + 1]; /* Hash key for inode numbers. Initialized only when needed. */ siphash_key_t mk_ino_hash_key; bool mk_ino_hash_key_initialized; /* * Whether this key is in the "present" state, i.e. fully usable. For * details, see the fscrypt_master_key struct comment. * * Locking: protected by ->mk_sem, but can be read locklessly using * READ_ONCE(). Writers must use WRITE_ONCE() when concurrent readers * are possible. */ bool mk_present; } __randomize_layout; static inline const char *master_key_spec_type( const struct fscrypt_key_specifier *spec) { switch (spec->type) { case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: return "descriptor"; case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER: return "identifier"; } return "[unknown]"; } static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec) { switch (spec->type) { case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: return FSCRYPT_KEY_DESCRIPTOR_SIZE; case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER: return FSCRYPT_KEY_IDENTIFIER_SIZE; } return 0; } void fscrypt_put_master_key(struct fscrypt_master_key *mk); void fscrypt_put_master_key_activeref(struct super_block *sb, struct fscrypt_master_key *mk); struct fscrypt_master_key * fscrypt_find_master_key(struct super_block *sb, const struct fscrypt_key_specifier *mk_spec); int fscrypt_get_test_dummy_key_identifier( u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); int fscrypt_add_test_dummy_key(struct super_block *sb, struct fscrypt_key_specifier *key_spec); int fscrypt_verify_key_added(struct super_block *sb, const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); int __init fscrypt_init_keyring(void); /* keysetup.c */ struct fscrypt_mode { const char *friendly_name; const char *cipher_str; int keysize; /* key size in bytes */ int security_strength; /* security strength in bytes */ int ivsize; /* IV size in bytes */ int logged_cryptoapi_impl; int logged_blk_crypto_native; int logged_blk_crypto_fallback; enum blk_crypto_mode_num blk_crypto_mode; }; extern struct fscrypt_mode fscrypt_modes[]; int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_inode_info *ci); void fscrypt_destroy_prepared_key(struct super_block *sb, struct fscrypt_prepared_key *prep_key); int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci, const u8 *raw_key); int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci, const struct fscrypt_master_key *mk); void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci, const struct fscrypt_master_key *mk); int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported); /** * fscrypt_require_key() - require an inode's encryption key * @inode: the inode we need the key for * * If the inode is encrypted, set up its encryption key if not already done. * Then require that the key be present and return -ENOKEY otherwise. * * No locks are needed, and the key will live as long as the struct inode --- so * it won't go away from under you. * * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code * if a problem occurred while setting up the encryption key. */ static inline int fscrypt_require_key(struct inode *inode) { if (IS_ENCRYPTED(inode)) { int err = fscrypt_get_encryption_info(inode, false); if (err) return err; if (!fscrypt_has_encryption_key(inode)) return -ENOKEY; } return 0; } /* keysetup_v1.c */ void fscrypt_put_direct_key(struct fscrypt_direct_key *dk); int fscrypt_setup_v1_file_key(struct fscrypt_inode_info *ci, const u8 *raw_master_key); int fscrypt_setup_v1_file_key_via_subscribed_keyrings( struct fscrypt_inode_info *ci); /* policy.c */ bool fscrypt_policies_equal(const union fscrypt_policy *policy1, const union fscrypt_policy *policy2); int fscrypt_policy_to_key_spec(const union fscrypt_policy *policy, struct fscrypt_key_specifier *key_spec); const union fscrypt_policy *fscrypt_get_dummy_policy(struct super_block *sb); bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, const struct inode *inode); int fscrypt_policy_from_context(union fscrypt_policy *policy_u, const union fscrypt_context *ctx_u, int ctx_size); const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir); #endif /* _FSCRYPT_PRIVATE_H */
6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 // SPDX-License-Identifier: GPL-2.0 #include <linux/init.h> #include <linux/async.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/delay.h> #include <linux/string.h> #include <linux/dirent.h> #include <linux/syscalls.h> #include <linux/utime.h> #include <linux/file.h> #include <linux/kstrtox.h> #include <linux/memblock.h> #include <linux/mm.h> #include <linux/namei.h> #include <linux/init_syscalls.h> #include <linux/umh.h> #include <linux/security.h> #include "do_mounts.h" static __initdata bool csum_present; static __initdata u32 io_csum; static ssize_t __init xwrite(struct file *file, const unsigned char *p, size_t count, loff_t *pos) { ssize_t out = 0; /* sys_write only can write MAX_RW_COUNT aka 2G-4K bytes at most */ while (count) { ssize_t rv = kernel_write(file, p, count, pos); if (rv < 0) { if (rv == -EINTR || rv == -EAGAIN) continue; return out ? out : rv; } else if (rv == 0) break; if (csum_present) { ssize_t i; for (i = 0; i < rv; i++) io_csum += p[i]; } p += rv; out += rv; count -= rv; } return out; } static __initdata char *message; static void __init error(char *x) { if (!message) message = x; } #define panic_show_mem(fmt, ...) \ ({ show_mem(); panic(fmt, ##__VA_ARGS__); }) /* link hash */ #define N_ALIGN(len) ((((len) + 1) & ~3) + 2) static __initdata struct hash { int ino, minor, major; umode_t mode; struct hash *next; char name[N_ALIGN(PATH_MAX)]; } *head[32]; static inline int hash(int major, int minor, int ino) { unsigned long tmp = ino + minor + (major << 3); tmp += tmp >> 5; return tmp & 31; } static char __init *find_link(int major, int minor, int ino, umode_t mode, char *name) { struct hash **p, *q; for (p = head + hash(major, minor, ino); *p; p = &(*p)->next) { if ((*p)->ino != ino) continue; if ((*p)->minor != minor) continue; if ((*p)->major != major) continue; if (((*p)->mode ^ mode) & S_IFMT) continue; return (*p)->name; } q = kmalloc(sizeof(struct hash), GFP_KERNEL); if (!q) panic_show_mem("can't allocate link hash entry"); q->major = major; q->minor = minor; q->ino = ino; q->mode = mode; strcpy(q->name, name); q->next = NULL; *p = q; return NULL; } static void __init free_hash(void) { struct hash **p, *q; for (p = head; p < head + 32; p++) { while (*p) { q = *p; *p = q->next; kfree(q); } } } #ifdef CONFIG_INITRAMFS_PRESERVE_MTIME static void __init do_utime(char *filename, time64_t mtime) { struct timespec64 t[2] = { { .tv_sec = mtime }, { .tv_sec = mtime } }; init_utimes(filename, t); } static void __init do_utime_path(const struct path *path, time64_t mtime) { struct timespec64 t[2] = { { .tv_sec = mtime }, { .tv_sec = mtime } }; vfs_utimes(path, t); } static __initdata LIST_HEAD(dir_list); struct dir_entry { struct list_head list; time64_t mtime; char name[]; }; static void __init dir_add(const char *name, time64_t mtime) { size_t nlen = strlen(name) + 1; struct dir_entry *de; de = kmalloc(sizeof(struct dir_entry) + nlen, GFP_KERNEL); if (!de) panic_show_mem("can't allocate dir_entry buffer"); INIT_LIST_HEAD(&de->list); strscpy(de->name, name, nlen); de->mtime = mtime; list_add(&de->list, &dir_list); } static void __init dir_utime(void) { struct dir_entry *de, *tmp; list_for_each_entry_safe(de, tmp, &dir_list, list) { list_del(&de->list); do_utime(de->name, de->mtime); kfree(de); } } #else static void __init do_utime(char *filename, time64_t mtime) {} static void __init do_utime_path(const struct path *path, time64_t mtime) {} static void __init dir_add(const char *name, time64_t mtime) {} static void __init dir_utime(void) {} #endif static __initdata time64_t mtime; /* cpio header parsing */ static __initdata unsigned long ino, major, minor, nlink; static __initdata umode_t mode; static __initdata unsigned long body_len, name_len; static __initdata uid_t uid; static __initdata gid_t gid; static __initdata unsigned rdev; static __initdata u32 hdr_csum; static void __init parse_header(char *s) { unsigned long parsed[13]; char buf[9]; int i; buf[8] = '\0'; for (i = 0, s += 6; i < 13; i++, s += 8) { memcpy(buf, s, 8); parsed[i] = simple_strtoul(buf, NULL, 16); } ino = parsed[0]; mode = parsed[1]; uid = parsed[2]; gid = parsed[3]; nlink = parsed[4]; mtime = parsed[5]; /* breaks in y2106 */ body_len = parsed[6]; major = parsed[7]; minor = parsed[8]; rdev = new_encode_dev(MKDEV(parsed[9], parsed[10])); name_len = parsed[11]; hdr_csum = parsed[12]; } /* FSM */ static __initdata enum state { Start, Collect, GotHeader, SkipIt, GotName, CopyFile, GotSymlink, Reset } state, next_state; static __initdata char *victim; static unsigned long byte_count __initdata; static __initdata loff_t this_header, next_header; static inline void __init eat(unsigned n) { victim += n; this_header += n; byte_count -= n; } static __initdata char *collected; static long remains __initdata; static __initdata char *collect; static void __init read_into(char *buf, unsigned size, enum state next) { if (byte_count >= size) { collected = victim; eat(size); state = next; } else { collect = collected = buf; remains = size; next_state = next; state = Collect; } } static __initdata char *header_buf, *symlink_buf, *name_buf; static int __init do_start(void) { read_into(header_buf, 110, GotHeader); return 0; } static int __init do_collect(void) { unsigned long n = remains; if (byte_count < n) n = byte_count; memcpy(collect, victim, n); eat(n); collect += n; if ((remains -= n) != 0) return 1; state = next_state; return 0; } static int __init do_header(void) { if (!memcmp(collected, "070701", 6)) { csum_present = false; } else if (!memcmp(collected, "070702", 6)) { csum_present = true; } else { if (memcmp(collected, "070707", 6) == 0) error("incorrect cpio method used: use -H newc option"); else error("no cpio magic"); return 1; } parse_header(collected); next_header = this_header + N_ALIGN(name_len) + body_len; next_header = (next_header + 3) & ~3; state = SkipIt; if (name_len <= 0 || name_len > PATH_MAX) return 0; if (S_ISLNK(mode)) { if (body_len > PATH_MAX) return 0; collect = collected = symlink_buf; remains = N_ALIGN(name_len) + body_len; next_state = GotSymlink; state = Collect; return 0; } if (S_ISREG(mode) || !body_len) read_into(name_buf, N_ALIGN(name_len), GotName); return 0; } static int __init do_skip(void) { if (this_header + byte_count < next_header) { eat(byte_count); return 1; } else { eat(next_header - this_header); state = next_state; return 0; } } static int __init do_reset(void) { while (byte_count && *victim == '\0') eat(1); if (byte_count && (this_header & 3)) error("broken padding"); return 1; } static void __init clean_path(char *path, umode_t fmode) { struct kstat st; if (!init_stat(path, &st, AT_SYMLINK_NOFOLLOW) && (st.mode ^ fmode) & S_IFMT) { if (S_ISDIR(st.mode)) init_rmdir(path); else init_unlink(path); } } static int __init maybe_link(void) { if (nlink >= 2) { char *old = find_link(major, minor, ino, mode, collected); if (old) { clean_path(collected, 0); return (init_link(old, collected) < 0) ? -1 : 1; } } return 0; } static __initdata struct file *wfile; static __initdata loff_t wfile_pos; static int __init do_name(void) { state = SkipIt; next_state = Reset; /* name_len > 0 && name_len <= PATH_MAX checked in do_header */ if (collected[name_len - 1] != '\0') { pr_err("initramfs name without nulterm: %.*s\n", (int)name_len, collected); error("malformed archive"); return 1; } if (strcmp(collected, "TRAILER!!!") == 0) { free_hash(); return 0; } clean_path(collected, mode); if (S_ISREG(mode)) { int ml = maybe_link(); if (ml >= 0) { int openflags = O_WRONLY|O_CREAT|O_LARGEFILE; if (ml != 1) openflags |= O_TRUNC; wfile = filp_open(collected, openflags, mode); if (IS_ERR(wfile)) return 0; wfile_pos = 0; io_csum = 0; vfs_fchown(wfile, uid, gid); vfs_fchmod(wfile, mode); if (body_len) vfs_truncate(&wfile->f_path, body_len); state = CopyFile; } } else if (S_ISDIR(mode)) { init_mkdir(collected, mode); init_chown(collected, uid, gid, 0); init_chmod(collected, mode); dir_add(collected, mtime); } else if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) { if (maybe_link() == 0) { init_mknod(collected, mode, rdev); init_chown(collected, uid, gid, 0); init_chmod(collected, mode); do_utime(collected, mtime); } } return 0; } static int __init do_copy(void) { if (byte_count >= body_len) { if (xwrite(wfile, victim, body_len, &wfile_pos) != body_len) error("write error"); do_utime_path(&wfile->f_path, mtime); fput(wfile); if (csum_present && io_csum != hdr_csum) error("bad data checksum"); eat(body_len); state = SkipIt; return 0; } else { if (xwrite(wfile, victim, byte_count, &wfile_pos) != byte_count) error("write error"); body_len -= byte_count; eat(byte_count); return 1; } } static int __init do_symlink(void) { if (collected[name_len - 1] != '\0') { pr_err("initramfs symlink without nulterm: %.*s\n", (int)name_len, collected); error("malformed archive"); return 1; } collected[N_ALIGN(name_len) + body_len] = '\0'; clean_path(collected, 0); init_symlink(collected + N_ALIGN(name_len), collected); init_chown(collected, uid, gid, AT_SYMLINK_NOFOLLOW); do_utime(collected, mtime); state = SkipIt; next_state = Reset; return 0; } static __initdata int (*actions[])(void) = { [Start] = do_start, [Collect] = do_collect, [GotHeader] = do_header, [SkipIt] = do_skip, [GotName] = do_name, [CopyFile] = do_copy, [GotSymlink] = do_symlink, [Reset] = do_reset, }; static long __init write_buffer(char *buf, unsigned long len) { byte_count = len; victim = buf; while (!actions[state]()) ; return len - byte_count; } static long __init flush_buffer(void *bufv, unsigned long len) { char *buf = bufv; long written; long origLen = len; if (message) return -1; while ((written = write_buffer(buf, len)) < len && !message) { char c = buf[written]; if (c == '0') { buf += written; len -= written; state = Start; } else if (c == 0) { buf += written; len -= written; state = Reset; } else error("junk within compressed archive"); } return origLen; } static unsigned long my_inptr __initdata; /* index of next byte to be processed in inbuf */ #include <linux/decompress/generic.h> static char * __init unpack_to_rootfs(char *buf, unsigned long len) { long written; decompress_fn decompress; const char *compress_name; static __initdata char msg_buf[64]; header_buf = kmalloc(110, GFP_KERNEL); symlink_buf = kmalloc(PATH_MAX + N_ALIGN(PATH_MAX) + 1, GFP_KERNEL); name_buf = kmalloc(N_ALIGN(PATH_MAX), GFP_KERNEL); if (!header_buf || !symlink_buf || !name_buf) panic_show_mem("can't allocate buffers"); state = Start; this_header = 0; message = NULL; while (!message && len) { loff_t saved_offset = this_header; if (*buf == '0' && !(this_header & 3)) { state = Start; written = write_buffer(buf, len); buf += written; len -= written; continue; } if (!*buf) { buf++; len--; this_header++; continue; } this_header = 0; decompress = decompress_method(buf, len, &compress_name); pr_debug("Detected %s compressed data\n", compress_name); if (decompress) { int res = decompress(buf, len, NULL, flush_buffer, NULL, &my_inptr, error); if (res) error("decompressor failed"); } else if (compress_name) { if (!message) { snprintf(msg_buf, sizeof msg_buf, "compression method %s not configured", compress_name); message = msg_buf; } } else error("invalid magic at start of compressed archive"); if (state != Reset) error("junk at the end of compressed archive"); this_header = saved_offset + my_inptr; buf += my_inptr; len -= my_inptr; } dir_utime(); kfree(name_buf); kfree(symlink_buf); kfree(header_buf); return message; } static int __initdata do_retain_initrd; static int __init retain_initrd_param(char *str) { if (*str) return 0; do_retain_initrd = 1; return 1; } __setup("retain_initrd", retain_initrd_param); #ifdef CONFIG_ARCH_HAS_KEEPINITRD static int __init keepinitrd_setup(char *__unused) { do_retain_initrd = 1; return 1; } __setup("keepinitrd", keepinitrd_setup); #endif static bool __initdata initramfs_async = true; static int __init initramfs_async_setup(char *str) { return kstrtobool(str, &initramfs_async) == 0; } __setup("initramfs_async=", initramfs_async_setup); extern char __initramfs_start[]; extern unsigned long __initramfs_size; #include <linux/initrd.h> #include <linux/kexec.h> static BIN_ATTR(initrd, 0440, sysfs_bin_attr_simple_read, NULL, 0); void __init reserve_initrd_mem(void) { phys_addr_t start; unsigned long size; /* Ignore the virtul address computed during device tree parsing */ initrd_start = initrd_end = 0; if (!phys_initrd_size) return; /* * Round the memory region to page boundaries as per free_initrd_mem() * This allows us to detect whether the pages overlapping the initrd * are in use, but more importantly, reserves the entire set of pages * as we don't want these pages allocated for other purposes. */ start = round_down(phys_initrd_start, PAGE_SIZE); size = phys_initrd_size + (phys_initrd_start - start); size = round_up(size, PAGE_SIZE); if (!memblock_is_region_memory(start, size)) { pr_err("INITRD: 0x%08llx+0x%08lx is not a memory region", (u64)start, size); goto disable; } if (memblock_is_region_reserved(start, size)) { pr_err("INITRD: 0x%08llx+0x%08lx overlaps in-use memory region\n", (u64)start, size); goto disable; } memblock_reserve(start, size); /* Now convert initrd to virtual addresses */ initrd_start = (unsigned long)__va(phys_initrd_start); initrd_end = initrd_start + phys_initrd_size; initrd_below_start_ok = 1; return; disable: pr_cont(" - disabling initrd\n"); initrd_start = 0; initrd_end = 0; } void __weak __init free_initrd_mem(unsigned long start, unsigned long end) { #ifdef CONFIG_ARCH_KEEP_MEMBLOCK unsigned long aligned_start = ALIGN_DOWN(start, PAGE_SIZE); unsigned long aligned_end = ALIGN(end, PAGE_SIZE); memblock_free((void *)aligned_start, aligned_end - aligned_start); #endif free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM, "initrd"); } #ifdef CONFIG_CRASH_RESERVE static bool __init kexec_free_initrd(void) { unsigned long crashk_start = (unsigned long)__va(crashk_res.start); unsigned long crashk_end = (unsigned long)__va(crashk_res.end); /* * If the initrd region is overlapped with crashkernel reserved region, * free only memory that is not part of crashkernel region. */ if (initrd_start >= crashk_end || initrd_end <= crashk_start) return false; /* * Initialize initrd memory region since the kexec boot does not do. */ memset((void *)initrd_start, 0, initrd_end - initrd_start); if (initrd_start < crashk_start) free_initrd_mem(initrd_start, crashk_start); if (initrd_end > crashk_end) free_initrd_mem(crashk_end, initrd_end); return true; } #else static inline bool kexec_free_initrd(void) { return false; } #endif /* CONFIG_KEXEC_CORE */ #ifdef CONFIG_BLK_DEV_RAM static void __init populate_initrd_image(char *err) { ssize_t written; struct file *file; loff_t pos = 0; printk(KERN_INFO "rootfs image is not initramfs (%s); looks like an initrd\n", err); file = filp_open("/initrd.image", O_WRONLY|O_CREAT|O_LARGEFILE, 0700); if (IS_ERR(file)) return; written = xwrite(file, (char *)initrd_start, initrd_end - initrd_start, &pos); if (written != initrd_end - initrd_start) pr_err("/initrd.image: incomplete write (%zd != %ld)\n", written, initrd_end - initrd_start); fput(file); } #endif /* CONFIG_BLK_DEV_RAM */ static void __init do_populate_rootfs(void *unused, async_cookie_t cookie) { /* Load the built in initramfs */ char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size); if (err) panic_show_mem("%s", err); /* Failed to decompress INTERNAL initramfs */ if (!initrd_start || IS_ENABLED(CONFIG_INITRAMFS_FORCE)) goto done; if (IS_ENABLED(CONFIG_BLK_DEV_RAM)) printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n"); else printk(KERN_INFO "Unpacking initramfs...\n"); err = unpack_to_rootfs((char *)initrd_start, initrd_end - initrd_start); if (err) { #ifdef CONFIG_BLK_DEV_RAM populate_initrd_image(err); #else printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err); #endif } done: security_initramfs_populated(); /* * If the initrd region is overlapped with crashkernel reserved region, * free only memory that is not part of crashkernel region. */ if (!do_retain_initrd && initrd_start && !kexec_free_initrd()) { free_initrd_mem(initrd_start, initrd_end); } else if (do_retain_initrd && initrd_start) { bin_attr_initrd.size = initrd_end - initrd_start; bin_attr_initrd.private = (void *)initrd_start; if (sysfs_create_bin_file(firmware_kobj, &bin_attr_initrd)) pr_err("Failed to create initrd sysfs file"); } initrd_start = 0; initrd_end = 0; init_flush_fput(); } static ASYNC_DOMAIN_EXCLUSIVE(initramfs_domain); static async_cookie_t initramfs_cookie; void wait_for_initramfs(void) { if (!initramfs_cookie) { /* * Something before rootfs_initcall wants to access * the filesystem/initramfs. Probably a bug. Make a * note, avoid deadlocking the machine, and let the * caller's access fail as it used to. */ pr_warn_once("wait_for_initramfs() called before rootfs_initcalls\n"); return; } async_synchronize_cookie_domain(initramfs_cookie + 1, &initramfs_domain); } EXPORT_SYMBOL_GPL(wait_for_initramfs); static int __init populate_rootfs(void) { initramfs_cookie = async_schedule_domain(do_populate_rootfs, NULL, &initramfs_domain); usermodehelper_enable(); if (!initramfs_async) wait_for_initramfs(); return 0; } rootfs_initcall(populate_rootfs);
1 12 12 55 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2000-2001 Qualcomm Incorporated Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ /* Bluetooth kernel library. */ #define pr_fmt(fmt) "Bluetooth: " fmt #include <linux/export.h> #include <net/bluetooth/bluetooth.h> /** * baswap() - Swaps the order of a bd address * @dst: Pointer to a bdaddr_t struct that will store the swapped * bd address. * @src: Pointer to the bdaddr_t struct to be swapped. * * This function reverses the byte order of a Bluetooth device * address. */ void baswap(bdaddr_t *dst, const bdaddr_t *src) { const unsigned char *s = (const unsigned char *)src; unsigned char *d = (unsigned char *)dst; unsigned int i; for (i = 0; i < 6; i++) d[i] = s[5 - i]; } EXPORT_SYMBOL(baswap); /** * bt_to_errno() - Bluetooth error codes to standard errno * @code: Bluetooth error code to be converted * * This function takes a Bluetooth error code as input and convets * it to an equivalent Unix/standard errno value. * * Return: * * If the bt error code is known, an equivalent Unix errno value * is returned. * If the given bt error code is not known, ENOSYS is returned. */ int bt_to_errno(__u16 code) { switch (code) { case 0: return 0; case 0x01: return EBADRQC; case 0x02: return ENOTCONN; case 0x03: return EIO; case 0x04: case 0x3c: return EHOSTDOWN; case 0x05: return EACCES; case 0x06: return EBADE; case 0x07: return ENOMEM; case 0x08: return ETIMEDOUT; case 0x09: return EMLINK; case 0x0a: return EMLINK; case 0x0b: return EALREADY; case 0x0c: return EBUSY; case 0x0d: case 0x0e: case 0x0f: return ECONNREFUSED; case 0x10: return ETIMEDOUT; case 0x11: case 0x27: case 0x29: case 0x20: return EOPNOTSUPP; case 0x12: return EINVAL; case 0x13: case 0x14: case 0x15: return ECONNRESET; case 0x16: return ECONNABORTED; case 0x17: return ELOOP; case 0x18: return EACCES; case 0x1a: return EPROTONOSUPPORT; case 0x1b: return ECONNREFUSED; case 0x19: case 0x1e: case 0x23: case 0x24: case 0x25: return EPROTO; default: return ENOSYS; } } EXPORT_SYMBOL(bt_to_errno); /** * bt_status() - Standard errno value to Bluetooth error code * @err: Unix/standard errno value to be converted * * This function converts a standard/Unix errno value to an * equivalent Bluetooth error code. * * Return: Bluetooth error code. * * If the given errno is not found, 0x1f is returned by default * which indicates an unspecified error. * For err >= 0, no conversion is performed, and the same value * is immediately returned. */ __u8 bt_status(int err) { if (err >= 0) return err; switch (err) { case -EBADRQC: return 0x01; case -ENOTCONN: return 0x02; case -EIO: return 0x03; case -EHOSTDOWN: return 0x04; case -EACCES: return 0x05; case -EBADE: return 0x06; case -ENOMEM: return 0x07; case -ETIMEDOUT: return 0x08; case -EMLINK: return 0x09; case -EALREADY: return 0x0b; case -EBUSY: return 0x0c; case -ECONNREFUSED: return 0x0d; case -EOPNOTSUPP: return 0x11; case -EINVAL: return 0x12; case -ECONNRESET: return 0x13; case -ECONNABORTED: return 0x16; case -ELOOP: return 0x17; case -EPROTONOSUPPORT: return 0x1a; case -EPROTO: return 0x19; default: return 0x1f; } } EXPORT_SYMBOL(bt_status); /** * bt_info() - Log Bluetooth information message * @format: Message's format string */ void bt_info(const char *format, ...) { struct va_format vaf; va_list args; va_start(args, format); vaf.fmt = format; vaf.va = &args; pr_info("%pV", &vaf); va_end(args); } EXPORT_SYMBOL(bt_info); /** * bt_warn() - Log Bluetooth warning message * @format: Message's format string */ void bt_warn(const char *format, ...) { struct va_format vaf; va_list args; va_start(args, format); vaf.fmt = format; vaf.va = &args; pr_warn("%pV", &vaf); va_end(args); } EXPORT_SYMBOL(bt_warn); /** * bt_err() - Log Bluetooth error message * @format: Message's format string */ void bt_err(const char *format, ...) { struct va_format vaf; va_list args; va_start(args, format); vaf.fmt = format; vaf.va = &args; pr_err("%pV", &vaf); va_end(args); } EXPORT_SYMBOL(bt_err); #ifdef CONFIG_BT_FEATURE_DEBUG static bool debug_enable; void bt_dbg_set(bool enable) { debug_enable = enable; } bool bt_dbg_get(void) { return debug_enable; } /** * bt_dbg() - Log Bluetooth debugging message * @format: Message's format string */ void bt_dbg(const char *format, ...) { struct va_format vaf; va_list args; if (likely(!debug_enable)) return; va_start(args, format); vaf.fmt = format; vaf.va = &args; printk(KERN_DEBUG pr_fmt("%pV"), &vaf); va_end(args); } EXPORT_SYMBOL(bt_dbg); #endif /** * bt_warn_ratelimited() - Log rate-limited Bluetooth warning message * @format: Message's format string * * This functions works like bt_warn, but it uses rate limiting * to prevent the message from being logged too often. */ void bt_warn_ratelimited(const char *format, ...) { struct va_format vaf; va_list args; va_start(args, format); vaf.fmt = format; vaf.va = &args; pr_warn_ratelimited("%pV", &vaf); va_end(args); } EXPORT_SYMBOL(bt_warn_ratelimited); /** * bt_err_ratelimited() - Log rate-limited Bluetooth error message * @format: Message's format string * * This functions works like bt_err, but it uses rate limiting * to prevent the message from being logged too often. */ void bt_err_ratelimited(const char *format, ...) { struct va_format vaf; va_list args; va_start(args, format); vaf.fmt = format; vaf.va = &args; pr_err_ratelimited("%pV", &vaf); va_end(args); } EXPORT_SYMBOL(bt_err_ratelimited);
4 4 3 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_PKT_SCHED_H #define __NET_PKT_SCHED_H #include <linux/jiffies.h> #include <linux/ktime.h> #include <linux/if_vlan.h> #include <linux/netdevice.h> #include <net/sch_generic.h> #include <net/net_namespace.h> #include <uapi/linux/pkt_sched.h> #define DEFAULT_TX_QUEUE_LEN 1000 #define STAB_SIZE_LOG_MAX 30 struct qdisc_walker { int stop; int skip; int count; int (*fn)(struct Qdisc *, unsigned long cl, struct qdisc_walker *); }; #define qdisc_priv(q) \ _Generic(q, \ const struct Qdisc * : (const void *)&q->privdata, \ struct Qdisc * : (void *)&q->privdata) static inline struct Qdisc *qdisc_from_priv(void *priv) { return container_of(priv, struct Qdisc, privdata); } /* Timer resolution MUST BE < 10% of min_schedulable_packet_size/bandwidth Normal IP packet size ~ 512byte, hence: 0.5Kbyte/1Mbyte/sec = 0.5msec, so that we need 50usec timer for 10Mbit ethernet. 10msec resolution -> <50Kbit/sec. The result: [34]86 is not good choice for QoS router :-( The things are not so bad, because we may use artificial clock evaluated by integration of network data flow in the most critical places. */ typedef u64 psched_time_t; typedef long psched_tdiff_t; /* Avoid doing 64 bit divide */ #define PSCHED_SHIFT 6 #define PSCHED_TICKS2NS(x) ((s64)(x) << PSCHED_SHIFT) #define PSCHED_NS2TICKS(x) ((x) >> PSCHED_SHIFT) #define PSCHED_TICKS_PER_SEC PSCHED_NS2TICKS(NSEC_PER_SEC) #define PSCHED_PASTPERFECT 0 static inline psched_time_t psched_get_time(void) { return PSCHED_NS2TICKS(ktime_get_ns()); } struct qdisc_watchdog { struct hrtimer timer; struct Qdisc *qdisc; }; void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, clockid_t clockid); void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc); void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires, u64 delta_ns); static inline void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires) { return qdisc_watchdog_schedule_range_ns(wd, expires, 0ULL); } static inline void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires) { qdisc_watchdog_schedule_ns(wd, PSCHED_TICKS2NS(expires)); } void qdisc_watchdog_cancel(struct qdisc_watchdog *wd); extern struct Qdisc_ops pfifo_qdisc_ops; extern struct Qdisc_ops bfifo_qdisc_ops; extern struct Qdisc_ops pfifo_head_drop_qdisc_ops; int fifo_set_limit(struct Qdisc *q, unsigned int limit); struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops, unsigned int limit, struct netlink_ext_ack *extack); int register_qdisc(struct Qdisc_ops *qops); void unregister_qdisc(struct Qdisc_ops *qops); #define NET_SCH_ALIAS_PREFIX "net-sch-" #define MODULE_ALIAS_NET_SCH(id) MODULE_ALIAS(NET_SCH_ALIAS_PREFIX id) void qdisc_get_default(char *id, size_t len); int qdisc_set_default(const char *id); void qdisc_hash_add(struct Qdisc *q, bool invisible); void qdisc_hash_del(struct Qdisc *q); struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle); struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle); struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab, struct netlink_ext_ack *extack); void qdisc_put_rtab(struct qdisc_rate_table *tab); void qdisc_put_stab(struct qdisc_size_table *tab); void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc); bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq, spinlock_t *root_lock, bool validate); void __qdisc_run(struct Qdisc *q); static inline void qdisc_run(struct Qdisc *q) { if (qdisc_run_begin(q)) { __qdisc_run(q); qdisc_run_end(q); } } extern const struct nla_policy rtm_tca_policy[TCA_MAX + 1]; /* Calculate maximal size of packet seen by hard_start_xmit routine of this device. */ static inline unsigned int psched_mtu(const struct net_device *dev) { return READ_ONCE(dev->mtu) + dev->hard_header_len; } static inline struct net *qdisc_net(struct Qdisc *q) { return dev_net(q->dev_queue->dev); } struct tc_query_caps_base { enum tc_setup_type type; void *caps; }; struct tc_cbs_qopt_offload { u8 enable; s32 queue; s32 hicredit; s32 locredit; s32 idleslope; s32 sendslope; }; struct tc_etf_qopt_offload { u8 enable; s32 queue; }; struct tc_mqprio_caps { bool validate_queue_counts:1; }; struct tc_mqprio_qopt_offload { /* struct tc_mqprio_qopt must always be the first element */ struct tc_mqprio_qopt qopt; struct netlink_ext_ack *extack; u16 mode; u16 shaper; u32 flags; u64 min_rate[TC_QOPT_MAX_QUEUE]; u64 max_rate[TC_QOPT_MAX_QUEUE]; unsigned long preemptible_tcs; }; struct tc_taprio_caps { bool supports_queue_max_sdu:1; bool gate_mask_per_txq:1; /* Device expects lower TXQ numbers to have higher priority over higher * TXQs, regardless of their TC mapping. DO NOT USE FOR NEW DRIVERS, * INSTEAD ENFORCE A PROPER TC:TXQ MAPPING COMING FROM USER SPACE. */ bool broken_mqprio:1; }; enum tc_taprio_qopt_cmd { TAPRIO_CMD_REPLACE, TAPRIO_CMD_DESTROY, TAPRIO_CMD_STATS, TAPRIO_CMD_QUEUE_STATS, }; /** * struct tc_taprio_qopt_stats - IEEE 802.1Qbv statistics * @window_drops: Frames that were dropped because they were too large to be * transmitted in any of the allotted time windows (open gates) for their * traffic class. * @tx_overruns: Frames still being transmitted by the MAC after the * transmission gate associated with their traffic class has closed. * Equivalent to `12.29.1.1.2 TransmissionOverrun` from 802.1Q-2018. */ struct tc_taprio_qopt_stats { u64 window_drops; u64 tx_overruns; }; struct tc_taprio_qopt_queue_stats { int queue; struct tc_taprio_qopt_stats stats; }; struct tc_taprio_sched_entry { u8 command; /* TC_TAPRIO_CMD_* */ /* The gate_mask in the offloading side refers to traffic classes */ u32 gate_mask; u32 interval; }; struct tc_taprio_qopt_offload { enum tc_taprio_qopt_cmd cmd; union { /* TAPRIO_CMD_STATS */ struct tc_taprio_qopt_stats stats; /* TAPRIO_CMD_QUEUE_STATS */ struct tc_taprio_qopt_queue_stats queue_stats; /* TAPRIO_CMD_REPLACE */ struct { struct tc_mqprio_qopt_offload mqprio; struct netlink_ext_ack *extack; ktime_t base_time; u64 cycle_time; u64 cycle_time_extension; u32 max_sdu[TC_MAX_QUEUE]; size_t num_entries; struct tc_taprio_sched_entry entries[]; }; }; }; #if IS_ENABLED(CONFIG_NET_SCH_TAPRIO) /* Reference counting */ struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload *offload); void taprio_offload_free(struct tc_taprio_qopt_offload *offload); #else /* Reference counting */ static inline struct tc_taprio_qopt_offload * taprio_offload_get(struct tc_taprio_qopt_offload *offload) { return NULL; } static inline void taprio_offload_free(struct tc_taprio_qopt_offload *offload) { } #endif /* Ensure skb_mstamp_ns, which might have been populated with the txtime, is * not mistaken for a software timestamp, because this will otherwise prevent * the dispatch of hardware timestamps to the socket. */ static inline void skb_txtime_consumed(struct sk_buff *skb) { skb->tstamp = ktime_set(0, 0); } static inline bool tc_qdisc_stats_dump(struct Qdisc *sch, unsigned long cl, struct qdisc_walker *arg) { if (arg->count >= arg->skip && arg->fn(sch, cl, arg) < 0) { arg->stop = 1; return false; } arg->count++; return true; } #endif
6 6 6 6 6 6 6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/xattr.c * * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> * * Fix by Harrison Xing <harrison@mountainviewdata.com>. * Ext4 code with a lot of help from Eric Jarman <ejarman@acm.org>. * Extended attributes for symlinks and special files added per * suggestion of Luka Renko <luka.renko@hermes.si>. * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>, * Red Hat Inc. * ea-in-inode support by Alex Tomas <alex@clusterfs.com> aka bzzz * and Andreas Gruenbacher <agruen@suse.de>. */ /* * Extended attributes are stored directly in inodes (on file systems with * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl * field contains the block number if an inode uses an additional block. All * attributes must fit in the inode and one additional block. Blocks that * contain the identical set of attributes may be shared among several inodes. * Identical blocks are detected by keeping a cache of blocks that have * recently been accessed. * * The attributes in inodes and on blocks have a different header; the entries * are stored in the same format: * * +------------------+ * | header | * | entry 1 | | * | entry 2 | | growing downwards * | entry 3 | v * | four null bytes | * | . . . | * | value 1 | ^ * | value 3 | | growing upwards * | value 2 | | * +------------------+ * * The header is followed by multiple entry descriptors. In disk blocks, the * entry descriptors are kept sorted. In inodes, they are unsorted. The * attribute values are aligned to the end of the block in no specific order. * * Locking strategy * ---------------- * EXT4_I(inode)->i_file_acl is protected by EXT4_I(inode)->xattr_sem. * EA blocks are only changed if they are exclusive to an inode, so * holding xattr_sem also means that nothing but the EA block's reference * count can change. Multiple writers to the same block are synchronized * by the buffer lock. */ #include <linux/init.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/mbcache.h> #include <linux/quotaops.h> #include <linux/iversion.h> #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" #include "acl.h" #ifdef EXT4_XATTR_DEBUG # define ea_idebug(inode, fmt, ...) \ printk(KERN_DEBUG "inode %s:%lu: " fmt "\n", \ inode->i_sb->s_id, inode->i_ino, ##__VA_ARGS__) # define ea_bdebug(bh, fmt, ...) \ printk(KERN_DEBUG "block %pg:%lu: " fmt "\n", \ bh->b_bdev, (unsigned long)bh->b_blocknr, ##__VA_ARGS__) #else # define ea_idebug(inode, fmt, ...) no_printk(fmt, ##__VA_ARGS__) # define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif static void ext4_xattr_block_cache_insert(struct mb_cache *, struct buffer_head *); static struct buffer_head * ext4_xattr_block_cache_find(struct inode *, struct ext4_xattr_header *, struct mb_cache_entry **); static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value, size_t value_count); static __le32 ext4_xattr_hash_entry_signed(char *name, size_t name_len, __le32 *value, size_t value_count); static void ext4_xattr_rehash(struct ext4_xattr_header *); static const struct xattr_handler * const ext4_xattr_handler_map[] = { [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, #ifdef CONFIG_EXT4_FS_POSIX_ACL [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default, #endif [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler, #ifdef CONFIG_EXT4_FS_SECURITY [EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler, #endif [EXT4_XATTR_INDEX_HURD] = &ext4_xattr_hurd_handler, }; const struct xattr_handler * const ext4_xattr_handlers[] = { &ext4_xattr_user_handler, &ext4_xattr_trusted_handler, #ifdef CONFIG_EXT4_FS_SECURITY &ext4_xattr_security_handler, #endif &ext4_xattr_hurd_handler, NULL }; #define EA_BLOCK_CACHE(inode) (((struct ext4_sb_info *) \ inode->i_sb->s_fs_info)->s_ea_block_cache) #define EA_INODE_CACHE(inode) (((struct ext4_sb_info *) \ inode->i_sb->s_fs_info)->s_ea_inode_cache) static int ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, struct inode *inode); #ifdef CONFIG_LOCKDEP void ext4_xattr_inode_set_class(struct inode *ea_inode) { struct ext4_inode_info *ei = EXT4_I(ea_inode); lockdep_set_subclass(&ea_inode->i_rwsem, 1); (void) ei; /* shut up clang warning if !CONFIG_LOCKDEP */ lockdep_set_subclass(&ei->i_data_sem, I_DATA_SEM_EA); } #endif static __le32 ext4_xattr_block_csum(struct inode *inode, sector_t block_nr, struct ext4_xattr_header *hdr) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __le64 dsk_block_nr = cpu_to_le64(block_nr); __u32 dummy_csum = 0; int offset = offsetof(struct ext4_xattr_header, h_checksum); csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); csum = ext4_chksum(sbi, csum, (__u8 *)hdr, offset); csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); offset += sizeof(dummy_csum); csum = ext4_chksum(sbi, csum, (__u8 *)hdr + offset, EXT4_BLOCK_SIZE(inode->i_sb) - offset); return cpu_to_le32(csum); } static int ext4_xattr_block_csum_verify(struct inode *inode, struct buffer_head *bh) { struct ext4_xattr_header *hdr = BHDR(bh); int ret = 1; if (ext4_has_metadata_csum(inode->i_sb)) { lock_buffer(bh); ret = (hdr->h_checksum == ext4_xattr_block_csum(inode, bh->b_blocknr, hdr)); unlock_buffer(bh); } return ret; } static void ext4_xattr_block_csum_set(struct inode *inode, struct buffer_head *bh) { if (ext4_has_metadata_csum(inode->i_sb)) BHDR(bh)->h_checksum = ext4_xattr_block_csum(inode, bh->b_blocknr, BHDR(bh)); } static inline const char *ext4_xattr_prefix(int name_index, struct dentry *dentry) { const struct xattr_handler *handler = NULL; if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map)) handler = ext4_xattr_handler_map[name_index]; if (!xattr_handler_can_list(handler, dentry)) return NULL; return xattr_prefix(handler); } static int check_xattrs(struct inode *inode, struct buffer_head *bh, struct ext4_xattr_entry *entry, void *end, void *value_start, const char *function, unsigned int line) { struct ext4_xattr_entry *e = entry; int err = -EFSCORRUPTED; char *err_str; if (bh) { if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) { err_str = "invalid header"; goto errout; } if (buffer_verified(bh)) return 0; if (!ext4_xattr_block_csum_verify(inode, bh)) { err = -EFSBADCRC; err_str = "invalid checksum"; goto errout; } } else { struct ext4_xattr_ibody_header *header = value_start; header -= 1; if (end - (void *)header < sizeof(*header) + sizeof(u32)) { err_str = "in-inode xattr block too small"; goto errout; } if (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { err_str = "bad magic number in in-inode xattr"; goto errout; } } /* Find the end of the names list */ while (!IS_LAST_ENTRY(e)) { struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); if ((void *)next >= end) { err_str = "e_name out of bounds"; goto errout; } if (strnlen(e->e_name, e->e_name_len) != e->e_name_len) { err_str = "bad e_name length"; goto errout; } e = next; } /* Check the values */ while (!IS_LAST_ENTRY(entry)) { u32 size = le32_to_cpu(entry->e_value_size); unsigned long ea_ino = le32_to_cpu(entry->e_value_inum); if (!ext4_has_feature_ea_inode(inode->i_sb) && ea_ino) { err_str = "ea_inode specified without ea_inode feature enabled"; goto errout; } if (ea_ino && ((ea_ino == EXT4_ROOT_INO) || !ext4_valid_inum(inode->i_sb, ea_ino))) { err_str = "invalid ea_ino"; goto errout; } if (size > EXT4_XATTR_SIZE_MAX) { err_str = "e_value size too large"; goto errout; } if (size != 0 && entry->e_value_inum == 0) { u16 offs = le16_to_cpu(entry->e_value_offs); void *value; /* * The value cannot overlap the names, and the value * with padding cannot extend beyond 'end'. Check both * the padded and unpadded sizes, since the size may * overflow to 0 when adding padding. */ if (offs > end - value_start) { err_str = "e_value out of bounds"; goto errout; } value = value_start + offs; if (value < (void *)e + sizeof(u32) || size > end - value || EXT4_XATTR_SIZE(size) > end - value) { err_str = "overlapping e_value "; goto errout; } } entry = EXT4_XATTR_NEXT(entry); } if (bh) set_buffer_verified(bh); return 0; errout: if (bh) __ext4_error_inode(inode, function, line, 0, -err, "corrupted xattr block %llu: %s", (unsigned long long) bh->b_blocknr, err_str); else __ext4_error_inode(inode, function, line, 0, -err, "corrupted in-inode xattr: %s", err_str); return err; } static inline int __ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh, const char *function, unsigned int line) { return check_xattrs(inode, bh, BFIRST(bh), bh->b_data + bh->b_size, bh->b_data, function, line); } #define ext4_xattr_check_block(inode, bh) \ __ext4_xattr_check_block((inode), (bh), __func__, __LINE__) static inline int __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, void *end, const char *function, unsigned int line) { return check_xattrs(inode, NULL, IFIRST(header), end, IFIRST(header), function, line); } #define xattr_check_inode(inode, header, end) \ __xattr_check_inode((inode), (header), (end), __func__, __LINE__) static int xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry, void *end, int name_index, const char *name, int sorted) { struct ext4_xattr_entry *entry, *next; size_t name_len; int cmp = 1; if (name == NULL) return -EINVAL; name_len = strlen(name); for (entry = *pentry; !IS_LAST_ENTRY(entry); entry = next) { next = EXT4_XATTR_NEXT(entry); if ((void *) next >= end) { EXT4_ERROR_INODE(inode, "corrupted xattr entries"); return -EFSCORRUPTED; } cmp = name_index - entry->e_name_index; if (!cmp) cmp = name_len - entry->e_name_len; if (!cmp) cmp = memcmp(name, entry->e_name, name_len); if (cmp <= 0 && (sorted || cmp == 0)) break; } *pentry = entry; return cmp ? -ENODATA : 0; } static u32 ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size) { return ext4_chksum(sbi, sbi->s_csum_seed, buffer, size); } static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode) { return ((u64) inode_get_ctime_sec(ea_inode) << 32) | (u32) inode_peek_iversion_raw(ea_inode); } static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count) { inode_set_ctime(ea_inode, (u32)(ref_count >> 32), 0); inode_set_iversion_raw(ea_inode, ref_count & 0xffffffff); } static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode) { return (u32) inode_get_atime_sec(ea_inode); } static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash) { inode_set_atime(ea_inode, hash, 0); } /* * Read the EA value from an inode. */ static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size) { int blocksize = 1 << ea_inode->i_blkbits; int bh_count = (size + blocksize - 1) >> ea_inode->i_blkbits; int tail_size = (size % blocksize) ?: blocksize; struct buffer_head *bhs_inline[8]; struct buffer_head **bhs = bhs_inline; int i, ret; if (bh_count > ARRAY_SIZE(bhs_inline)) { bhs = kmalloc_array(bh_count, sizeof(*bhs), GFP_NOFS); if (!bhs) return -ENOMEM; } ret = ext4_bread_batch(ea_inode, 0 /* block */, bh_count, true /* wait */, bhs); if (ret) goto free_bhs; for (i = 0; i < bh_count; i++) { /* There shouldn't be any holes in ea_inode. */ if (!bhs[i]) { ret = -EFSCORRUPTED; goto put_bhs; } memcpy((char *)buf + blocksize * i, bhs[i]->b_data, i < bh_count - 1 ? blocksize : tail_size); } ret = 0; put_bhs: for (i = 0; i < bh_count; i++) brelse(bhs[i]); free_bhs: if (bhs != bhs_inline) kfree(bhs); return ret; } #define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode_get_mtime_sec(inode))) static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, u32 ea_inode_hash, struct inode **ea_inode) { struct inode *inode; int err; /* * We have to check for this corruption early as otherwise * iget_locked() could wait indefinitely for the state of our * parent inode. */ if (parent->i_ino == ea_ino) { ext4_error(parent->i_sb, "Parent and EA inode have the same ino %lu", ea_ino); return -EFSCORRUPTED; } inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_EA_INODE); if (IS_ERR(inode)) { err = PTR_ERR(inode); ext4_error(parent->i_sb, "error while reading EA inode %lu err=%d", ea_ino, err); return err; } ext4_xattr_inode_set_class(inode); /* * Check whether this is an old Lustre-style xattr inode. Lustre * implementation does not have hash validation, rather it has a * backpointer from ea_inode to the parent inode. */ if (ea_inode_hash != ext4_xattr_inode_get_hash(inode) && EXT4_XATTR_INODE_GET_PARENT(inode) == parent->i_ino && inode->i_generation == parent->i_generation) { ext4_set_inode_state(inode, EXT4_STATE_LUSTRE_EA_INODE); ext4_xattr_inode_set_ref(inode, 1); } else { inode_lock_nested(inode, I_MUTEX_XATTR); inode->i_flags |= S_NOQUOTA; inode_unlock(inode); } *ea_inode = inode; return 0; } /* Remove entry from mbcache when EA inode is getting evicted */ void ext4_evict_ea_inode(struct inode *inode) { struct mb_cache_entry *oe; if (!EA_INODE_CACHE(inode)) return; /* Wait for entry to get unused so that we can remove it */ while ((oe = mb_cache_entry_delete_or_get(EA_INODE_CACHE(inode), ext4_xattr_inode_get_hash(inode), inode->i_ino))) { mb_cache_entry_wait_unused(oe); mb_cache_entry_put(EA_INODE_CACHE(inode), oe); } } static int ext4_xattr_inode_verify_hashes(struct inode *ea_inode, struct ext4_xattr_entry *entry, void *buffer, size_t size) { u32 hash; /* Verify stored hash matches calculated hash. */ hash = ext4_xattr_inode_hash(EXT4_SB(ea_inode->i_sb), buffer, size); if (hash != ext4_xattr_inode_get_hash(ea_inode)) return -EFSCORRUPTED; if (entry) { __le32 e_hash, tmp_data; /* Verify entry hash. */ tmp_data = cpu_to_le32(hash); e_hash = ext4_xattr_hash_entry(entry->e_name, entry->e_name_len, &tmp_data, 1); /* All good? */ if (e_hash == entry->e_hash) return 0; /* * Not good. Maybe the entry hash was calculated * using the buggy signed char version? */ e_hash = ext4_xattr_hash_entry_signed(entry->e_name, entry->e_name_len, &tmp_data, 1); /* Still no match - bad */ if (e_hash != entry->e_hash) return -EFSCORRUPTED; /* Let people know about old hash */ pr_warn_once("ext4: filesystem with signed xattr name hash"); } return 0; } /* * Read xattr value from the EA inode. */ static int ext4_xattr_inode_get(struct inode *inode, struct ext4_xattr_entry *entry, void *buffer, size_t size) { struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode); struct inode *ea_inode; int err; err = ext4_xattr_inode_iget(inode, le32_to_cpu(entry->e_value_inum), le32_to_cpu(entry->e_hash), &ea_inode); if (err) { ea_inode = NULL; goto out; } if (i_size_read(ea_inode) != size) { ext4_warning_inode(ea_inode, "ea_inode file size=%llu entry size=%zu", i_size_read(ea_inode), size); err = -EFSCORRUPTED; goto out; } err = ext4_xattr_inode_read(ea_inode, buffer, size); if (err) goto out; if (!ext4_test_inode_state(ea_inode, EXT4_STATE_LUSTRE_EA_INODE)) { err = ext4_xattr_inode_verify_hashes(ea_inode, entry, buffer, size); if (err) { ext4_warning_inode(ea_inode, "EA inode hash validation failed"); goto out; } if (ea_inode_cache) mb_cache_entry_create(ea_inode_cache, GFP_NOFS, ext4_xattr_inode_get_hash(ea_inode), ea_inode->i_ino, true /* reusable */); } out: iput(ea_inode); return err; } static int ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size) { struct buffer_head *bh = NULL; struct ext4_xattr_entry *entry; size_t size; void *end; int error; struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", name_index, name, buffer, (long)buffer_size); if (!EXT4_I(inode)->i_file_acl) return -ENODATA; ea_idebug(inode, "reading block %llu", (unsigned long long)EXT4_I(inode)->i_file_acl); bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) return PTR_ERR(bh); ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); error = ext4_xattr_check_block(inode, bh); if (error) goto cleanup; ext4_xattr_block_cache_insert(ea_block_cache, bh); entry = BFIRST(bh); end = bh->b_data + bh->b_size; error = xattr_find_entry(inode, &entry, end, name_index, name, 1); if (error) goto cleanup; size = le32_to_cpu(entry->e_value_size); error = -ERANGE; if (unlikely(size > EXT4_XATTR_SIZE_MAX)) goto cleanup; if (buffer) { if (size > buffer_size) goto cleanup; if (entry->e_value_inum) { error = ext4_xattr_inode_get(inode, entry, buffer, size); if (error) goto cleanup; } else { u16 offset = le16_to_cpu(entry->e_value_offs); void *p = bh->b_data + offset; if (unlikely(p + size > end)) goto cleanup; memcpy(buffer, p, size); } } error = size; cleanup: brelse(bh); return error; } int ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size) { struct ext4_xattr_ibody_header *header; struct ext4_xattr_entry *entry; struct ext4_inode *raw_inode; struct ext4_iloc iloc; size_t size; void *end; int error; if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) return -ENODATA; error = ext4_get_inode_loc(inode, &iloc); if (error) return error; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; error = xattr_check_inode(inode, header, end); if (error) goto cleanup; entry = IFIRST(header); error = xattr_find_entry(inode, &entry, end, name_index, name, 0); if (error) goto cleanup; size = le32_to_cpu(entry->e_value_size); error = -ERANGE; if (unlikely(size > EXT4_XATTR_SIZE_MAX)) goto cleanup; if (buffer) { if (size > buffer_size) goto cleanup; if (entry->e_value_inum) { error = ext4_xattr_inode_get(inode, entry, buffer, size); if (error) goto cleanup; } else { u16 offset = le16_to_cpu(entry->e_value_offs); void *p = (void *)IFIRST(header) + offset; if (unlikely(p + size > end)) goto cleanup; memcpy(buffer, p, size); } } error = size; cleanup: brelse(iloc.bh); return error; } /* * ext4_xattr_get() * * Copy an extended attribute into the buffer * provided, or compute the buffer size required. * Buffer is NULL to compute the size of the buffer required. * * Returns a negative error number on failure, or the number of bytes * used / required on success. */ int ext4_xattr_get(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size) { int error; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (strlen(name) > 255) return -ERANGE; down_read(&EXT4_I(inode)->xattr_sem); error = ext4_xattr_ibody_get(inode, name_index, name, buffer, buffer_size); if (error == -ENODATA) error = ext4_xattr_block_get(inode, name_index, name, buffer, buffer_size); up_read(&EXT4_I(inode)->xattr_sem); return error; } static int ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry, char *buffer, size_t buffer_size) { size_t rest = buffer_size; for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { const char *prefix; prefix = ext4_xattr_prefix(entry->e_name_index, dentry); if (prefix) { size_t prefix_len = strlen(prefix); size_t size = prefix_len + entry->e_name_len + 1; if (buffer) { if (size > rest) return -ERANGE; memcpy(buffer, prefix, prefix_len); buffer += prefix_len; memcpy(buffer, entry->e_name, entry->e_name_len); buffer += entry->e_name_len; *buffer++ = 0; } rest -= size; } } return buffer_size - rest; /* total size */ } static int ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) { struct inode *inode = d_inode(dentry); struct buffer_head *bh = NULL; int error; ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); if (!EXT4_I(inode)->i_file_acl) return 0; ea_idebug(inode, "reading block %llu", (unsigned long long)EXT4_I(inode)->i_file_acl); bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) return PTR_ERR(bh); ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); error = ext4_xattr_check_block(inode, bh); if (error) goto cleanup; ext4_xattr_block_cache_insert(EA_BLOCK_CACHE(inode), bh); error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); cleanup: brelse(bh); return error; } static int ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) { struct inode *inode = d_inode(dentry); struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; struct ext4_iloc iloc; void *end; int error; if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) return 0; error = ext4_get_inode_loc(inode, &iloc); if (error) return error; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; error = xattr_check_inode(inode, header, end); if (error) goto cleanup; error = ext4_xattr_list_entries(dentry, IFIRST(header), buffer, buffer_size); cleanup: brelse(iloc.bh); return error; } /* * Inode operation listxattr() * * d_inode(dentry)->i_rwsem: don't care * * Copy a list of attribute names into the buffer * provided, or compute the buffer size required. * Buffer is NULL to compute the size of the buffer required. * * Returns a negative error number on failure, or the number of bytes * used / required on success. */ ssize_t ext4_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { int ret, ret2; down_read(&EXT4_I(d_inode(dentry))->xattr_sem); ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size); if (ret < 0) goto errout; if (buffer) { buffer += ret; buffer_size -= ret; } ret = ext4_xattr_block_list(dentry, buffer, buffer_size); if (ret < 0) goto errout; ret += ret2; errout: up_read(&EXT4_I(d_inode(dentry))->xattr_sem); return ret; } /* * If the EXT4_FEATURE_COMPAT_EXT_ATTR feature of this file system is * not set, set it. */ static void ext4_xattr_update_super_block(handle_t *handle, struct super_block *sb) { if (ext4_has_feature_xattr(sb)) return; BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); if (ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh, EXT4_JTR_NONE) == 0) { lock_buffer(EXT4_SB(sb)->s_sbh); ext4_set_feature_xattr(sb); ext4_superblock_csum_set(sb); unlock_buffer(EXT4_SB(sb)->s_sbh); ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); } } int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) { struct ext4_iloc iloc = { .bh = NULL }; struct buffer_head *bh = NULL; struct ext4_inode *raw_inode; struct ext4_xattr_ibody_header *header; struct ext4_xattr_entry *entry; qsize_t ea_inode_refs = 0; void *end; int ret; lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem); if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { ret = ext4_get_inode_loc(inode, &iloc); if (ret) goto out; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; ret = xattr_check_inode(inode, header, end); if (ret) goto out; for (entry = IFIRST(header); !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) if (entry->e_value_inum) ea_inode_refs++; } if (EXT4_I(inode)->i_file_acl) { bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) { ret = PTR_ERR(bh); bh = NULL; goto out; } ret = ext4_xattr_check_block(inode, bh); if (ret) goto out; for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) if (entry->e_value_inum) ea_inode_refs++; } *usage = ea_inode_refs + 1; ret = 0; out: brelse(iloc.bh); brelse(bh); return ret; } static inline size_t round_up_cluster(struct inode *inode, size_t length) { struct super_block *sb = inode->i_sb; size_t cluster_size = 1 << (EXT4_SB(sb)->s_cluster_bits + inode->i_blkbits); size_t mask = ~(cluster_size - 1); return (length + cluster_size - 1) & mask; } static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len) { int err; err = dquot_alloc_inode(inode); if (err) return err; err = dquot_alloc_space_nodirty(inode, round_up_cluster(inode, len)); if (err) dquot_free_inode(inode); return err; } static void ext4_xattr_inode_free_quota(struct inode *parent, struct inode *ea_inode, size_t len) { if (ea_inode && ext4_test_inode_state(ea_inode, EXT4_STATE_LUSTRE_EA_INODE)) return; dquot_free_space_nodirty(parent, round_up_cluster(parent, len)); dquot_free_inode(parent); } int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, struct buffer_head *block_bh, size_t value_len, bool is_create) { int credits; int blocks; /* * 1) Owner inode update * 2) Ref count update on old xattr block * 3) new xattr block * 4) block bitmap update for new xattr block * 5) group descriptor for new xattr block * 6) block bitmap update for old xattr block * 7) group descriptor for old block * * 6 & 7 can happen if we have two racing threads T_a and T_b * which are each trying to set an xattr on inodes I_a and I_b * which were both initially sharing an xattr block. */ credits = 7; /* Quota updates. */ credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb); /* * In case of inline data, we may push out the data to a block, * so we need to reserve credits for this eventuality */ if (inode && ext4_has_inline_data(inode)) credits += ext4_writepage_trans_blocks(inode) + 1; /* We are done if ea_inode feature is not enabled. */ if (!ext4_has_feature_ea_inode(sb)) return credits; /* New ea_inode, inode map, block bitmap, group descriptor. */ credits += 4; /* Data blocks. */ blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits; /* Indirection block or one level of extent tree. */ blocks += 1; /* Block bitmap and group descriptor updates for each block. */ credits += blocks * 2; /* Blocks themselves. */ credits += blocks; if (!is_create) { /* Dereference ea_inode holding old xattr value. * Old ea_inode, inode map, block bitmap, group descriptor. */ credits += 4; /* Data blocks for old ea_inode. */ blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits; /* Indirection block or one level of extent tree for old * ea_inode. */ blocks += 1; /* Block bitmap and group descriptor updates for each block. */ credits += blocks * 2; } /* We may need to clone the existing xattr block in which case we need * to increment ref counts for existing ea_inodes referenced by it. */ if (block_bh) { struct ext4_xattr_entry *entry = BFIRST(block_bh); for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) if (entry->e_value_inum) /* Ref count update on ea_inode. */ credits += 1; } return credits; } static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, int ref_change) { struct ext4_iloc iloc; s64 ref_count; int ret; inode_lock_nested(ea_inode, I_MUTEX_XATTR); ret = ext4_reserve_inode_write(handle, ea_inode, &iloc); if (ret) goto out; ref_count = ext4_xattr_inode_get_ref(ea_inode); ref_count += ref_change; ext4_xattr_inode_set_ref(ea_inode, ref_count); if (ref_change > 0) { WARN_ONCE(ref_count <= 0, "EA inode %lu ref_count=%lld", ea_inode->i_ino, ref_count); if (ref_count == 1) { WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u", ea_inode->i_ino, ea_inode->i_nlink); set_nlink(ea_inode, 1); ext4_orphan_del(handle, ea_inode); } } else { WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld", ea_inode->i_ino, ref_count); if (ref_count == 0) { WARN_ONCE(ea_inode->i_nlink != 1, "EA inode %lu i_nlink=%u", ea_inode->i_ino, ea_inode->i_nlink); clear_nlink(ea_inode); ext4_orphan_add(handle, ea_inode); } } ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc); if (ret) ext4_warning_inode(ea_inode, "ext4_mark_iloc_dirty() failed ret=%d", ret); out: inode_unlock(ea_inode); return ret; } static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode) { return ext4_xattr_inode_update_ref(handle, ea_inode, 1); } static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode) { return ext4_xattr_inode_update_ref(handle, ea_inode, -1); } static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent, struct ext4_xattr_entry *first) { struct inode *ea_inode; struct ext4_xattr_entry *entry; struct ext4_xattr_entry *failed_entry; unsigned int ea_ino; int err, saved_err; for (entry = first; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum) continue; ea_ino = le32_to_cpu(entry->e_value_inum); err = ext4_xattr_inode_iget(parent, ea_ino, le32_to_cpu(entry->e_hash), &ea_inode); if (err) goto cleanup; err = ext4_xattr_inode_inc_ref(handle, ea_inode); if (err) { ext4_warning_inode(ea_inode, "inc ref error %d", err); iput(ea_inode); goto cleanup; } iput(ea_inode); } return 0; cleanup: saved_err = err; failed_entry = entry; for (entry = first; entry != failed_entry; entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum) continue; ea_ino = le32_to_cpu(entry->e_value_inum); err = ext4_xattr_inode_iget(parent, ea_ino, le32_to_cpu(entry->e_hash), &ea_inode); if (err) { ext4_warning(parent->i_sb, "cleanup ea_ino %u iget error %d", ea_ino, err); continue; } err = ext4_xattr_inode_dec_ref(handle, ea_inode); if (err) ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err); iput(ea_inode); } return saved_err; } static int ext4_xattr_restart_fn(handle_t *handle, struct inode *inode, struct buffer_head *bh, bool block_csum, bool dirty) { int error; if (bh && dirty) { if (block_csum) ext4_xattr_block_csum_set(inode, bh); error = ext4_handle_dirty_metadata(handle, NULL, bh); if (error) { ext4_warning(inode->i_sb, "Handle metadata (error %d)", error); return error; } } return 0; } static void ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, struct buffer_head *bh, struct ext4_xattr_entry *first, bool block_csum, struct ext4_xattr_inode_array **ea_inode_array, int extra_credits, bool skip_quota) { struct inode *ea_inode; struct ext4_xattr_entry *entry; bool dirty = false; unsigned int ea_ino; int err; int credits; /* One credit for dec ref on ea_inode, one for orphan list addition, */ credits = 2 + extra_credits; for (entry = first; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum) continue; ea_ino = le32_to_cpu(entry->e_value_inum); err = ext4_xattr_inode_iget(parent, ea_ino, le32_to_cpu(entry->e_hash), &ea_inode); if (err) continue; err = ext4_expand_inode_array(ea_inode_array, ea_inode); if (err) { ext4_warning_inode(ea_inode, "Expand inode array err=%d", err); iput(ea_inode); continue; } err = ext4_journal_ensure_credits_fn(handle, credits, credits, ext4_free_metadata_revoke_credits(parent->i_sb, 1), ext4_xattr_restart_fn(handle, parent, bh, block_csum, dirty)); if (err < 0) { ext4_warning_inode(ea_inode, "Ensure credits err=%d", err); continue; } if (err > 0) { err = ext4_journal_get_write_access(handle, parent->i_sb, bh, EXT4_JTR_NONE); if (err) { ext4_warning_inode(ea_inode, "Re-get write access err=%d", err); continue; } } err = ext4_xattr_inode_dec_ref(handle, ea_inode); if (err) { ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d", err); continue; } if (!skip_quota) ext4_xattr_inode_free_quota(parent, ea_inode, le32_to_cpu(entry->e_value_size)); /* * Forget about ea_inode within the same transaction that * decrements the ref count. This avoids duplicate decrements in * case the rest of the work spills over to subsequent * transactions. */ entry->e_value_inum = 0; entry->e_value_size = 0; dirty = true; } if (dirty) { /* * Note that we are deliberately skipping csum calculation for * the final update because we do not expect any journal * restarts until xattr block is freed. */ err = ext4_handle_dirty_metadata(handle, NULL, bh); if (err) ext4_warning_inode(parent, "handle dirty metadata err=%d", err); } } /* * Release the xattr block BH: If the reference count is > 1, decrement it; * otherwise free the block. */ static void ext4_xattr_release_block(handle_t *handle, struct inode *inode, struct buffer_head *bh, struct ext4_xattr_inode_array **ea_inode_array, int extra_credits) { struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); u32 hash, ref; int error = 0; BUFFER_TRACE(bh, "get_write_access"); error = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (error) goto out; retry_ref: lock_buffer(bh); hash = le32_to_cpu(BHDR(bh)->h_hash); ref = le32_to_cpu(BHDR(bh)->h_refcount); if (ref == 1) { ea_bdebug(bh, "refcount now=0; freeing"); /* * This must happen under buffer lock for * ext4_xattr_block_set() to reliably detect freed block */ if (ea_block_cache) { struct mb_cache_entry *oe; oe = mb_cache_entry_delete_or_get(ea_block_cache, hash, bh->b_blocknr); if (oe) { unlock_buffer(bh); mb_cache_entry_wait_unused(oe); mb_cache_entry_put(ea_block_cache, oe); goto retry_ref; } } get_bh(bh); unlock_buffer(bh); if (ext4_has_feature_ea_inode(inode->i_sb)) ext4_xattr_inode_dec_ref_all(handle, inode, bh, BFIRST(bh), true /* block_csum */, ea_inode_array, extra_credits, true /* skip_quota */); ext4_free_blocks(handle, inode, bh, 0, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } else { ref--; BHDR(bh)->h_refcount = cpu_to_le32(ref); if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) { struct mb_cache_entry *ce; if (ea_block_cache) { ce = mb_cache_entry_get(ea_block_cache, hash, bh->b_blocknr); if (ce) { set_bit(MBE_REUSABLE_B, &ce->e_flags); mb_cache_entry_put(ea_block_cache, ce); } } } ext4_xattr_block_csum_set(inode, bh); /* * Beware of this ugliness: Releasing of xattr block references * from different inodes can race and so we have to protect * from a race where someone else frees the block (and releases * its journal_head) before we are done dirtying the buffer. In * nojournal mode this race is harmless and we actually cannot * call ext4_handle_dirty_metadata() with locked buffer as * that function can call sync_dirty_buffer() so for that case * we handle the dirtying after unlocking the buffer. */ if (ext4_handle_valid(handle)) error = ext4_handle_dirty_metadata(handle, inode, bh); unlock_buffer(bh); if (!ext4_handle_valid(handle)) error = ext4_handle_dirty_metadata(handle, inode, bh); if (IS_SYNC(inode)) ext4_handle_sync(handle); dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1)); ea_bdebug(bh, "refcount now=%d; releasing", le32_to_cpu(BHDR(bh)->h_refcount)); } out: ext4_std_error(inode->i_sb, error); return; } /* * Find the available free space for EAs. This also returns the total number of * bytes used by EA entries. */ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, size_t *min_offs, void *base, int *total) { for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { if (!last->e_value_inum && last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < *min_offs) *min_offs = offs; } if (total) *total += EXT4_XATTR_LEN(last->e_name_len); } return (*min_offs - ((void *)last - base) - sizeof(__u32)); } /* * Write the value of the EA in an inode. */ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode, const void *buf, int bufsize) { struct buffer_head *bh = NULL; unsigned long block = 0; int blocksize = ea_inode->i_sb->s_blocksize; int max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits; int csize, wsize = 0; int ret = 0, ret2 = 0; int retries = 0; retry: while (ret >= 0 && ret < max_blocks) { struct ext4_map_blocks map; map.m_lblk = block += ret; map.m_len = max_blocks -= ret; ret = ext4_map_blocks(handle, ea_inode, &map, EXT4_GET_BLOCKS_CREATE); if (ret <= 0) { ext4_mark_inode_dirty(handle, ea_inode); if (ret == -ENOSPC && ext4_should_retry_alloc(ea_inode->i_sb, &retries)) { ret = 0; goto retry; } break; } } if (ret < 0) return ret; block = 0; while (wsize < bufsize) { brelse(bh); csize = (bufsize - wsize) > blocksize ? blocksize : bufsize - wsize; bh = ext4_getblk(handle, ea_inode, block, 0); if (IS_ERR(bh)) return PTR_ERR(bh); if (!bh) { WARN_ON_ONCE(1); EXT4_ERROR_INODE(ea_inode, "ext4_getblk() return bh = NULL"); return -EFSCORRUPTED; } ret = ext4_journal_get_write_access(handle, ea_inode->i_sb, bh, EXT4_JTR_NONE); if (ret) goto out; memcpy(bh->b_data, buf, csize); /* * Zero out block tail to avoid writing uninitialized memory * to disk. */ if (csize < blocksize) memset(bh->b_data + csize, 0, blocksize - csize); set_buffer_uptodate(bh); ext4_handle_dirty_metadata(handle, ea_inode, bh); buf += csize; wsize += csize; block += 1; } inode_lock(ea_inode); i_size_write(ea_inode, wsize); ext4_update_i_disksize(ea_inode, wsize); inode_unlock(ea_inode); ret2 = ext4_mark_inode_dirty(handle, ea_inode); if (unlikely(ret2 && !ret)) ret = ret2; out: brelse(bh); return ret; } /* * Create an inode to store the value of a large EA. */ static struct inode *ext4_xattr_inode_create(handle_t *handle, struct inode *inode, u32 hash) { struct inode *ea_inode = NULL; uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) }; int err; if (inode->i_sb->s_root == NULL) { ext4_warning(inode->i_sb, "refuse to create EA inode when umounting"); WARN_ON(1); return ERR_PTR(-EINVAL); } /* * Let the next inode be the goal, so we try and allocate the EA inode * in the same group, or nearby one. */ ea_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, S_IFREG | 0600, NULL, inode->i_ino + 1, owner, EXT4_EA_INODE_FL); if (!IS_ERR(ea_inode)) { ea_inode->i_op = &ext4_file_inode_operations; ea_inode->i_fop = &ext4_file_operations; ext4_set_aops(ea_inode); ext4_xattr_inode_set_class(ea_inode); unlock_new_inode(ea_inode); ext4_xattr_inode_set_ref(ea_inode, 1); ext4_xattr_inode_set_hash(ea_inode, hash); err = ext4_mark_inode_dirty(handle, ea_inode); if (!err) err = ext4_inode_attach_jinode(ea_inode); if (err) { if (ext4_xattr_inode_dec_ref(handle, ea_inode)) ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err); iput(ea_inode); return ERR_PTR(err); } /* * Xattr inodes are shared therefore quota charging is performed * at a higher level. */ dquot_free_inode(ea_inode); dquot_drop(ea_inode); inode_lock(ea_inode); ea_inode->i_flags |= S_NOQUOTA; inode_unlock(ea_inode); } return ea_inode; } static struct inode * ext4_xattr_inode_cache_find(struct inode *inode, const void *value, size_t value_len, u32 hash) { struct inode *ea_inode; struct mb_cache_entry *ce; struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode); void *ea_data; if (!ea_inode_cache) return NULL; ce = mb_cache_entry_find_first(ea_inode_cache, hash); if (!ce) return NULL; WARN_ON_ONCE(ext4_handle_valid(journal_current_handle()) && !(current->flags & PF_MEMALLOC_NOFS)); ea_data = kvmalloc(value_len, GFP_KERNEL); if (!ea_data) { mb_cache_entry_put(ea_inode_cache, ce); return NULL; } while (ce) { ea_inode = ext4_iget(inode->i_sb, ce->e_value, EXT4_IGET_EA_INODE); if (IS_ERR(ea_inode)) goto next_entry; ext4_xattr_inode_set_class(ea_inode); if (i_size_read(ea_inode) == value_len && !ext4_xattr_inode_read(ea_inode, ea_data, value_len) && !ext4_xattr_inode_verify_hashes(ea_inode, NULL, ea_data, value_len) && !memcmp(value, ea_data, value_len)) { mb_cache_entry_touch(ea_inode_cache, ce); mb_cache_entry_put(ea_inode_cache, ce); kvfree(ea_data); return ea_inode; } iput(ea_inode); next_entry: ce = mb_cache_entry_find_next(ea_inode_cache, ce); } kvfree(ea_data); return NULL; } /* * Add value of the EA in an inode. */ static struct inode *ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode, const void *value, size_t value_len) { struct inode *ea_inode; u32 hash; int err; /* Account inode & space to quota even if sharing... */ err = ext4_xattr_inode_alloc_quota(inode, value_len); if (err) return ERR_PTR(err); hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len); ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash); if (ea_inode) { err = ext4_xattr_inode_inc_ref(handle, ea_inode); if (err) goto out_err; return ea_inode; } /* Create an inode for the EA value */ ea_inode = ext4_xattr_inode_create(handle, inode, hash); if (IS_ERR(ea_inode)) { ext4_xattr_inode_free_quota(inode, NULL, value_len); return ea_inode; } err = ext4_xattr_inode_write(handle, ea_inode, value, value_len); if (err) { if (ext4_xattr_inode_dec_ref(handle, ea_inode)) ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err); goto out_err; } if (EA_INODE_CACHE(inode)) mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash, ea_inode->i_ino, true /* reusable */); return ea_inode; out_err: iput(ea_inode); ext4_xattr_inode_free_quota(inode, NULL, value_len); return ERR_PTR(err); } /* * Reserve min(block_size/8, 1024) bytes for xattr entries/names if ea_inode * feature is enabled. */ #define EXT4_XATTR_BLOCK_RESERVE(inode) min(i_blocksize(inode)/8, 1024U) static int ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s, handle_t *handle, struct inode *inode, struct inode *new_ea_inode, bool is_block) { struct ext4_xattr_entry *last, *next; struct ext4_xattr_entry *here = s->here; size_t min_offs = s->end - s->base, name_len = strlen(i->name); int in_inode = i->in_inode; struct inode *old_ea_inode = NULL; size_t old_size, new_size; int ret; /* Space used by old and new values. */ old_size = (!s->not_found && !here->e_value_inum) ? EXT4_XATTR_SIZE(le32_to_cpu(here->e_value_size)) : 0; new_size = (i->value && !in_inode) ? EXT4_XATTR_SIZE(i->value_len) : 0; /* * Optimization for the simple case when old and new values have the * same padded sizes. Not applicable if external inodes are involved. */ if (new_size && new_size == old_size) { size_t offs = le16_to_cpu(here->e_value_offs); void *val = s->base + offs; here->e_value_size = cpu_to_le32(i->value_len); if (i->value == EXT4_ZERO_XATTR_VALUE) { memset(val, 0, new_size); } else { memcpy(val, i->value, i->value_len); /* Clear padding bytes. */ memset(val + i->value_len, 0, new_size - i->value_len); } goto update_hash; } /* Compute min_offs and last. */ last = s->first; for (; !IS_LAST_ENTRY(last); last = next) { next = EXT4_XATTR_NEXT(last); if ((void *)next >= s->end) { EXT4_ERROR_INODE(inode, "corrupted xattr entries"); ret = -EFSCORRUPTED; goto out; } if (!last->e_value_inum && last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < min_offs) min_offs = offs; } } /* Check whether we have enough space. */ if (i->value) { size_t free; free = min_offs - ((void *)last - s->base) - sizeof(__u32); if (!s->not_found) free += EXT4_XATTR_LEN(name_len) + old_size; if (free < EXT4_XATTR_LEN(name_len) + new_size) { ret = -ENOSPC; goto out; } /* * If storing the value in an external inode is an option, * reserve space for xattr entries/names in the external * attribute block so that a long value does not occupy the * whole space and prevent further entries being added. */ if (ext4_has_feature_ea_inode(inode->i_sb) && new_size && is_block && (min_offs + old_size - new_size) < EXT4_XATTR_BLOCK_RESERVE(inode)) { ret = -ENOSPC; goto out; } } /* * Getting access to old and new ea inodes is subject to failures. * Finish that work before doing any modifications to the xattr data. */ if (!s->not_found && here->e_value_inum) { ret = ext4_xattr_inode_iget(inode, le32_to_cpu(here->e_value_inum), le32_to_cpu(here->e_hash), &old_ea_inode); if (ret) { old_ea_inode = NULL; goto out; } /* We are ready to release ref count on the old_ea_inode. */ ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode); if (ret) goto out; ext4_xattr_inode_free_quota(inode, old_ea_inode, le32_to_cpu(here->e_value_size)); } /* No failures allowed past this point. */ if (!s->not_found && here->e_value_size && !here->e_value_inum) { /* Remove the old value. */ void *first_val = s->base + min_offs; size_t offs = le16_to_cpu(here->e_value_offs); void *val = s->base + offs; memmove(first_val + old_size, first_val, val - first_val); memset(first_val, 0, old_size); min_offs += old_size; /* Adjust all value offsets. */ last = s->first; while (!IS_LAST_ENTRY(last)) { size_t o = le16_to_cpu(last->e_value_offs); if (!last->e_value_inum && last->e_value_size && o < offs) last->e_value_offs = cpu_to_le16(o + old_size); last = EXT4_XATTR_NEXT(last); } } if (!i->value) { /* Remove old name. */ size_t size = EXT4_XATTR_LEN(name_len); last = ENTRY((void *)last - size); memmove(here, (void *)here + size, (void *)last - (void *)here + sizeof(__u32)); memset(last, 0, size); /* * Update i_inline_off - moved ibody region might contain * system.data attribute. Handling a failure here won't * cause other complications for setting an xattr. */ if (!is_block && ext4_has_inline_data(inode)) { ret = ext4_find_inline_data_nolock(inode); if (ret) { ext4_warning_inode(inode, "unable to update i_inline_off"); goto out; } } } else if (s->not_found) { /* Insert new name. */ size_t size = EXT4_XATTR_LEN(name_len); size_t rest = (void *)last - (void *)here + sizeof(__u32); memmove((void *)here + size, here, rest); memset(here, 0, size); here->e_name_index = i->name_index; here->e_name_len = name_len; memcpy(here->e_name, i->name, name_len); } else { /* This is an update, reset value info. */ here->e_value_inum = 0; here->e_value_offs = 0; here->e_value_size = 0; } if (i->value) { /* Insert new value. */ if (in_inode) { here->e_value_inum = cpu_to_le32(new_ea_inode->i_ino); } else if (i->value_len) { void *val = s->base + min_offs - new_size; here->e_value_offs = cpu_to_le16(min_offs - new_size); if (i->value == EXT4_ZERO_XATTR_VALUE) { memset(val, 0, new_size); } else { memcpy(val, i->value, i->value_len); /* Clear padding bytes. */ memset(val + i->value_len, 0, new_size - i->value_len); } } here->e_value_size = cpu_to_le32(i->value_len); } update_hash: if (i->value) { __le32 hash = 0; /* Entry hash calculation. */ if (in_inode) { __le32 crc32c_hash; /* * Feed crc32c hash instead of the raw value for entry * hash calculation. This is to avoid walking * potentially long value buffer again. */ crc32c_hash = cpu_to_le32( ext4_xattr_inode_get_hash(new_ea_inode)); hash = ext4_xattr_hash_entry(here->e_name, here->e_name_len, &crc32c_hash, 1); } else if (is_block) { __le32 *value = s->base + le16_to_cpu( here->e_value_offs); hash = ext4_xattr_hash_entry(here->e_name, here->e_name_len, value, new_size >> 2); } here->e_hash = hash; } if (is_block) ext4_xattr_rehash((struct ext4_xattr_header *)s->base); ret = 0; out: iput(old_ea_inode); return ret; } struct ext4_xattr_block_find { struct ext4_xattr_search s; struct buffer_head *bh; }; static int ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_block_find *bs) { struct super_block *sb = inode->i_sb; int error; ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", i->name_index, i->name, i->value, (long)i->value_len); if (EXT4_I(inode)->i_file_acl) { /* The inode already has an extended attribute block. */ bs->bh = ext4_sb_bread(sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bs->bh)) { error = PTR_ERR(bs->bh); bs->bh = NULL; return error; } ea_bdebug(bs->bh, "b_count=%d, refcount=%d", atomic_read(&(bs->bh->b_count)), le32_to_cpu(BHDR(bs->bh)->h_refcount)); error = ext4_xattr_check_block(inode, bs->bh); if (error) return error; /* Find the named attribute. */ bs->s.base = BHDR(bs->bh); bs->s.first = BFIRST(bs->bh); bs->s.end = bs->bh->b_data + bs->bh->b_size; bs->s.here = bs->s.first; error = xattr_find_entry(inode, &bs->s.here, bs->s.end, i->name_index, i->name, 1); if (error && error != -ENODATA) return error; bs->s.not_found = error; } return 0; } static int ext4_xattr_block_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_block_find *bs) { struct super_block *sb = inode->i_sb; struct buffer_head *new_bh = NULL; struct ext4_xattr_search s_copy = bs->s; struct ext4_xattr_search *s = &s_copy; struct mb_cache_entry *ce = NULL; int error = 0; struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); struct inode *ea_inode = NULL, *tmp_inode; size_t old_ea_inode_quota = 0; unsigned int ea_ino; #define header(x) ((struct ext4_xattr_header *)(x)) /* If we need EA inode, prepare it before locking the buffer */ if (i->value && i->in_inode) { WARN_ON_ONCE(!i->value_len); ea_inode = ext4_xattr_inode_lookup_create(handle, inode, i->value, i->value_len); if (IS_ERR(ea_inode)) { error = PTR_ERR(ea_inode); ea_inode = NULL; goto cleanup; } } if (s->base) { int offset = (char *)s->here - bs->bh->b_data; BUFFER_TRACE(bs->bh, "get_write_access"); error = ext4_journal_get_write_access(handle, sb, bs->bh, EXT4_JTR_NONE); if (error) goto cleanup; lock_buffer(bs->bh); if (header(s->base)->h_refcount == cpu_to_le32(1)) { __u32 hash = le32_to_cpu(BHDR(bs->bh)->h_hash); /* * This must happen under buffer lock for * ext4_xattr_block_set() to reliably detect modified * block */ if (ea_block_cache) { struct mb_cache_entry *oe; oe = mb_cache_entry_delete_or_get(ea_block_cache, hash, bs->bh->b_blocknr); if (oe) { /* * Xattr block is getting reused. Leave * it alone. */ mb_cache_entry_put(ea_block_cache, oe); goto clone_block; } } ea_bdebug(bs->bh, "modifying in-place"); error = ext4_xattr_set_entry(i, s, handle, inode, ea_inode, true /* is_block */); ext4_xattr_block_csum_set(inode, bs->bh); unlock_buffer(bs->bh); if (error == -EFSCORRUPTED) goto bad_block; if (!error) error = ext4_handle_dirty_metadata(handle, inode, bs->bh); if (error) goto cleanup; goto inserted; } clone_block: unlock_buffer(bs->bh); ea_bdebug(bs->bh, "cloning"); s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS); error = -ENOMEM; if (s->base == NULL) goto cleanup; s->first = ENTRY(header(s->base)+1); header(s->base)->h_refcount = cpu_to_le32(1); s->here = ENTRY(s->base + offset); s->end = s->base + bs->bh->b_size; /* * If existing entry points to an xattr inode, we need * to prevent ext4_xattr_set_entry() from decrementing * ref count on it because the reference belongs to the * original block. In this case, make the entry look * like it has an empty value. */ if (!s->not_found && s->here->e_value_inum) { ea_ino = le32_to_cpu(s->here->e_value_inum); error = ext4_xattr_inode_iget(inode, ea_ino, le32_to_cpu(s->here->e_hash), &tmp_inode); if (error) goto cleanup; if (!ext4_test_inode_state(tmp_inode, EXT4_STATE_LUSTRE_EA_INODE)) { /* * Defer quota free call for previous * inode until success is guaranteed. */ old_ea_inode_quota = le32_to_cpu( s->here->e_value_size); } iput(tmp_inode); s->here->e_value_inum = 0; s->here->e_value_size = 0; } } else { /* Allocate a buffer where we construct the new block. */ s->base = kzalloc(sb->s_blocksize, GFP_NOFS); error = -ENOMEM; if (s->base == NULL) goto cleanup; header(s->base)->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); header(s->base)->h_blocks = cpu_to_le32(1); header(s->base)->h_refcount = cpu_to_le32(1); s->first = ENTRY(header(s->base)+1); s->here = ENTRY(header(s->base)+1); s->end = s->base + sb->s_blocksize; } error = ext4_xattr_set_entry(i, s, handle, inode, ea_inode, true /* is_block */); if (error == -EFSCORRUPTED) goto bad_block; if (error) goto cleanup; inserted: if (!IS_LAST_ENTRY(s->first)) { new_bh = ext4_xattr_block_cache_find(inode, header(s->base), &ce); if (IS_ERR(new_bh)) { error = PTR_ERR(new_bh); new_bh = NULL; goto cleanup; } if (new_bh) { /* We found an identical block in the cache. */ if (new_bh == bs->bh) ea_bdebug(new_bh, "keeping"); else { u32 ref; #ifdef EXT4_XATTR_DEBUG WARN_ON_ONCE(dquot_initialize_needed(inode)); #endif /* The old block is released after updating the inode. */ error = dquot_alloc_block(inode, EXT4_C2B(EXT4_SB(sb), 1)); if (error) goto cleanup; BUFFER_TRACE(new_bh, "get_write_access"); error = ext4_journal_get_write_access( handle, sb, new_bh, EXT4_JTR_NONE); if (error) goto cleanup_dquot; lock_buffer(new_bh); /* * We have to be careful about races with * adding references to xattr block. Once we * hold buffer lock xattr block's state is * stable so we can check the additional * reference fits. */ ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1; if (ref > EXT4_XATTR_REFCOUNT_MAX) { /* * Undo everything and check mbcache * again. */ unlock_buffer(new_bh); dquot_free_block(inode, EXT4_C2B(EXT4_SB(sb), 1)); brelse(new_bh); mb_cache_entry_put(ea_block_cache, ce); ce = NULL; new_bh = NULL; goto inserted; } BHDR(new_bh)->h_refcount = cpu_to_le32(ref); if (ref == EXT4_XATTR_REFCOUNT_MAX) clear_bit(MBE_REUSABLE_B, &ce->e_flags); ea_bdebug(new_bh, "reusing; refcount now=%d", ref); ext4_xattr_block_csum_set(inode, new_bh); unlock_buffer(new_bh); error = ext4_handle_dirty_metadata(handle, inode, new_bh); if (error) goto cleanup_dquot; } mb_cache_entry_touch(ea_block_cache, ce); mb_cache_entry_put(ea_block_cache, ce); ce = NULL; } else if (bs->bh && s->base == bs->bh->b_data) { /* We were modifying this block in-place. */ ea_bdebug(bs->bh, "keeping this block"); ext4_xattr_block_cache_insert(ea_block_cache, bs->bh); new_bh = bs->bh; get_bh(new_bh); } else { /* We need to allocate a new block */ ext4_fsblk_t goal, block; #ifdef EXT4_XATTR_DEBUG WARN_ON_ONCE(dquot_initialize_needed(inode)); #endif goal = ext4_group_first_block_no(sb, EXT4_I(inode)->i_block_group); block = ext4_new_meta_blocks(handle, inode, goal, 0, NULL, &error); if (error) goto cleanup; ea_idebug(inode, "creating block %llu", (unsigned long long)block); new_bh = sb_getblk(sb, block); if (unlikely(!new_bh)) { error = -ENOMEM; getblk_failed: ext4_free_blocks(handle, inode, NULL, block, 1, EXT4_FREE_BLOCKS_METADATA); goto cleanup; } error = ext4_xattr_inode_inc_ref_all(handle, inode, ENTRY(header(s->base)+1)); if (error) goto getblk_failed; if (ea_inode) { /* Drop the extra ref on ea_inode. */ error = ext4_xattr_inode_dec_ref(handle, ea_inode); if (error) ext4_warning_inode(ea_inode, "dec ref error=%d", error); iput(ea_inode); ea_inode = NULL; } lock_buffer(new_bh); error = ext4_journal_get_create_access(handle, sb, new_bh, EXT4_JTR_NONE); if (error) { unlock_buffer(new_bh); error = -EIO; goto getblk_failed; } memcpy(new_bh->b_data, s->base, new_bh->b_size); ext4_xattr_block_csum_set(inode, new_bh); set_buffer_uptodate(new_bh); unlock_buffer(new_bh); ext4_xattr_block_cache_insert(ea_block_cache, new_bh); error = ext4_handle_dirty_metadata(handle, inode, new_bh); if (error) goto cleanup; } } if (old_ea_inode_quota) ext4_xattr_inode_free_quota(inode, NULL, old_ea_inode_quota); /* Update the inode. */ EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; /* Drop the previous xattr block. */ if (bs->bh && bs->bh != new_bh) { struct ext4_xattr_inode_array *ea_inode_array = NULL; ext4_xattr_release_block(handle, inode, bs->bh, &ea_inode_array, 0 /* extra_credits */); ext4_xattr_inode_array_free(ea_inode_array); } error = 0; cleanup: if (ea_inode) { if (error) { int error2; error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); if (error2) ext4_warning_inode(ea_inode, "dec ref error=%d", error2); ext4_xattr_inode_free_quota(inode, ea_inode, i_size_read(ea_inode)); } iput(ea_inode); } if (ce) mb_cache_entry_put(ea_block_cache, ce); brelse(new_bh); if (!(bs->bh && s->base == bs->bh->b_data)) kfree(s->base); return error; cleanup_dquot: dquot_free_block(inode, EXT4_C2B(EXT4_SB(sb), 1)); goto cleanup; bad_block: EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); goto cleanup; #undef header } int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is) { struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; int error; if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) return 0; raw_inode = ext4_raw_inode(&is->iloc); header = IHDR(inode, raw_inode); is->s.base = is->s.first = IFIRST(header); is->s.here = is->s.first; is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { error = xattr_check_inode(inode, header, is->s.end); if (error) return error; /* Find the named attribute. */ error = xattr_find_entry(inode, &is->s.here, is->s.end, i->name_index, i->name, 0); if (error && error != -ENODATA) return error; is->s.not_found = error; } return 0; } int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is) { struct ext4_xattr_ibody_header *header; struct ext4_xattr_search *s = &is->s; struct inode *ea_inode = NULL; int error; if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) return -ENOSPC; /* If we need EA inode, prepare it before locking the buffer */ if (i->value && i->in_inode) { WARN_ON_ONCE(!i->value_len); ea_inode = ext4_xattr_inode_lookup_create(handle, inode, i->value, i->value_len); if (IS_ERR(ea_inode)) return PTR_ERR(ea_inode); } error = ext4_xattr_set_entry(i, s, handle, inode, ea_inode, false /* is_block */); if (error) { if (ea_inode) { int error2; error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); if (error2) ext4_warning_inode(ea_inode, "dec ref error=%d", error2); ext4_xattr_inode_free_quota(inode, ea_inode, i_size_read(ea_inode)); iput(ea_inode); } return error; } header = IHDR(inode, ext4_raw_inode(&is->iloc)); if (!IS_LAST_ENTRY(s->first)) { header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); ext4_set_inode_state(inode, EXT4_STATE_XATTR); } else { header->h_magic = cpu_to_le32(0); ext4_clear_inode_state(inode, EXT4_STATE_XATTR); } iput(ea_inode); return 0; } static int ext4_xattr_value_same(struct ext4_xattr_search *s, struct ext4_xattr_info *i) { void *value; /* When e_value_inum is set the value is stored externally. */ if (s->here->e_value_inum) return 0; if (le32_to_cpu(s->here->e_value_size) != i->value_len) return 0; value = ((void *)s->base) + le16_to_cpu(s->here->e_value_offs); return !memcmp(value, i->value, i->value_len); } static struct buffer_head *ext4_xattr_get_block(struct inode *inode) { struct buffer_head *bh; int error; if (!EXT4_I(inode)->i_file_acl) return NULL; bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) return bh; error = ext4_xattr_check_block(inode, bh); if (error) { brelse(bh); return ERR_PTR(error); } return bh; } /* * ext4_xattr_set_handle() * * Create, replace or remove an extended attribute for this inode. Value * is NULL to remove an existing extended attribute, and non-NULL to * either replace an existing extended attribute, or create a new extended * attribute. The flags XATTR_REPLACE and XATTR_CREATE * specify that an extended attribute must exist and must not exist * previous to the call, respectively. * * Returns 0, or a negative error number on failure. */ int ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, const char *name, const void *value, size_t value_len, int flags) { struct ext4_xattr_info i = { .name_index = name_index, .name = name, .value = value, .value_len = value_len, .in_inode = 0, }; struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, }; struct ext4_xattr_block_find bs = { .s = { .not_found = -ENODATA, }, }; int no_expand; int error; if (!name) return -EINVAL; if (strlen(name) > 255) return -ERANGE; ext4_write_lock_xattr(inode, &no_expand); /* Check journal credits under write lock. */ if (ext4_handle_valid(handle)) { struct buffer_head *bh; int credits; bh = ext4_xattr_get_block(inode); if (IS_ERR(bh)) { error = PTR_ERR(bh); goto cleanup; } credits = __ext4_xattr_set_credits(inode->i_sb, inode, bh, value_len, flags & XATTR_CREATE); brelse(bh); if (jbd2_handle_buffer_credits(handle) < credits) { error = -ENOSPC; goto cleanup; } WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS)); } error = ext4_reserve_inode_write(handle, inode, &is.iloc); if (error) goto cleanup; if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) { struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); ext4_clear_inode_state(inode, EXT4_STATE_NEW); } error = ext4_xattr_ibody_find(inode, &i, &is); if (error) goto cleanup; if (is.s.not_found) error = ext4_xattr_block_find(inode, &i, &bs); if (error) goto cleanup; if (is.s.not_found && bs.s.not_found) { error = -ENODATA; if (flags & XATTR_REPLACE) goto cleanup; error = 0; if (!value) goto cleanup; } else { error = -EEXIST; if (flags & XATTR_CREATE) goto cleanup; } if (!value) { if (!is.s.not_found) error = ext4_xattr_ibody_set(handle, inode, &i, &is); else if (!bs.s.not_found) error = ext4_xattr_block_set(handle, inode, &i, &bs); } else { error = 0; /* Xattr value did not change? Save us some work and bail out */ if (!is.s.not_found && ext4_xattr_value_same(&is.s, &i)) goto cleanup; if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i)) goto cleanup; if (ext4_has_feature_ea_inode(inode->i_sb) && (EXT4_XATTR_SIZE(i.value_len) > EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize))) i.in_inode = 1; retry_inode: error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (!error && !bs.s.not_found) { i.value = NULL; error = ext4_xattr_block_set(handle, inode, &i, &bs); } else if (error == -ENOSPC) { if (EXT4_I(inode)->i_file_acl && !bs.s.base) { brelse(bs.bh); bs.bh = NULL; error = ext4_xattr_block_find(inode, &i, &bs); if (error) goto cleanup; } error = ext4_xattr_block_set(handle, inode, &i, &bs); if (!error && !is.s.not_found) { i.value = NULL; error = ext4_xattr_ibody_set(handle, inode, &i, &is); } else if (error == -ENOSPC) { /* * Xattr does not fit in the block, store at * external inode if possible. */ if (ext4_has_feature_ea_inode(inode->i_sb) && i.value_len && !i.in_inode) { i.in_inode = 1; goto retry_inode; } } } } if (!error) { ext4_xattr_update_super_block(handle, inode->i_sb); inode_set_ctime_current(inode); inode_inc_iversion(inode); if (!value) no_expand = 0; error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); /* * The bh is consumed by ext4_mark_iloc_dirty, even with * error != 0. */ is.iloc.bh = NULL; if (IS_SYNC(inode)) ext4_handle_sync(handle); } ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); cleanup: brelse(is.iloc.bh); brelse(bs.bh); ext4_write_unlock_xattr(inode, &no_expand); return error; } int ext4_xattr_set_credits(struct inode *inode, size_t value_len, bool is_create, int *credits) { struct buffer_head *bh; int err; *credits = 0; if (!EXT4_SB(inode->i_sb)->s_journal) return 0; down_read(&EXT4_I(inode)->xattr_sem); bh = ext4_xattr_get_block(inode); if (IS_ERR(bh)) { err = PTR_ERR(bh); } else { *credits = __ext4_xattr_set_credits(inode->i_sb, inode, bh, value_len, is_create); brelse(bh); err = 0; } up_read(&EXT4_I(inode)->xattr_sem); return err; } /* * ext4_xattr_set() * * Like ext4_xattr_set_handle, but start from an inode. This extended * attribute modification is a filesystem transaction by itself. * * Returns 0, or a negative error number on failure. */ int ext4_xattr_set(struct inode *inode, int name_index, const char *name, const void *value, size_t value_len, int flags) { handle_t *handle; struct super_block *sb = inode->i_sb; int error, retries = 0; int credits; error = dquot_initialize(inode); if (error) return error; retry: error = ext4_xattr_set_credits(inode, value_len, flags & XATTR_CREATE, &credits); if (error) return error; handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); if (IS_ERR(handle)) { error = PTR_ERR(handle); } else { int error2; error = ext4_xattr_set_handle(handle, inode, name_index, name, value, value_len, flags); ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); error2 = ext4_journal_stop(handle); if (error == -ENOSPC && ext4_should_retry_alloc(sb, &retries)) goto retry; if (error == 0) error = error2; } return error; } /* * Shift the EA entries in the inode to create space for the increased * i_extra_isize. */ static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry, int value_offs_shift, void *to, void *from, size_t n) { struct ext4_xattr_entry *last = entry; int new_offs; /* We always shift xattr headers further thus offsets get lower */ BUG_ON(value_offs_shift > 0); /* Adjust the value offsets of the entries */ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { if (!last->e_value_inum && last->e_value_size) { new_offs = le16_to_cpu(last->e_value_offs) + value_offs_shift; last->e_value_offs = cpu_to_le16(new_offs); } } /* Shift the entries by n bytes */ memmove(to, from, n); } /* * Move xattr pointed to by 'entry' from inode into external xattr block */ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, struct ext4_inode *raw_inode, struct ext4_xattr_entry *entry) { struct ext4_xattr_ibody_find *is = NULL; struct ext4_xattr_block_find *bs = NULL; char *buffer = NULL, *b_entry_name = NULL; size_t value_size = le32_to_cpu(entry->e_value_size); struct ext4_xattr_info i = { .value = NULL, .value_len = 0, .name_index = entry->e_name_index, .in_inode = !!entry->e_value_inum, }; struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode); int needs_kvfree = 0; int error; is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS); if (!is || !bs || !b_entry_name) { error = -ENOMEM; goto out; } is->s.not_found = -ENODATA; bs->s.not_found = -ENODATA; is->iloc.bh = NULL; bs->bh = NULL; /* Save the entry name and the entry value */ if (entry->e_value_inum) { buffer = kvmalloc(value_size, GFP_NOFS); if (!buffer) { error = -ENOMEM; goto out; } needs_kvfree = 1; error = ext4_xattr_inode_get(inode, entry, buffer, value_size); if (error) goto out; } else { size_t value_offs = le16_to_cpu(entry->e_value_offs); buffer = (void *)IFIRST(header) + value_offs; } memcpy(b_entry_name, entry->e_name, entry->e_name_len); b_entry_name[entry->e_name_len] = '\0'; i.name = b_entry_name; error = ext4_get_inode_loc(inode, &is->iloc); if (error) goto out; error = ext4_xattr_ibody_find(inode, &i, is); if (error) goto out; i.value = buffer; i.value_len = value_size; error = ext4_xattr_block_find(inode, &i, bs); if (error) goto out; /* Move ea entry from the inode into the block */ error = ext4_xattr_block_set(handle, inode, &i, bs); if (error) goto out; /* Remove the chosen entry from the inode */ i.value = NULL; i.value_len = 0; error = ext4_xattr_ibody_set(handle, inode, &i, is); out: kfree(b_entry_name); if (needs_kvfree && buffer) kvfree(buffer); if (is) brelse(is->iloc.bh); if (bs) brelse(bs->bh); kfree(is); kfree(bs); return error; } static int ext4_xattr_make_inode_space(handle_t *handle, struct inode *inode, struct ext4_inode *raw_inode, int isize_diff, size_t ifree, size_t bfree, int *total_ino) { struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode); struct ext4_xattr_entry *small_entry; struct ext4_xattr_entry *entry; struct ext4_xattr_entry *last; unsigned int entry_size; /* EA entry size */ unsigned int total_size; /* EA entry size + value size */ unsigned int min_total_size; int error; while (isize_diff > ifree) { entry = NULL; small_entry = NULL; min_total_size = ~0U; last = IFIRST(header); /* Find the entry best suited to be pushed into EA block */ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { /* never move system.data out of the inode */ if ((last->e_name_len == 4) && (last->e_name_index == EXT4_XATTR_INDEX_SYSTEM) && !memcmp(last->e_name, "data", 4)) continue; total_size = EXT4_XATTR_LEN(last->e_name_len); if (!last->e_value_inum) total_size += EXT4_XATTR_SIZE( le32_to_cpu(last->e_value_size)); if (total_size <= bfree && total_size < min_total_size) { if (total_size + ifree < isize_diff) { small_entry = last; } else { entry = last; min_total_size = total_size; } } } if (entry == NULL) { if (small_entry == NULL) return -ENOSPC; entry = small_entry; } entry_size = EXT4_XATTR_LEN(entry->e_name_len); total_size = entry_size; if (!entry->e_value_inum) total_size += EXT4_XATTR_SIZE( le32_to_cpu(entry->e_value_size)); error = ext4_xattr_move_to_block(handle, inode, raw_inode, entry); if (error) return error; *total_ino -= entry_size; ifree += total_size; bfree -= total_size; } return 0; } /* * Expand an inode by new_extra_isize bytes when EAs are present. * Returns 0 on success or negative error number on failure. */ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle) { struct ext4_xattr_ibody_header *header; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); static unsigned int mnt_count; size_t min_offs; size_t ifree, bfree; int total_ino; void *base, *end; int error = 0, tried_min_extra_isize = 0; int s_min_extra_isize = le16_to_cpu(sbi->s_es->s_min_extra_isize); int isize_diff; /* How much do we need to grow i_extra_isize */ retry: isize_diff = new_extra_isize - EXT4_I(inode)->i_extra_isize; if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) return 0; header = IHDR(inode, raw_inode); /* * Check if enough free space is available in the inode to shift the * entries ahead by new_extra_isize. */ base = IFIRST(header); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; min_offs = end - base; total_ino = sizeof(struct ext4_xattr_ibody_header) + sizeof(u32); error = xattr_check_inode(inode, header, end); if (error) goto cleanup; ifree = ext4_xattr_free_space(base, &min_offs, base, &total_ino); if (ifree >= isize_diff) goto shift; /* * Enough free space isn't available in the inode, check if * EA block can hold new_extra_isize bytes. */ if (EXT4_I(inode)->i_file_acl) { struct buffer_head *bh; bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) { error = PTR_ERR(bh); goto cleanup; } error = ext4_xattr_check_block(inode, bh); if (error) { brelse(bh); goto cleanup; } base = BHDR(bh); end = bh->b_data + bh->b_size; min_offs = end - base; bfree = ext4_xattr_free_space(BFIRST(bh), &min_offs, base, NULL); brelse(bh); if (bfree + ifree < isize_diff) { if (!tried_min_extra_isize && s_min_extra_isize) { tried_min_extra_isize++; new_extra_isize = s_min_extra_isize; goto retry; } error = -ENOSPC; goto cleanup; } } else { bfree = inode->i_sb->s_blocksize; } error = ext4_xattr_make_inode_space(handle, inode, raw_inode, isize_diff, ifree, bfree, &total_ino); if (error) { if (error == -ENOSPC && !tried_min_extra_isize && s_min_extra_isize) { tried_min_extra_isize++; new_extra_isize = s_min_extra_isize; goto retry; } goto cleanup; } shift: /* Adjust the offsets and shift the remaining entries ahead */ ext4_xattr_shift_entries(IFIRST(header), EXT4_I(inode)->i_extra_isize - new_extra_isize, (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize, (void *)header, total_ino); EXT4_I(inode)->i_extra_isize = new_extra_isize; if (ext4_has_inline_data(inode)) error = ext4_find_inline_data_nolock(inode); cleanup: if (error && (mnt_count != le16_to_cpu(sbi->s_es->s_mnt_count))) { ext4_warning(inode->i_sb, "Unable to expand inode %lu. Delete some EAs or run e2fsck.", inode->i_ino); mnt_count = le16_to_cpu(sbi->s_es->s_mnt_count); } return error; } #define EIA_INCR 16 /* must be 2^n */ #define EIA_MASK (EIA_INCR - 1) /* Add the large xattr @inode into @ea_inode_array for deferred iput(). * If @ea_inode_array is new or full it will be grown and the old * contents copied over. */ static int ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, struct inode *inode) { if (*ea_inode_array == NULL) { /* * Start with 15 inodes, so it fits into a power-of-two size. */ (*ea_inode_array) = kmalloc( struct_size(*ea_inode_array, inodes, EIA_MASK), GFP_NOFS); if (*ea_inode_array == NULL) return -ENOMEM; (*ea_inode_array)->count = 0; } else if (((*ea_inode_array)->count & EIA_MASK) == EIA_MASK) { /* expand the array once all 15 + n * 16 slots are full */ struct ext4_xattr_inode_array *new_array = NULL; new_array = kmalloc( struct_size(*ea_inode_array, inodes, (*ea_inode_array)->count + EIA_INCR), GFP_NOFS); if (new_array == NULL) return -ENOMEM; memcpy(new_array, *ea_inode_array, struct_size(*ea_inode_array, inodes, (*ea_inode_array)->count)); kfree(*ea_inode_array); *ea_inode_array = new_array; } (*ea_inode_array)->count++; (*ea_inode_array)->inodes[(*ea_inode_array)->count - 1] = inode; return 0; } /* * ext4_xattr_delete_inode() * * Free extended attribute resources associated with this inode. Traverse * all entries and decrement reference on any xattr inodes associated with this * inode. This is called immediately before an inode is freed. We have exclusive * access to the inode. If an orphan inode is deleted it will also release its * references on xattr block and xattr inodes. */ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, struct ext4_xattr_inode_array **ea_inode_array, int extra_credits) { struct buffer_head *bh = NULL; struct ext4_xattr_ibody_header *header; struct ext4_iloc iloc = { .bh = NULL }; struct ext4_xattr_entry *entry; struct inode *ea_inode; int error; error = ext4_journal_ensure_credits(handle, extra_credits, ext4_free_metadata_revoke_credits(inode->i_sb, 1)); if (error < 0) { EXT4_ERROR_INODE(inode, "ensure credits (error %d)", error); goto cleanup; } if (ext4_has_feature_ea_inode(inode->i_sb) && ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { error = ext4_get_inode_loc(inode, &iloc); if (error) { EXT4_ERROR_INODE(inode, "inode loc (error %d)", error); goto cleanup; } error = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, EXT4_JTR_NONE); if (error) { EXT4_ERROR_INODE(inode, "write access (error %d)", error); goto cleanup; } header = IHDR(inode, ext4_raw_inode(&iloc)); if (header->h_magic == cpu_to_le32(EXT4_XATTR_MAGIC)) ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh, IFIRST(header), false /* block_csum */, ea_inode_array, extra_credits, false /* skip_quota */); } if (EXT4_I(inode)->i_file_acl) { bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) { error = PTR_ERR(bh); if (error == -EIO) { EXT4_ERROR_INODE_ERR(inode, EIO, "block %llu read error", EXT4_I(inode)->i_file_acl); } bh = NULL; goto cleanup; } error = ext4_xattr_check_block(inode, bh); if (error) goto cleanup; if (ext4_has_feature_ea_inode(inode->i_sb)) { for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum) continue; error = ext4_xattr_inode_iget(inode, le32_to_cpu(entry->e_value_inum), le32_to_cpu(entry->e_hash), &ea_inode); if (error) continue; ext4_xattr_inode_free_quota(inode, ea_inode, le32_to_cpu(entry->e_value_size)); iput(ea_inode); } } ext4_xattr_release_block(handle, inode, bh, ea_inode_array, extra_credits); /* * Update i_file_acl value in the same transaction that releases * block. */ EXT4_I(inode)->i_file_acl = 0; error = ext4_mark_inode_dirty(handle, inode); if (error) { EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)", error); goto cleanup; } ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); } error = 0; cleanup: brelse(iloc.bh); brelse(bh); return error; } void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array) { int idx; if (ea_inode_array == NULL) return; for (idx = 0; idx < ea_inode_array->count; ++idx) iput(ea_inode_array->inodes[idx]); kfree(ea_inode_array); } /* * ext4_xattr_block_cache_insert() * * Create a new entry in the extended attribute block cache, and insert * it unless such an entry is already in the cache. */ static void ext4_xattr_block_cache_insert(struct mb_cache *ea_block_cache, struct buffer_head *bh) { struct ext4_xattr_header *header = BHDR(bh); __u32 hash = le32_to_cpu(header->h_hash); int reusable = le32_to_cpu(header->h_refcount) < EXT4_XATTR_REFCOUNT_MAX; int error; if (!ea_block_cache) return; error = mb_cache_entry_create(ea_block_cache, GFP_NOFS, hash, bh->b_blocknr, reusable); if (error) { if (error == -EBUSY) ea_bdebug(bh, "already in cache"); } else ea_bdebug(bh, "inserting [%x]", (int)hash); } /* * ext4_xattr_cmp() * * Compare two extended attribute blocks for equality. * * Returns 0 if the blocks are equal, 1 if they differ. */ static int ext4_xattr_cmp(struct ext4_xattr_header *header1, struct ext4_xattr_header *header2) { struct ext4_xattr_entry *entry1, *entry2; entry1 = ENTRY(header1+1); entry2 = ENTRY(header2+1); while (!IS_LAST_ENTRY(entry1)) { if (IS_LAST_ENTRY(entry2)) return 1; if (entry1->e_hash != entry2->e_hash || entry1->e_name_index != entry2->e_name_index || entry1->e_name_len != entry2->e_name_len || entry1->e_value_size != entry2->e_value_size || entry1->e_value_inum != entry2->e_value_inum || memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) return 1; if (!entry1->e_value_inum && memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), (char *)header2 + le16_to_cpu(entry2->e_value_offs), le32_to_cpu(entry1->e_value_size))) return 1; entry1 = EXT4_XATTR_NEXT(entry1); entry2 = EXT4_XATTR_NEXT(entry2); } if (!IS_LAST_ENTRY(entry2)) return 1; return 0; } /* * ext4_xattr_block_cache_find() * * Find an identical extended attribute block. * * Returns a pointer to the block found, or NULL if such a block was not * found, or an error pointer if an error occurred while reading ea block. */ static struct buffer_head * ext4_xattr_block_cache_find(struct inode *inode, struct ext4_xattr_header *header, struct mb_cache_entry **pce) { __u32 hash = le32_to_cpu(header->h_hash); struct mb_cache_entry *ce; struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); if (!ea_block_cache) return NULL; if (!header->h_hash) return NULL; /* never share */ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); ce = mb_cache_entry_find_first(ea_block_cache, hash); while (ce) { struct buffer_head *bh; bh = ext4_sb_bread(inode->i_sb, ce->e_value, REQ_PRIO); if (IS_ERR(bh)) { if (PTR_ERR(bh) != -ENOMEM) EXT4_ERROR_INODE(inode, "block %lu read error", (unsigned long)ce->e_value); mb_cache_entry_put(ea_block_cache, ce); return bh; } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { *pce = ce; return bh; } brelse(bh); ce = mb_cache_entry_find_next(ea_block_cache, ce); } return NULL; } #define NAME_HASH_SHIFT 5 #define VALUE_HASH_SHIFT 16 /* * ext4_xattr_hash_entry() * * Compute the hash of an extended attribute. */ static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value, size_t value_count) { __u32 hash = 0; while (name_len--) { hash = (hash << NAME_HASH_SHIFT) ^ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ (unsigned char)*name++; } while (value_count--) { hash = (hash << VALUE_HASH_SHIFT) ^ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ le32_to_cpu(*value++); } return cpu_to_le32(hash); } /* * ext4_xattr_hash_entry_signed() * * Compute the hash of an extended attribute incorrectly. */ static __le32 ext4_xattr_hash_entry_signed(char *name, size_t name_len, __le32 *value, size_t value_count) { __u32 hash = 0; while (name_len--) { hash = (hash << NAME_HASH_SHIFT) ^ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ (signed char)*name++; } while (value_count--) { hash = (hash << VALUE_HASH_SHIFT) ^ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ le32_to_cpu(*value++); } return cpu_to_le32(hash); } #undef NAME_HASH_SHIFT #undef VALUE_HASH_SHIFT #define BLOCK_HASH_SHIFT 16 /* * ext4_xattr_rehash() * * Re-compute the extended attribute hash value after an entry has changed. */ static void ext4_xattr_rehash(struct ext4_xattr_header *header) { struct ext4_xattr_entry *here; __u32 hash = 0; here = ENTRY(header+1); while (!IS_LAST_ENTRY(here)) { if (!here->e_hash) { /* Block is not shared if an entry's hash value == 0 */ hash = 0; break; } hash = (hash << BLOCK_HASH_SHIFT) ^ (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ le32_to_cpu(here->e_hash); here = EXT4_XATTR_NEXT(here); } header->h_hash = cpu_to_le32(hash); } #undef BLOCK_HASH_SHIFT #define HASH_BUCKET_BITS 10 struct mb_cache * ext4_xattr_create_cache(void) { return mb_cache_create(HASH_BUCKET_BITS); } void ext4_xattr_destroy_cache(struct mb_cache *cache) { if (cache) mb_cache_destroy(cache); }
10 3 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (c) 2016, Amir Vadai <amir@vadai.me> * Copyright (c) 2016, Mellanox Technologies. All rights reserved. */ #ifndef __NET_TC_TUNNEL_KEY_H #define __NET_TC_TUNNEL_KEY_H #include <net/act_api.h> #include <linux/tc_act/tc_tunnel_key.h> #include <net/dst_metadata.h> struct tcf_tunnel_key_params { struct rcu_head rcu; int tcft_action; struct metadata_dst *tcft_enc_metadata; }; struct tcf_tunnel_key { struct tc_action common; struct tcf_tunnel_key_params __rcu *params; }; #define to_tunnel_key(a) ((struct tcf_tunnel_key *)a) static inline bool is_tcf_tunnel_set(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT struct tcf_tunnel_key *t = to_tunnel_key(a); struct tcf_tunnel_key_params *params; params = rcu_dereference_protected(t->params, lockdep_is_held(&a->tcfa_lock)); if (a->ops && a->ops->id == TCA_ID_TUNNEL_KEY) return params->tcft_action == TCA_TUNNEL_KEY_ACT_SET; #endif return false; } static inline bool is_tcf_tunnel_release(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT struct tcf_tunnel_key *t = to_tunnel_key(a); struct tcf_tunnel_key_params *params; params = rcu_dereference_protected(t->params, lockdep_is_held(&a->tcfa_lock)); if (a->ops && a->ops->id == TCA_ID_TUNNEL_KEY) return params->tcft_action == TCA_TUNNEL_KEY_ACT_RELEASE; #endif return false; } static inline struct ip_tunnel_info *tcf_tunnel_info(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT struct tcf_tunnel_key *t = to_tunnel_key(a); struct tcf_tunnel_key_params *params; params = rcu_dereference_protected(t->params, lockdep_is_held(&a->tcfa_lock)); return &params->tcft_enc_metadata->u.tun_info; #else return NULL; #endif } static inline struct ip_tunnel_info * tcf_tunnel_info_copy(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT struct ip_tunnel_info *tun = tcf_tunnel_info(a); if (tun) { size_t tun_size = sizeof(*tun) + tun->options_len; struct ip_tunnel_info *tun_copy = kmemdup(tun, tun_size, GFP_ATOMIC); return tun_copy; } #endif return NULL; } #endif /* __NET_TC_TUNNEL_KEY_H */
1 691 565 358 359 29 695 683 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* include/asm-generic/tlb.h * * Generic TLB shootdown code * * Copyright 2001 Red Hat, Inc. * Based on code from mm/memory.c Copyright Linus Torvalds and others. * * Copyright 2011 Red Hat, Inc., Peter Zijlstra */ #ifndef _ASM_GENERIC__TLB_H #define _ASM_GENERIC__TLB_H #include <linux/mmu_notifier.h> #include <linux/swap.h> #include <linux/hugetlb_inline.h> #include <asm/tlbflush.h> #include <asm/cacheflush.h> /* * Blindly accessing user memory from NMI context can be dangerous * if we're in the middle of switching the current user task or switching * the loaded mm. */ #ifndef nmi_uaccess_okay # define nmi_uaccess_okay() true #endif #ifdef CONFIG_MMU /* * Generic MMU-gather implementation. * * The mmu_gather data structure is used by the mm code to implement the * correct and efficient ordering of freeing pages and TLB invalidations. * * This correct ordering is: * * 1) unhook page * 2) TLB invalidate page * 3) free page * * That is, we must never free a page before we have ensured there are no live * translations left to it. Otherwise it might be possible to observe (or * worse, change) the page content after it has been reused. * * The mmu_gather API consists of: * * - tlb_gather_mmu() / tlb_gather_mmu_fullmm() / tlb_finish_mmu() * * start and finish a mmu_gather * * Finish in particular will issue a (final) TLB invalidate and free * all (remaining) queued pages. * * - tlb_start_vma() / tlb_end_vma(); marks the start / end of a VMA * * Defaults to flushing at tlb_end_vma() to reset the range; helps when * there's large holes between the VMAs. * * - tlb_remove_table() * * tlb_remove_table() is the basic primitive to free page-table directories * (__p*_free_tlb()). In it's most primitive form it is an alias for * tlb_remove_page() below, for when page directories are pages and have no * additional constraints. * * See also MMU_GATHER_TABLE_FREE and MMU_GATHER_RCU_TABLE_FREE. * * - tlb_remove_page() / __tlb_remove_page() * - tlb_remove_page_size() / __tlb_remove_page_size() * - __tlb_remove_folio_pages() * * __tlb_remove_page_size() is the basic primitive that queues a page for * freeing. __tlb_remove_page() assumes PAGE_SIZE. Both will return a * boolean indicating if the queue is (now) full and a call to * tlb_flush_mmu() is required. * * tlb_remove_page() and tlb_remove_page_size() imply the call to * tlb_flush_mmu() when required and has no return value. * * __tlb_remove_folio_pages() is similar to __tlb_remove_page(), however, * instead of removing a single page, remove the given number of consecutive * pages that are all part of the same (large) folio: just like calling * __tlb_remove_page() on each page individually. * * - tlb_change_page_size() * * call before __tlb_remove_page*() to set the current page-size; implies a * possible tlb_flush_mmu() call. * * - tlb_flush_mmu() / tlb_flush_mmu_tlbonly() * * tlb_flush_mmu_tlbonly() - does the TLB invalidate (and resets * related state, like the range) * * tlb_flush_mmu() - in addition to the above TLB invalidate, also frees * whatever pages are still batched. * * - mmu_gather::fullmm * * A flag set by tlb_gather_mmu_fullmm() to indicate we're going to free * the entire mm; this allows a number of optimizations. * * - We can ignore tlb_{start,end}_vma(); because we don't * care about ranges. Everything will be shot down. * * - (RISC) architectures that use ASIDs can cycle to a new ASID * and delay the invalidation until ASID space runs out. * * - mmu_gather::need_flush_all * * A flag that can be set by the arch code if it wants to force * flush the entire TLB irrespective of the range. For instance * x86-PAE needs this when changing top-level entries. * * And allows the architecture to provide and implement tlb_flush(): * * tlb_flush() may, in addition to the above mentioned mmu_gather fields, make * use of: * * - mmu_gather::start / mmu_gather::end * * which provides the range that needs to be flushed to cover the pages to * be freed. * * - mmu_gather::freed_tables * * set when we freed page table pages * * - tlb_get_unmap_shift() / tlb_get_unmap_size() * * returns the smallest TLB entry size unmapped in this range. * * If an architecture does not provide tlb_flush() a default implementation * based on flush_tlb_range() will be used, unless MMU_GATHER_NO_RANGE is * specified, in which case we'll default to flush_tlb_mm(). * * Additionally there are a few opt-in features: * * MMU_GATHER_PAGE_SIZE * * This ensures we call tlb_flush() every time tlb_change_page_size() actually * changes the size and provides mmu_gather::page_size to tlb_flush(). * * This might be useful if your architecture has size specific TLB * invalidation instructions. * * MMU_GATHER_TABLE_FREE * * This provides tlb_remove_table(), to be used instead of tlb_remove_page() * for page directores (__p*_free_tlb()). * * Useful if your architecture has non-page page directories. * * When used, an architecture is expected to provide __tlb_remove_table() or * use the generic __tlb_remove_table(), which does the actual freeing of these * pages. * * MMU_GATHER_RCU_TABLE_FREE * * Like MMU_GATHER_TABLE_FREE, and adds semi-RCU semantics to the free (see * comment below). * * Useful if your architecture doesn't use IPIs for remote TLB invalidates * and therefore doesn't naturally serialize with software page-table walkers. * * MMU_GATHER_NO_FLUSH_CACHE * * Indicates the architecture has flush_cache_range() but it needs *NOT* be called * before unmapping a VMA. * * NOTE: strictly speaking we shouldn't have this knob and instead rely on * flush_cache_range() being a NOP, except Sparc64 seems to be * different here. * * MMU_GATHER_MERGE_VMAS * * Indicates the architecture wants to merge ranges over VMAs; typical when * multiple range invalidates are more expensive than a full invalidate. * * MMU_GATHER_NO_RANGE * * Use this if your architecture lacks an efficient flush_tlb_range(). This * option implies MMU_GATHER_MERGE_VMAS above. * * MMU_GATHER_NO_GATHER * * If the option is set the mmu_gather will not track individual pages for * delayed page free anymore. A platform that enables the option needs to * provide its own implementation of the __tlb_remove_page_size() function to * free pages. * * This is useful if your architecture already flushes TLB entries in the * various ptep_get_and_clear() functions. */ #ifdef CONFIG_MMU_GATHER_TABLE_FREE struct mmu_table_batch { #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE struct rcu_head rcu; #endif unsigned int nr; void *tables[]; }; #define MAX_TABLE_BATCH \ ((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *)) #ifndef __HAVE_ARCH_TLB_REMOVE_TABLE static inline void __tlb_remove_table(void *table) { struct ptdesc *ptdesc = (struct ptdesc *)table; pagetable_dtor_free(ptdesc); } #endif extern void tlb_remove_table(struct mmu_gather *tlb, void *table); #else /* !CONFIG_MMU_GATHER_TABLE_FREE */ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page); /* * Without MMU_GATHER_TABLE_FREE the architecture is assumed to have page based * page directories and we can use the normal page batching to free them. */ static inline void tlb_remove_table(struct mmu_gather *tlb, void *table) { struct page *page = (struct page *)table; pagetable_dtor(page_ptdesc(page)); tlb_remove_page(tlb, page); } #endif /* CONFIG_MMU_GATHER_TABLE_FREE */ #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE /* * This allows an architecture that does not use the linux page-tables for * hardware to skip the TLBI when freeing page tables. */ #ifndef tlb_needs_table_invalidate #define tlb_needs_table_invalidate() (true) #endif void tlb_remove_table_sync_one(void); #else #ifdef tlb_needs_table_invalidate #error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE #endif static inline void tlb_remove_table_sync_one(void) { } #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ #ifndef CONFIG_MMU_GATHER_NO_GATHER /* * If we can't allocate a page to make a big batch of page pointers * to work on, then just handle a few from the on-stack structure. */ #define MMU_GATHER_BUNDLE 8 struct mmu_gather_batch { struct mmu_gather_batch *next; unsigned int nr; unsigned int max; struct encoded_page *encoded_pages[]; }; #define MAX_GATHER_BATCH \ ((PAGE_SIZE - sizeof(struct mmu_gather_batch)) / sizeof(void *)) /* * Limit the maximum number of mmu_gather batches to reduce a risk of soft * lockups for non-preemptible kernels on huge machines when a lot of memory * is zapped during unmapping. * 10K pages freed at once should be safe even without a preemption point. */ #define MAX_GATHER_BATCH_COUNT (10000UL/MAX_GATHER_BATCH) extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, bool delay_rmap, int page_size); bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page, unsigned int nr_pages, bool delay_rmap); #ifdef CONFIG_SMP /* * This both sets 'delayed_rmap', and returns true. It would be an inline * function, except we define it before the 'struct mmu_gather'. */ #define tlb_delay_rmap(tlb) (((tlb)->delayed_rmap = 1), true) extern void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma); #endif #endif /* * We have a no-op version of the rmap removal that doesn't * delay anything. That is used on S390, which flushes remote * TLBs synchronously, and on UP, which doesn't have any * remote TLBs to flush and is not preemptible due to this * all happening under the page table lock. */ #ifndef tlb_delay_rmap #define tlb_delay_rmap(tlb) (false) static inline void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma) { } #endif /* * struct mmu_gather is an opaque type used by the mm code for passing around * any data needed by arch specific code for tlb_remove_page. */ struct mmu_gather { struct mm_struct *mm; #ifdef CONFIG_MMU_GATHER_TABLE_FREE struct mmu_table_batch *batch; #endif unsigned long start; unsigned long end; /* * we are in the middle of an operation to clear * a full mm and can make some optimizations */ unsigned int fullmm : 1; /* * we have performed an operation which * requires a complete flush of the tlb */ unsigned int need_flush_all : 1; /* * we have removed page directories */ unsigned int freed_tables : 1; /* * Do we have pending delayed rmap removals? */ unsigned int delayed_rmap : 1; /* * at which levels have we cleared entries? */ unsigned int cleared_ptes : 1; unsigned int cleared_pmds : 1; unsigned int cleared_puds : 1; unsigned int cleared_p4ds : 1; /* * tracks VM_EXEC | VM_HUGETLB in tlb_start_vma */ unsigned int vma_exec : 1; unsigned int vma_huge : 1; unsigned int vma_pfn : 1; unsigned int batch_count; #ifndef CONFIG_MMU_GATHER_NO_GATHER struct mmu_gather_batch *active; struct mmu_gather_batch local; struct page *__pages[MMU_GATHER_BUNDLE]; #ifdef CONFIG_MMU_GATHER_PAGE_SIZE unsigned int page_size; #endif #endif }; void tlb_flush_mmu(struct mmu_gather *tlb); static inline void __tlb_adjust_range(struct mmu_gather *tlb, unsigned long address, unsigned int range_size) { tlb->start = min(tlb->start, address); tlb->end = max(tlb->end, address + range_size); } static inline void __tlb_reset_range(struct mmu_gather *tlb) { if (tlb->fullmm) { tlb->start = tlb->end = ~0; } else { tlb->start = TASK_SIZE; tlb->end = 0; } tlb->freed_tables = 0; tlb->cleared_ptes = 0; tlb->cleared_pmds = 0; tlb->cleared_puds = 0; tlb->cleared_p4ds = 0; /* * Do not reset mmu_gather::vma_* fields here, we do not * call into tlb_start_vma() again to set them if there is an * intermediate flush. */ } #ifdef CONFIG_MMU_GATHER_NO_RANGE #if defined(tlb_flush) #error MMU_GATHER_NO_RANGE relies on default tlb_flush() #endif /* * When an architecture does not have efficient means of range flushing TLBs * there is no point in doing intermediate flushes on tlb_end_vma() to keep the * range small. We equally don't have to worry about page granularity or other * things. * * All we need to do is issue a full flush for any !0 range. */ static inline void tlb_flush(struct mmu_gather *tlb) { if (tlb->end) flush_tlb_mm(tlb->mm); } #else /* CONFIG_MMU_GATHER_NO_RANGE */ #ifndef tlb_flush /* * When an architecture does not provide its own tlb_flush() implementation * but does have a reasonably efficient flush_vma_range() implementation * use that. */ static inline void tlb_flush(struct mmu_gather *tlb) { if (tlb->fullmm || tlb->need_flush_all) { flush_tlb_mm(tlb->mm); } else if (tlb->end) { struct vm_area_struct vma = { .vm_mm = tlb->mm, .vm_flags = (tlb->vma_exec ? VM_EXEC : 0) | (tlb->vma_huge ? VM_HUGETLB : 0), }; flush_tlb_range(&vma, tlb->start, tlb->end); } } #endif #endif /* CONFIG_MMU_GATHER_NO_RANGE */ static inline void tlb_update_vma_flags(struct mmu_gather *tlb, struct vm_area_struct *vma) { /* * flush_tlb_range() implementations that look at VM_HUGETLB (tile, * mips-4k) flush only large pages. * * flush_tlb_range() implementations that flush I-TLB also flush D-TLB * (tile, xtensa, arm), so it's ok to just add VM_EXEC to an existing * range. * * We rely on tlb_end_vma() to issue a flush, such that when we reset * these values the batch is empty. *