64 63 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | // SPDX-License-Identifier: GPL-2.0-or-later /* mpihelp-lshift.c - MPI helper functions * Copyright (C) 1994, 1996, 1998, 2001 Free Software Foundation, Inc. * * This file is part of GnuPG. * * Note: This code is heavily based on the GNU MP Library. * Actually it's the same code with only minor changes in the * way the data is stored; this is to support the abstraction * of an optional secure memory allocation which may be used * to avoid revealing of sensitive data due to paging etc. * The GNU MP Library itself is published under the LGPL; * however I decided to publish this code under the plain GPL. */ #include "mpi-internal.h" /* Shift U (pointed to by UP and USIZE digits long) CNT bits to the left * and store the USIZE least significant digits of the result at WP. * Return the bits shifted out from the most significant digit. * * Argument constraints: * 1. 0 < CNT < BITS_PER_MP_LIMB * 2. If the result is to be written over the input, WP must be >= UP. */ mpi_limb_t mpihelp_lshift(mpi_ptr_t wp, mpi_ptr_t up, mpi_size_t usize, unsigned int cnt) { mpi_limb_t high_limb, low_limb; unsigned sh_1, sh_2; mpi_size_t i; mpi_limb_t retval; sh_1 = cnt; wp += 1; sh_2 = BITS_PER_MPI_LIMB - sh_1; i = usize - 1; low_limb = up[i]; retval = low_limb >> sh_2; high_limb = low_limb; while (--i >= 0) { low_limb = up[i]; wp[i] = (high_limb << sh_1) | (low_limb >> sh_2); high_limb = low_limb; } wp[i] = high_limb << sh_1; return retval; } |
14 14 14 14 14 14 14 14 14 3 3 24 23 23 23 21 1 2 16 2 2 2 2 2 2 2 2 22 1 21 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/sch_prio.c Simple 3-band priority "scheduler". * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * Fixes: 19990609: J Hadi Salim <hadi@nortelnetworks.com>: * Init -- EINVAL when opt undefined */ #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> struct prio_sched_data { int bands; struct tcf_proto __rcu *filter_list; struct tcf_block *block; u8 prio2band[TC_PRIO_MAX+1]; struct Qdisc *queues[TCQ_PRIO_BANDS]; }; static struct Qdisc * prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) { struct prio_sched_data *q = qdisc_priv(sch); u32 band = skb->priority; struct tcf_result res; struct tcf_proto *fl; int err; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; if (TC_H_MAJ(skb->priority) != sch->handle) { fl = rcu_dereference_bh(q->filter_list); err = tcf_classify(skb, NULL, fl, &res, false); #ifdef CONFIG_NET_CLS_ACT switch (err) { case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; fallthrough; case TC_ACT_SHOT: return NULL; } #endif if (!fl || err < 0) { if (TC_H_MAJ(band)) band = 0; return q->queues[q->prio2band[band & TC_PRIO_MAX]]; } band = res.classid; } band = TC_H_MIN(band) - 1; if (band >= q->bands) return q->queues[q->prio2band[0]]; return q->queues[band]; } static int prio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { unsigned int len = qdisc_pkt_len(skb); struct Qdisc *qdisc; int ret; qdisc = prio_classify(skb, sch, &ret); #ifdef CONFIG_NET_CLS_ACT if (qdisc == NULL) { if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); __qdisc_drop(skb, to_free); return ret; } #endif ret = qdisc_enqueue(skb, qdisc, to_free); if (ret == NET_XMIT_SUCCESS) { sch->qstats.backlog += len; sch->q.qlen++; return NET_XMIT_SUCCESS; } if (net_xmit_drop_count(ret)) qdisc_qstats_drop(sch); return ret; } static struct sk_buff *prio_peek(struct Qdisc *sch) { struct prio_sched_data *q = qdisc_priv(sch); int prio; for (prio = 0; prio < q->bands; prio++) { struct Qdisc *qdisc = q->queues[prio]; struct sk_buff *skb = qdisc->ops->peek(qdisc); if (skb) return skb; } return NULL; } static struct sk_buff *prio_dequeue(struct Qdisc *sch) { struct prio_sched_data *q = qdisc_priv(sch); int prio; for (prio = 0; prio < q->bands; prio++) { struct Qdisc *qdisc = q->queues[prio]; struct sk_buff *skb = qdisc_dequeue_peeked(qdisc); if (skb) { qdisc_bstats_update(sch, skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; return skb; } } return NULL; } static void prio_reset(struct Qdisc *sch) { int prio; struct prio_sched_data *q = qdisc_priv(sch); for (prio = 0; prio < q->bands; prio++) qdisc_reset(q->queues[prio]); } static int prio_offload(struct Qdisc *sch, struct tc_prio_qopt *qopt) { struct net_device *dev = qdisc_dev(sch); struct tc_prio_qopt_offload opt = { .handle = sch->handle, .parent = sch->parent, }; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return -EOPNOTSUPP; if (qopt) { opt.command = TC_PRIO_REPLACE; opt.replace_params.bands = qopt->bands; memcpy(&opt.replace_params.priomap, qopt->priomap, TC_PRIO_MAX + 1); opt.replace_params.qstats = &sch->qstats; } else { opt.command = TC_PRIO_DESTROY; } return dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_PRIO, &opt); } static void prio_destroy(struct Qdisc *sch) { int prio; struct prio_sched_data *q = qdisc_priv(sch); tcf_block_put(q->block); prio_offload(sch, NULL); for (prio = 0; prio < q->bands; prio++) qdisc_put(q->queues[prio]); } static int prio_tune(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct prio_sched_data *q = qdisc_priv(sch); struct Qdisc *queues[TCQ_PRIO_BANDS]; int oldbands = q->bands, i; struct tc_prio_qopt *qopt; if (nla_len(opt) < sizeof(*qopt)) return -EINVAL; qopt = nla_data(opt); if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < TCQ_MIN_PRIO_BANDS) return -EINVAL; for (i = 0; i <= TC_PRIO_MAX; i++) { if (qopt->priomap[i] >= qopt->bands) return -EINVAL; } /* Before commit, make sure we can allocate all new qdiscs */ for (i = oldbands; i < qopt->bands; i++) { queues[i] = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, TC_H_MAKE(sch->handle, i + 1), extack); if (!queues[i]) { while (i > oldbands) qdisc_put(queues[--i]); return -ENOMEM; } } prio_offload(sch, qopt); sch_tree_lock(sch); q->bands = qopt->bands; memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1); for (i = q->bands; i < oldbands; i++) qdisc_tree_flush_backlog(q->queues[i]); for (i = oldbands; i < q->bands; i++) { q->queues[i] = queues[i]; if (q->queues[i] != &noop_qdisc) qdisc_hash_add(q->queues[i], true); } sch_tree_unlock(sch); for (i = q->bands; i < oldbands; i++) qdisc_put(q->queues[i]); return 0; } static int prio_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct prio_sched_data *q = qdisc_priv(sch); int err; if (!opt) return -EINVAL; err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) return err; return prio_tune(sch, opt, extack); } static int prio_dump_offload(struct Qdisc *sch) { struct tc_prio_qopt_offload hw_stats = { .command = TC_PRIO_STATS, .handle = sch->handle, .parent = sch->parent, { .stats = { .bstats = &sch->bstats, .qstats = &sch->qstats, }, }, }; return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_PRIO, &hw_stats); } static int prio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct prio_sched_data *q = qdisc_priv(sch); unsigned char *b = skb_tail_pointer(skb); struct tc_prio_qopt opt; int err; opt.bands = q->bands; memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX + 1); err = prio_dump_offload(sch); if (err) goto nla_put_failure; if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) goto nla_put_failure; return skb->len; nla_put_failure: nlmsg_trim(skb, b); return -1; } static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct prio_sched_data *q = qdisc_priv(sch); struct tc_prio_qopt_offload graft_offload; unsigned long band = arg - 1; if (!new) { new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, TC_H_MAKE(sch->handle, arg), extack); if (!new) new = &noop_qdisc; else qdisc_hash_add(new, true); } *old = qdisc_replace(sch, new, &q->queues[band]); graft_offload.handle = sch->handle; graft_offload.parent = sch->parent; graft_offload.graft_params.band = band; graft_offload.graft_params.child_handle = new->handle; graft_offload.command = TC_PRIO_GRAFT; qdisc_offload_graft_helper(qdisc_dev(sch), sch, new, *old, TC_SETUP_QDISC_PRIO, &graft_offload, extack); return 0; } static struct Qdisc * prio_leaf(struct Qdisc *sch, unsigned long arg) { struct prio_sched_data *q = qdisc_priv(sch); unsigned long band = arg - 1; return q->queues[band]; } static unsigned long prio_find(struct Qdisc *sch, u32 classid) { struct prio_sched_data *q = qdisc_priv(sch); unsigned long band = TC_H_MIN(classid); if (band - 1 >= q->bands) return 0; return band; } static unsigned long prio_bind(struct Qdisc *sch, unsigned long parent, u32 classid) { return prio_find(sch, classid); } static void prio_unbind(struct Qdisc *q, unsigned long cl) { } static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { struct prio_sched_data *q = qdisc_priv(sch); tcm->tcm_handle |= TC_H_MIN(cl); tcm->tcm_info = q->queues[cl-1]->handle; return 0; } static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) { struct prio_sched_data *q = qdisc_priv(sch); struct Qdisc *cl_q; cl_q = q->queues[cl - 1]; if (gnet_stats_copy_basic(d, cl_q->cpu_bstats, &cl_q->bstats, true) < 0 || qdisc_qstats_copy(d, cl_q) < 0) return -1; return 0; } static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct prio_sched_data *q = qdisc_priv(sch); int prio; if (arg->stop) return; for (prio = 0; prio < q->bands; prio++) { if (!tc_qdisc_stats_dump(sch, prio + 1, arg)) break; } } static struct tcf_block *prio_tcf_block(struct Qdisc *sch, unsigned long cl, struct netlink_ext_ack *extack) { struct prio_sched_data *q = qdisc_priv(sch); if (cl) return NULL; return q->block; } static const struct Qdisc_class_ops prio_class_ops = { .graft = prio_graft, .leaf = prio_leaf, .find = prio_find, .walk = prio_walk, .tcf_block = prio_tcf_block, .bind_tcf = prio_bind, .unbind_tcf = prio_unbind, .dump = prio_dump_class, .dump_stats = prio_dump_class_stats, }; static struct Qdisc_ops prio_qdisc_ops __read_mostly = { .next = NULL, .cl_ops = &prio_class_ops, .id = "prio", .priv_size = sizeof(struct prio_sched_data), .enqueue = prio_enqueue, .dequeue = prio_dequeue, .peek = prio_peek, .init = prio_init, .reset = prio_reset, .destroy = prio_destroy, .change = prio_tune, .dump = prio_dump, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("prio"); static int __init prio_module_init(void) { return register_qdisc(&prio_qdisc_ops); } static void __exit prio_module_exit(void) { unregister_qdisc(&prio_qdisc_ops); } module_init(prio_module_init) module_exit(prio_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Simple 3-band priority qdisc"); |
1311 1307 1004 14 14 604 603 74 489 474 15 455 13 453 59 59 5 26 26 13 26 29 6 23 26 14 13 10 2 8 10 35 26 2 7 2 11 28 1 3 1 3 7 19 19 18 24 11 3 2 8 465 407 467 95 4 6 5 355 65 19 4 7 101 82 474 478 475 209 5 5 476 478 477 476 5 384 375 375 130 130 130 477 428 32 458 458 455 14 346 3 341 45 1 43 1 2 3 322 2 8 340 323 324 445 3 2 1 7 7 7 422 17 321 164 440 7 436 360 115 5 3 1 2 3 1 2 3 3 3 3 423 37 8 8 654 505 505 466 427 26 17 442 17 4 7 3 5 10 459 440 4 445 413 39 445 4 48 1 1 1 1 1 430 15 10 3 43 2081 830 2081 2086 2080 2080 10 2068 2084 52 15 75 55 52 74 75 61 13 2 6 6 71 1 1 335 2 367 44 345 345 345 344 340 335 334 333 3 342 339 341 3 318 3 179 236 28 338 386 368 474 474 2 7 8 8 474 3 301 424 424 41 341 363 382 71 474 475 479 430 473 470 4 475 386 1306 1307 1306 1309 119 1309 1310 1309 1310 1309 1312 2 18 1311 1304 118 1305 1310 1306 1307 13 1296 1280 1191 536 536 1195 1194 426 12 1280 1285 1283 1192 1195 535 533 536 53 498 53 14 498 498 498 498 497 496 498 54 494 59 59 5 5 4 5 4 3 5 3 5 5 5 5 5 5 5 3 3 2 3 5 5 5 3 5 3 5 5 5 3 2 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux INET6 implementation * Forwarding Information Database * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Changes: * Yuji SEKIYA @USAGI: Support default route on router node; * remove ip6_null_entry from the top of * routing table. * Ville Nuorvala: Fixed routing subtrees. */ #define pr_fmt(fmt) "IPv6: " fmt #include <linux/bpf.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/net.h> #include <linux/route.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/lwtunnel.h> #include <net/fib_notifier.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> static struct kmem_cache *fib6_node_kmem __read_mostly; struct fib6_cleaner { struct fib6_walker w; struct net *net; int (*func)(struct fib6_info *, void *arg); int sernum; void *arg; bool skip_notify; }; #ifdef CONFIG_IPV6_SUBTREES #define FWS_INIT FWS_S #else #define FWS_INIT FWS_L #endif static struct fib6_info *fib6_find_prefix(struct net *net, struct fib6_table *table, struct fib6_node *fn); static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_table *table, struct fib6_node *fn); static int fib6_walk(struct net *net, struct fib6_walker *w); static int fib6_walk_continue(struct fib6_walker *w); /* * A routing update causes an increase of the serial number on the * affected subtree. This allows for cached routes to be asynchronously * tested when modifications are made to the destination cache as a * result of redirects, path MTU changes, etc. */ static void fib6_gc_timer_cb(struct timer_list *t); #define FOR_WALKERS(net, w) \ list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh) static void fib6_walker_link(struct net *net, struct fib6_walker *w) { write_lock_bh(&net->ipv6.fib6_walker_lock); list_add(&w->lh, &net->ipv6.fib6_walkers); write_unlock_bh(&net->ipv6.fib6_walker_lock); } static void fib6_walker_unlink(struct net *net, struct fib6_walker *w) { write_lock_bh(&net->ipv6.fib6_walker_lock); list_del(&w->lh); write_unlock_bh(&net->ipv6.fib6_walker_lock); } static int fib6_new_sernum(struct net *net) { int new, old = atomic_read(&net->ipv6.fib6_sernum); do { new = old < INT_MAX ? old + 1 : 1; } while (!atomic_try_cmpxchg(&net->ipv6.fib6_sernum, &old, new)); return new; } enum { FIB6_NO_SERNUM_CHANGE = 0, }; void fib6_update_sernum(struct net *net, struct fib6_info *f6i) { struct fib6_node *fn; fn = rcu_dereference_protected(f6i->fib6_node, lockdep_is_held(&f6i->fib6_table->tb6_lock)); if (fn) WRITE_ONCE(fn->fn_sernum, fib6_new_sernum(net)); } /* * Auxiliary address test functions for the radix tree. * * These assume a 32bit processor (although it will work on * 64bit processors) */ /* * test bit */ #if defined(__LITTLE_ENDIAN) # define BITOP_BE32_SWIZZLE (0x1F & ~7) #else # define BITOP_BE32_SWIZZLE 0 #endif static __be32 addr_bit_set(const void *token, int fn_bit) { const __be32 *addr = token; /* * Here, * 1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f) * is optimized version of * htonl(1 << ((~fn_bit)&0x1F)) * See include/asm-generic/bitops/le.h. */ return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) & addr[fn_bit >> 5]; } struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh) { struct fib6_info *f6i; size_t sz = sizeof(*f6i); if (with_fib6_nh) sz += sizeof(struct fib6_nh); f6i = kzalloc(sz, gfp_flags); if (!f6i) return NULL; /* fib6_siblings is a union with nh_list, so this initializes both */ INIT_LIST_HEAD(&f6i->fib6_siblings); refcount_set(&f6i->fib6_ref, 1); INIT_HLIST_NODE(&f6i->gc_link); return f6i; } void fib6_info_destroy_rcu(struct rcu_head *head) { struct fib6_info *f6i = container_of(head, struct fib6_info, rcu); WARN_ON(f6i->fib6_node); if (f6i->nh) nexthop_put(f6i->nh); else fib6_nh_release(f6i->fib6_nh); ip_fib_metrics_put(f6i->fib6_metrics); kfree(f6i); } EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu); static struct fib6_node *node_alloc(struct net *net) { struct fib6_node *fn; fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC); if (fn) net->ipv6.rt6_stats->fib_nodes++; return fn; } static void node_free_immediate(struct net *net, struct fib6_node *fn) { kmem_cache_free(fib6_node_kmem, fn); net->ipv6.rt6_stats->fib_nodes--; } static void node_free_rcu(struct rcu_head *head) { struct fib6_node *fn = container_of(head, struct fib6_node, rcu); kmem_cache_free(fib6_node_kmem, fn); } static void node_free(struct net *net, struct fib6_node *fn) { call_rcu(&fn->rcu, node_free_rcu); net->ipv6.rt6_stats->fib_nodes--; } static void fib6_free_table(struct fib6_table *table) { inetpeer_invalidate_tree(&table->tb6_peers); kfree(table); } static void fib6_link_table(struct net *net, struct fib6_table *tb) { unsigned int h; /* * Initialize table lock at a single place to give lockdep a key, * tables aren't visible prior to being linked to the list. */ spin_lock_init(&tb->tb6_lock); h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1); /* * No protection necessary, this is the only list mutatation * operation, tables never disappear once they exist. */ hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]); } #ifdef CONFIG_IPV6_MULTIPLE_TABLES static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) { struct fib6_table *table; table = kzalloc(sizeof(*table), GFP_ATOMIC); if (table) { table->tb6_id = id; rcu_assign_pointer(table->tb6_root.leaf, net->ipv6.fib6_null_entry); table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&table->tb6_peers); INIT_HLIST_HEAD(&table->tb6_gc_hlist); } return table; } struct fib6_table *fib6_new_table(struct net *net, u32 id) { struct fib6_table *tb; if (id == 0) id = RT6_TABLE_MAIN; tb = fib6_get_table(net, id); if (tb) return tb; tb = fib6_alloc_table(net, id); if (tb) fib6_link_table(net, tb); return tb; } EXPORT_SYMBOL_GPL(fib6_new_table); struct fib6_table *fib6_get_table(struct net *net, u32 id) { struct fib6_table *tb; struct hlist_head *head; unsigned int h; if (id == 0) id = RT6_TABLE_MAIN; h = id & (FIB6_TABLE_HASHSZ - 1); rcu_read_lock(); head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (tb->tb6_id == id) { rcu_read_unlock(); return tb; } } rcu_read_unlock(); return NULL; } EXPORT_SYMBOL_GPL(fib6_get_table); static void __net_init fib6_tables_init(struct net *net) { fib6_link_table(net, net->ipv6.fib6_main_tbl); fib6_link_table(net, net->ipv6.fib6_local_tbl); } #else struct fib6_table *fib6_new_table(struct net *net, u32 id) { return fib6_get_table(net, id); } struct fib6_table *fib6_get_table(struct net *net, u32 id) { return net->ipv6.fib6_main_tbl; } struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup) { struct rt6_info *rt; rt = pol_lookup_func(lookup, net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error == -EAGAIN) { ip6_rt_put_flags(rt, flags); rt = net->ipv6.ip6_null_entry; if (!(flags & RT6_LOOKUP_F_DST_NOREF)) dst_hold(&rt->dst); } return &rt->dst; } /* called with rcu lock held; no reference taken on fib6_info */ int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, struct fib6_result *res, int flags) { return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, res, flags); } static void __net_init fib6_tables_init(struct net *net) { fib6_link_table(net, net->ipv6.fib6_main_tbl); } #endif unsigned int fib6_tables_seq_read(struct net *net) { unsigned int h, fib_seq = 0; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv6.fib_table_hash[h]; struct fib6_table *tb; hlist_for_each_entry_rcu(tb, head, tb6_hlist) fib_seq += tb->fib_seq; } rcu_read_unlock(); return fib_seq; } static int call_fib6_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib6_info *rt, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, }; return call_fib6_notifier(nb, event_type, &info.info); } static int call_fib6_multipath_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib6_info *rt, unsigned int nsiblings, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, .nsiblings = nsiblings, }; return call_fib6_notifier(nb, event_type, &info.info); } int call_fib6_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, }; rt->fib6_table->fib_seq++; return call_fib6_notifiers(net, event_type, &info.info); } int call_fib6_multipath_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, unsigned int nsiblings, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, .nsiblings = nsiblings, }; rt->fib6_table->fib_seq++; return call_fib6_notifiers(net, event_type, &info.info); } int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt) { struct fib6_entry_notifier_info info = { .rt = rt, .nsiblings = rt->fib6_nsiblings, }; rt->fib6_table->fib_seq++; return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info); } struct fib6_dump_arg { struct net *net; struct notifier_block *nb; struct netlink_ext_ack *extack; }; static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) { enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE; int err; if (!rt || rt == arg->net->ipv6.fib6_null_entry) return 0; if (rt->fib6_nsiblings) err = call_fib6_multipath_entry_notifier(arg->nb, fib_event, rt, rt->fib6_nsiblings, arg->extack); else err = call_fib6_entry_notifier(arg->nb, fib_event, rt, arg->extack); return err; } static int fib6_node_dump(struct fib6_walker *w) { int err; err = fib6_rt_dump(w->leaf, w->args); w->leaf = NULL; return err; } static int fib6_table_dump(struct net *net, struct fib6_table *tb, struct fib6_walker *w) { int err; w->root = &tb->tb6_root; spin_lock_bh(&tb->tb6_lock); err = fib6_walk(net, w); spin_unlock_bh(&tb->tb6_lock); return err; } /* Called with rcu_read_lock() */ int fib6_tables_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct fib6_dump_arg arg; struct fib6_walker *w; unsigned int h; int err = 0; w = kzalloc(sizeof(*w), GFP_ATOMIC); if (!w) return -ENOMEM; w->func = fib6_node_dump; arg.net = net; arg.nb = nb; arg.extack = extack; w->args = &arg; for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv6.fib_table_hash[h]; struct fib6_table *tb; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { err = fib6_table_dump(net, tb, w); if (err) goto out; } } out: kfree(w); /* The tree traversal function should never return a positive value. */ return err > 0 ? -EINVAL : err; } static int fib6_dump_node(struct fib6_walker *w) { int res; struct fib6_info *rt; for_each_fib6_walker_rt(w) { res = rt6_dump_route(rt, w->args, w->skip_in_node); if (res >= 0) { /* Frame is full, suspend walking */ w->leaf = rt; /* We'll restart from this node, so if some routes were * already dumped, skip them next time. */ w->skip_in_node += res; return 1; } w->skip_in_node = 0; /* Multipath routes are dumped in one route with the * RTA_MULTIPATH attribute. Jump 'rt' to point to the * last sibling of this route (no need to dump the * sibling routes again) */ if (rt->fib6_nsiblings) rt = list_last_entry(&rt->fib6_siblings, struct fib6_info, fib6_siblings); } w->leaf = NULL; return 0; } static void fib6_dump_end(struct netlink_callback *cb) { struct net *net = sock_net(cb->skb->sk); struct fib6_walker *w = (void *)cb->args[2]; if (w) { if (cb->args[4]) { cb->args[4] = 0; fib6_walker_unlink(net, w); } cb->args[2] = 0; kfree(w); } cb->done = (void *)cb->args[3]; cb->args[1] = 3; } static int fib6_dump_done(struct netlink_callback *cb) { fib6_dump_end(cb); return cb->done ? cb->done(cb) : 0; } static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct fib6_walker *w; int res; w = (void *)cb->args[2]; w->root = &table->tb6_root; if (cb->args[4] == 0) { w->count = 0; w->skip = 0; w->skip_in_node = 0; spin_lock_bh(&table->tb6_lock); res = fib6_walk(net, w); spin_unlock_bh(&table->tb6_lock); if (res > 0) { cb->args[4] = 1; cb->args[5] = READ_ONCE(w->root->fn_sernum); } } else { int sernum = READ_ONCE(w->root->fn_sernum); if (cb->args[5] != sernum) { /* Begin at the root if the tree changed */ cb->args[5] = sernum; w->state = FWS_INIT; w->node = w->root; w->skip = w->count; w->skip_in_node = 0; } else w->skip = 0; spin_lock_bh(&table->tb6_lock); res = fib6_walk_continue(w); spin_unlock_bh(&table->tb6_lock); if (res <= 0) { fib6_walker_unlink(net, w); cb->args[4] = 0; } } return res; } static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true, .filter.dump_routes = true, .filter.rtnl_held = false, }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int e = 0, s_e; struct hlist_head *head; struct fib6_walker *w; struct fib6_table *tb; unsigned int h, s_h; int err = 0; rcu_read_lock(); if (cb->strict_check) { err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb); if (err < 0) goto unlock; } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { struct rtmsg *rtm = nlmsg_data(nlh); if (rtm->rtm_flags & RTM_F_PREFIX) arg.filter.flags = RTM_F_PREFIX; } w = (void *)cb->args[2]; if (!w) { /* New dump: * * 1. allocate and initialize walker. */ w = kzalloc(sizeof(*w), GFP_ATOMIC); if (!w) { err = -ENOMEM; goto unlock; } w->func = fib6_dump_node; cb->args[2] = (long)w; /* 2. hook callback destructor. */ cb->args[3] = (long)cb->done; cb->done = fib6_dump_done; } arg.skb = skb; arg.cb = cb; arg.net = net; w->args = &arg; if (arg.filter.table_id) { tb = fib6_get_table(net, arg.filter.table_id); if (!tb) { if (rtnl_msg_family(cb->nlh) != PF_INET6) goto unlock; NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist"); err = -ENOENT; goto unlock; } if (!cb->args[0]) { err = fib6_dump_table(tb, skb, cb); if (!err) cb->args[0] = 1; } goto unlock; } s_h = cb->args[0]; s_e = cb->args[1]; for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (e < s_e) goto next; err = fib6_dump_table(tb, skb, cb); if (err != 0) goto out; next: e++; } } out: cb->args[1] = e; cb->args[0] = h; unlock: rcu_read_unlock(); if (err <= 0) fib6_dump_end(cb); return err; } void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val) { if (!f6i) return; if (f6i->fib6_metrics == &dst_default_metrics) { struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC); if (!p) return; refcount_set(&p->refcnt, 1); f6i->fib6_metrics = p; } f6i->fib6_metrics->metrics[metric - 1] = val; } /* * Routing Table * * return the appropriate node for a routing tree "add" operation * by either creating and inserting or by returning an existing * node. */ static struct fib6_node *fib6_add_1(struct net *net, struct fib6_table *table, struct fib6_node *root, struct in6_addr *addr, int plen, int offset, int allow_create, int replace_required, struct netlink_ext_ack *extack) { struct fib6_node *fn, *in, *ln; struct fib6_node *pn = NULL; struct rt6key *key; int bit; __be32 dir = 0; /* insert node in tree */ fn = root; do { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match */ if (plen < fn->fn_bit || !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) { if (!allow_create) { if (replace_required) { NL_SET_ERR_MSG(extack, "Can not replace route - no match found"); pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT); } pr_warn("NLM_F_CREATE should be set when creating new route\n"); } goto insert_above; } /* * Exact match ? */ if (plen == fn->fn_bit) { /* clean up an intermediate node */ if (!(fn->fn_flags & RTN_RTINFO)) { RCU_INIT_POINTER(fn->leaf, NULL); fib6_info_release(leaf); /* remove null_entry in the root node */ } else if (fn->fn_flags & RTN_TL_ROOT && rcu_access_pointer(fn->leaf) == net->ipv6.fib6_null_entry) { RCU_INIT_POINTER(fn->leaf, NULL); } return fn; } /* * We have more bits to go */ /* Try to walk down on tree. */ dir = addr_bit_set(addr, fn->fn_bit); pn = fn; fn = dir ? rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)) : rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); } while (fn); if (!allow_create) { /* We should not create new node because * NLM_F_REPLACE was specified without NLM_F_CREATE * I assume it is safe to require NLM_F_CREATE when * REPLACE flag is used! Later we may want to remove the * check for replace_required, because according * to netlink specification, NLM_F_CREATE * MUST be specified if new route is created. * That would keep IPv6 consistent with IPv4 */ if (replace_required) { NL_SET_ERR_MSG(extack, "Can not replace route - no match found"); pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT); } pr_warn("NLM_F_CREATE should be set when creating new route\n"); } /* * We walked to the bottom of tree. * Create new leaf node without children. */ ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, pn); if (dir) rcu_assign_pointer(pn->right, ln); else rcu_assign_pointer(pn->left, ln); return ln; insert_above: /* * split since we don't have a common prefix anymore or * we have a less significant route. * we've to insert an intermediate node on the list * this new node will point to the one we need to create * and the current */ pn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); /* find 1st bit in difference between the 2 addrs. See comment in __ipv6_addr_diff: bit may be an invalid value, but if it is >= plen, the value is ignored in any case. */ bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr)); /* * (intermediate)[in] * / \ * (new leaf node)[ln] (old node)[fn] */ if (plen > bit) { in = node_alloc(net); ln = node_alloc(net); if (!in || !ln) { if (in) node_free_immediate(net, in); if (ln) node_free_immediate(net, ln); return ERR_PTR(-ENOMEM); } /* * new intermediate node. * RTN_RTINFO will * be off since that an address that chooses one of * the branches would not match less specific routes * in the other branch */ in->fn_bit = bit; RCU_INIT_POINTER(in->parent, pn); in->leaf = fn->leaf; fib6_info_hold(rcu_dereference_protected(in->leaf, lockdep_is_held(&table->tb6_lock))); /* update parent pointer */ if (dir) rcu_assign_pointer(pn->right, in); else rcu_assign_pointer(pn->left, in); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, in); rcu_assign_pointer(fn->parent, in); if (addr_bit_set(addr, bit)) { rcu_assign_pointer(in->right, ln); rcu_assign_pointer(in->left, fn); } else { rcu_assign_pointer(in->left, ln); rcu_assign_pointer(in->right, fn); } } else { /* plen <= bit */ /* * (new leaf node)[ln] * / \ * (old node)[fn] NULL */ ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, pn); if (addr_bit_set(&key->addr, plen)) RCU_INIT_POINTER(ln->right, fn); else RCU_INIT_POINTER(ln->left, fn); rcu_assign_pointer(fn->parent, ln); if (dir) rcu_assign_pointer(pn->right, ln); else rcu_assign_pointer(pn->left, ln); } return ln; } static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, const struct fib6_info *match, const struct fib6_table *table) { int cpu; if (!fib6_nh->rt6i_pcpu) return; rcu_read_lock(); /* release the reference to this fib entry from * all of its cached pcpu routes */ for_each_possible_cpu(cpu) { struct rt6_info **ppcpu_rt; struct rt6_info *pcpu_rt; ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); /* Paired with xchg() in rt6_get_pcpu_route() */ pcpu_rt = READ_ONCE(*ppcpu_rt); /* only dropping the 'from' reference if the cached route * is using 'match'. The cached pcpu_rt->from only changes * from a fib6_info to NULL (ip6_dst_destroy); it can never * change from one fib6_info reference to another */ if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) { struct fib6_info *from; from = unrcu_pointer(xchg(&pcpu_rt->from, NULL)); fib6_info_release(from); } } rcu_read_unlock(); } struct fib6_nh_pcpu_arg { struct fib6_info *from; const struct fib6_table *table; }; static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg) { struct fib6_nh_pcpu_arg *arg = _arg; __fib6_drop_pcpu_from(nh, arg->from, arg->table); return 0; } static void fib6_drop_pcpu_from(struct fib6_info *f6i, const struct fib6_table *table) { /* Make sure rt6_make_pcpu_route() wont add other percpu routes * while we are cleaning them here. */ f6i->fib6_destroying = 1; mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ if (f6i->nh) { struct fib6_nh_pcpu_arg arg = { .from = f6i, .table = table }; nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, &arg); } else { struct fib6_nh *fib6_nh; fib6_nh = f6i->fib6_nh; __fib6_drop_pcpu_from(fib6_nh, f6i, table); } } static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, struct net *net) { struct fib6_table *table = rt->fib6_table; /* Flush all cached dst in exception table */ rt6_flush_exceptions(rt); fib6_drop_pcpu_from(rt, table); if (rt->nh && !list_empty(&rt->nh_list)) list_del_init(&rt->nh_list); if (refcount_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split * nodes. It is not leaked, but it still holds other resources, * which must be released in time. So, scan ascendant nodes * and replace dummy references to this route with references * to still alive ones. */ while (fn) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *new_leaf; if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) { new_leaf = fib6_find_prefix(net, table, fn); fib6_info_hold(new_leaf); rcu_assign_pointer(fn->leaf, new_leaf); fib6_info_release(rt); } fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); } } fib6_clean_expires(rt); fib6_remove_gc_list(rt); } /* * Insert routing information in a node. */ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->fib6_table->tb6_lock)); struct fib6_info *iter = NULL; struct fib6_info __rcu **ins; struct fib6_info __rcu **fallback_ins = NULL; int replace = (info->nlh && (info->nlh->nlmsg_flags & NLM_F_REPLACE)); int add = (!info->nlh || (info->nlh->nlmsg_flags & NLM_F_CREATE)); int found = 0; bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); bool notify_sibling_rt = false; u16 nlflags = NLM_F_EXCL; int err; if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND)) nlflags |= NLM_F_APPEND; ins = &fn->leaf; for (iter = leaf; iter; iter = rcu_dereference_protected(iter->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock))) { /* * Search for duplicates */ if (iter->fib6_metric == rt->fib6_metric) { /* * Same priority level */ if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_EXCL)) return -EEXIST; nlflags &= ~NLM_F_EXCL; if (replace) { if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) { found++; break; } fallback_ins = fallback_ins ?: ins; goto next_iter; } if (rt6_duplicate_nexthop(iter, rt)) { if (rt->fib6_nsiblings) rt->fib6_nsiblings = 0; if (!(iter->fib6_flags & RTF_EXPIRES)) return -EEXIST; if (!(rt->fib6_flags & RTF_EXPIRES)) { fib6_clean_expires(iter); fib6_remove_gc_list(iter); } else { fib6_set_expires(iter, rt->expires); fib6_add_gc_list(iter); } if (rt->fib6_pmtu) fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu); return -EEXIST; } /* If we have the same destination and the same metric, * but not the same gateway, then the route we try to * add is sibling to this route, increment our counter * of siblings, and later we will add our route to the * list. * Only static routes (which don't have flag * RTF_EXPIRES) are used for ECMPv6. * * To avoid long list, we only had siblings if the * route have a gateway. */ if (rt_can_ecmp && rt6_qualify_for_ecmp(iter)) rt->fib6_nsiblings++; } if (iter->fib6_metric > rt->fib6_metric) break; next_iter: ins = &iter->fib6_next; } if (fallback_ins && !found) { /* No matching route with same ecmp-able-ness found, replace * first matching route */ ins = fallback_ins; iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); found++; } /* Reset round-robin state, if necessary */ if (ins == &fn->leaf) fn->rr_ptr = NULL; /* Link this route to others same route. */ if (rt->fib6_nsiblings) { unsigned int fib6_nsiblings; struct fib6_info *sibling, *temp_sibling; /* Find the first route that have the same metric */ sibling = leaf; notify_sibling_rt = true; while (sibling) { if (sibling->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(sibling)) { list_add_tail(&rt->fib6_siblings, &sibling->fib6_siblings); break; } sibling = rcu_dereference_protected(sibling->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock)); notify_sibling_rt = false; } /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings * is broken! */ fib6_nsiblings = 0; list_for_each_entry_safe(sibling, temp_sibling, &rt->fib6_siblings, fib6_siblings) { sibling->fib6_nsiblings++; BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings); fib6_nsiblings++; } BUG_ON(fib6_nsiblings != rt->fib6_nsiblings); rt6_multipath_rebalance(temp_sibling); } /* * insert node */ if (!replace) { if (!add) pr_warn("NLM_F_CREATE should be set when creating new route\n"); add: nlflags |= NLM_F_CREATE; /* The route should only be notified if it is the first * route in the node or if it is added as a sibling * route to the first route in the node. */ if (!info->skip_notify_kernel && (notify_sibling_rt || ins == &fn->leaf)) { enum fib_event_type fib_event; if (notify_sibling_rt) fib_event = FIB_EVENT_ENTRY_APPEND; else fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib6_entry_notifiers(info->nl_net, fib_event, rt, extack); if (err) { struct fib6_info *sibling, *next_sibling; /* If the route has siblings, then it first * needs to be unlinked from them. */ if (!rt->fib6_nsiblings) return err; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) sibling->fib6_nsiblings--; rt->fib6_nsiblings = 0; list_del_init(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); return err; } } rcu_assign_pointer(rt->fib6_next, iter); fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); rcu_assign_pointer(*ins, rt); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } } else { int nsiblings; if (!found) { if (add) goto add; pr_warn("NLM_F_REPLACE set, but no existing node found!\n"); return -ENOENT; } if (!info->skip_notify_kernel && ins == &fn->leaf) { err = call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, rt, extack); if (err) return err; } fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); rt->fib6_next = iter->fib6_next; rcu_assign_pointer(*ins, rt); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } nsiblings = iter->fib6_nsiblings; iter->fib6_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; fib6_info_release(iter); if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ ins = &rt->fib6_next; iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); while (iter) { if (iter->fib6_metric > rt->fib6_metric) break; if (rt6_qualify_for_ecmp(iter)) { *ins = iter->fib6_next; iter->fib6_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; fib6_info_release(iter); nsiblings--; info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } else { ins = &iter->fib6_next; } iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); } WARN_ON(nsiblings != 0); } } return 0; } static void fib6_start_gc(struct net *net, struct fib6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && (rt->fib6_flags & RTF_EXPIRES)) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } void fib6_force_start_gc(struct net *net) { if (!timer_pending(&net->ipv6.ip6_fib_timer)) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } static void __fib6_update_sernum_upto_root(struct fib6_info *rt, int sernum) { struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&rt->fib6_table->tb6_lock)); /* paired with smp_rmb() in fib6_get_cookie_safe() */ smp_wmb(); while (fn) { WRITE_ONCE(fn->fn_sernum, sernum); fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&rt->fib6_table->tb6_lock)); } } void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt) { __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net)); } /* allow ipv4 to update sernum via ipv6_stub */ void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i) { spin_lock_bh(&f6i->fib6_table->tb6_lock); fib6_update_sernum_upto_root(net, f6i); spin_unlock_bh(&f6i->fib6_table->tb6_lock); } /* * Add routing information to the routing tree. * <destination addr>/<source addr> * with source addr info in sub-trees * Need to own table->tb6_lock */ int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_table *table = rt->fib6_table; struct fib6_node *fn; #ifdef CONFIG_IPV6_SUBTREES struct fib6_node *pn = NULL; #endif int err = -ENOMEM; int allow_create = 1; int replace_required = 0; if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) allow_create = 0; if (info->nlh->nlmsg_flags & NLM_F_REPLACE) replace_required = 1; } if (!allow_create && !replace_required) pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n"); fn = fib6_add_1(info->nl_net, table, root, &rt->fib6_dst.addr, rt->fib6_dst.plen, offsetof(struct fib6_info, fib6_dst), allow_create, replace_required, extack); if (IS_ERR(fn)) { err = PTR_ERR(fn); fn = NULL; goto out; } #ifdef CONFIG_IPV6_SUBTREES pn = fn; if (rt->fib6_src.plen) { struct fib6_node *sn; if (!rcu_access_pointer(fn->subtree)) { struct fib6_node *sfn; /* * Create subtree. * * fn[main tree] * | * sfn[subtree root] * \ * sn[new leaf node] */ /* Create subtree root node */ sfn = node_alloc(info->nl_net); if (!sfn) goto failure; fib6_info_hold(info->nl_net->ipv6.fib6_null_entry); rcu_assign_pointer(sfn->leaf, info->nl_net->ipv6.fib6_null_entry); sfn->fn_flags = RTN_ROOT; /* Now add the first leaf node to new subtree */ sn = fib6_add_1(info->nl_net, table, sfn, &rt->fib6_src.addr, rt->fib6_src.plen, offsetof(struct fib6_info, fib6_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { /* If it is failed, discard just allocated root, and then (in failure) stale node in main tree. */ node_free_immediate(info->nl_net, sfn); err = PTR_ERR(sn); goto failure; } /* Now link new subtree to main tree */ rcu_assign_pointer(sfn->parent, fn); rcu_assign_pointer(fn->subtree, sfn); } else { sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn), &rt->fib6_src.addr, rt->fib6_src.plen, offsetof(struct fib6_info, fib6_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { err = PTR_ERR(sn); goto failure; } } if (!rcu_access_pointer(fn->leaf)) { if (fn->fn_flags & RTN_TL_ROOT) { /* put back null_entry for root node */ rcu_assign_pointer(fn->leaf, info->nl_net->ipv6.fib6_null_entry); } else { fib6_info_hold(rt); rcu_assign_pointer(fn->leaf, rt); } } fn = sn; } #endif err = fib6_add_rt2node(fn, rt, info, extack); if (!err) { if (rt->nh) list_add(&rt->nh_list, &rt->nh->f6i_list); __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net)); if (rt->fib6_flags & RTF_EXPIRES) fib6_add_gc_list(rt); fib6_start_gc(info->nl_net, rt); } out: if (err) { #ifdef CONFIG_IPV6_SUBTREES /* * If fib6_add_1 has cleared the old leaf pointer in the * super-tree leaf node we have to find a new one for it. */ if (pn != fn) { struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf, lockdep_is_held(&table->tb6_lock)); if (pn_leaf == rt) { pn_leaf = NULL; RCU_INIT_POINTER(pn->leaf, NULL); fib6_info_release(rt); } if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { pn_leaf = fib6_find_prefix(info->nl_net, table, pn); if (!pn_leaf) pn_leaf = info->nl_net->ipv6.fib6_null_entry; fib6_info_hold(pn_leaf); rcu_assign_pointer(pn->leaf, pn_leaf); } } #endif goto failure; } else if (fib6_requires_src(rt)) { fib6_routes_require_src_inc(info->nl_net); } return err; failure: /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if: * 1. fn is an intermediate node and we failed to add the new * route to it in both subtree creation failure and fib6_add_rt2node() * failure case. * 2. fn is the root node in the table and we fail to add the first * default route to it. */ if (fn && (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) || (fn->fn_flags & RTN_TL_ROOT && !rcu_access_pointer(fn->leaf)))) fib6_repair_tree(info->nl_net, table, fn); return err; } /* * Routing tree lookup * */ struct lookup_args { int offset; /* key offset on fib6_info */ const struct in6_addr *addr; /* search key */ }; static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root, struct lookup_args *args) { struct fib6_node *fn; __be32 dir; if (unlikely(args->offset == 0)) return NULL; /* * Descend on a tree */ fn = root; for (;;) { struct fib6_node *next; dir = addr_bit_set(args->addr, fn->fn_bit); next = dir ? rcu_dereference(fn->right) : rcu_dereference(fn->left); if (next) { fn = next; continue; } break; } while (fn) { struct fib6_node *subtree = FIB6_SUBTREE(fn); if (subtree || fn->fn_flags & RTN_RTINFO) { struct fib6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; if (!leaf) goto backtrack; key = (struct rt6key *) ((u8 *)leaf + args->offset); if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) { #ifdef CONFIG_IPV6_SUBTREES if (subtree) { struct fib6_node *sfn; sfn = fib6_node_lookup_1(subtree, args + 1); if (!sfn) goto backtrack; fn = sfn; } #endif if (fn->fn_flags & RTN_RTINFO) return fn; } } backtrack: if (fn->fn_flags & RTN_ROOT) break; fn = rcu_dereference(fn->parent); } return NULL; } /* called with rcu_read_lock() held */ struct fib6_node *fib6_node_lookup(struct fib6_node *root, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct fib6_node *fn; struct lookup_args args[] = { { .offset = offsetof(struct fib6_info, fib6_dst), .addr = daddr, }, #ifdef CONFIG_IPV6_SUBTREES { .offset = offsetof(struct fib6_info, fib6_src), .addr = saddr, }, #endif { .offset = 0, /* sentinel */ } }; fn = fib6_node_lookup_1(root, daddr ? args : args + 1); if (!fn || fn->fn_flags & RTN_TL_ROOT) fn = root; return fn; } /* * Get node with specified destination prefix (and source prefix, * if subtrees are used) * exact_match == true means we try to find fn with exact match of * the passed in prefix addr * exact_match == false means we try to find fn with longest prefix * match of the passed in prefix addr. This is useful for finding fn * for cached route as it will be stored in the exception table under * the node with longest prefix length. */ static struct fib6_node *fib6_locate_1(struct fib6_node *root, const struct in6_addr *addr, int plen, int offset, bool exact_match) { struct fib6_node *fn, *prev = NULL; for (fn = root; fn ; ) { struct fib6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; /* This node is being deleted */ if (!leaf) { if (plen <= fn->fn_bit) goto out; else goto next; } key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match */ if (plen < fn->fn_bit || !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) goto out; if (plen == fn->fn_bit) return fn; if (fn->fn_flags & RTN_RTINFO) prev = fn; next: /* * We have more bits to go */ if (addr_bit_set(addr, fn->fn_bit)) fn = rcu_dereference(fn->right); else fn = rcu_dereference(fn->left); } out: if (exact_match) return NULL; else return prev; } struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *daddr, int dst_len, const struct in6_addr *saddr, int src_len, bool exact_match) { struct fib6_node *fn; fn = fib6_locate_1(root, daddr, dst_len, offsetof(struct fib6_info, fib6_dst), exact_match); #ifdef CONFIG_IPV6_SUBTREES if (src_len) { WARN_ON(saddr == NULL); if (fn) { struct fib6_node *subtree = FIB6_SUBTREE(fn); if (subtree) { fn = fib6_locate_1(subtree, saddr, src_len, offsetof(struct fib6_info, fib6_src), exact_match); } } } #endif if (fn && fn->fn_flags & RTN_RTINFO) return fn; return NULL; } /* * Deletion * */ static struct fib6_info *fib6_find_prefix(struct net *net, struct fib6_table *table, struct fib6_node *fn) { struct fib6_node *child_left, *child_right; if (fn->fn_flags & RTN_ROOT) return net->ipv6.fib6_null_entry; while (fn) { child_left = rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); child_right = rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)); if (child_left) return rcu_dereference_protected(child_left->leaf, lockdep_is_held(&table->tb6_lock)); if (child_right) return rcu_dereference_protected(child_right->leaf, lockdep_is_held(&table->tb6_lock)); fn = FIB6_SUBTREE(fn); } return NULL; } /* * Called to trim the tree of intermediate nodes when possible. "fn" * is the node we want to try and remove. * Need to own table->tb6_lock */ static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_table *table, struct fib6_node *fn) { int children; int nstate; struct fib6_node *child; struct fib6_walker *w; int iter = 0; /* Set fn->leaf to null_entry for root node. */ if (fn->fn_flags & RTN_TL_ROOT) { rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry); return fn; } for (;;) { struct fib6_node *fn_r = rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)); struct fib6_node *fn_l = rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn_r = rcu_dereference_protected(pn->right, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn_l = rcu_dereference_protected(pn->left, lockdep_is_held(&table->tb6_lock)); struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *new_fn_leaf; pr_debug("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); iter++; WARN_ON(fn->fn_flags & RTN_RTINFO); WARN_ON(fn->fn_flags & RTN_TL_ROOT); WARN_ON(fn_leaf); children = 0; child = NULL; if (fn_r) { child = fn_r; children |= 1; } if (fn_l) { child = fn_l; children |= 2; } if (children == 3 || FIB6_SUBTREE(fn) #ifdef CONFIG_IPV6_SUBTREES /* Subtree root (i.e. fn) may have one child */ || (children && fn->fn_flags & RTN_ROOT) #endif ) { new_fn_leaf = fib6_find_prefix(net, table, fn); #if RT6_DEBUG >= 2 if (!new_fn_leaf) { WARN_ON(!new_fn_leaf); new_fn_leaf = net->ipv6.fib6_null_entry; } #endif fib6_info_hold(new_fn_leaf); rcu_assign_pointer(fn->leaf, new_fn_leaf); return pn; } #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { WARN_ON(!(fn->fn_flags & RTN_ROOT)); RCU_INIT_POINTER(pn->subtree, NULL); nstate = FWS_L; } else { WARN_ON(fn->fn_flags & RTN_ROOT); #endif if (pn_r == fn) rcu_assign_pointer(pn->right, child); else if (pn_l == fn) rcu_assign_pointer(pn->left, child); #if RT6_DEBUG >= 2 else WARN_ON(1); #endif if (child) rcu_assign_pointer(child->parent, pn); nstate = FWS_R; #ifdef CONFIG_IPV6_SUBTREES } #endif read_lock(&net->ipv6.fib6_walker_lock); FOR_WALKERS(net, w) { if (!child) { if (w->node == fn) { pr_debug("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate); w->node = pn; w->state = nstate; } } else { if (w->node == fn) { w->node = child; if (children&2) { pr_debug("W %p adjusted by delnode 2, s=%d\n", w, w->state); w->state = w->state >= FWS_R ? FWS_U : FWS_INIT; } else { pr_debug("W %p adjusted by delnode 2, s=%d\n", w, w->state); w->state = w->state >= FWS_C ? FWS_U : FWS_INIT; } } } } read_unlock(&net->ipv6.fib6_walker_lock); node_free(net, fn); if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn)) return pn; RCU_INIT_POINTER(pn->leaf, NULL); fib6_info_release(pn_leaf); fn = pn; } } static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, struct fib6_info __rcu **rtp, struct nl_info *info) { struct fib6_info *leaf, *replace_rt = NULL; struct fib6_walker *w; struct fib6_info *rt = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); struct net *net = info->nl_net; bool notify_del = false; /* If the deleted route is the first in the node and it is not part of * a multipath route, then we need to replace it with the next route * in the node, if exists. */ leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); if (leaf == rt && !rt->fib6_nsiblings) { if (rcu_access_pointer(rt->fib6_next)) replace_rt = rcu_dereference_protected(rt->fib6_next, lockdep_is_held(&table->tb6_lock)); else notify_del = true; } /* Unlink it */ *rtp = rt->fib6_next; rt->fib6_node = NULL; net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; /* Reset round-robin state, if necessary */ if (rcu_access_pointer(fn->rr_ptr) == rt) fn->rr_ptr = NULL; /* Remove this entry from other siblings */ if (rt->fib6_nsiblings) { struct fib6_info *sibling, *next_sibling; /* The route is deleted from a multipath route. If this * multipath route is the first route in the node, then we need * to emit a delete notification. Otherwise, we need to skip * the notification. */ if (rt->fib6_metric == leaf->fib6_metric && rt6_qualify_for_ecmp(leaf)) notify_del = true; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) sibling->fib6_nsiblings--; rt->fib6_nsiblings = 0; list_del_init(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); } /* Adjust walkers */ read_lock(&net->ipv6.fib6_walker_lock); FOR_WALKERS(net, w) { if (w->state == FWS_C && w->leaf == rt) { pr_debug("walker %p adjusted by delroute\n", w); w->leaf = rcu_dereference_protected(rt->fib6_next, lockdep_is_held(&table->tb6_lock)); if (!w->leaf) w->state = FWS_U; } } read_unlock(&net->ipv6.fib6_walker_lock); /* If it was last route, call fib6_repair_tree() to: * 1. For root node, put back null_entry as how the table was created. * 2. For other nodes, expunge its radix tree node. */ if (!rcu_access_pointer(fn->leaf)) { if (!(fn->fn_flags & RTN_TL_ROOT)) { fn->fn_flags &= ~RTN_RTINFO; net->ipv6.rt6_stats->fib_route_nodes--; } fn = fib6_repair_tree(net, table, fn); } fib6_purge_rt(rt, fn, net); if (!info->skip_notify_kernel) { if (notify_del) call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); else if (replace_rt) call_fib6_entry_notifiers_replace(net, replace_rt); } if (!info->skip_notify) inet6_rt_notify(RTM_DELROUTE, rt, info, 0); fib6_info_release(rt); } /* Need to own table->tb6_lock */ int fib6_del(struct fib6_info *rt, struct nl_info *info) { struct net *net = info->nl_net; struct fib6_info __rcu **rtp; struct fib6_info __rcu **rtp_next; struct fib6_table *table; struct fib6_node *fn; if (rt == net->ipv6.fib6_null_entry) return -ENOENT; table = rt->fib6_table; fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&table->tb6_lock)); if (!fn) return -ENOENT; WARN_ON(!(fn->fn_flags & RTN_RTINFO)); /* * Walk the leaf entries looking for ourself */ for (rtp = &fn->leaf; *rtp; rtp = rtp_next) { struct fib6_info *cur = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); if (rt == cur) { if (fib6_requires_src(cur)) fib6_routes_require_src_dec(info->nl_net); fib6_del_route(table, fn, rtp, info); return 0; } rtp_next = &cur->fib6_next; } return -ENOENT; } /* * Tree traversal function. * * Certainly, it is not interrupt safe. * However, it is internally reenterable wrt itself and fib6_add/fib6_del. * It means, that we can modify tree during walking * and use this function for garbage collection, clone pruning, * cleaning tree when a device goes down etc. etc. * * It guarantees that every node will be traversed, * and that it will be traversed only once. * * Callback function w->func may return: * 0 -> continue walking. * positive value -> walking is suspended (used by tree dumps, * and probably by gc, if it will be split to several slices) * negative value -> terminate walking. * * The function itself returns: * 0 -> walk is complete. * >0 -> walk is incomplete (i.e. suspended) * <0 -> walk is terminated by an error. * * This function is called with tb6_lock held. */ static int fib6_walk_continue(struct fib6_walker *w) { struct fib6_node *fn, *pn, *left, *right; /* w->root should always be table->tb6_root */ WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT)); for (;;) { fn = w->node; if (!fn) return 0; switch (w->state) { #ifdef CONFIG_IPV6_SUBTREES case FWS_S: if (FIB6_SUBTREE(fn)) { w->node = FIB6_SUBTREE(fn); continue; } w->state = FWS_L; fallthrough; #endif case FWS_L: left = rcu_dereference_protected(fn->left, 1); if (left) { w->node = left; w->state = FWS_INIT; continue; } w->state = FWS_R; fallthrough; case FWS_R: right = rcu_dereference_protected(fn->right, 1); if (right) { w->node = right; w->state = FWS_INIT; continue; } w->state = FWS_C; w->leaf = rcu_dereference_protected(fn->leaf, 1); fallthrough; case FWS_C: if (w->leaf && fn->fn_flags & RTN_RTINFO) { int err; if (w->skip) { w->skip--; goto skip; } err = w->func(w); if (err) return err; w->count++; continue; } skip: w->state = FWS_U; fallthrough; case FWS_U: if (fn == w->root) return 0; pn = rcu_dereference_protected(fn->parent, 1); left = rcu_dereference_protected(pn->left, 1); right = rcu_dereference_protected(pn->right, 1); w->node = pn; #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { WARN_ON(!(fn->fn_flags & RTN_ROOT)); w->state = FWS_L; continue; } #endif if (left == fn) { w->state = FWS_R; continue; } if (right == fn) { w->state = FWS_C; w->leaf = rcu_dereference_protected(w->node->leaf, 1); continue; } #if RT6_DEBUG >= 2 WARN_ON(1); #endif } } } static int fib6_walk(struct net *net, struct fib6_walker *w) { int res; w->state = FWS_INIT; w->node = w->root; fib6_walker_link(net, w); res = fib6_walk_continue(w); if (res <= 0) fib6_walker_unlink(net, w); return res; } static int fib6_clean_node(struct fib6_walker *w) { int res; struct fib6_info *rt; struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w); struct nl_info info = { .nl_net = c->net, .skip_notify = c->skip_notify, }; if (c->sernum != FIB6_NO_SERNUM_CHANGE && READ_ONCE(w->node->fn_sernum) != c->sernum) WRITE_ONCE(w->node->fn_sernum, c->sernum); if (!c->func) { WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE); w->leaf = NULL; return 0; } for_each_fib6_walker_rt(w) { res = c->func(rt, c->arg); if (res == -1) { w->leaf = rt; res = fib6_del(rt, &info); if (res) { #if RT6_DEBUG >= 2 pr_debug("%s: del failed: rt=%p@%p err=%d\n", __func__, rt, rcu_access_pointer(rt->fib6_node), res); #endif continue; } return 0; } else if (res == -2) { if (WARN_ON(!rt->fib6_nsiblings)) continue; rt = list_last_entry(&rt->fib6_siblings, struct fib6_info, fib6_siblings); continue; } WARN_ON(res != 0); } w->leaf = rt; return 0; } /* * Convenient frontend to tree walker. * * func is called on each route. * It may return -2 -> skip multipath route. * -1 -> delete this route. * 0 -> continue walking */ static void fib6_clean_tree(struct net *net, struct fib6_node *root, int (*func)(struct fib6_info *, void *arg), int sernum, void *arg, bool skip_notify) { struct fib6_cleaner c; c.w.root = root; c.w.func = fib6_clean_node; c.w.count = 0; c.w.skip = 0; c.w.skip_in_node = 0; c.func = func; c.sernum = sernum; c.arg = arg; c.net = net; c.skip_notify = skip_notify; fib6_walk(net, &c.w); } static void __fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), int sernum, void *arg, bool skip_notify) { struct fib6_table *table; struct hlist_head *head; unsigned int h; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { spin_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, func, sernum, arg, skip_notify); spin_unlock_bh(&table->tb6_lock); } } rcu_read_unlock(); } void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), void *arg) { __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false); } void fib6_clean_all_skip_notify(struct net *net, int (*func)(struct fib6_info *, void *), void *arg) { __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true); } static void fib6_flush_trees(struct net *net) { int new_sernum = fib6_new_sernum(net); __fib6_clean_all(net, NULL, new_sernum, NULL, false); } /* * Garbage collection */ static int fib6_age(struct fib6_info *rt, struct fib6_gc_args *gc_args) { unsigned long now = jiffies; /* * check addrconf expiration here. * Routes are expired even if they are in use. */ if (rt->fib6_flags & RTF_EXPIRES && rt->expires) { if (time_after(now, rt->expires)) { pr_debug("expiring %p\n", rt); return -1; } gc_args->more++; } /* Also age clones in the exception table. * Note, that clones are aged out * only if they are not in use now. */ rt6_age_exceptions(rt, gc_args, now); return 0; } static void fib6_gc_table(struct net *net, struct fib6_table *tb6, struct fib6_gc_args *gc_args) { struct fib6_info *rt; struct hlist_node *n; struct nl_info info = { .nl_net = net, .skip_notify = false, }; hlist_for_each_entry_safe(rt, n, &tb6->tb6_gc_hlist, gc_link) if (fib6_age(rt, gc_args) == -1) fib6_del(rt, &info); } static void fib6_gc_all(struct net *net, struct fib6_gc_args *gc_args) { struct fib6_table *table; struct hlist_head *head; unsigned int h; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { spin_lock_bh(&table->tb6_lock); fib6_gc_table(net, table, gc_args); spin_unlock_bh(&table->tb6_lock); } } rcu_read_unlock(); } void fib6_run_gc(unsigned long expires, struct net *net, bool force) { struct fib6_gc_args gc_args; unsigned long now; if (force) { spin_lock_bh(&net->ipv6.fib6_gc_lock); } else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) { mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ); return; } gc_args.timeout = expires ? (int)expires : net->ipv6.sysctl.ip6_rt_gc_interval; gc_args.more = 0; fib6_gc_all(net, &gc_args); now = jiffies; net->ipv6.ip6_rt_last_gc = now; if (gc_args.more) mod_timer(&net->ipv6.ip6_fib_timer, round_jiffies(now + net->ipv6.sysctl.ip6_rt_gc_interval)); else del_timer(&net->ipv6.ip6_fib_timer); spin_unlock_bh(&net->ipv6.fib6_gc_lock); } static void fib6_gc_timer_cb(struct timer_list *t) { struct net *arg = from_timer(arg, t, ipv6.ip6_fib_timer); fib6_run_gc(0, arg, true); } static int __net_init fib6_net_init(struct net *net) { size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ; int err; err = fib6_notifier_init(net); if (err) return err; /* Default to 3-tuple */ net->ipv6.sysctl.multipath_hash_fields = FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK; spin_lock_init(&net->ipv6.fib6_gc_lock); rwlock_init(&net->ipv6.fib6_walker_lock); INIT_LIST_HEAD(&net->ipv6.fib6_walkers); timer_setup(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, 0); net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL); if (!net->ipv6.rt6_stats) goto out_notifier; /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL); if (!net->ipv6.fib_table_hash) goto out_rt6_stats; net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl), GFP_KERNEL); if (!net->ipv6.fib6_main_tbl) goto out_fib_table_hash; net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf, net->ipv6.fib6_null_entry); net->ipv6.fib6_main_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers); INIT_HLIST_HEAD(&net->ipv6.fib6_main_tbl->tb6_gc_hlist); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl), GFP_KERNEL); if (!net->ipv6.fib6_local_tbl) goto out_fib6_main_tbl; net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf, net->ipv6.fib6_null_entry); net->ipv6.fib6_local_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers); INIT_HLIST_HEAD(&net->ipv6.fib6_local_tbl->tb6_gc_hlist); #endif fib6_tables_init(net); return 0; #ifdef CONFIG_IPV6_MULTIPLE_TABLES out_fib6_main_tbl: kfree(net->ipv6.fib6_main_tbl); #endif out_fib_table_hash: kfree(net->ipv6.fib_table_hash); out_rt6_stats: kfree(net->ipv6.rt6_stats); out_notifier: fib6_notifier_exit(net); return -ENOMEM; } static void fib6_net_exit(struct net *net) { unsigned int i; del_timer_sync(&net->ipv6.ip6_fib_timer); for (i = 0; i < FIB6_TABLE_HASHSZ; i++) { struct hlist_head *head = &net->ipv6.fib_table_hash[i]; struct hlist_node *tmp; struct fib6_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) { hlist_del(&tb->tb6_hlist); fib6_free_table(tb); } } kfree(net->ipv6.fib_table_hash); kfree(net->ipv6.rt6_stats); fib6_notifier_exit(net); } static struct pernet_operations fib6_net_ops = { .init = fib6_net_init, .exit = fib6_net_exit, }; int __init fib6_init(void) { int ret = -ENOMEM; fib6_node_kmem = KMEM_CACHE(fib6_node, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT); if (!fib6_node_kmem) goto out; ret = register_pernet_subsys(&fib6_net_ops); if (ret) goto out_kmem_cache_create; ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib, RTNL_FLAG_DUMP_UNLOCKED | RTNL_FLAG_DUMP_SPLIT_NLM_DONE); if (ret) goto out_unregister_subsys; __fib6_flush_trees = fib6_flush_trees; out: return ret; out_unregister_subsys: unregister_pernet_subsys(&fib6_net_ops); out_kmem_cache_create: kmem_cache_destroy(fib6_node_kmem); goto out; } void fib6_gc_cleanup(void) { unregister_pernet_subsys(&fib6_net_ops); kmem_cache_destroy(fib6_node_kmem); } #ifdef CONFIG_PROC_FS static int ipv6_route_native_seq_show(struct seq_file *seq, void *v) { struct fib6_info *rt = v; struct ipv6_route_iter *iter = seq->private; struct fib6_nh *fib6_nh = rt->fib6_nh; unsigned int flags = rt->fib6_flags; const struct net_device *dev; if (rt->nh) fib6_nh = nexthop_fib6_nh(rt->nh); seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); #ifdef CONFIG_IPV6_SUBTREES seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen); #else seq_puts(seq, "00000000000000000000000000000000 00 "); #endif if (fib6_nh->fib_nh_gw_family) { flags |= RTF_GATEWAY; seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6); } else { seq_puts(seq, "00000000000000000000000000000000"); } dev = fib6_nh->fib_nh_dev; seq_printf(seq, " %08x %08x %08x %08x %8s\n", rt->fib6_metric, refcount_read(&rt->fib6_ref), 0, flags, dev ? dev->name : ""); iter->w.leaf = NULL; return 0; } static int ipv6_route_yield(struct fib6_walker *w) { struct ipv6_route_iter *iter = w->args; if (!iter->skip) return 1; do { iter->w.leaf = rcu_dereference_protected( iter->w.leaf->fib6_next, lockdep_is_held(&iter->tbl->tb6_lock)); iter->skip--; if (!iter->skip && iter->w.leaf) return 1; } while (iter->w.leaf); return 0; } static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter, struct net *net) { memset(&iter->w, 0, sizeof(iter->w)); iter->w.func = ipv6_route_yield; iter->w.root = &iter->tbl->tb6_root; iter->w.state = FWS_INIT; iter->w.node = iter->w.root; iter->w.args = iter; iter->sernum = READ_ONCE(iter->w.root->fn_sernum); INIT_LIST_HEAD(&iter->w.lh); fib6_walker_link(net, &iter->w); } static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl, struct net *net) { unsigned int h; struct hlist_node *node; if (tbl) { h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1; node = rcu_dereference(hlist_next_rcu(&tbl->tb6_hlist)); } else { h = 0; node = NULL; } while (!node && h < FIB6_TABLE_HASHSZ) { node = rcu_dereference( hlist_first_rcu(&net->ipv6.fib_table_hash[h++])); } return hlist_entry_safe(node, struct fib6_table, tb6_hlist); } static void ipv6_route_check_sernum(struct ipv6_route_iter *iter) { int sernum = READ_ONCE(iter->w.root->fn_sernum); if (iter->sernum != sernum) { iter->sernum = sernum; iter->w.state = FWS_INIT; iter->w.node = iter->w.root; WARN_ON(iter->w.skip); iter->w.skip = iter->w.count; } } static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) { int r; struct fib6_info *n; struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; ++(*pos); if (!v) goto iter_table; n = rcu_dereference(((struct fib6_info *)v)->fib6_next); if (n) return n; iter_table: ipv6_route_check_sernum(iter); spin_lock_bh(&iter->tbl->tb6_lock); r = fib6_walk_continue(&iter->w); spin_unlock_bh(&iter->tbl->tb6_lock); if (r > 0) { return iter->w.leaf; } else if (r < 0) { fib6_walker_unlink(net, &iter->w); return NULL; } fib6_walker_unlink(net, &iter->w); iter->tbl = ipv6_route_seq_next_table(iter->tbl, net); if (!iter->tbl) return NULL; ipv6_route_seq_setup_walk(iter, net); goto iter_table; } static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; rcu_read_lock(); iter->tbl = ipv6_route_seq_next_table(NULL, net); iter->skip = *pos; if (iter->tbl) { loff_t p = 0; ipv6_route_seq_setup_walk(iter, net); return ipv6_route_seq_next(seq, NULL, &p); } else { return NULL; } } static bool ipv6_route_iter_active(struct ipv6_route_iter *iter) { struct fib6_walker *w = &iter->w; return w->node && !(w->state == FWS_U && w->node == w->root); } static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; if (ipv6_route_iter_active(iter)) fib6_walker_unlink(net, &iter->w); rcu_read_unlock(); } #if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) static int ipv6_route_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, void *v) { struct bpf_iter__ipv6_route ctx; ctx.meta = meta; ctx.rt = v; return bpf_iter_run_prog(prog, &ctx); } static int ipv6_route_seq_show(struct seq_file *seq, void *v) { struct ipv6_route_iter *iter = seq->private; struct bpf_iter_meta meta; struct bpf_prog *prog; int ret; meta.seq = seq; prog = bpf_iter_get_info(&meta, false); if (!prog) return ipv6_route_native_seq_show(seq, v); ret = ipv6_route_prog_seq_show(prog, &meta, v); iter->w.leaf = NULL; return ret; } static void ipv6_route_seq_stop(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; if (!v) { meta.seq = seq; prog = bpf_iter_get_info(&meta, true); if (prog) (void)ipv6_route_prog_seq_show(prog, &meta, v); } ipv6_route_native_seq_stop(seq, v); } #else static int ipv6_route_seq_show(struct seq_file *seq, void *v) { return ipv6_route_native_seq_show(seq, v); } static void ipv6_route_seq_stop(struct seq_file *seq, void *v) { ipv6_route_native_seq_stop(seq, v); } #endif const struct seq_operations ipv6_route_seq_ops = { .start = ipv6_route_seq_start, .next = ipv6_route_seq_next, .stop = ipv6_route_seq_stop, .show = ipv6_route_seq_show }; #endif /* CONFIG_PROC_FS */ |
24 7 5 39 17 5 19 24 19 260 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 | /* SPDX-License-Identifier: GPL-2.0 */ /* * fscrypt_private.h * * Copyright (C) 2015, Google, Inc. * * Originally written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar. * Heavily modified since then. */ #ifndef _FSCRYPT_PRIVATE_H #define _FSCRYPT_PRIVATE_H #include <linux/fscrypt.h> #include <linux/siphash.h> #include <crypto/hash.h> #include <linux/blk-crypto.h> #define CONST_STRLEN(str) (sizeof(str) - 1) #define FSCRYPT_FILE_NONCE_SIZE 16 /* * Minimum size of an fscrypt master key. Note: a longer key will be required * if ciphers with a 256-bit security strength are used. This is just the * absolute minimum, which applies when only 128-bit encryption is used. */ #define FSCRYPT_MIN_KEY_SIZE 16 #define FSCRYPT_CONTEXT_V1 1 #define FSCRYPT_CONTEXT_V2 2 /* Keep this in sync with include/uapi/linux/fscrypt.h */ #define FSCRYPT_MODE_MAX FSCRYPT_MODE_AES_256_HCTR2 struct fscrypt_context_v1 { u8 version; /* FSCRYPT_CONTEXT_V1 */ u8 contents_encryption_mode; u8 filenames_encryption_mode; u8 flags; u8 master_key_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE]; u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; }; struct fscrypt_context_v2 { u8 version; /* FSCRYPT_CONTEXT_V2 */ u8 contents_encryption_mode; u8 filenames_encryption_mode; u8 flags; u8 log2_data_unit_size; u8 __reserved[3]; u8 master_key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]; u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; }; /* * fscrypt_context - the encryption context of an inode * * This is the on-disk equivalent of an fscrypt_policy, stored alongside each * encrypted file usually in a hidden extended attribute. It contains the * fields from the fscrypt_policy, in order to identify the encryption algorithm * and key with which the file is encrypted. It also contains a nonce that was * randomly generated by fscrypt itself; this is used as KDF input or as a tweak * to cause different files to be encrypted differently. */ union fscrypt_context { u8 version; struct fscrypt_context_v1 v1; struct fscrypt_context_v2 v2; }; /* * Return the size expected for the given fscrypt_context based on its version * number, or 0 if the context version is unrecognized. */ static inline int fscrypt_context_size(const union fscrypt_context *ctx) { switch (ctx->version) { case FSCRYPT_CONTEXT_V1: BUILD_BUG_ON(sizeof(ctx->v1) != 28); return sizeof(ctx->v1); case FSCRYPT_CONTEXT_V2: BUILD_BUG_ON(sizeof(ctx->v2) != 40); return sizeof(ctx->v2); } return 0; } /* Check whether an fscrypt_context has a recognized version number and size */ static inline bool fscrypt_context_is_valid(const union fscrypt_context *ctx, int ctx_size) { return ctx_size >= 1 && ctx_size == fscrypt_context_size(ctx); } /* Retrieve the context's nonce, assuming the context was already validated */ static inline const u8 *fscrypt_context_nonce(const union fscrypt_context *ctx) { switch (ctx->version) { case FSCRYPT_CONTEXT_V1: return ctx->v1.nonce; case FSCRYPT_CONTEXT_V2: return ctx->v2.nonce; } WARN_ON_ONCE(1); return NULL; } union fscrypt_policy { u8 version; struct fscrypt_policy_v1 v1; struct fscrypt_policy_v2 v2; }; /* * Return the size expected for the given fscrypt_policy based on its version * number, or 0 if the policy version is unrecognized. */ static inline int fscrypt_policy_size(const union fscrypt_policy *policy) { switch (policy->version) { case FSCRYPT_POLICY_V1: return sizeof(policy->v1); case FSCRYPT_POLICY_V2: return sizeof(policy->v2); } return 0; } /* Return the contents encryption mode of a valid encryption policy */ static inline u8 fscrypt_policy_contents_mode(const union fscrypt_policy *policy) { switch (policy->version) { case FSCRYPT_POLICY_V1: return policy->v1.contents_encryption_mode; case FSCRYPT_POLICY_V2: return policy->v2.contents_encryption_mode; } BUG(); } /* Return the filenames encryption mode of a valid encryption policy */ static inline u8 fscrypt_policy_fnames_mode(const union fscrypt_policy *policy) { switch (policy->version) { case FSCRYPT_POLICY_V1: return policy->v1.filenames_encryption_mode; case FSCRYPT_POLICY_V2: return policy->v2.filenames_encryption_mode; } BUG(); } /* Return the flags (FSCRYPT_POLICY_FLAG*) of a valid encryption policy */ static inline u8 fscrypt_policy_flags(const union fscrypt_policy *policy) { switch (policy->version) { case FSCRYPT_POLICY_V1: return policy->v1.flags; case FSCRYPT_POLICY_V2: return policy->v2.flags; } BUG(); } static inline int fscrypt_policy_v2_du_bits(const struct fscrypt_policy_v2 *policy, const struct inode *inode) { return policy->log2_data_unit_size ?: inode->i_blkbits; } static inline int fscrypt_policy_du_bits(const union fscrypt_policy *policy, const struct inode *inode) { switch (policy->version) { case FSCRYPT_POLICY_V1: return inode->i_blkbits; case FSCRYPT_POLICY_V2: return fscrypt_policy_v2_du_bits(&policy->v2, inode); } BUG(); } /* * For encrypted symlinks, the ciphertext length is stored at the beginning * of the string in little-endian format. */ struct fscrypt_symlink_data { __le16 len; char encrypted_path[]; } __packed; /** * struct fscrypt_prepared_key - a key prepared for actual encryption/decryption * @tfm: crypto API transform object * @blk_key: key for blk-crypto * * Normally only one of the fields will be non-NULL. */ struct fscrypt_prepared_key { struct crypto_skcipher *tfm; #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT struct blk_crypto_key *blk_key; #endif }; /* * fscrypt_inode_info - the "encryption key" for an inode * * When an encrypted file's key is made available, an instance of this struct is * allocated and stored in ->i_crypt_info. Once created, it remains until the * inode is evicted. */ struct fscrypt_inode_info { /* The key in a form prepared for actual encryption/decryption */ struct fscrypt_prepared_key ci_enc_key; /* True if ci_enc_key should be freed when this struct is freed */ u8 ci_owns_key : 1; #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT /* * True if this inode will use inline encryption (blk-crypto) instead of * the traditional filesystem-layer encryption. */ u8 ci_inlinecrypt : 1; #endif /* True if ci_dirhash_key is initialized */ u8 ci_dirhash_key_initialized : 1; /* * log2 of the data unit size (granularity of contents encryption) of * this file. This is computable from ci_policy and ci_inode but is * cached here for efficiency. Only used for regular files. */ u8 ci_data_unit_bits; /* Cached value: log2 of number of data units per FS block */ u8 ci_data_units_per_block_bits; /* Hashed inode number. Only set for IV_INO_LBLK_32 */ u32 ci_hashed_ino; /* * Encryption mode used for this inode. It corresponds to either the * contents or filenames encryption mode, depending on the inode type. */ struct fscrypt_mode *ci_mode; /* Back-pointer to the inode */ struct inode *ci_inode; /* * The master key with which this inode was unlocked (decrypted). This * will be NULL if the master key was found in a process-subscribed * keyring rather than in the filesystem-level keyring. */ struct fscrypt_master_key *ci_master_key; /* * Link in list of inodes that were unlocked with the master key. * Only used when ->ci_master_key is set. */ struct list_head ci_master_key_link; /* * If non-NULL, then encryption is done using the master key directly * and ci_enc_key will equal ci_direct_key->dk_key. */ struct fscrypt_direct_key *ci_direct_key; /* * This inode's hash key for filenames. This is a 128-bit SipHash-2-4 * key. This is only set for directories that use a keyed dirhash over * the plaintext filenames -- currently just casefolded directories. */ siphash_key_t ci_dirhash_key; /* The encryption policy used by this inode */ union fscrypt_policy ci_policy; /* This inode's nonce, copied from the fscrypt_context */ u8 ci_nonce[FSCRYPT_FILE_NONCE_SIZE]; }; typedef enum { FS_DECRYPT = 0, FS_ENCRYPT, } fscrypt_direction_t; /* crypto.c */ extern struct kmem_cache *fscrypt_inode_info_cachep; int fscrypt_initialize(struct super_block *sb); int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci, fscrypt_direction_t rw, u64 index, struct page *src_page, struct page *dest_page, unsigned int len, unsigned int offs, gfp_t gfp_flags); struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags); void __printf(3, 4) __cold fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...); #define fscrypt_warn(inode, fmt, ...) \ fscrypt_msg((inode), KERN_WARNING, fmt, ##__VA_ARGS__) #define fscrypt_err(inode, fmt, ...) \ fscrypt_msg((inode), KERN_ERR, fmt, ##__VA_ARGS__) #define FSCRYPT_MAX_IV_SIZE 32 union fscrypt_iv { struct { /* zero-based index of data unit within the file */ __le64 index; /* per-file nonce; only set in DIRECT_KEY mode */ u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; }; u8 raw[FSCRYPT_MAX_IV_SIZE]; __le64 dun[FSCRYPT_MAX_IV_SIZE / sizeof(__le64)]; }; void fscrypt_generate_iv(union fscrypt_iv *iv, u64 index, const struct fscrypt_inode_info *ci); /* * Return the number of bits used by the maximum file data unit index that is * possible on the given filesystem, using the given log2 data unit size. */ static inline int fscrypt_max_file_dun_bits(const struct super_block *sb, int du_bits) { return fls64(sb->s_maxbytes - 1) - du_bits; } /* fname.c */ bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, u32 orig_len, u32 max_len, u32 *encrypted_len_ret); /* hkdf.c */ struct fscrypt_hkdf { struct crypto_shash *hmac_tfm; }; int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, unsigned int master_key_size); /* * The list of contexts in which fscrypt uses HKDF. These values are used as * the first byte of the HKDF application-specific info string to guarantee that * info strings are never repeated between contexts. This ensures that all HKDF * outputs are unique and cryptographically isolated, i.e. knowledge of one * output doesn't reveal another. */ #define HKDF_CONTEXT_KEY_IDENTIFIER 1 /* info=<empty> */ #define HKDF_CONTEXT_PER_FILE_ENC_KEY 2 /* info=file_nonce */ #define HKDF_CONTEXT_DIRECT_KEY 3 /* info=mode_num */ #define HKDF_CONTEXT_IV_INO_LBLK_64_KEY 4 /* info=mode_num||fs_uuid */ #define HKDF_CONTEXT_DIRHASH_KEY 5 /* info=file_nonce */ #define HKDF_CONTEXT_IV_INO_LBLK_32_KEY 6 /* info=mode_num||fs_uuid */ #define HKDF_CONTEXT_INODE_HASH_KEY 7 /* info=<empty> */ int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, const u8 *info, unsigned int infolen, u8 *okm, unsigned int okmlen); void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf); /* inline_crypt.c */ #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci); static inline bool fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci) { return ci->ci_inlinecrypt; } int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_inode_info *ci); void fscrypt_destroy_inline_crypt_key(struct super_block *sb, struct fscrypt_prepared_key *prep_key); /* * Check whether the crypto transform or blk-crypto key has been allocated in * @prep_key, depending on which encryption implementation the file will use. */ static inline bool fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key, const struct fscrypt_inode_info *ci) { /* * The two smp_load_acquire()'s here pair with the smp_store_release()'s * in fscrypt_prepare_inline_crypt_key() and fscrypt_prepare_key(). * I.e., in some cases (namely, if this prep_key is a per-mode * encryption key) another task can publish blk_key or tfm concurrently, * executing a RELEASE barrier. We need to use smp_load_acquire() here * to safely ACQUIRE the memory the other task published. */ if (fscrypt_using_inline_encryption(ci)) return smp_load_acquire(&prep_key->blk_key) != NULL; return smp_load_acquire(&prep_key->tfm) != NULL; } #else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ static inline int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci) { return 0; } static inline bool fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci) { return false; } static inline int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_inode_info *ci) { WARN_ON_ONCE(1); return -EOPNOTSUPP; } static inline void fscrypt_destroy_inline_crypt_key(struct super_block *sb, struct fscrypt_prepared_key *prep_key) { } static inline bool fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key, const struct fscrypt_inode_info *ci) { return smp_load_acquire(&prep_key->tfm) != NULL; } #endif /* !CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ /* keyring.c */ /* * fscrypt_master_key_secret - secret key material of an in-use master key */ struct fscrypt_master_key_secret { /* * For v2 policy keys: HKDF context keyed by this master key. * For v1 policy keys: not set (hkdf.hmac_tfm == NULL). */ struct fscrypt_hkdf hkdf; /* * Size of the raw key in bytes. This remains set even if ->raw was * zeroized due to no longer being needed. I.e. we still remember the * size of the key even if we don't need to remember the key itself. */ u32 size; /* For v1 policy keys: the raw key. Wiped for v2 policy keys. */ u8 raw[FSCRYPT_MAX_KEY_SIZE]; } __randomize_layout; /* * fscrypt_master_key - an in-use master key * * This represents a master encryption key which has been added to the * filesystem. There are three high-level states that a key can be in: * * FSCRYPT_KEY_STATUS_PRESENT * Key is fully usable; it can be used to unlock inodes that are encrypted * with it (this includes being able to create new inodes). ->mk_present * indicates whether the key is in this state. ->mk_secret exists, the key * is in the keyring, and ->mk_active_refs > 0 due to ->mk_present. * * FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED * Removal of this key has been initiated, but some inodes that were * unlocked with it are still in-use. Like ABSENT, ->mk_secret is wiped, * and the key can no longer be used to unlock inodes. Unlike ABSENT, the * key is still in the keyring; ->mk_decrypted_inodes is nonempty; and * ->mk_active_refs > 0, being equal to the size of ->mk_decrypted_inodes. * * This state transitions to ABSENT if ->mk_decrypted_inodes becomes empty, * or to PRESENT if FS_IOC_ADD_ENCRYPTION_KEY is called again for this key. * * FSCRYPT_KEY_STATUS_ABSENT * Key is fully removed. The key is no longer in the keyring, * ->mk_decrypted_inodes is empty, ->mk_active_refs == 0, ->mk_secret is * wiped, and the key can no longer be used to unlock inodes. */ struct fscrypt_master_key { /* * Link in ->s_master_keys->key_hashtable. * Only valid if ->mk_active_refs > 0. */ struct hlist_node mk_node; /* Semaphore that protects ->mk_secret, ->mk_users, and ->mk_present */ struct rw_semaphore mk_sem; /* * Active and structural reference counts. An active ref guarantees * that the struct continues to exist, continues to be in the keyring * ->s_master_keys, and that any embedded subkeys (e.g. * ->mk_direct_keys) that have been prepared continue to exist. * A structural ref only guarantees that the struct continues to exist. * * There is one active ref associated with ->mk_present being true, and * one active ref for each inode in ->mk_decrypted_inodes. * * There is one structural ref associated with the active refcount being * nonzero. Finding a key in the keyring also takes a structural ref, * which is then held temporarily while the key is operated on. */ refcount_t mk_active_refs; refcount_t mk_struct_refs; struct rcu_head mk_rcu_head; /* * The secret key material. Wiped as soon as it is no longer needed; * for details, see the fscrypt_master_key struct comment. * * Locking: protected by ->mk_sem. */ struct fscrypt_master_key_secret mk_secret; /* * For v1 policy keys: an arbitrary key descriptor which was assigned by * userspace (->descriptor). * * For v2 policy keys: a cryptographic hash of this key (->identifier). */ struct fscrypt_key_specifier mk_spec; /* * Keyring which contains a key of type 'key_type_fscrypt_user' for each * user who has added this key. Normally each key will be added by just * one user, but it's possible that multiple users share a key, and in * that case we need to keep track of those users so that one user can't * remove the key before the others want it removed too. * * This is NULL for v1 policy keys; those can only be added by root. * * Locking: protected by ->mk_sem. (We don't just rely on the keyrings * subsystem semaphore ->mk_users->sem, as we need support for atomic * search+insert along with proper synchronization with other fields.) */ struct key *mk_users; /* * List of inodes that were unlocked using this key. This allows the * inodes to be evicted efficiently if the key is removed. */ struct list_head mk_decrypted_inodes; spinlock_t mk_decrypted_inodes_lock; /* * Per-mode encryption keys for the various types of encryption policies * that use them. Allocated and derived on-demand. */ struct fscrypt_prepared_key mk_direct_keys[FSCRYPT_MODE_MAX + 1]; struct fscrypt_prepared_key mk_iv_ino_lblk_64_keys[FSCRYPT_MODE_MAX + 1]; struct fscrypt_prepared_key mk_iv_ino_lblk_32_keys[FSCRYPT_MODE_MAX + 1]; /* Hash key for inode numbers. Initialized only when needed. */ siphash_key_t mk_ino_hash_key; bool mk_ino_hash_key_initialized; /* * Whether this key is in the "present" state, i.e. fully usable. For * details, see the fscrypt_master_key struct comment. * * Locking: protected by ->mk_sem, but can be read locklessly using * READ_ONCE(). Writers must use WRITE_ONCE() when concurrent readers * are possible. */ bool mk_present; } __randomize_layout; static inline const char *master_key_spec_type( const struct fscrypt_key_specifier *spec) { switch (spec->type) { case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: return "descriptor"; case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER: return "identifier"; } return "[unknown]"; } static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec) { switch (spec->type) { case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: return FSCRYPT_KEY_DESCRIPTOR_SIZE; case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER: return FSCRYPT_KEY_IDENTIFIER_SIZE; } return 0; } void fscrypt_put_master_key(struct fscrypt_master_key *mk); void fscrypt_put_master_key_activeref(struct super_block *sb, struct fscrypt_master_key *mk); struct fscrypt_master_key * fscrypt_find_master_key(struct super_block *sb, const struct fscrypt_key_specifier *mk_spec); int fscrypt_get_test_dummy_key_identifier( u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); int fscrypt_add_test_dummy_key(struct super_block *sb, struct fscrypt_key_specifier *key_spec); int fscrypt_verify_key_added(struct super_block *sb, const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); int __init fscrypt_init_keyring(void); /* keysetup.c */ struct fscrypt_mode { const char *friendly_name; const char *cipher_str; int keysize; /* key size in bytes */ int security_strength; /* security strength in bytes */ int ivsize; /* IV size in bytes */ int logged_cryptoapi_impl; int logged_blk_crypto_native; int logged_blk_crypto_fallback; enum blk_crypto_mode_num blk_crypto_mode; }; extern struct fscrypt_mode fscrypt_modes[]; int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_inode_info *ci); void fscrypt_destroy_prepared_key(struct super_block *sb, struct fscrypt_prepared_key *prep_key); int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci, const u8 *raw_key); int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci, const struct fscrypt_master_key *mk); void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci, const struct fscrypt_master_key *mk); int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported); /** * fscrypt_require_key() - require an inode's encryption key * @inode: the inode we need the key for * * If the inode is encrypted, set up its encryption key if not already done. * Then require that the key be present and return -ENOKEY otherwise. * * No locks are needed, and the key will live as long as the struct inode --- so * it won't go away from under you. * * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code * if a problem occurred while setting up the encryption key. */ static inline int fscrypt_require_key(struct inode *inode) { if (IS_ENCRYPTED(inode)) { int err = fscrypt_get_encryption_info(inode, false); if (err) return err; if (!fscrypt_has_encryption_key(inode)) return -ENOKEY; } return 0; } /* keysetup_v1.c */ void fscrypt_put_direct_key(struct fscrypt_direct_key *dk); int fscrypt_setup_v1_file_key(struct fscrypt_inode_info *ci, const u8 *raw_master_key); int fscrypt_setup_v1_file_key_via_subscribed_keyrings( struct fscrypt_inode_info *ci); /* policy.c */ bool fscrypt_policies_equal(const union fscrypt_policy *policy1, const union fscrypt_policy *policy2); int fscrypt_policy_to_key_spec(const union fscrypt_policy *policy, struct fscrypt_key_specifier *key_spec); const union fscrypt_policy *fscrypt_get_dummy_policy(struct super_block *sb); bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, const struct inode *inode); int fscrypt_policy_from_context(union fscrypt_policy *policy_u, const union fscrypt_context *ctx_u, int ctx_size); const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir); #endif /* _FSCRYPT_PRIVATE_H */ |
1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 | // SPDX-License-Identifier: GPL-2.0 /* * (C) 2001 Clemson University and The University of Chicago * * Changes by Acxiom Corporation to add protocol version to kernel * communication, Copyright Acxiom Corporation, 2005. * * See COPYING in top-level directory. */ #include "protocol.h" #include "orangefs-kernel.h" #include "orangefs-dev-proto.h" #include "orangefs-bufmap.h" #include "orangefs-debugfs.h" #include <linux/debugfs.h> #include <linux/slab.h> /* this file implements the /dev/pvfs2-req device node */ uint32_t orangefs_userspace_version; static int open_access_count; static DEFINE_MUTEX(devreq_mutex); #define DUMP_DEVICE_ERROR() \ do { \ gossip_err("*****************************************************\n");\ gossip_err("ORANGEFS Device Error: You cannot open the device file "); \ gossip_err("\n/dev/%s more than once. Please make sure that\nthere " \ "are no ", ORANGEFS_REQDEVICE_NAME); \ gossip_err("instances of a program using this device\ncurrently " \ "running. (You must verify this!)\n"); \ gossip_err("For example, you can use the lsof program as follows:\n");\ gossip_err("'lsof | grep %s' (run this as root)\n", \ ORANGEFS_REQDEVICE_NAME); \ gossip_err(" open_access_count = %d\n", open_access_count); \ gossip_err("*****************************************************\n");\ } while (0) static int hash_func(__u64 tag, int table_size) { return do_div(tag, (unsigned int)table_size); } static void orangefs_devreq_add_op(struct orangefs_kernel_op_s *op) { int index = hash_func(op->tag, hash_table_size); list_add_tail(&op->list, &orangefs_htable_ops_in_progress[index]); } /* * find the op with this tag and remove it from the in progress * hash table. */ static struct orangefs_kernel_op_s *orangefs_devreq_remove_op(__u64 tag) { struct orangefs_kernel_op_s *op, *next; int index; index = hash_func(tag, hash_table_size); spin_lock(&orangefs_htable_ops_in_progress_lock); list_for_each_entry_safe(op, next, &orangefs_htable_ops_in_progress[index], list) { if (op->tag == tag && !op_state_purged(op) && !op_state_given_up(op)) { list_del_init(&op->list); spin_unlock(&orangefs_htable_ops_in_progress_lock); return op; } } spin_unlock(&orangefs_htable_ops_in_progress_lock); return NULL; } /* Returns whether any FS are still pending remounted */ static int mark_all_pending_mounts(void) { int unmounted = 1; struct orangefs_sb_info_s *orangefs_sb = NULL; spin_lock(&orangefs_superblocks_lock); list_for_each_entry(orangefs_sb, &orangefs_superblocks, list) { /* All of these file system require a remount */ orangefs_sb->mount_pending = 1; unmounted = 0; } spin_unlock(&orangefs_superblocks_lock); return unmounted; } /* * Determine if a given file system needs to be remounted or not * Returns -1 on error * 0 if already mounted * 1 if needs remount */ static int fs_mount_pending(__s32 fsid) { int mount_pending = -1; struct orangefs_sb_info_s *orangefs_sb = NULL; spin_lock(&orangefs_superblocks_lock); list_for_each_entry(orangefs_sb, &orangefs_superblocks, list) { if (orangefs_sb->fs_id == fsid) { mount_pending = orangefs_sb->mount_pending; break; } } spin_unlock(&orangefs_superblocks_lock); return mount_pending; } static int orangefs_devreq_open(struct inode *inode, struct file *file) { int ret = -EINVAL; /* in order to ensure that the filesystem driver sees correct UIDs */ if (file->f_cred->user_ns != &init_user_ns) { gossip_err("%s: device cannot be opened outside init_user_ns\n", __func__); goto out; } if (!(file->f_flags & O_NONBLOCK)) { gossip_err("%s: device cannot be opened in blocking mode\n", __func__); goto out; } ret = -EACCES; gossip_debug(GOSSIP_DEV_DEBUG, "client-core: opening device\n"); mutex_lock(&devreq_mutex); if (open_access_count == 0) { open_access_count = 1; ret = 0; } else { DUMP_DEVICE_ERROR(); } mutex_unlock(&devreq_mutex); out: gossip_debug(GOSSIP_DEV_DEBUG, "pvfs2-client-core: open device complete (ret = %d)\n", ret); return ret; } /* Function for read() callers into the device */ static ssize_t orangefs_devreq_read(struct file *file, char __user *buf, size_t count, loff_t *offset) { struct orangefs_kernel_op_s *op, *temp; __s32 proto_ver = ORANGEFS_KERNEL_PROTO_VERSION; static __s32 magic = ORANGEFS_DEVREQ_MAGIC; struct orangefs_kernel_op_s *cur_op; unsigned long ret; /* We do not support blocking IO. */ if (!(file->f_flags & O_NONBLOCK)) { gossip_err("%s: blocking read from client-core.\n", __func__); return -EINVAL; } /* * The client will do an ioctl to find MAX_DEV_REQ_UPSIZE, then * always read with that size buffer. */ if (count != MAX_DEV_REQ_UPSIZE) { gossip_err("orangefs: client-core tried to read wrong size\n"); return -EINVAL; } /* Check for an empty list before locking. */ if (list_empty(&orangefs_request_list)) return -EAGAIN; restart: cur_op = NULL; /* Get next op (if any) from top of list. */ spin_lock(&orangefs_request_list_lock); list_for_each_entry_safe(op, temp, &orangefs_request_list, list) { __s32 fsid; /* This lock is held past the end of the loop when we break. */ spin_lock(&op->lock); if (unlikely(op_state_purged(op) || op_state_given_up(op))) { spin_unlock(&op->lock); continue; } fsid = fsid_of_op(op); if (fsid != ORANGEFS_FS_ID_NULL) { int ret; /* Skip ops whose filesystem needs to be mounted. */ ret = fs_mount_pending(fsid); if (ret == 1) { gossip_debug(GOSSIP_DEV_DEBUG, "%s: mount pending, skipping op tag " "%llu %s\n", __func__, llu(op->tag), get_opname_string(op)); spin_unlock(&op->lock); continue; /* * Skip ops whose filesystem we don't know about unless * it is being mounted or unmounted. It is possible for * a filesystem we don't know about to be unmounted if * it fails to mount in the kernel after userspace has * been sent the mount request. */ /* XXX: is there a better way to detect this? */ } else if (ret == -1 && !(op->upcall.type == ORANGEFS_VFS_OP_FS_MOUNT || op->upcall.type == ORANGEFS_VFS_OP_GETATTR || op->upcall.type == ORANGEFS_VFS_OP_FS_UMOUNT)) { gossip_debug(GOSSIP_DEV_DEBUG, "orangefs: skipping op tag %llu %s\n", llu(op->tag), get_opname_string(op)); gossip_err( "orangefs: ERROR: fs_mount_pending %d\n", fsid); spin_unlock(&op->lock); continue; } } /* * Either this op does not pertain to a filesystem, is mounting * a filesystem, or pertains to a mounted filesystem. Let it * through. */ cur_op = op; break; } /* * At this point we either have a valid op and can continue or have not * found an op and must ask the client to try again later. */ if (!cur_op) { spin_unlock(&orangefs_request_list_lock); return -EAGAIN; } gossip_debug(GOSSIP_DEV_DEBUG, "%s: reading op tag %llu %s\n", __func__, llu(cur_op->tag), get_opname_string(cur_op)); /* * Such an op should never be on the list in the first place. If so, we * will abort. */ if (op_state_in_progress(cur_op) || op_state_serviced(cur_op)) { gossip_err("orangefs: ERROR: Current op already queued.\n"); list_del_init(&cur_op->list); spin_unlock(&cur_op->lock); spin_unlock(&orangefs_request_list_lock); return -EAGAIN; } list_del_init(&cur_op->list); spin_unlock(&orangefs_request_list_lock); spin_unlock(&cur_op->lock); /* Push the upcall out. */ ret = copy_to_user(buf, &proto_ver, sizeof(__s32)); if (ret != 0) goto error; ret = copy_to_user(buf + sizeof(__s32), &magic, sizeof(__s32)); if (ret != 0) goto error; ret = copy_to_user(buf + 2 * sizeof(__s32), &cur_op->tag, sizeof(__u64)); if (ret != 0) goto error; ret = copy_to_user(buf + 2 * sizeof(__s32) + sizeof(__u64), &cur_op->upcall, sizeof(struct orangefs_upcall_s)); if (ret != 0) goto error; spin_lock(&orangefs_htable_ops_in_progress_lock); spin_lock(&cur_op->lock); if (unlikely(op_state_given_up(cur_op))) { spin_unlock(&cur_op->lock); spin_unlock(&orangefs_htable_ops_in_progress_lock); complete(&cur_op->waitq); goto restart; } /* * Set the operation to be in progress and move it between lists since * it has been sent to the client. */ set_op_state_inprogress(cur_op); gossip_debug(GOSSIP_DEV_DEBUG, "%s: 1 op:%s: op_state:%d: process:%s:\n", __func__, get_opname_string(cur_op), cur_op->op_state, current->comm); orangefs_devreq_add_op(cur_op); spin_unlock(&cur_op->lock); spin_unlock(&orangefs_htable_ops_in_progress_lock); /* The client only asks to read one size buffer. */ return MAX_DEV_REQ_UPSIZE; error: /* * We were unable to copy the op data to the client. Put the op back in * list. If client has crashed, the op will be purged later when the * device is released. */ gossip_err("orangefs: Failed to copy data to user space\n"); spin_lock(&orangefs_request_list_lock); spin_lock(&cur_op->lock); if (likely(!op_state_given_up(cur_op))) { set_op_state_waiting(cur_op); gossip_debug(GOSSIP_DEV_DEBUG, "%s: 2 op:%s: op_state:%d: process:%s:\n", __func__, get_opname_string(cur_op), cur_op->op_state, current->comm); list_add(&cur_op->list, &orangefs_request_list); spin_unlock(&cur_op->lock); } else { spin_unlock(&cur_op->lock); complete(&cur_op->waitq); } spin_unlock(&orangefs_request_list_lock); return -EFAULT; } /* * Function for writev() callers into the device. * * Userspace should have written: * - __u32 version * - __u32 magic * - __u64 tag * - struct orangefs_downcall_s * - trailer buffer (in the case of READDIR operations) */ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb, struct iov_iter *iter) { ssize_t ret; struct orangefs_kernel_op_s *op = NULL; struct { __u32 version; __u32 magic; __u64 tag; } head; int total = ret = iov_iter_count(iter); int downcall_size = sizeof(struct orangefs_downcall_s); int head_size = sizeof(head); gossip_debug(GOSSIP_DEV_DEBUG, "%s: total:%d: ret:%zd:\n", __func__, total, ret); if (total < MAX_DEV_REQ_DOWNSIZE) { gossip_err("%s: total:%d: must be at least:%u:\n", __func__, total, (unsigned int) MAX_DEV_REQ_DOWNSIZE); return -EFAULT; } if (!copy_from_iter_full(&head, head_size, iter)) { gossip_err("%s: failed to copy head.\n", __func__); return -EFAULT; } if (head.version < ORANGEFS_MINIMUM_USERSPACE_VERSION) { gossip_err("%s: userspace claims version" "%d, minimum version required: %d.\n", __func__, head.version, ORANGEFS_MINIMUM_USERSPACE_VERSION); return -EPROTO; } if (head.magic != ORANGEFS_DEVREQ_MAGIC) { gossip_err("Error: Device magic number does not match.\n"); return -EPROTO; } if (!orangefs_userspace_version) { orangefs_userspace_version = head.version; } else if (orangefs_userspace_version != head.version) { gossip_err("Error: userspace version changes\n"); return -EPROTO; } /* remove the op from the in progress hash table */ op = orangefs_devreq_remove_op(head.tag); if (!op) { gossip_debug(GOSSIP_DEV_DEBUG, "%s: No one's waiting for tag %llu\n", __func__, llu(head.tag)); return ret; } if (!copy_from_iter_full(&op->downcall, downcall_size, iter)) { gossip_err("%s: failed to copy downcall.\n", __func__); goto Efault; } if (op->downcall.status) goto wakeup; /* * We've successfully peeled off the head and the downcall. * Something has gone awry if total doesn't equal the * sum of head_size, downcall_size and trailer_size. */ if ((head_size + downcall_size + op->downcall.trailer_size) != total) { gossip_err("%s: funky write, head_size:%d" ": downcall_size:%d: trailer_size:%lld" ": total size:%d:\n", __func__, head_size, downcall_size, op->downcall.trailer_size, total); goto Efault; } /* Only READDIR operations should have trailers. */ if ((op->downcall.type != ORANGEFS_VFS_OP_READDIR) && (op->downcall.trailer_size != 0)) { gossip_err("%s: %x operation with trailer.", __func__, op->downcall.type); goto Efault; } /* READDIR operations should always have trailers. */ if ((op->downcall.type == ORANGEFS_VFS_OP_READDIR) && (op->downcall.trailer_size == 0)) { gossip_err("%s: %x operation with no trailer.", __func__, op->downcall.type); goto Efault; } if (op->downcall.type != ORANGEFS_VFS_OP_READDIR) goto wakeup; op->downcall.trailer_buf = vzalloc(op->downcall.trailer_size); if (!op->downcall.trailer_buf) goto Enomem; if (!copy_from_iter_full(op->downcall.trailer_buf, op->downcall.trailer_size, iter)) { gossip_err("%s: failed to copy trailer.\n", __func__); vfree(op->downcall.trailer_buf); goto Efault; } wakeup: /* * Return to vfs waitqueue, and back to service_operation * through wait_for_matching_downcall. */ spin_lock(&op->lock); if (unlikely(op_is_cancel(op))) { spin_unlock(&op->lock); put_cancel(op); } else if (unlikely(op_state_given_up(op))) { spin_unlock(&op->lock); complete(&op->waitq); } else { set_op_state_serviced(op); gossip_debug(GOSSIP_DEV_DEBUG, "%s: op:%s: op_state:%d: process:%s:\n", __func__, get_opname_string(op), op->op_state, current->comm); spin_unlock(&op->lock); } return ret; Efault: op->downcall.status = -(ORANGEFS_ERROR_BIT | 9); ret = -EFAULT; goto wakeup; Enomem: op->downcall.status = -(ORANGEFS_ERROR_BIT | 8); ret = -ENOMEM; goto wakeup; } /* * NOTE: gets called when the last reference to this device is dropped. * Using the open_access_count variable, we enforce a reference count * on this file so that it can be opened by only one process at a time. * the devreq_mutex is used to make sure all i/o has completed * before we call orangefs_bufmap_finalize, and similar such tricky * situations */ static int orangefs_devreq_release(struct inode *inode, struct file *file) { int unmounted = 0; gossip_debug(GOSSIP_DEV_DEBUG, "%s:pvfs2-client-core: exiting, closing device\n", __func__); mutex_lock(&devreq_mutex); orangefs_bufmap_finalize(); open_access_count = -1; unmounted = mark_all_pending_mounts(); gossip_debug(GOSSIP_DEV_DEBUG, "ORANGEFS Device Close: Filesystem(s) %s\n", (unmounted ? "UNMOUNTED" : "MOUNTED")); purge_waiting_ops(); purge_inprogress_ops(); orangefs_bufmap_run_down(); gossip_debug(GOSSIP_DEV_DEBUG, "pvfs2-client-core: device close complete\n"); open_access_count = 0; orangefs_userspace_version = 0; mutex_unlock(&devreq_mutex); return 0; } int is_daemon_in_service(void) { int in_service; /* * What this function does is checks if client-core is alive * based on the access count we maintain on the device. */ mutex_lock(&devreq_mutex); in_service = open_access_count == 1 ? 0 : -EIO; mutex_unlock(&devreq_mutex); return in_service; } bool __is_daemon_in_service(void) { return open_access_count == 1; } static inline long check_ioctl_command(unsigned int command) { /* Check for valid ioctl codes */ if (_IOC_TYPE(command) != ORANGEFS_DEV_MAGIC) { gossip_err("device ioctl magic numbers don't match! Did you rebuild pvfs2-client-core/libpvfs2? [cmd %x, magic %x != %x]\n", command, _IOC_TYPE(command), ORANGEFS_DEV_MAGIC); return -EINVAL; } /* and valid ioctl commands */ if (_IOC_NR(command) >= ORANGEFS_DEV_MAXNR || _IOC_NR(command) <= 0) { gossip_err("Invalid ioctl command number [%d >= %d]\n", _IOC_NR(command), ORANGEFS_DEV_MAXNR); return -ENOIOCTLCMD; } return 0; } static long dispatch_ioctl_command(unsigned int command, unsigned long arg) { static __s32 magic = ORANGEFS_DEVREQ_MAGIC; static __s32 max_up_size = MAX_DEV_REQ_UPSIZE; static __s32 max_down_size = MAX_DEV_REQ_DOWNSIZE; struct ORANGEFS_dev_map_desc user_desc; int ret = 0; int upstream_kmod = 1; struct orangefs_sb_info_s *orangefs_sb; /* mtmoore: add locking here */ switch (command) { case ORANGEFS_DEV_GET_MAGIC: return ((put_user(magic, (__s32 __user *) arg) == -EFAULT) ? -EIO : 0); case ORANGEFS_DEV_GET_MAX_UPSIZE: return ((put_user(max_up_size, (__s32 __user *) arg) == -EFAULT) ? -EIO : 0); case ORANGEFS_DEV_GET_MAX_DOWNSIZE: return ((put_user(max_down_size, (__s32 __user *) arg) == -EFAULT) ? -EIO : 0); case ORANGEFS_DEV_MAP: ret = copy_from_user(&user_desc, (struct ORANGEFS_dev_map_desc __user *) arg, sizeof(struct ORANGEFS_dev_map_desc)); /* WTF -EIO and not -EFAULT? */ return ret ? -EIO : orangefs_bufmap_initialize(&user_desc); case ORANGEFS_DEV_REMOUNT_ALL: gossip_debug(GOSSIP_DEV_DEBUG, "%s: got ORANGEFS_DEV_REMOUNT_ALL\n", __func__); /* * remount all mounted orangefs volumes to regain the lost * dynamic mount tables (if any) -- NOTE: this is done * without keeping the superblock list locked due to the * upcall/downcall waiting. also, the request mutex is * used to ensure that no operations will be serviced until * all of the remounts are serviced (to avoid ops between * mounts to fail) */ ret = mutex_lock_interruptible(&orangefs_request_mutex); if (ret < 0) return ret; gossip_debug(GOSSIP_DEV_DEBUG, "%s: priority remount in progress\n", __func__); spin_lock(&orangefs_superblocks_lock); list_for_each_entry(orangefs_sb, &orangefs_superblocks, list) { /* * We have to drop the spinlock, so entries can be * removed. They can't be freed, though, so we just * keep the forward pointers and zero the back ones - * that way we can get to the rest of the list. */ if (!orangefs_sb->list.prev) continue; gossip_debug(GOSSIP_DEV_DEBUG, "%s: Remounting SB %p\n", __func__, orangefs_sb); spin_unlock(&orangefs_superblocks_lock); ret = orangefs_remount(orangefs_sb); spin_lock(&orangefs_superblocks_lock); if (ret) { gossip_debug(GOSSIP_DEV_DEBUG, "SB %p remount failed\n", orangefs_sb); break; } } spin_unlock(&orangefs_superblocks_lock); gossip_debug(GOSSIP_DEV_DEBUG, "%s: priority remount complete\n", __func__); mutex_unlock(&orangefs_request_mutex); return ret; case ORANGEFS_DEV_UPSTREAM: ret = copy_to_user((void __user *)arg, &upstream_kmod, sizeof(upstream_kmod)); if (ret != 0) return -EIO; else return ret; case ORANGEFS_DEV_CLIENT_MASK: return orangefs_debugfs_new_client_mask((void __user *)arg); case ORANGEFS_DEV_CLIENT_STRING: return orangefs_debugfs_new_client_string((void __user *)arg); case ORANGEFS_DEV_DEBUG: return orangefs_debugfs_new_debug((void __user *)arg); default: return -ENOIOCTLCMD; } return -ENOIOCTLCMD; } static long orangefs_devreq_ioctl(struct file *file, unsigned int command, unsigned long arg) { long ret; /* Check for properly constructed commands */ ret = check_ioctl_command(command); if (ret < 0) return (int)ret; return (int)dispatch_ioctl_command(command, arg); } #ifdef CONFIG_COMPAT /* CONFIG_COMPAT is in .config */ /* Compat structure for the ORANGEFS_DEV_MAP ioctl */ struct ORANGEFS_dev_map_desc32 { compat_uptr_t ptr; __s32 total_size; __s32 size; __s32 count; }; /* * 32 bit user-space apps' ioctl handlers when kernel modules * is compiled as a 64 bit one */ static long orangefs_devreq_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long args) { long ret; /* Check for properly constructed commands */ ret = check_ioctl_command(cmd); if (ret < 0) return ret; if (cmd == ORANGEFS_DEV_MAP) { struct ORANGEFS_dev_map_desc desc; struct ORANGEFS_dev_map_desc32 d32; if (copy_from_user(&d32, (void __user *)args, sizeof(d32))) return -EFAULT; desc.ptr = compat_ptr(d32.ptr); desc.total_size = d32.total_size; desc.size = d32.size; desc.count = d32.count; return orangefs_bufmap_initialize(&desc); } /* no other ioctl requires translation */ return dispatch_ioctl_command(cmd, args); } #endif /* CONFIG_COMPAT is in .config */ static __poll_t orangefs_devreq_poll(struct file *file, struct poll_table_struct *poll_table) { __poll_t poll_revent_mask = 0; poll_wait(file, &orangefs_request_list_waitq, poll_table); if (!list_empty(&orangefs_request_list)) poll_revent_mask |= EPOLLIN; return poll_revent_mask; } /* the assigned character device major number */ static int orangefs_dev_major; static const struct file_operations orangefs_devreq_file_operations = { .owner = THIS_MODULE, .read = orangefs_devreq_read, .write_iter = orangefs_devreq_write_iter, .open = orangefs_devreq_open, .release = orangefs_devreq_release, .unlocked_ioctl = orangefs_devreq_ioctl, #ifdef CONFIG_COMPAT /* CONFIG_COMPAT is in .config */ .compat_ioctl = orangefs_devreq_compat_ioctl, #endif .poll = orangefs_devreq_poll }; /* * Initialize orangefs device specific state: * Must be called at module load time only */ int orangefs_dev_init(void) { /* register orangefs-req device */ orangefs_dev_major = register_chrdev(0, ORANGEFS_REQDEVICE_NAME, &orangefs_devreq_file_operations); if (orangefs_dev_major < 0) { gossip_debug(GOSSIP_DEV_DEBUG, "Failed to register /dev/%s (error %d)\n", ORANGEFS_REQDEVICE_NAME, orangefs_dev_major); return orangefs_dev_major; } gossip_debug(GOSSIP_DEV_DEBUG, "*** /dev/%s character device registered ***\n", ORANGEFS_REQDEVICE_NAME); gossip_debug(GOSSIP_DEV_DEBUG, "'mknod /dev/%s c %d 0'.\n", ORANGEFS_REQDEVICE_NAME, orangefs_dev_major); return 0; } void orangefs_dev_cleanup(void) { unregister_chrdev(orangefs_dev_major, ORANGEFS_REQDEVICE_NAME); gossip_debug(GOSSIP_DEV_DEBUG, "*** /dev/%s character device unregistered ***\n", ORANGEFS_REQDEVICE_NAME); } |
3782 3776 3780 3784 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | // SPDX-License-Identifier: GPL-2.0-or-later /* * The "hash function" used as the core of the ChaCha stream cipher (RFC7539) * * Copyright (C) 2015 Martin Willi */ #include <linux/bug.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/bitops.h> #include <linux/string.h> #include <asm/unaligned.h> #include <crypto/chacha.h> static void chacha_permute(u32 *x, int nrounds) { int i; /* whitelist the allowed round counts */ WARN_ON_ONCE(nrounds != 20 && nrounds != 12); for (i = 0; i < nrounds; i += 2) { x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 16); x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 16); x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 16); x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 16); x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 12); x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 12); x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 12); x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 12); x[0] += x[4]; x[12] = rol32(x[12] ^ x[0], 8); x[1] += x[5]; x[13] = rol32(x[13] ^ x[1], 8); x[2] += x[6]; x[14] = rol32(x[14] ^ x[2], 8); x[3] += x[7]; x[15] = rol32(x[15] ^ x[3], 8); x[8] += x[12]; x[4] = rol32(x[4] ^ x[8], 7); x[9] += x[13]; x[5] = rol32(x[5] ^ x[9], 7); x[10] += x[14]; x[6] = rol32(x[6] ^ x[10], 7); x[11] += x[15]; x[7] = rol32(x[7] ^ x[11], 7); x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 16); x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 16); x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 16); x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 16); x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 12); x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 12); x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 12); x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 12); x[0] += x[5]; x[15] = rol32(x[15] ^ x[0], 8); x[1] += x[6]; x[12] = rol32(x[12] ^ x[1], 8); x[2] += x[7]; x[13] = rol32(x[13] ^ x[2], 8); x[3] += x[4]; x[14] = rol32(x[14] ^ x[3], 8); x[10] += x[15]; x[5] = rol32(x[5] ^ x[10], 7); x[11] += x[12]; x[6] = rol32(x[6] ^ x[11], 7); x[8] += x[13]; x[7] = rol32(x[7] ^ x[8], 7); x[9] += x[14]; x[4] = rol32(x[4] ^ x[9], 7); } } /** * chacha_block_generic - generate one keystream block and increment block counter * @state: input state matrix (16 32-bit words) * @stream: output keystream block (64 bytes) * @nrounds: number of rounds (20 or 12; 20 is recommended) * * This is the ChaCha core, a function from 64-byte strings to 64-byte strings. * The caller has already converted the endianness of the input. This function * also handles incrementing the block counter in the input matrix. */ void chacha_block_generic(u32 *state, u8 *stream, int nrounds) { u32 x[16]; int i; memcpy(x, state, 64); chacha_permute(x, nrounds); for (i = 0; i < ARRAY_SIZE(x); i++) put_unaligned_le32(x[i] + state[i], &stream[i * sizeof(u32)]); state[12]++; } EXPORT_SYMBOL(chacha_block_generic); /** * hchacha_block_generic - abbreviated ChaCha core, for XChaCha * @state: input state matrix (16 32-bit words) * @stream: output (8 32-bit words) * @nrounds: number of rounds (20 or 12; 20 is recommended) * * HChaCha is the ChaCha equivalent of HSalsa and is an intermediate step * towards XChaCha (see https://cr.yp.to/snuffle/xsalsa-20081128.pdf). HChaCha * skips the final addition of the initial state, and outputs only certain words * of the state. It should not be used for streaming directly. */ void hchacha_block_generic(const u32 *state, u32 *stream, int nrounds) { u32 x[16]; memcpy(x, state, 64); chacha_permute(x, nrounds); memcpy(&stream[0], &x[0], 16); memcpy(&stream[4], &x[12], 16); } EXPORT_SYMBOL(hchacha_block_generic); |
2793 2794 2723 2725 37 37 37 2725 2729 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 | // SPDX-License-Identifier: GPL-2.0 /* * of.c The helpers for hcd device tree support * * Copyright (C) 2016 Freescale Semiconductor, Inc. * Author: Peter Chen <peter.chen@freescale.com> * Copyright (C) 2017 Johan Hovold <johan@kernel.org> */ #include <linux/of.h> #include <linux/of_graph.h> #include <linux/usb/of.h> /** * usb_of_get_device_node() - get a USB device node * @hub: hub to which device is connected * @port1: one-based index of port * * Look up the node of a USB device given its parent hub device and one-based * port number. * * Return: A pointer to the node with incremented refcount if found, or * %NULL otherwise. */ struct device_node *usb_of_get_device_node(struct usb_device *hub, int port1) { struct device_node *node; u32 reg; for_each_child_of_node(hub->dev.of_node, node) { if (of_property_read_u32(node, "reg", ®)) continue; if (reg == port1) return node; } return NULL; } EXPORT_SYMBOL_GPL(usb_of_get_device_node); /** * usb_of_has_combined_node() - determine whether a device has a combined node * @udev: USB device * * Determine whether a USB device has a so called combined node which is * shared with its sole interface. This is the case if and only if the device * has a node and its descriptors report the following: * * 1) bDeviceClass is 0 or 9, and * 2) bNumConfigurations is 1, and * 3) bNumInterfaces is 1. * * Return: True iff the device has a device node and its descriptors match the * criteria for a combined node. */ bool usb_of_has_combined_node(struct usb_device *udev) { struct usb_device_descriptor *ddesc = &udev->descriptor; struct usb_config_descriptor *cdesc; if (!udev->dev.of_node) return false; switch (ddesc->bDeviceClass) { case USB_CLASS_PER_INTERFACE: case USB_CLASS_HUB: if (ddesc->bNumConfigurations == 1) { cdesc = &udev->config->desc; if (cdesc->bNumInterfaces == 1) return true; } } return false; } EXPORT_SYMBOL_GPL(usb_of_has_combined_node); static bool usb_of_has_devices_or_graph(const struct usb_device *hub) { const struct device_node *np = hub->dev.of_node; struct device_node *child; if (of_graph_is_present(np)) return true; for_each_child_of_node(np, child) { if (of_property_present(child, "reg")) { of_node_put(child); return true; } } return false; } /** * usb_of_get_connect_type() - get a USB hub's port connect_type * @hub: hub to which port is for @port1 * @port1: one-based index of port * * Get the connect_type of @port1 based on the device node for @hub. If the * port is described in the OF graph, the connect_type is "hotplug". If the * @hub has a child device has with a 'reg' property equal to @port1 the * connect_type is "hard-wired". If there isn't an OF graph or child node at * all then the connect_type is "unknown". Otherwise, the port is considered * "unused" because it isn't described at all. * * Return: A connect_type for @port1 based on the device node for @hub. */ enum usb_port_connect_type usb_of_get_connect_type(struct usb_device *hub, int port1) { struct device_node *np, *child, *ep, *remote_np; enum usb_port_connect_type connect_type; /* Only set connect_type if binding has ports/hardwired devices. */ if (!usb_of_has_devices_or_graph(hub)) return USB_PORT_CONNECT_TYPE_UNKNOWN; /* Assume port is unused if there's a graph or a child node. */ connect_type = USB_PORT_NOT_USED; np = hub->dev.of_node; /* * Hotplug ports are connected to an available remote node, e.g. * usb-a-connector compatible node, in the OF graph. */ if (of_graph_is_present(np)) { ep = of_graph_get_endpoint_by_regs(np, port1, -1); if (ep) { remote_np = of_graph_get_remote_port_parent(ep); of_node_put(ep); if (of_device_is_available(remote_np)) connect_type = USB_PORT_CONNECT_TYPE_HOT_PLUG; of_node_put(remote_np); } } /* * Hard-wired ports are child nodes with a reg property corresponding * to the port number, i.e. a usb device. */ child = usb_of_get_device_node(hub, port1); if (of_device_is_available(child)) connect_type = USB_PORT_CONNECT_TYPE_HARD_WIRED; of_node_put(child); return connect_type; } EXPORT_SYMBOL_GPL(usb_of_get_connect_type); /** * usb_of_get_interface_node() - get a USB interface node * @udev: USB device of interface * @config: configuration value * @ifnum: interface number * * Look up the node of a USB interface given its USB device, configuration * value and interface number. * * Return: A pointer to the node with incremented refcount if found, or * %NULL otherwise. */ struct device_node * usb_of_get_interface_node(struct usb_device *udev, u8 config, u8 ifnum) { struct device_node *node; u32 reg[2]; for_each_child_of_node(udev->dev.of_node, node) { if (of_property_read_u32_array(node, "reg", reg, 2)) continue; if (reg[0] == ifnum && reg[1] == config) return node; } return NULL; } EXPORT_SYMBOL_GPL(usb_of_get_interface_node); |
195 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* SCTP kernel reference Implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * Copyright (c) 2001 Nokia, Inc. * Copyright (c) 2001 La Monte H.P. Yarroll * * This file is part of the SCTP kernel reference Implementation * * Various protocol defined structures. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Or submit a bug report through the following website: * http://www.sf.net/projects/lksctp * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@us.ibm.com> * Xingang Guo <xingang.guo@intel.com> * randall@sctp.chicago.il.us * kmorneau@cisco.com * qxie1@email.mot.com * Sridhar Samudrala <sri@us.ibm.com> * Kevin Gao <kevin.gao@intel.com> * * Any bugs reported given to us we will try to fix... any fixes shared will * be incorporated into the next SCTP release. */ #ifndef __LINUX_SCTP_H__ #define __LINUX_SCTP_H__ #include <linux/in.h> /* We need in_addr. */ #include <linux/in6.h> /* We need in6_addr. */ #include <linux/skbuff.h> #include <uapi/linux/sctp.h> /* Section 3.1. SCTP Common Header Format */ struct sctphdr { __be16 source; __be16 dest; __be32 vtag; __le32 checksum; }; static inline struct sctphdr *sctp_hdr(const struct sk_buff *skb) { return (struct sctphdr *)skb_transport_header(skb); } /* Section 3.2. Chunk Field Descriptions. */ struct sctp_chunkhdr { __u8 type; __u8 flags; __be16 length; }; /* Section 3.2. Chunk Type Values. * [Chunk Type] identifies the type of information contained in the Chunk * Value field. It takes a value from 0 to 254. The value of 255 is * reserved for future use as an extension field. */ enum sctp_cid { SCTP_CID_DATA = 0, SCTP_CID_INIT = 1, SCTP_CID_INIT_ACK = 2, SCTP_CID_SACK = 3, SCTP_CID_HEARTBEAT = 4, SCTP_CID_HEARTBEAT_ACK = 5, SCTP_CID_ABORT = 6, SCTP_CID_SHUTDOWN = 7, SCTP_CID_SHUTDOWN_ACK = 8, SCTP_CID_ERROR = 9, SCTP_CID_COOKIE_ECHO = 10, SCTP_CID_COOKIE_ACK = 11, SCTP_CID_ECN_ECNE = 12, SCTP_CID_ECN_CWR = 13, SCTP_CID_SHUTDOWN_COMPLETE = 14, /* AUTH Extension Section 4.1 */ SCTP_CID_AUTH = 0x0F, /* sctp ndata 5.1. I-DATA */ SCTP_CID_I_DATA = 0x40, /* PR-SCTP Sec 3.2 */ SCTP_CID_FWD_TSN = 0xC0, /* Use hex, as defined in ADDIP sec. 3.1 */ SCTP_CID_ASCONF = 0xC1, SCTP_CID_I_FWD_TSN = 0xC2, SCTP_CID_ASCONF_ACK = 0x80, SCTP_CID_RECONF = 0x82, SCTP_CID_PAD = 0x84, }; /* enum */ /* Section 3.2 * Chunk Types are encoded such that the highest-order two bits specify * the action that must be taken if the processing endpoint does not * recognize the Chunk Type. */ enum { SCTP_CID_ACTION_DISCARD = 0x00, SCTP_CID_ACTION_DISCARD_ERR = 0x40, SCTP_CID_ACTION_SKIP = 0x80, SCTP_CID_ACTION_SKIP_ERR = 0xc0, }; enum { SCTP_CID_ACTION_MASK = 0xc0, }; /* This flag is used in Chunk Flags for ABORT and SHUTDOWN COMPLETE. * * 3.3.7 Abort Association (ABORT) (6): * The T bit is set to 0 if the sender had a TCB that it destroyed. * If the sender did not have a TCB it should set this bit to 1. */ enum { SCTP_CHUNK_FLAG_T = 0x01 }; /* * Set the T bit * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 14 |Reserved |T| Length = 4 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Chunk Flags: 8 bits * * Reserved: 7 bits * Set to 0 on transmit and ignored on receipt. * * T bit: 1 bit * The T bit is set to 0 if the sender had a TCB that it destroyed. If * the sender did NOT have a TCB it should set this bit to 1. * * Note: Special rules apply to this chunk for verification, please * see Section 8.5.1 for details. */ #define sctp_test_T_bit(c) ((c)->chunk_hdr->flags & SCTP_CHUNK_FLAG_T) /* RFC 2960 * Section 3.2.1 Optional/Variable-length Parmaeter Format. */ struct sctp_paramhdr { __be16 type; __be16 length; }; enum sctp_param { /* RFC 2960 Section 3.3.5 */ SCTP_PARAM_HEARTBEAT_INFO = cpu_to_be16(1), /* RFC 2960 Section 3.3.2.1 */ SCTP_PARAM_IPV4_ADDRESS = cpu_to_be16(5), SCTP_PARAM_IPV6_ADDRESS = cpu_to_be16(6), SCTP_PARAM_STATE_COOKIE = cpu_to_be16(7), SCTP_PARAM_UNRECOGNIZED_PARAMETERS = cpu_to_be16(8), SCTP_PARAM_COOKIE_PRESERVATIVE = cpu_to_be16(9), SCTP_PARAM_HOST_NAME_ADDRESS = cpu_to_be16(11), SCTP_PARAM_SUPPORTED_ADDRESS_TYPES = cpu_to_be16(12), SCTP_PARAM_ECN_CAPABLE = cpu_to_be16(0x8000), /* AUTH Extension Section 3 */ SCTP_PARAM_RANDOM = cpu_to_be16(0x8002), SCTP_PARAM_CHUNKS = cpu_to_be16(0x8003), SCTP_PARAM_HMAC_ALGO = cpu_to_be16(0x8004), /* Add-IP: Supported Extensions, Section 4.2 */ SCTP_PARAM_SUPPORTED_EXT = cpu_to_be16(0x8008), /* PR-SCTP Sec 3.1 */ SCTP_PARAM_FWD_TSN_SUPPORT = cpu_to_be16(0xc000), /* Add-IP Extension. Section 3.2 */ SCTP_PARAM_ADD_IP = cpu_to_be16(0xc001), SCTP_PARAM_DEL_IP = cpu_to_be16(0xc002), SCTP_PARAM_ERR_CAUSE = cpu_to_be16(0xc003), SCTP_PARAM_SET_PRIMARY = cpu_to_be16(0xc004), SCTP_PARAM_SUCCESS_REPORT = cpu_to_be16(0xc005), SCTP_PARAM_ADAPTATION_LAYER_IND = cpu_to_be16(0xc006), /* RE-CONFIG. Section 4 */ SCTP_PARAM_RESET_OUT_REQUEST = cpu_to_be16(0x000d), SCTP_PARAM_RESET_IN_REQUEST = cpu_to_be16(0x000e), SCTP_PARAM_RESET_TSN_REQUEST = cpu_to_be16(0x000f), SCTP_PARAM_RESET_RESPONSE = cpu_to_be16(0x0010), SCTP_PARAM_RESET_ADD_OUT_STREAMS = cpu_to_be16(0x0011), SCTP_PARAM_RESET_ADD_IN_STREAMS = cpu_to_be16(0x0012), }; /* enum */ /* RFC 2960 Section 3.2.1 * The Parameter Types are encoded such that the highest-order two bits * specify the action that must be taken if the processing endpoint does * not recognize the Parameter Type. * */ enum { SCTP_PARAM_ACTION_DISCARD = cpu_to_be16(0x0000), SCTP_PARAM_ACTION_DISCARD_ERR = cpu_to_be16(0x4000), SCTP_PARAM_ACTION_SKIP = cpu_to_be16(0x8000), SCTP_PARAM_ACTION_SKIP_ERR = cpu_to_be16(0xc000), }; enum { SCTP_PARAM_ACTION_MASK = cpu_to_be16(0xc000), }; /* RFC 2960 Section 3.3.1 Payload Data (DATA) (0) */ struct sctp_datahdr { __be32 tsn; __be16 stream; __be16 ssn; __u32 ppid; /* __u8 payload[]; */ }; struct sctp_data_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_datahdr data_hdr; }; struct sctp_idatahdr { __be32 tsn; __be16 stream; __be16 reserved; __be32 mid; union { __u32 ppid; __be32 fsn; }; __u8 payload[0]; }; struct sctp_idata_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_idatahdr data_hdr; }; /* DATA Chuck Specific Flags */ enum { SCTP_DATA_MIDDLE_FRAG = 0x00, SCTP_DATA_LAST_FRAG = 0x01, SCTP_DATA_FIRST_FRAG = 0x02, SCTP_DATA_NOT_FRAG = 0x03, SCTP_DATA_UNORDERED = 0x04, SCTP_DATA_SACK_IMM = 0x08, }; enum { SCTP_DATA_FRAG_MASK = 0x03, }; /* RFC 2960 Section 3.3.2 Initiation (INIT) (1) * * This chunk is used to initiate a SCTP association between two * endpoints. */ struct sctp_inithdr { __be32 init_tag; __be32 a_rwnd; __be16 num_outbound_streams; __be16 num_inbound_streams; __be32 initial_tsn; /* __u8 params[]; */ }; struct sctp_init_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_inithdr init_hdr; }; /* Section 3.3.2.1. IPv4 Address Parameter (5) */ struct sctp_ipv4addr_param { struct sctp_paramhdr param_hdr; struct in_addr addr; }; /* Section 3.3.2.1. IPv6 Address Parameter (6) */ struct sctp_ipv6addr_param { struct sctp_paramhdr param_hdr; struct in6_addr addr; }; /* Section 3.3.2.1 Cookie Preservative (9) */ struct sctp_cookie_preserve_param { struct sctp_paramhdr param_hdr; __be32 lifespan_increment; }; /* Section 3.3.2.1 Host Name Address (11) */ struct sctp_hostname_param { struct sctp_paramhdr param_hdr; uint8_t hostname[]; }; /* Section 3.3.2.1 Supported Address Types (12) */ struct sctp_supported_addrs_param { struct sctp_paramhdr param_hdr; __be16 types[]; }; /* ADDIP Section 3.2.6 Adaptation Layer Indication */ struct sctp_adaptation_ind_param { struct sctp_paramhdr param_hdr; __be32 adaptation_ind; }; /* ADDIP Section 4.2.7 Supported Extensions Parameter */ struct sctp_supported_ext_param { struct sctp_paramhdr param_hdr; __u8 chunks[]; }; /* AUTH Section 3.1 Random */ struct sctp_random_param { struct sctp_paramhdr param_hdr; __u8 random_val[]; }; /* AUTH Section 3.2 Chunk List */ struct sctp_chunks_param { struct sctp_paramhdr param_hdr; __u8 chunks[]; }; /* AUTH Section 3.3 HMAC Algorithm */ struct sctp_hmac_algo_param { struct sctp_paramhdr param_hdr; __be16 hmac_ids[]; }; /* RFC 2960. Section 3.3.3 Initiation Acknowledgement (INIT ACK) (2): * The INIT ACK chunk is used to acknowledge the initiation of an SCTP * association. */ struct sctp_initack_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_inithdr init_hdr; }; /* Section 3.3.3.1 State Cookie (7) */ struct sctp_cookie_param { struct sctp_paramhdr p; __u8 body[]; }; /* Section 3.3.3.1 Unrecognized Parameters (8) */ struct sctp_unrecognized_param { struct sctp_paramhdr param_hdr; struct sctp_paramhdr unrecognized; }; /* * 3.3.4 Selective Acknowledgement (SACK) (3): * * This chunk is sent to the peer endpoint to acknowledge received DATA * chunks and to inform the peer endpoint of gaps in the received * subsequences of DATA chunks as represented by their TSNs. */ struct sctp_gap_ack_block { __be16 start; __be16 end; }; union sctp_sack_variable { struct sctp_gap_ack_block gab; __be32 dup; }; struct sctp_sackhdr { __be32 cum_tsn_ack; __be32 a_rwnd; __be16 num_gap_ack_blocks; __be16 num_dup_tsns; /* union sctp_sack_variable variable[]; */ }; struct sctp_sack_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_sackhdr sack_hdr; }; /* RFC 2960. Section 3.3.5 Heartbeat Request (HEARTBEAT) (4): * * An endpoint should send this chunk to its peer endpoint to probe the * reachability of a particular destination transport address defined in * the present association. */ struct sctp_heartbeathdr { struct sctp_paramhdr info; }; struct sctp_heartbeat_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_heartbeathdr hb_hdr; }; /* PAD chunk could be bundled with heartbeat chunk to probe pmtu */ struct sctp_pad_chunk { struct sctp_chunkhdr uh; }; /* For the abort and shutdown ACK we must carry the init tag in the * common header. Just the common header is all that is needed with a * chunk descriptor. */ struct sctp_abort_chunk { struct sctp_chunkhdr uh; }; /* For the graceful shutdown we must carry the tag (in common header) * and the highest consecutive acking value. */ struct sctp_shutdownhdr { __be32 cum_tsn_ack; }; struct sctp_shutdown_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_shutdownhdr shutdown_hdr; }; /* RFC 2960. Section 3.3.10 Operation Error (ERROR) (9) */ struct sctp_errhdr { __be16 cause; __be16 length; /* __u8 variable[]; */ }; struct sctp_operr_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_errhdr err_hdr; }; /* RFC 2960 3.3.10 - Operation Error * * Cause Code: 16 bits (unsigned integer) * * Defines the type of error conditions being reported. * Cause Code * Value Cause Code * --------- ---------------- * 1 Invalid Stream Identifier * 2 Missing Mandatory Parameter * 3 Stale Cookie Error * 4 Out of Resource * 5 Unresolvable Address * 6 Unrecognized Chunk Type * 7 Invalid Mandatory Parameter * 8 Unrecognized Parameters * 9 No User Data * 10 Cookie Received While Shutting Down */ enum sctp_error { SCTP_ERROR_NO_ERROR = cpu_to_be16(0x00), SCTP_ERROR_INV_STRM = cpu_to_be16(0x01), SCTP_ERROR_MISS_PARAM = cpu_to_be16(0x02), SCTP_ERROR_STALE_COOKIE = cpu_to_be16(0x03), SCTP_ERROR_NO_RESOURCE = cpu_to_be16(0x04), SCTP_ERROR_DNS_FAILED = cpu_to_be16(0x05), SCTP_ERROR_UNKNOWN_CHUNK = cpu_to_be16(0x06), SCTP_ERROR_INV_PARAM = cpu_to_be16(0x07), SCTP_ERROR_UNKNOWN_PARAM = cpu_to_be16(0x08), SCTP_ERROR_NO_DATA = cpu_to_be16(0x09), SCTP_ERROR_COOKIE_IN_SHUTDOWN = cpu_to_be16(0x0a), /* SCTP Implementation Guide: * 11 Restart of an association with new addresses * 12 User Initiated Abort * 13 Protocol Violation * 14 Restart of an Association with New Encapsulation Port */ SCTP_ERROR_RESTART = cpu_to_be16(0x0b), SCTP_ERROR_USER_ABORT = cpu_to_be16(0x0c), SCTP_ERROR_PROTO_VIOLATION = cpu_to_be16(0x0d), SCTP_ERROR_NEW_ENCAP_PORT = cpu_to_be16(0x0e), /* ADDIP Section 3.3 New Error Causes * * Four new Error Causes are added to the SCTP Operational Errors, * primarily for use in the ASCONF-ACK chunk. * * Value Cause Code * --------- ---------------- * 0x00A0 Request to Delete Last Remaining IP Address. * 0x00A1 Operation Refused Due to Resource Shortage. * 0x00A2 Request to Delete Source IP Address. * 0x00A3 Association Aborted due to illegal ASCONF-ACK * 0x00A4 Request refused - no authorization. */ SCTP_ERROR_DEL_LAST_IP = cpu_to_be16(0x00A0), SCTP_ERROR_RSRC_LOW = cpu_to_be16(0x00A1), SCTP_ERROR_DEL_SRC_IP = cpu_to_be16(0x00A2), SCTP_ERROR_ASCONF_ACK = cpu_to_be16(0x00A3), SCTP_ERROR_REQ_REFUSED = cpu_to_be16(0x00A4), /* AUTH Section 4. New Error Cause * * This section defines a new error cause that will be sent if an AUTH * chunk is received with an unsupported HMAC identifier. * illustrates the new error cause. * * Cause Code Error Cause Name * -------------------------------------------------------------- * 0x0105 Unsupported HMAC Identifier */ SCTP_ERROR_UNSUP_HMAC = cpu_to_be16(0x0105) }; /* RFC 2960. Appendix A. Explicit Congestion Notification. * Explicit Congestion Notification Echo (ECNE) (12) */ struct sctp_ecnehdr { __be32 lowest_tsn; }; struct sctp_ecne_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_ecnehdr ence_hdr; }; /* RFC 2960. Appendix A. Explicit Congestion Notification. * Congestion Window Reduced (CWR) (13) */ struct sctp_cwrhdr { __be32 lowest_tsn; }; /* PR-SCTP * 3.2 Forward Cumulative TSN Chunk Definition (FORWARD TSN) * * Forward Cumulative TSN chunk has the following format: * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 192 | Flags = 0x00 | Length = Variable | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | New Cumulative TSN | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream-1 | Stream Sequence-1 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ / * / \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream-N | Stream Sequence-N | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Chunk Flags: * * Set to all zeros on transmit and ignored on receipt. * * New Cumulative TSN: 32 bit u_int * * This indicates the new cumulative TSN to the data receiver. Upon * the reception of this value, the data receiver MUST consider * any missing TSNs earlier than or equal to this value as received * and stop reporting them as gaps in any subsequent SACKs. * * Stream-N: 16 bit u_int * * This field holds a stream number that was skipped by this * FWD-TSN. * * Stream Sequence-N: 16 bit u_int * This field holds the sequence number associated with the stream * that was skipped. The stream sequence field holds the largest stream * sequence number in this stream being skipped. The receiver of * the FWD-TSN's can use the Stream-N and Stream Sequence-N fields * to enable delivery of any stranded TSN's that remain on the stream * re-ordering queues. This field MUST NOT report TSN's corresponding * to DATA chunk that are marked as unordered. For ordered DATA * chunks this field MUST be filled in. */ struct sctp_fwdtsn_skip { __be16 stream; __be16 ssn; }; struct sctp_fwdtsn_hdr { __be32 new_cum_tsn; /* struct sctp_fwdtsn_skip skip[]; */ }; struct sctp_fwdtsn_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_fwdtsn_hdr fwdtsn_hdr; }; struct sctp_ifwdtsn_skip { __be16 stream; __u8 reserved; __u8 flags; __be32 mid; }; struct sctp_ifwdtsn_hdr { __be32 new_cum_tsn; /* struct sctp_ifwdtsn_skip skip[]; */ }; struct sctp_ifwdtsn_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_ifwdtsn_hdr fwdtsn_hdr; }; /* ADDIP * Section 3.1.1 Address Configuration Change Chunk (ASCONF) * * Serial Number: 32 bits (unsigned integer) * This value represents a Serial Number for the ASCONF Chunk. The * valid range of Serial Number is from 0 to 2^32-1. * Serial Numbers wrap back to 0 after reaching 2^32 -1. * * Address Parameter: 8 or 20 bytes (depending on type) * The address is an address of the sender of the ASCONF chunk, * the address MUST be considered part of the association by the * peer endpoint. This field may be used by the receiver of the * ASCONF to help in finding the association. This parameter MUST * be present in every ASCONF message i.e. it is a mandatory TLV * parameter. * * ASCONF Parameter: TLV format * Each Address configuration change is represented by a TLV * parameter as defined in Section 3.2. One or more requests may * be present in an ASCONF Chunk. * * Section 3.1.2 Address Configuration Acknowledgement Chunk (ASCONF-ACK) * * Serial Number: 32 bits (unsigned integer) * This value represents the Serial Number for the received ASCONF * Chunk that is acknowledged by this chunk. This value is copied * from the received ASCONF Chunk. * * ASCONF Parameter Response: TLV format * The ASCONF Parameter Response is used in the ASCONF-ACK to * report status of ASCONF processing. */ struct sctp_addip_param { struct sctp_paramhdr param_hdr; __be32 crr_id; }; struct sctp_addiphdr { __be32 serial; /* __u8 params[]; */ }; struct sctp_addip_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_addiphdr addip_hdr; }; /* AUTH * Section 4.1 Authentication Chunk (AUTH) * * This chunk is used to hold the result of the HMAC calculation. * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0x0F | Flags=0 | Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Shared Key Identifier | HMAC Identifier | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | | * \ HMAC / * / \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Type: 1 byte (unsigned integer) * This value MUST be set to 0x0F for all AUTH-chunks. * * Flags: 1 byte (unsigned integer) * Set to zero on transmit and ignored on receipt. * * Length: 2 bytes (unsigned integer) * This value holds the length of the HMAC in bytes plus 8. * * Shared Key Identifier: 2 bytes (unsigned integer) * This value describes which endpoint pair shared key is used. * * HMAC Identifier: 2 bytes (unsigned integer) * This value describes which message digest is being used. Table 2 * shows the currently defined values. * * The following Table 2 shows the currently defined values for HMAC * identifiers. * * +-----------------+--------------------------+ * | HMAC Identifier | Message Digest Algorithm | * +-----------------+--------------------------+ * | 0 | Reserved | * | 1 | SHA-1 defined in [8] | * | 2 | Reserved | * | 3 | SHA-256 defined in [8] | * +-----------------+--------------------------+ * * * HMAC: n bytes (unsigned integer) This hold the result of the HMAC * calculation. */ struct sctp_authhdr { __be16 shkey_id; __be16 hmac_id; /* __u8 hmac[]; */ }; struct sctp_auth_chunk { struct sctp_chunkhdr chunk_hdr; struct sctp_authhdr auth_hdr; }; struct sctp_infox { struct sctp_info *sctpinfo; struct sctp_association *asoc; }; struct sctp_reconf_chunk { struct sctp_chunkhdr chunk_hdr; /* __u8 params[]; */ }; struct sctp_strreset_outreq { struct sctp_paramhdr param_hdr; __be32 request_seq; __be32 response_seq; __be32 send_reset_at_tsn; __be16 list_of_streams[]; }; struct sctp_strreset_inreq { struct sctp_paramhdr param_hdr; __be32 request_seq; __be16 list_of_streams[]; }; struct sctp_strreset_tsnreq { struct sctp_paramhdr param_hdr; __be32 request_seq; }; struct sctp_strreset_addstrm { struct sctp_paramhdr param_hdr; __be32 request_seq; __be16 number_of_streams; __be16 reserved; }; enum { SCTP_STRRESET_NOTHING_TO_DO = 0x00, SCTP_STRRESET_PERFORMED = 0x01, SCTP_STRRESET_DENIED = 0x02, SCTP_STRRESET_ERR_WRONG_SSN = 0x03, SCTP_STRRESET_ERR_IN_PROGRESS = 0x04, SCTP_STRRESET_ERR_BAD_SEQNO = 0x05, SCTP_STRRESET_IN_PROGRESS = 0x06, }; struct sctp_strreset_resp { struct sctp_paramhdr param_hdr; __be32 response_seq; __be32 result; }; struct sctp_strreset_resptsn { struct sctp_paramhdr param_hdr; __be32 response_seq; __be32 result; __be32 senders_next_tsn; __be32 receivers_next_tsn; }; enum { SCTP_DSCP_SET_MASK = 0x1, SCTP_DSCP_VAL_MASK = 0xfc, SCTP_FLOWLABEL_SET_MASK = 0x100000, SCTP_FLOWLABEL_VAL_MASK = 0xfffff }; /* UDP Encapsulation * draft-tuexen-tsvwg-sctp-udp-encaps-cons-03.html#section-4-4 * * The error cause indicating an "Restart of an Association with * New Encapsulation Port" * * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Cause Code = 14 | Cause Length = 8 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Current Encapsulation Port | New Encapsulation Port | * +-------------------------------+-------------------------------+ */ struct sctp_new_encap_port_hdr { __be16 cur_port; __be16 new_port; }; /* Round an int up to the next multiple of 4. */ #define SCTP_PAD4(s) (((s)+3)&~3) /* Truncate to the previous multiple of 4. */ #define SCTP_TRUNC4(s) ((s)&~3) #endif /* __LINUX_SCTP_H__ */ |
6 30 27 43 51 63 215 68 41 1 63 9 107 145 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 | /* * net/tipc/trace.h: TIPC tracepoints * * Copyright (c) 2018, Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "ASIS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #undef TRACE_SYSTEM #define TRACE_SYSTEM tipc #if !defined(_TIPC_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TIPC_TRACE_H #include <linux/tracepoint.h> #include "core.h" #include "link.h" #include "socket.h" #include "node.h" #define SKB_LMIN (100) #define SKB_LMAX (SKB_LMIN * 2) #define LIST_LMIN (SKB_LMIN * 3) #define LIST_LMAX (SKB_LMIN * 11) #define SK_LMIN (SKB_LMIN * 2) #define SK_LMAX (SKB_LMIN * 11) #define LINK_LMIN (SKB_LMIN) #define LINK_LMAX (SKB_LMIN * 16) #define NODE_LMIN (SKB_LMIN) #define NODE_LMAX (SKB_LMIN * 11) #ifndef __TIPC_TRACE_ENUM #define __TIPC_TRACE_ENUM enum { TIPC_DUMP_NONE = 0, TIPC_DUMP_TRANSMQ = 1, TIPC_DUMP_BACKLOGQ = (1 << 1), TIPC_DUMP_DEFERDQ = (1 << 2), TIPC_DUMP_INPUTQ = (1 << 3), TIPC_DUMP_WAKEUP = (1 << 4), TIPC_DUMP_SK_SNDQ = (1 << 8), TIPC_DUMP_SK_RCVQ = (1 << 9), TIPC_DUMP_SK_BKLGQ = (1 << 10), TIPC_DUMP_ALL = 0xffffu }; #endif /* Link & Node FSM states: */ #define state_sym(val) \ __print_symbolic(val, \ {(0xe), "ESTABLISHED" },\ {(0xe << 4), "ESTABLISHING" },\ {(0x1 << 8), "RESET" },\ {(0x2 << 12), "RESETTING" },\ {(0xd << 16), "PEER_RESET" },\ {(0xf << 20), "FAILINGOVER" },\ {(0xc << 24), "SYNCHING" },\ {(0xdd), "SELF_DOWN_PEER_DOWN" },\ {(0xaa), "SELF_UP_PEER_UP" },\ {(0xd1), "SELF_DOWN_PEER_LEAVING" },\ {(0xac), "SELF_UP_PEER_COMING" },\ {(0xca), "SELF_COMING_PEER_UP" },\ {(0x1d), "SELF_LEAVING_PEER_DOWN" },\ {(0xf0), "FAILINGOVER" },\ {(0xcc), "SYNCHING" }) /* Link & Node FSM events: */ #define evt_sym(val) \ __print_symbolic(val, \ {(0xec1ab1e), "ESTABLISH_EVT" },\ {(0x9eed0e), "PEER_RESET_EVT" },\ {(0xfa110e), "FAILURE_EVT" },\ {(0x10ca1d0e), "RESET_EVT" },\ {(0xfa110bee), "FAILOVER_BEGIN_EVT" },\ {(0xfa110ede), "FAILOVER_END_EVT" },\ {(0xc1ccbee), "SYNCH_BEGIN_EVT" },\ {(0xc1ccede), "SYNCH_END_EVT" },\ {(0xece), "SELF_ESTABL_CONTACT_EVT" },\ {(0x1ce), "SELF_LOST_CONTACT_EVT" },\ {(0x9ece), "PEER_ESTABL_CONTACT_EVT" },\ {(0x91ce), "PEER_LOST_CONTACT_EVT" },\ {(0xfbe), "FAILOVER_BEGIN_EVT" },\ {(0xfee), "FAILOVER_END_EVT" },\ {(0xcbe), "SYNCH_BEGIN_EVT" },\ {(0xcee), "SYNCH_END_EVT" }) /* Bearer, net device events: */ #define dev_evt_sym(val) \ __print_symbolic(val, \ {(NETDEV_CHANGE), "NETDEV_CHANGE" },\ {(NETDEV_GOING_DOWN), "NETDEV_GOING_DOWN" },\ {(NETDEV_UP), "NETDEV_UP" },\ {(NETDEV_CHANGEMTU), "NETDEV_CHANGEMTU" },\ {(NETDEV_CHANGEADDR), "NETDEV_CHANGEADDR" },\ {(NETDEV_UNREGISTER), "NETDEV_UNREGISTER" },\ {(NETDEV_CHANGENAME), "NETDEV_CHANGENAME" }) extern unsigned long sysctl_tipc_sk_filter[5] __read_mostly; int tipc_skb_dump(struct sk_buff *skb, bool more, char *buf); int tipc_list_dump(struct sk_buff_head *list, bool more, char *buf); int tipc_sk_dump(struct sock *sk, u16 dqueues, char *buf); int tipc_link_dump(struct tipc_link *l, u16 dqueues, char *buf); int tipc_node_dump(struct tipc_node *n, bool more, char *buf); bool tipc_sk_filtering(struct sock *sk); DECLARE_EVENT_CLASS(tipc_skb_class, TP_PROTO(struct sk_buff *skb, bool more, const char *header), TP_ARGS(skb, more, header), TP_STRUCT__entry( __string(header, header) __dynamic_array(char, buf, (more) ? SKB_LMAX : SKB_LMIN) ), TP_fast_assign( __assign_str(header); tipc_skb_dump(skb, more, __get_str(buf)); ), TP_printk("%s\n%s", __get_str(header), __get_str(buf)) ) #define DEFINE_SKB_EVENT(name) \ DEFINE_EVENT(tipc_skb_class, name, \ TP_PROTO(struct sk_buff *skb, bool more, const char *header), \ TP_ARGS(skb, more, header)) DEFINE_SKB_EVENT(tipc_skb_dump); DEFINE_SKB_EVENT(tipc_proto_build); DEFINE_SKB_EVENT(tipc_proto_rcv); DECLARE_EVENT_CLASS(tipc_list_class, TP_PROTO(struct sk_buff_head *list, bool more, const char *header), TP_ARGS(list, more, header), TP_STRUCT__entry( __string(header, header) __dynamic_array(char, buf, (more) ? LIST_LMAX : LIST_LMIN) ), TP_fast_assign( __assign_str(header); tipc_list_dump(list, more, __get_str(buf)); ), TP_printk("%s\n%s", __get_str(header), __get_str(buf)) ); #define DEFINE_LIST_EVENT(name) \ DEFINE_EVENT(tipc_list_class, name, \ TP_PROTO(struct sk_buff_head *list, bool more, const char *header), \ TP_ARGS(list, more, header)) DEFINE_LIST_EVENT(tipc_list_dump); DECLARE_EVENT_CLASS(tipc_sk_class, TP_PROTO(struct sock *sk, struct sk_buff *skb, u16 dqueues, const char *header), TP_ARGS(sk, skb, dqueues, header), TP_STRUCT__entry( __string(header, header) __field(u32, portid) __dynamic_array(char, buf, (dqueues) ? SK_LMAX : SK_LMIN) __dynamic_array(char, skb_buf, (skb) ? SKB_LMIN : 1) ), TP_fast_assign( __assign_str(header); __entry->portid = tipc_sock_get_portid(sk); tipc_sk_dump(sk, dqueues, __get_str(buf)); if (skb) tipc_skb_dump(skb, false, __get_str(skb_buf)); else *(__get_str(skb_buf)) = '\0'; ), TP_printk("<%u> %s\n%s%s", __entry->portid, __get_str(header), __get_str(skb_buf), __get_str(buf)) ); #define DEFINE_SK_EVENT_FILTER(name) \ DEFINE_EVENT_CONDITION(tipc_sk_class, name, \ TP_PROTO(struct sock *sk, struct sk_buff *skb, u16 dqueues, \ const char *header), \ TP_ARGS(sk, skb, dqueues, header), \ TP_CONDITION(tipc_sk_filtering(sk))) DEFINE_SK_EVENT_FILTER(tipc_sk_dump); DEFINE_SK_EVENT_FILTER(tipc_sk_create); DEFINE_SK_EVENT_FILTER(tipc_sk_sendmcast); DEFINE_SK_EVENT_FILTER(tipc_sk_sendmsg); DEFINE_SK_EVENT_FILTER(tipc_sk_sendstream); DEFINE_SK_EVENT_FILTER(tipc_sk_poll); DEFINE_SK_EVENT_FILTER(tipc_sk_filter_rcv); DEFINE_SK_EVENT_FILTER(tipc_sk_advance_rx); DEFINE_SK_EVENT_FILTER(tipc_sk_rej_msg); DEFINE_SK_EVENT_FILTER(tipc_sk_drop_msg); DEFINE_SK_EVENT_FILTER(tipc_sk_release); DEFINE_SK_EVENT_FILTER(tipc_sk_shutdown); #define DEFINE_SK_EVENT_FILTER_COND(name, cond) \ DEFINE_EVENT_CONDITION(tipc_sk_class, name, \ TP_PROTO(struct sock *sk, struct sk_buff *skb, u16 dqueues, \ const char *header), \ TP_ARGS(sk, skb, dqueues, header), \ TP_CONDITION(tipc_sk_filtering(sk) && (cond))) DEFINE_SK_EVENT_FILTER_COND(tipc_sk_overlimit1, tipc_sk_overlimit1(sk, skb)); DEFINE_SK_EVENT_FILTER_COND(tipc_sk_overlimit2, tipc_sk_overlimit2(sk, skb)); DECLARE_EVENT_CLASS(tipc_link_class, TP_PROTO(struct tipc_link *l, u16 dqueues, const char *header), TP_ARGS(l, dqueues, header), TP_STRUCT__entry( __string(header, header) __array(char, name, TIPC_MAX_LINK_NAME) __dynamic_array(char, buf, (dqueues) ? LINK_LMAX : LINK_LMIN) ), TP_fast_assign( __assign_str(header); memcpy(__entry->name, tipc_link_name(l), TIPC_MAX_LINK_NAME); tipc_link_dump(l, dqueues, __get_str(buf)); ), TP_printk("<%s> %s\n%s", __entry->name, __get_str(header), __get_str(buf)) ); #define DEFINE_LINK_EVENT(name) \ DEFINE_EVENT(tipc_link_class, name, \ TP_PROTO(struct tipc_link *l, u16 dqueues, const char *header), \ TP_ARGS(l, dqueues, header)) DEFINE_LINK_EVENT(tipc_link_dump); DEFINE_LINK_EVENT(tipc_link_conges); DEFINE_LINK_EVENT(tipc_link_timeout); DEFINE_LINK_EVENT(tipc_link_reset); #define DEFINE_LINK_EVENT_COND(name, cond) \ DEFINE_EVENT_CONDITION(tipc_link_class, name, \ TP_PROTO(struct tipc_link *l, u16 dqueues, const char *header), \ TP_ARGS(l, dqueues, header), \ TP_CONDITION(cond)) DEFINE_LINK_EVENT_COND(tipc_link_too_silent, tipc_link_too_silent(l)); DECLARE_EVENT_CLASS(tipc_link_transmq_class, TP_PROTO(struct tipc_link *r, u16 f, u16 t, struct sk_buff_head *tq), TP_ARGS(r, f, t, tq), TP_STRUCT__entry( __array(char, name, TIPC_MAX_LINK_NAME) __field(u16, from) __field(u16, to) __field(u32, len) __field(u16, fseqno) __field(u16, lseqno) ), TP_fast_assign( memcpy(__entry->name, tipc_link_name(r), TIPC_MAX_LINK_NAME); __entry->from = f; __entry->to = t; __entry->len = skb_queue_len(tq); __entry->fseqno = __entry->len ? msg_seqno(buf_msg(skb_peek(tq))) : 0; __entry->lseqno = __entry->len ? msg_seqno(buf_msg(skb_peek_tail(tq))) : 0; ), TP_printk("<%s> retrans req: [%u-%u] transmq: %u [%u-%u]\n", __entry->name, __entry->from, __entry->to, __entry->len, __entry->fseqno, __entry->lseqno) ); DEFINE_EVENT_CONDITION(tipc_link_transmq_class, tipc_link_retrans, TP_PROTO(struct tipc_link *r, u16 f, u16 t, struct sk_buff_head *tq), TP_ARGS(r, f, t, tq), TP_CONDITION(less_eq(f, t)) ); DEFINE_EVENT_PRINT(tipc_link_transmq_class, tipc_link_bc_ack, TP_PROTO(struct tipc_link *r, u16 f, u16 t, struct sk_buff_head *tq), TP_ARGS(r, f, t, tq), TP_printk("<%s> acked: %u gap: %u transmq: %u [%u-%u]\n", __entry->name, __entry->from, __entry->to, __entry->len, __entry->fseqno, __entry->lseqno) ); DECLARE_EVENT_CLASS(tipc_node_class, TP_PROTO(struct tipc_node *n, bool more, const char *header), TP_ARGS(n, more, header), TP_STRUCT__entry( __string(header, header) __field(u32, addr) __dynamic_array(char, buf, (more) ? NODE_LMAX : NODE_LMIN) ), TP_fast_assign( __assign_str(header); __entry->addr = tipc_node_get_addr(n); tipc_node_dump(n, more, __get_str(buf)); ), TP_printk("<%x> %s\n%s", __entry->addr, __get_str(header), __get_str(buf)) ); #define DEFINE_NODE_EVENT(name) \ DEFINE_EVENT(tipc_node_class, name, \ TP_PROTO(struct tipc_node *n, bool more, const char *header), \ TP_ARGS(n, more, header)) DEFINE_NODE_EVENT(tipc_node_dump); DEFINE_NODE_EVENT(tipc_node_create); DEFINE_NODE_EVENT(tipc_node_delete); DEFINE_NODE_EVENT(tipc_node_lost_contact); DEFINE_NODE_EVENT(tipc_node_timeout); DEFINE_NODE_EVENT(tipc_node_link_up); DEFINE_NODE_EVENT(tipc_node_link_down); DEFINE_NODE_EVENT(tipc_node_reset_links); DEFINE_NODE_EVENT(tipc_node_check_state); DECLARE_EVENT_CLASS(tipc_fsm_class, TP_PROTO(const char *name, u32 os, u32 ns, int evt), TP_ARGS(name, os, ns, evt), TP_STRUCT__entry( __string(name, name) __field(u32, os) __field(u32, ns) __field(u32, evt) ), TP_fast_assign( __assign_str(name); __entry->os = os; __entry->ns = ns; __entry->evt = evt; ), TP_printk("<%s> %s--(%s)->%s\n", __get_str(name), state_sym(__entry->os), evt_sym(__entry->evt), state_sym(__entry->ns)) ); #define DEFINE_FSM_EVENT(fsm_name) \ DEFINE_EVENT(tipc_fsm_class, fsm_name, \ TP_PROTO(const char *name, u32 os, u32 ns, int evt), \ TP_ARGS(name, os, ns, evt)) DEFINE_FSM_EVENT(tipc_link_fsm); DEFINE_FSM_EVENT(tipc_node_fsm); TRACE_EVENT(tipc_l2_device_event, TP_PROTO(struct net_device *dev, struct tipc_bearer *b, unsigned long evt), TP_ARGS(dev, b, evt), TP_STRUCT__entry( __string(dev_name, dev->name) __string(b_name, b->name) __field(unsigned long, evt) __field(u8, b_up) __field(u8, carrier) __field(u8, oper) ), TP_fast_assign( __assign_str(dev_name); __assign_str(b_name); __entry->evt = evt; __entry->b_up = test_bit(0, &b->up); __entry->carrier = netif_carrier_ok(dev); __entry->oper = netif_oper_up(dev); ), TP_printk("%s on: <%s>/<%s> oper: %s carrier: %s bearer: %s\n", dev_evt_sym(__entry->evt), __get_str(dev_name), __get_str(b_name), (__entry->oper) ? "up" : "down", (__entry->carrier) ? "ok" : "notok", (__entry->b_up) ? "up" : "down") ); #endif /* _TIPC_TRACE_H */ /* This part must be outside protection */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE trace #include <trace/define_trace.h> |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef BTRFS_DEFRAG_H #define BTRFS_DEFRAG_H #include <linux/types.h> #include <linux/compiler_types.h> struct inode; struct file_ra_state; struct btrfs_fs_info; struct btrfs_root; struct btrfs_trans_handle; struct btrfs_ioctl_defrag_range_args; int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag); int __init btrfs_auto_defrag_init(void); void __cold btrfs_auto_defrag_exit(void); int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u32 extent_thresh); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_defrag_root(struct btrfs_root *root); static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) { return signal_pending(current); } #endif |
2 2 1 1 2 2 2 1 1 2 2 2 2 2 2 1 1 2 2 2 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 | // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "buckets.h" #include "disk_accounting.h" #include "journal.h" #include "replicas.h" #include "super-io.h" #include <linux/sort.h> static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, struct bch_replicas_cpu *); /* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */ static int bch2_memcmp(const void *l, const void *r, const void *priv) { size_t size = (size_t) priv; return memcmp(l, r, size); } /* Replicas tracking - in memory: */ static void verify_replicas_entry(struct bch_replicas_entry_v1 *e) { #ifdef CONFIG_BCACHEFS_DEBUG BUG_ON(!e->nr_devs); BUG_ON(e->nr_required > 1 && e->nr_required >= e->nr_devs); for (unsigned i = 0; i + 1 < e->nr_devs; i++) BUG_ON(e->devs[i] >= e->devs[i + 1]); #endif } void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e) { bubble_sort(e->devs, e->nr_devs, u8_cmp); } static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) { eytzinger0_sort_r(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL, (void *)(size_t)r->entry_size); } static void bch2_replicas_entry_v0_to_text(struct printbuf *out, struct bch_replicas_entry_v0 *e) { bch2_prt_data_type(out, e->data_type); prt_printf(out, ": %u [", e->nr_devs); for (unsigned i = 0; i < e->nr_devs; i++) prt_printf(out, i ? " %u" : "%u", e->devs[i]); prt_printf(out, "]"); } void bch2_replicas_entry_to_text(struct printbuf *out, struct bch_replicas_entry_v1 *e) { bch2_prt_data_type(out, e->data_type); prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs); for (unsigned i = 0; i < e->nr_devs; i++) prt_printf(out, i ? " %u" : "%u", e->devs[i]); prt_printf(out, "]"); } int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, struct bch_sb *sb, struct printbuf *err) { if (!r->nr_devs) { prt_printf(err, "no devices in entry "); goto bad; } if (r->nr_required > 1 && r->nr_required >= r->nr_devs) { prt_printf(err, "bad nr_required in entry "); goto bad; } for (unsigned i = 0; i < r->nr_devs; i++) if (!bch2_member_exists(sb, r->devs[i])) { prt_printf(err, "invalid device %u in entry ", r->devs[i]); goto bad; } return 0; bad: bch2_replicas_entry_to_text(err, r); return -BCH_ERR_invalid_replicas_entry; } void bch2_cpu_replicas_to_text(struct printbuf *out, struct bch_replicas_cpu *r) { struct bch_replicas_entry_v1 *e; bool first = true; for_each_cpu_replicas_entry(r, e) { if (!first) prt_printf(out, " "); first = false; bch2_replicas_entry_to_text(out, e); } } static void extent_to_replicas(struct bkey_s_c k, struct bch_replicas_entry_v1 *r) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; r->nr_required = 1; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { if (p.ptr.cached) continue; if (!p.has_ec) r->devs[r->nr_devs++] = p.ptr.dev; else r->nr_required = 0; } } static void stripe_to_replicas(struct bkey_s_c k, struct bch_replicas_entry_v1 *r) { struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); const struct bch_extent_ptr *ptr; r->nr_required = s.v->nr_blocks - s.v->nr_redundant; for (ptr = s.v->ptrs; ptr < s.v->ptrs + s.v->nr_blocks; ptr++) r->devs[r->nr_devs++] = ptr->dev; } void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e, struct bkey_s_c k) { e->nr_devs = 0; switch (k.k->type) { case KEY_TYPE_btree_ptr: case KEY_TYPE_btree_ptr_v2: e->data_type = BCH_DATA_btree; extent_to_replicas(k, e); break; case KEY_TYPE_extent: case KEY_TYPE_reflink_v: e->data_type = BCH_DATA_user; extent_to_replicas(k, e); break; case KEY_TYPE_stripe: e->data_type = BCH_DATA_parity; stripe_to_replicas(k, e); break; } bch2_replicas_entry_sort(e); } void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e, enum bch_data_type data_type, struct bch_devs_list devs) { BUG_ON(!data_type || data_type == BCH_DATA_sb || data_type >= BCH_DATA_NR); e->data_type = data_type; e->nr_devs = 0; e->nr_required = 1; darray_for_each(devs, i) e->devs[e->nr_devs++] = *i; bch2_replicas_entry_sort(e); } static struct bch_replicas_cpu cpu_replicas_add_entry(struct bch_fs *c, struct bch_replicas_cpu *old, struct bch_replicas_entry_v1 *new_entry) { struct bch_replicas_cpu new = { .nr = old->nr + 1, .entry_size = max_t(unsigned, old->entry_size, replicas_entry_bytes(new_entry)), }; new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL); if (!new.entries) return new; for (unsigned i = 0; i < old->nr; i++) memcpy(cpu_replicas_entry(&new, i), cpu_replicas_entry(old, i), old->entry_size); memcpy(cpu_replicas_entry(&new, old->nr), new_entry, replicas_entry_bytes(new_entry)); bch2_cpu_replicas_sort(&new); return new; } static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, struct bch_replicas_entry_v1 *search) { int idx, entry_size = replicas_entry_bytes(search); if (unlikely(entry_size > r->entry_size)) return -1; #define entry_cmp(_l, _r) memcmp(_l, _r, entry_size) idx = eytzinger0_find(r->entries, r->nr, r->entry_size, entry_cmp, search); #undef entry_cmp return idx < r->nr ? idx : -1; } int bch2_replicas_entry_idx(struct bch_fs *c, struct bch_replicas_entry_v1 *search) { bch2_replicas_entry_sort(search); return __replicas_entry_idx(&c->replicas, search); } static bool __replicas_has_entry(struct bch_replicas_cpu *r, struct bch_replicas_entry_v1 *search) { return __replicas_entry_idx(r, search) >= 0; } bool bch2_replicas_marked_locked(struct bch_fs *c, struct bch_replicas_entry_v1 *search) { verify_replicas_entry(search); return !search->nr_devs || (__replicas_has_entry(&c->replicas, search) && (likely((!c->replicas_gc.entries)) || __replicas_has_entry(&c->replicas_gc, search))); } bool bch2_replicas_marked(struct bch_fs *c, struct bch_replicas_entry_v1 *search) { percpu_down_read(&c->mark_lock); bool ret = bch2_replicas_marked_locked(c, search); percpu_up_read(&c->mark_lock); return ret; } noinline static int bch2_mark_replicas_slowpath(struct bch_fs *c, struct bch_replicas_entry_v1 *new_entry) { struct bch_replicas_cpu new_r, new_gc; int ret = 0; verify_replicas_entry(new_entry); memset(&new_r, 0, sizeof(new_r)); memset(&new_gc, 0, sizeof(new_gc)); mutex_lock(&c->sb_lock); if (c->replicas_gc.entries && !__replicas_has_entry(&c->replicas_gc, new_entry)) { new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry); if (!new_gc.entries) { ret = -BCH_ERR_ENOMEM_cpu_replicas; goto err; } } if (!__replicas_has_entry(&c->replicas, new_entry)) { new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry); if (!new_r.entries) { ret = -BCH_ERR_ENOMEM_cpu_replicas; goto err; } ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r); if (ret) goto err; } if (!new_r.entries && !new_gc.entries) goto out; /* allocations done, now commit: */ if (new_r.entries) bch2_write_super(c); /* don't update in memory replicas until changes are persistent */ percpu_down_write(&c->mark_lock); if (new_r.entries) swap(c->replicas, new_r); if (new_gc.entries) swap(new_gc, c->replicas_gc); percpu_up_write(&c->mark_lock); out: mutex_unlock(&c->sb_lock); kfree(new_r.entries); kfree(new_gc.entries); return ret; err: bch_err_msg(c, ret, "adding replicas entry"); goto out; } int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r) { return likely(bch2_replicas_marked(c, r)) ? 0 : bch2_mark_replicas_slowpath(c, r); } /* * Old replicas_gc mechanism: only used for journal replicas entries now, should * die at some point: */ int bch2_replicas_gc_end(struct bch_fs *c, int ret) { lockdep_assert_held(&c->replicas_gc_lock); mutex_lock(&c->sb_lock); percpu_down_write(&c->mark_lock); ret = ret ?: bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc); if (!ret) swap(c->replicas, c->replicas_gc); kfree(c->replicas_gc.entries); c->replicas_gc.entries = NULL; percpu_up_write(&c->mark_lock); if (!ret) bch2_write_super(c); mutex_unlock(&c->sb_lock); return ret; } int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) { struct bch_replicas_entry_v1 *e; unsigned i = 0; lockdep_assert_held(&c->replicas_gc_lock); mutex_lock(&c->sb_lock); BUG_ON(c->replicas_gc.entries); c->replicas_gc.nr = 0; c->replicas_gc.entry_size = 0; for_each_cpu_replicas_entry(&c->replicas, e) { /* Preserve unknown data types */ if (e->data_type >= BCH_DATA_NR || !((1 << e->data_type) & typemask)) { c->replicas_gc.nr++; c->replicas_gc.entry_size = max_t(unsigned, c->replicas_gc.entry_size, replicas_entry_bytes(e)); } } c->replicas_gc.entries = kcalloc(c->replicas_gc.nr, c->replicas_gc.entry_size, GFP_KERNEL); if (!c->replicas_gc.entries) { mutex_unlock(&c->sb_lock); bch_err(c, "error allocating c->replicas_gc"); return -BCH_ERR_ENOMEM_replicas_gc; } for_each_cpu_replicas_entry(&c->replicas, e) if (e->data_type >= BCH_DATA_NR || !((1 << e->data_type) & typemask)) memcpy(cpu_replicas_entry(&c->replicas_gc, i++), e, c->replicas_gc.entry_size); bch2_cpu_replicas_sort(&c->replicas_gc); mutex_unlock(&c->sb_lock); return 0; } /* * New much simpler mechanism for clearing out unneeded replicas entries - drop * replicas entries that have 0 sectors used. * * However, we don't track sector counts for journal usage, so this doesn't drop * any BCH_DATA_journal entries; the old bch2_replicas_gc_(start|end) mechanism * is retained for that. */ int bch2_replicas_gc2(struct bch_fs *c) { struct bch_replicas_cpu new = { 0 }; unsigned nr; int ret = 0; bch2_accounting_mem_gc(c); retry: nr = READ_ONCE(c->replicas.nr); new.entry_size = READ_ONCE(c->replicas.entry_size); new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL); if (!new.entries) { bch_err(c, "error allocating c->replicas_gc"); return -BCH_ERR_ENOMEM_replicas_gc; } mutex_lock(&c->sb_lock); percpu_down_write(&c->mark_lock); if (nr != c->replicas.nr || new.entry_size != c->replicas.entry_size) { percpu_up_write(&c->mark_lock); mutex_unlock(&c->sb_lock); kfree(new.entries); goto retry; } for (unsigned i = 0; i < c->replicas.nr; i++) { struct bch_replicas_entry_v1 *e = cpu_replicas_entry(&c->replicas, i); struct disk_accounting_pos k = { .type = BCH_DISK_ACCOUNTING_replicas, }; unsafe_memcpy(&k.replicas, e, replicas_entry_bytes(e), "embedded variable length struct"); struct bpos p = disk_accounting_pos_to_bpos(&k); struct bch_accounting_mem *acc = &c->accounting; bool kill = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, &p) >= acc->k.nr; if (e->data_type == BCH_DATA_journal || !kill) memcpy(cpu_replicas_entry(&new, new.nr++), e, new.entry_size); } bch2_cpu_replicas_sort(&new); ret = bch2_cpu_replicas_to_sb_replicas(c, &new); if (!ret) swap(c->replicas, new); kfree(new.entries); percpu_up_write(&c->mark_lock); if (!ret) bch2_write_super(c); mutex_unlock(&c->sb_lock); return ret; } /* Replicas tracking - superblock: */ static int __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, struct bch_replicas_cpu *cpu_r) { struct bch_replicas_entry_v1 *e, *dst; unsigned nr = 0, entry_size = 0, idx = 0; for_each_replicas_entry(sb_r, e) { entry_size = max_t(unsigned, entry_size, replicas_entry_bytes(e)); nr++; } cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); if (!cpu_r->entries) return -BCH_ERR_ENOMEM_cpu_replicas; cpu_r->nr = nr; cpu_r->entry_size = entry_size; for_each_replicas_entry(sb_r, e) { dst = cpu_replicas_entry(cpu_r, idx++); memcpy(dst, e, replicas_entry_bytes(e)); bch2_replicas_entry_sort(dst); } return 0; } static int __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, struct bch_replicas_cpu *cpu_r) { struct bch_replicas_entry_v0 *e; unsigned nr = 0, entry_size = 0, idx = 0; for_each_replicas_entry(sb_r, e) { entry_size = max_t(unsigned, entry_size, replicas_entry_bytes(e)); nr++; } entry_size += sizeof(struct bch_replicas_entry_v1) - sizeof(struct bch_replicas_entry_v0); cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); if (!cpu_r->entries) return -BCH_ERR_ENOMEM_cpu_replicas; cpu_r->nr = nr; cpu_r->entry_size = entry_size; for_each_replicas_entry(sb_r, e) { struct bch_replicas_entry_v1 *dst = cpu_replicas_entry(cpu_r, idx++); dst->data_type = e->data_type; dst->nr_devs = e->nr_devs; dst->nr_required = 1; memcpy(dst->devs, e->devs, e->nr_devs); bch2_replicas_entry_sort(dst); } return 0; } int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) { struct bch_sb_field_replicas *sb_v1; struct bch_sb_field_replicas_v0 *sb_v0; struct bch_replicas_cpu new_r = { 0, 0, NULL }; int ret = 0; if ((sb_v1 = bch2_sb_field_get(c->disk_sb.sb, replicas))) ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r); else if ((sb_v0 = bch2_sb_field_get(c->disk_sb.sb, replicas_v0))) ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r); if (ret) return ret; bch2_cpu_replicas_sort(&new_r); percpu_down_write(&c->mark_lock); swap(c->replicas, new_r); percpu_up_write(&c->mark_lock); kfree(new_r.entries); return 0; } static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, struct bch_replicas_cpu *r) { struct bch_sb_field_replicas_v0 *sb_r; struct bch_replicas_entry_v0 *dst; struct bch_replicas_entry_v1 *src; size_t bytes; bytes = sizeof(struct bch_sb_field_replicas); for_each_cpu_replicas_entry(r, src) bytes += replicas_entry_bytes(src) - 1; sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0, DIV_ROUND_UP(bytes, sizeof(u64))); if (!sb_r) return -BCH_ERR_ENOSPC_sb_replicas; bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas); sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0); memset(&sb_r->entries, 0, vstruct_end(&sb_r->field) - (void *) &sb_r->entries); dst = sb_r->entries; for_each_cpu_replicas_entry(r, src) { dst->data_type = src->data_type; dst->nr_devs = src->nr_devs; memcpy(dst->devs, src->devs, src->nr_devs); dst = replicas_entry_next(dst); BUG_ON((void *) dst > vstruct_end(&sb_r->field)); } return 0; } static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, struct bch_replicas_cpu *r) { struct bch_sb_field_replicas *sb_r; struct bch_replicas_entry_v1 *dst, *src; bool need_v1 = false; size_t bytes; bytes = sizeof(struct bch_sb_field_replicas); for_each_cpu_replicas_entry(r, src) { bytes += replicas_entry_bytes(src); if (src->nr_required != 1) need_v1 = true; } if (!need_v1) return bch2_cpu_replicas_to_sb_replicas_v0(c, r); sb_r = bch2_sb_field_resize(&c->disk_sb, replicas, DIV_ROUND_UP(bytes, sizeof(u64))); if (!sb_r) return -BCH_ERR_ENOSPC_sb_replicas; bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0); sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas); memset(&sb_r->entries, 0, vstruct_end(&sb_r->field) - (void *) &sb_r->entries); dst = sb_r->entries; for_each_cpu_replicas_entry(r, src) { memcpy(dst, src, replicas_entry_bytes(src)); dst = replicas_entry_next(dst); BUG_ON((void *) dst > vstruct_end(&sb_r->field)); } return 0; } static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, struct bch_sb *sb, struct printbuf *err) { unsigned i; sort_r(cpu_r->entries, cpu_r->nr, cpu_r->entry_size, bch2_memcmp, NULL, (void *)(size_t)cpu_r->entry_size); for (i = 0; i < cpu_r->nr; i++) { struct bch_replicas_entry_v1 *e = cpu_replicas_entry(cpu_r, i); int ret = bch2_replicas_entry_validate(e, sb, err); if (ret) return ret; if (i + 1 < cpu_r->nr) { struct bch_replicas_entry_v1 *n = cpu_replicas_entry(cpu_r, i + 1); BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0); if (!memcmp(e, n, cpu_r->entry_size)) { prt_printf(err, "duplicate replicas entry "); bch2_replicas_entry_to_text(err, e); return -BCH_ERR_invalid_sb_replicas; } } } return 0; } static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f, enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); struct bch_replicas_cpu cpu_r; int ret; ret = __bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r); if (ret) return ret; ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); kfree(cpu_r.entries); return ret; } static void bch2_sb_replicas_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field *f) { struct bch_sb_field_replicas *r = field_to_type(f, replicas); struct bch_replicas_entry_v1 *e; bool first = true; for_each_replicas_entry(r, e) { if (!first) prt_printf(out, " "); first = false; bch2_replicas_entry_to_text(out, e); } prt_newline(out); } const struct bch_sb_field_ops bch_sb_field_ops_replicas = { .validate = bch2_sb_replicas_validate, .to_text = bch2_sb_replicas_to_text, }; static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f, enum bch_validate_flags flags, struct printbuf *err) { struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); struct bch_replicas_cpu cpu_r; int ret; ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r); if (ret) return ret; ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); kfree(cpu_r.entries); return ret; } static void bch2_sb_replicas_v0_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field *f) { struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); struct bch_replicas_entry_v0 *e; bool first = true; for_each_replicas_entry(sb_r, e) { if (!first) prt_printf(out, " "); first = false; bch2_replicas_entry_v0_to_text(out, e); } prt_newline(out); } const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { .validate = bch2_sb_replicas_v0_validate, .to_text = bch2_sb_replicas_v0_to_text, }; /* Query replicas: */ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, unsigned flags, bool print) { struct bch_replicas_entry_v1 *e; bool ret = true; percpu_down_read(&c->mark_lock); for_each_cpu_replicas_entry(&c->replicas, e) { unsigned nr_online = 0, nr_failed = 0, dflags = 0; bool metadata = e->data_type < BCH_DATA_user; if (e->data_type == BCH_DATA_cached) continue; rcu_read_lock(); for (unsigned i = 0; i < e->nr_devs; i++) { nr_online += test_bit(e->devs[i], devs.d); struct bch_dev *ca = bch2_dev_rcu(c, e->devs[i]); nr_failed += ca && ca->mi.state == BCH_MEMBER_STATE_failed; } rcu_read_unlock(); if (nr_failed == e->nr_devs) continue; if (nr_online < e->nr_required) dflags |= metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST; if (nr_online < e->nr_devs) dflags |= metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED; if (dflags & ~flags) { if (print) { struct printbuf buf = PRINTBUF; bch2_replicas_entry_to_text(&buf, e); bch_err(c, "insufficient devices online (%u) for replicas entry %s", nr_online, buf.buf); printbuf_exit(&buf); } ret = false; break; } } percpu_up_read(&c->mark_lock); return ret; } unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) { struct bch_sb_field_replicas *replicas; struct bch_sb_field_replicas_v0 *replicas_v0; unsigned data_has = 0; replicas = bch2_sb_field_get(sb, replicas); replicas_v0 = bch2_sb_field_get(sb, replicas_v0); if (replicas) { struct bch_replicas_entry_v1 *r; for_each_replicas_entry(replicas, r) { if (r->data_type >= sizeof(data_has) * 8) continue; for (unsigned i = 0; i < r->nr_devs; i++) if (r->devs[i] == dev) data_has |= 1 << r->data_type; } } else if (replicas_v0) { struct bch_replicas_entry_v0 *r; for_each_replicas_entry_v0(replicas_v0, r) { if (r->data_type >= sizeof(data_has) * 8) continue; for (unsigned i = 0; i < r->nr_devs; i++) if (r->devs[i] == dev) data_has |= 1 << r->data_type; } } return data_has; } unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) { mutex_lock(&c->sb_lock); unsigned ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx); mutex_unlock(&c->sb_lock); return ret; } void bch2_fs_replicas_exit(struct bch_fs *c) { kfree(c->replicas.entries); kfree(c->replicas_gc.entries); } |
3 7 7 3 3 3 3 3 3 3 3 12 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 | // SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/ceph/libceph.h> #include <linux/ceph/osdmap.h> #include <linux/ceph/decode.h> #include <linux/crush/hash.h> #include <linux/crush/mapper.h> static __printf(2, 3) void osdmap_info(const struct ceph_osdmap *map, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; printk(KERN_INFO "%s (%pU e%u): %pV", KBUILD_MODNAME, &map->fsid, map->epoch, &vaf); va_end(args); } char *ceph_osdmap_state_str(char *str, int len, u32 state) { if (!len) return str; if ((state & CEPH_OSD_EXISTS) && (state & CEPH_OSD_UP)) snprintf(str, len, "exists, up"); else if (state & CEPH_OSD_EXISTS) snprintf(str, len, "exists"); else if (state & CEPH_OSD_UP) snprintf(str, len, "up"); else snprintf(str, len, "doesn't exist"); return str; } /* maps */ static int calc_bits_of(unsigned int t) { int b = 0; while (t) { t = t >> 1; b++; } return b; } /* * the foo_mask is the smallest value 2^n-1 that is >= foo. */ static void calc_pg_masks(struct ceph_pg_pool_info *pi) { pi->pg_num_mask = (1 << calc_bits_of(pi->pg_num-1)) - 1; pi->pgp_num_mask = (1 << calc_bits_of(pi->pgp_num-1)) - 1; } /* * decode crush map */ static int crush_decode_uniform_bucket(void **p, void *end, struct crush_bucket_uniform *b) { dout("crush_decode_uniform_bucket %p to %p\n", *p, end); ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad); b->item_weight = ceph_decode_32(p); return 0; bad: return -EINVAL; } static int crush_decode_list_bucket(void **p, void *end, struct crush_bucket_list *b) { int j; dout("crush_decode_list_bucket %p to %p\n", *p, end); b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); if (b->item_weights == NULL) return -ENOMEM; b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); if (b->sum_weights == NULL) return -ENOMEM; ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad); for (j = 0; j < b->h.size; j++) { b->item_weights[j] = ceph_decode_32(p); b->sum_weights[j] = ceph_decode_32(p); } return 0; bad: return -EINVAL; } static int crush_decode_tree_bucket(void **p, void *end, struct crush_bucket_tree *b) { int j; dout("crush_decode_tree_bucket %p to %p\n", *p, end); ceph_decode_8_safe(p, end, b->num_nodes, bad); b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS); if (b->node_weights == NULL) return -ENOMEM; ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad); for (j = 0; j < b->num_nodes; j++) b->node_weights[j] = ceph_decode_32(p); return 0; bad: return -EINVAL; } static int crush_decode_straw_bucket(void **p, void *end, struct crush_bucket_straw *b) { int j; dout("crush_decode_straw_bucket %p to %p\n", *p, end); b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); if (b->item_weights == NULL) return -ENOMEM; b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); if (b->straws == NULL) return -ENOMEM; ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad); for (j = 0; j < b->h.size; j++) { b->item_weights[j] = ceph_decode_32(p); b->straws[j] = ceph_decode_32(p); } return 0; bad: return -EINVAL; } static int crush_decode_straw2_bucket(void **p, void *end, struct crush_bucket_straw2 *b) { int j; dout("crush_decode_straw2_bucket %p to %p\n", *p, end); b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS); if (b->item_weights == NULL) return -ENOMEM; ceph_decode_need(p, end, b->h.size * sizeof(u32), bad); for (j = 0; j < b->h.size; j++) b->item_weights[j] = ceph_decode_32(p); return 0; bad: return -EINVAL; } struct crush_name_node { struct rb_node cn_node; int cn_id; char cn_name[]; }; static struct crush_name_node *alloc_crush_name(size_t name_len) { struct crush_name_node *cn; cn = kmalloc(sizeof(*cn) + name_len + 1, GFP_NOIO); if (!cn) return NULL; RB_CLEAR_NODE(&cn->cn_node); return cn; } static void free_crush_name(struct crush_name_node *cn) { WARN_ON(!RB_EMPTY_NODE(&cn->cn_node)); kfree(cn); } DEFINE_RB_FUNCS(crush_name, struct crush_name_node, cn_id, cn_node) static int decode_crush_names(void **p, void *end, struct rb_root *root) { u32 n; ceph_decode_32_safe(p, end, n, e_inval); while (n--) { struct crush_name_node *cn; int id; u32 name_len; ceph_decode_32_safe(p, end, id, e_inval); ceph_decode_32_safe(p, end, name_len, e_inval); ceph_decode_need(p, end, name_len, e_inval); cn = alloc_crush_name(name_len); if (!cn) return -ENOMEM; cn->cn_id = id; memcpy(cn->cn_name, *p, name_len); cn->cn_name[name_len] = '\0'; *p += name_len; if (!__insert_crush_name(root, cn)) { free_crush_name(cn); return -EEXIST; } } return 0; e_inval: return -EINVAL; } void clear_crush_names(struct rb_root *root) { while (!RB_EMPTY_ROOT(root)) { struct crush_name_node *cn = rb_entry(rb_first(root), struct crush_name_node, cn_node); erase_crush_name(root, cn); free_crush_name(cn); } } static struct crush_choose_arg_map *alloc_choose_arg_map(void) { struct crush_choose_arg_map *arg_map; arg_map = kzalloc(sizeof(*arg_map), GFP_NOIO); if (!arg_map) return NULL; RB_CLEAR_NODE(&arg_map->node); return arg_map; } static void free_choose_arg_map(struct crush_choose_arg_map *arg_map) { if (arg_map) { int i, j; WARN_ON(!RB_EMPTY_NODE(&arg_map->node)); for (i = 0; i < arg_map->size; i++) { struct crush_choose_arg *arg = &arg_map->args[i]; for (j = 0; j < arg->weight_set_size; j++) kfree(arg->weight_set[j].weights); kfree(arg->weight_set); kfree(arg->ids); } kfree(arg_map->args); kfree(arg_map); } } DEFINE_RB_FUNCS(choose_arg_map, struct crush_choose_arg_map, choose_args_index, node); void clear_choose_args(struct crush_map *c) { while (!RB_EMPTY_ROOT(&c->choose_args)) { struct crush_choose_arg_map *arg_map = rb_entry(rb_first(&c->choose_args), struct crush_choose_arg_map, node); erase_choose_arg_map(&c->choose_args, arg_map); free_choose_arg_map(arg_map); } } static u32 *decode_array_32_alloc(void **p, void *end, u32 *plen) { u32 *a = NULL; u32 len; int ret; ceph_decode_32_safe(p, end, len, e_inval); if (len) { u32 i; a = kmalloc_array(len, sizeof(u32), GFP_NOIO); if (!a) { ret = -ENOMEM; goto fail; } ceph_decode_need(p, end, len * sizeof(u32), e_inval); for (i = 0; i < len; i++) a[i] = ceph_decode_32(p); } *plen = len; return a; e_inval: ret = -EINVAL; fail: kfree(a); return ERR_PTR(ret); } /* * Assumes @arg is zero-initialized. */ static int decode_choose_arg(void **p, void *end, struct crush_choose_arg *arg) { int ret; ceph_decode_32_safe(p, end, arg->weight_set_size, e_inval); if (arg->weight_set_size) { u32 i; arg->weight_set = kmalloc_array(arg->weight_set_size, sizeof(*arg->weight_set), GFP_NOIO); if (!arg->weight_set) return -ENOMEM; for (i = 0; i < arg->weight_set_size; i++) { struct crush_weight_set *w = &arg->weight_set[i]; w->weights = decode_array_32_alloc(p, end, &w->size); if (IS_ERR(w->weights)) { ret = PTR_ERR(w->weights); w->weights = NULL; return ret; } } } arg->ids = decode_array_32_alloc(p, end, &arg->ids_size); if (IS_ERR(arg->ids)) { ret = PTR_ERR(arg->ids); arg->ids = NULL; return ret; } return 0; e_inval: return -EINVAL; } static int decode_choose_args(void **p, void *end, struct crush_map *c) { struct crush_choose_arg_map *arg_map = NULL; u32 num_choose_arg_maps, num_buckets; int ret; ceph_decode_32_safe(p, end, num_choose_arg_maps, e_inval); while (num_choose_arg_maps--) { arg_map = alloc_choose_arg_map(); if (!arg_map) { ret = -ENOMEM; goto fail; } ceph_decode_64_safe(p, end, arg_map->choose_args_index, e_inval); arg_map->size = c->max_buckets; arg_map->args = kcalloc(arg_map->size, sizeof(*arg_map->args), GFP_NOIO); if (!arg_map->args) { ret = -ENOMEM; goto fail; } ceph_decode_32_safe(p, end, num_buckets, e_inval); while (num_buckets--) { struct crush_choose_arg *arg; u32 bucket_index; ceph_decode_32_safe(p, end, bucket_index, e_inval); if (bucket_index >= arg_map->size) goto e_inval; arg = &arg_map->args[bucket_index]; ret = decode_choose_arg(p, end, arg); if (ret) goto fail; if (arg->ids_size && arg->ids_size != c->buckets[bucket_index]->size) goto e_inval; } insert_choose_arg_map(&c->choose_args, arg_map); } return 0; e_inval: ret = -EINVAL; fail: free_choose_arg_map(arg_map); return ret; } static void crush_finalize(struct crush_map *c) { __s32 b; /* Space for the array of pointers to per-bucket workspace */ c->working_size = sizeof(struct crush_work) + c->max_buckets * sizeof(struct crush_work_bucket *); for (b = 0; b < c->max_buckets; b++) { if (!c->buckets[b]) continue; switch (c->buckets[b]->alg) { default: /* * The base case, permutation variables and * the pointer to the permutation array. */ c->working_size += sizeof(struct crush_work_bucket); break; } /* Every bucket has a permutation array. */ c->working_size += c->buckets[b]->size * sizeof(__u32); } } static struct crush_map *crush_decode(void *pbyval, void *end) { struct crush_map *c; int err; int i, j; void **p = &pbyval; void *start = pbyval; u32 magic; dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p)); c = kzalloc(sizeof(*c), GFP_NOFS); if (c == NULL) return ERR_PTR(-ENOMEM); c->type_names = RB_ROOT; c->names = RB_ROOT; c->choose_args = RB_ROOT; /* set tunables to default values */ c->choose_local_tries = 2; c->choose_local_fallback_tries = 5; c->choose_total_tries = 19; c->chooseleaf_descend_once = 0; ceph_decode_need(p, end, 4*sizeof(u32), bad); magic = ceph_decode_32(p); if (magic != CRUSH_MAGIC) { pr_err("crush_decode magic %x != current %x\n", (unsigned int)magic, (unsigned int)CRUSH_MAGIC); goto bad; } c->max_buckets = ceph_decode_32(p); c->max_rules = ceph_decode_32(p); c->max_devices = ceph_decode_32(p); c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS); if (c->buckets == NULL) goto badmem; c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS); if (c->rules == NULL) goto badmem; /* buckets */ for (i = 0; i < c->max_buckets; i++) { int size = 0; u32 alg; struct crush_bucket *b; ceph_decode_32_safe(p, end, alg, bad); if (alg == 0) { c->buckets[i] = NULL; continue; } dout("crush_decode bucket %d off %x %p to %p\n", i, (int)(*p-start), *p, end); switch (alg) { case CRUSH_BUCKET_UNIFORM: size = sizeof(struct crush_bucket_uniform); break; case CRUSH_BUCKET_LIST: size = sizeof(struct crush_bucket_list); break; case CRUSH_BUCKET_TREE: size = sizeof(struct crush_bucket_tree); break; case CRUSH_BUCKET_STRAW: size = sizeof(struct crush_bucket_straw); break; case CRUSH_BUCKET_STRAW2: size = sizeof(struct crush_bucket_straw2); break; default: goto bad; } BUG_ON(size == 0); b = c->buckets[i] = kzalloc(size, GFP_NOFS); if (b == NULL) goto badmem; ceph_decode_need(p, end, 4*sizeof(u32), bad); b->id = ceph_decode_32(p); b->type = ceph_decode_16(p); b->alg = ceph_decode_8(p); b->hash = ceph_decode_8(p); b->weight = ceph_decode_32(p); b->size = ceph_decode_32(p); dout("crush_decode bucket size %d off %x %p to %p\n", b->size, (int)(*p-start), *p, end); b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS); if (b->items == NULL) goto badmem; ceph_decode_need(p, end, b->size*sizeof(u32), bad); for (j = 0; j < b->size; j++) b->items[j] = ceph_decode_32(p); switch (b->alg) { case CRUSH_BUCKET_UNIFORM: err = crush_decode_uniform_bucket(p, end, (struct crush_bucket_uniform *)b); if (err < 0) goto fail; break; case CRUSH_BUCKET_LIST: err = crush_decode_list_bucket(p, end, (struct crush_bucket_list *)b); if (err < 0) goto fail; break; case CRUSH_BUCKET_TREE: err = crush_decode_tree_bucket(p, end, (struct crush_bucket_tree *)b); if (err < 0) goto fail; break; case CRUSH_BUCKET_STRAW: err = crush_decode_straw_bucket(p, end, (struct crush_bucket_straw *)b); if (err < 0) goto fail; break; case CRUSH_BUCKET_STRAW2: err = crush_decode_straw2_bucket(p, end, (struct crush_bucket_straw2 *)b); if (err < 0) goto fail; break; } } /* rules */ dout("rule vec is %p\n", c->rules); for (i = 0; i < c->max_rules; i++) { u32 yes; struct crush_rule *r; ceph_decode_32_safe(p, end, yes, bad); if (!yes) { dout("crush_decode NO rule %d off %x %p to %p\n", i, (int)(*p-start), *p, end); c->rules[i] = NULL; continue; } dout("crush_decode rule %d off %x %p to %p\n", i, (int)(*p-start), *p, end); /* len */ ceph_decode_32_safe(p, end, yes, bad); #if BITS_PER_LONG == 32 if (yes > (ULONG_MAX - sizeof(*r)) / sizeof(struct crush_rule_step)) goto bad; #endif r = kmalloc(struct_size(r, steps, yes), GFP_NOFS); if (r == NULL) goto badmem; dout(" rule %d is at %p\n", i, r); c->rules[i] = r; r->len = yes; ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */ ceph_decode_need(p, end, r->len*3*sizeof(u32), bad); for (j = 0; j < r->len; j++) { r->steps[j].op = ceph_decode_32(p); r->steps[j].arg1 = ceph_decode_32(p); r->steps[j].arg2 = ceph_decode_32(p); } } err = decode_crush_names(p, end, &c->type_names); if (err) goto fail; err = decode_crush_names(p, end, &c->names); if (err) goto fail; ceph_decode_skip_map(p, end, 32, string, bad); /* rule_name_map */ /* tunables */ ceph_decode_need(p, end, 3*sizeof(u32), done); c->choose_local_tries = ceph_decode_32(p); c->choose_local_fallback_tries = ceph_decode_32(p); c->choose_total_tries = ceph_decode_32(p); dout("crush decode tunable choose_local_tries = %d\n", c->choose_local_tries); dout("crush decode tunable choose_local_fallback_tries = %d\n", c->choose_local_fallback_tries); dout("crush decode tunable choose_total_tries = %d\n", c->choose_total_tries); ceph_decode_need(p, end, sizeof(u32), done); c->chooseleaf_descend_once = ceph_decode_32(p); dout("crush decode tunable chooseleaf_descend_once = %d\n", c->chooseleaf_descend_once); ceph_decode_need(p, end, sizeof(u8), done); c->chooseleaf_vary_r = ceph_decode_8(p); dout("crush decode tunable chooseleaf_vary_r = %d\n", c->chooseleaf_vary_r); /* skip straw_calc_version, allowed_bucket_algs */ ceph_decode_need(p, end, sizeof(u8) + sizeof(u32), done); *p += sizeof(u8) + sizeof(u32); ceph_decode_need(p, end, sizeof(u8), done); c->chooseleaf_stable = ceph_decode_8(p); dout("crush decode tunable chooseleaf_stable = %d\n", c->chooseleaf_stable); if (*p != end) { /* class_map */ ceph_decode_skip_map(p, end, 32, 32, bad); /* class_name */ ceph_decode_skip_map(p, end, 32, string, bad); /* class_bucket */ ceph_decode_skip_map_of_map(p, end, 32, 32, 32, bad); } if (*p != end) { err = decode_choose_args(p, end, c); if (err) goto fail; } done: crush_finalize(c); dout("crush_decode success\n"); return c; badmem: err = -ENOMEM; fail: dout("crush_decode fail %d\n", err); crush_destroy(c); return ERR_PTR(err); bad: err = -EINVAL; goto fail; } int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs) { if (lhs->pool < rhs->pool) return -1; if (lhs->pool > rhs->pool) return 1; if (lhs->seed < rhs->seed) return -1; if (lhs->seed > rhs->seed) return 1; return 0; } int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs) { int ret; ret = ceph_pg_compare(&lhs->pgid, &rhs->pgid); if (ret) return ret; if (lhs->shard < rhs->shard) return -1; if (lhs->shard > rhs->shard) return 1; return 0; } static struct ceph_pg_mapping *alloc_pg_mapping(size_t payload_len) { struct ceph_pg_mapping *pg; pg = kmalloc(sizeof(*pg) + payload_len, GFP_NOIO); if (!pg) return NULL; RB_CLEAR_NODE(&pg->node); return pg; } static void free_pg_mapping(struct ceph_pg_mapping *pg) { WARN_ON(!RB_EMPTY_NODE(&pg->node)); kfree(pg); } /* * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid * to a set of osds) and primary_temp (explicit primary setting) */ DEFINE_RB_FUNCS2(pg_mapping, struct ceph_pg_mapping, pgid, ceph_pg_compare, RB_BYPTR, const struct ceph_pg *, node) /* * rbtree of pg pool info */ DEFINE_RB_FUNCS(pg_pool, struct ceph_pg_pool_info, id, node) struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, u64 id) { return lookup_pg_pool(&map->pg_pools, id); } const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id) { struct ceph_pg_pool_info *pi; if (id == CEPH_NOPOOL) return NULL; if (WARN_ON_ONCE(id > (u64) INT_MAX)) return NULL; pi = lookup_pg_pool(&map->pg_pools, id); return pi ? pi->name : NULL; } EXPORT_SYMBOL(ceph_pg_pool_name_by_id); int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name) { struct rb_node *rbp; for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) { struct ceph_pg_pool_info *pi = rb_entry(rbp, struct ceph_pg_pool_info, node); if (pi->name && strcmp(pi->name, name) == 0) return pi->id; } return -ENOENT; } EXPORT_SYMBOL(ceph_pg_poolid_by_name); u64 ceph_pg_pool_flags(struct ceph_osdmap *map, u64 id) { struct ceph_pg_pool_info *pi; pi = lookup_pg_pool(&map->pg_pools, id); return pi ? pi->flags : 0; } EXPORT_SYMBOL(ceph_pg_pool_flags); static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) { erase_pg_pool(root, pi); kfree(pi->name); kfree(pi); } static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) { u8 ev, cv; unsigned len, num; void *pool_end; ceph_decode_need(p, end, 2 + 4, bad); ev = ceph_decode_8(p); /* encoding version */ cv = ceph_decode_8(p); /* compat version */ if (ev < 5) { pr_warn("got v %d < 5 cv %d of ceph_pg_pool\n", ev, cv); return -EINVAL; } if (cv > 9) { pr_warn("got v %d cv %d > 9 of ceph_pg_pool\n", ev, cv); return -EINVAL; } len = ceph_decode_32(p); ceph_decode_need(p, end, len, bad); pool_end = *p + len; pi->type = ceph_decode_8(p); pi->size = ceph_decode_8(p); pi->crush_ruleset = ceph_decode_8(p); pi->object_hash = ceph_decode_8(p); pi->pg_num = ceph_decode_32(p); pi->pgp_num = ceph_decode_32(p); *p += 4 + 4; /* skip lpg* */ *p += 4; /* skip last_change */ *p += 8 + 4; /* skip snap_seq, snap_epoch */ /* skip snaps */ num = ceph_decode_32(p); while (num--) { *p += 8; /* snapid key */ *p += 1 + 1; /* versions */ len = ceph_decode_32(p); *p += len; } /* skip removed_snaps */ num = ceph_decode_32(p); *p += num * (8 + 8); *p += 8; /* skip auid */ pi->flags = ceph_decode_64(p); *p += 4; /* skip crash_replay_interval */ if (ev >= 7) pi->min_size = ceph_decode_8(p); else pi->min_size = pi->size - pi->size / 2; if (ev >= 8) *p += 8 + 8; /* skip quota_max_* */ if (ev >= 9) { /* skip tiers */ num = ceph_decode_32(p); *p += num * 8; *p += 8; /* skip tier_of */ *p += 1; /* skip cache_mode */ pi->read_tier = ceph_decode_64(p); pi->write_tier = ceph_decode_64(p); } else { pi->read_tier = -1; pi->write_tier = -1; } if (ev >= 10) { /* skip properties */ num = ceph_decode_32(p); while (num--) { len = ceph_decode_32(p); *p += len; /* key */ len = ceph_decode_32(p); *p += len; /* val */ } } if (ev >= 11) { /* skip hit_set_params */ *p += 1 + 1; /* versions */ len = ceph_decode_32(p); *p += len; *p += 4; /* skip hit_set_period */ *p += 4; /* skip hit_set_count */ } if (ev >= 12) *p += 4; /* skip stripe_width */ if (ev >= 13) { *p += 8; /* skip target_max_bytes */ *p += 8; /* skip target_max_objects */ *p += 4; /* skip cache_target_dirty_ratio_micro */ *p += 4; /* skip cache_target_full_ratio_micro */ *p += 4; /* skip cache_min_flush_age */ *p += 4; /* skip cache_min_evict_age */ } if (ev >= 14) { /* skip erasure_code_profile */ len = ceph_decode_32(p); *p += len; } /* * last_force_op_resend_preluminous, will be overridden if the * map was encoded with RESEND_ON_SPLIT */ if (ev >= 15) pi->last_force_request_resend = ceph_decode_32(p); else pi->last_force_request_resend = 0; if (ev >= 16) *p += 4; /* skip min_read_recency_for_promote */ if (ev >= 17) *p += 8; /* skip expected_num_objects */ if (ev >= 19) *p += 4; /* skip cache_target_dirty_high_ratio_micro */ if (ev >= 20) *p += 4; /* skip min_write_recency_for_promote */ if (ev >= 21) *p += 1; /* skip use_gmt_hitset */ if (ev >= 22) *p += 1; /* skip fast_read */ if (ev >= 23) { *p += 4; /* skip hit_set_grade_decay_rate */ *p += 4; /* skip hit_set_search_last_n */ } if (ev >= 24) { /* skip opts */ *p += 1 + 1; /* versions */ len = ceph_decode_32(p); *p += len; } if (ev >= 25) pi->last_force_request_resend = ceph_decode_32(p); /* ignore the rest */ *p = pool_end; calc_pg_masks(pi); return 0; bad: return -EINVAL; } static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map) { struct ceph_pg_pool_info *pi; u32 num, len; u64 pool; ceph_decode_32_safe(p, end, num, bad); dout(" %d pool names\n", num); while (num--) { ceph_decode_64_safe(p, end, pool, bad); ceph_decode_32_safe(p, end, len, bad); dout(" pool %llu len %d\n", pool, len); ceph_decode_need(p, end, len, bad); pi = lookup_pg_pool(&map->pg_pools, pool); if (pi) { char *name = kstrndup(*p, len, GFP_NOFS); if (!name) return -ENOMEM; kfree(pi->name); pi->name = name; dout(" name is %s\n", pi->name); } *p += len; } return 0; bad: return -EINVAL; } /* * CRUSH workspaces * * workspace_manager framework borrowed from fs/btrfs/compression.c. * Two simplifications: there is only one type of workspace and there * is always at least one workspace. */ static struct crush_work *alloc_workspace(const struct crush_map *c) { struct crush_work *work; size_t work_size; WARN_ON(!c->working_size); work_size = crush_work_size(c, CEPH_PG_MAX_SIZE); dout("%s work_size %zu bytes\n", __func__, work_size); work = kvmalloc(work_size, GFP_NOIO); if (!work) return NULL; INIT_LIST_HEAD(&work->item); crush_init_workspace(c, work); return work; } static void free_workspace(struct crush_work *work) { WARN_ON(!list_empty(&work->item)); kvfree(work); } static void init_workspace_manager(struct workspace_manager *wsm) { INIT_LIST_HEAD(&wsm->idle_ws); spin_lock_init(&wsm->ws_lock); atomic_set(&wsm->total_ws, 0); wsm->free_ws = 0; init_waitqueue_head(&wsm->ws_wait); } static void add_initial_workspace(struct workspace_manager *wsm, struct crush_work *work) { WARN_ON(!list_empty(&wsm->idle_ws)); list_add(&work->item, &wsm->idle_ws); atomic_set(&wsm->total_ws, 1); wsm->free_ws = 1; } static void cleanup_workspace_manager(struct workspace_manager *wsm) { struct crush_work *work; while (!list_empty(&wsm->idle_ws)) { work = list_first_entry(&wsm->idle_ws, struct crush_work, item); list_del_init(&work->item); free_workspace(work); } atomic_set(&wsm->total_ws, 0); wsm->free_ws = 0; } /* * Finds an available workspace or allocates a new one. If it's not * possible to allocate a new one, waits until there is one. */ static struct crush_work *get_workspace(struct workspace_manager *wsm, const struct crush_map *c) { struct crush_work *work; int cpus = num_online_cpus(); again: spin_lock(&wsm->ws_lock); if (!list_empty(&wsm->idle_ws)) { work = list_first_entry(&wsm->idle_ws, struct crush_work, item); list_del_init(&work->item); wsm->free_ws--; spin_unlock(&wsm->ws_lock); return work; } if (atomic_read(&wsm->total_ws) > cpus) { DEFINE_WAIT(wait); spin_unlock(&wsm->ws_lock); prepare_to_wait(&wsm->ws_wait, &wait, TASK_UNINTERRUPTIBLE); if (atomic_read(&wsm->total_ws) > cpus && !wsm->free_ws) schedule(); finish_wait(&wsm->ws_wait, &wait); goto again; } atomic_inc(&wsm->total_ws); spin_unlock(&wsm->ws_lock); work = alloc_workspace(c); if (!work) { atomic_dec(&wsm->total_ws); wake_up(&wsm->ws_wait); /* * Do not return the error but go back to waiting. We * have the initial workspace and the CRUSH computation * time is bounded so we will get it eventually. */ WARN_ON(atomic_read(&wsm->total_ws) < 1); goto again; } return work; } /* * Puts a workspace back on the list or frees it if we have enough * idle ones sitting around. */ static void put_workspace(struct workspace_manager *wsm, struct crush_work *work) { spin_lock(&wsm->ws_lock); if (wsm->free_ws <= num_online_cpus()) { list_add(&work->item, &wsm->idle_ws); wsm->free_ws++; spin_unlock(&wsm->ws_lock); goto wake; } spin_unlock(&wsm->ws_lock); free_workspace(work); atomic_dec(&wsm->total_ws); wake: if (wq_has_sleeper(&wsm->ws_wait)) wake_up(&wsm->ws_wait); } /* * osd map */ struct ceph_osdmap *ceph_osdmap_alloc(void) { struct ceph_osdmap *map; map = kzalloc(sizeof(*map), GFP_NOIO); if (!map) return NULL; map->pg_pools = RB_ROOT; map->pool_max = -1; map->pg_temp = RB_ROOT; map->primary_temp = RB_ROOT; map->pg_upmap = RB_ROOT; map->pg_upmap_items = RB_ROOT; init_workspace_manager(&map->crush_wsm); return map; } void ceph_osdmap_destroy(struct ceph_osdmap *map) { dout("osdmap_destroy %p\n", map); if (map->crush) crush_destroy(map->crush); cleanup_workspace_manager(&map->crush_wsm); while (!RB_EMPTY_ROOT(&map->pg_temp)) { struct ceph_pg_mapping *pg = rb_entry(rb_first(&map->pg_temp), struct ceph_pg_mapping, node); erase_pg_mapping(&map->pg_temp, pg); free_pg_mapping(pg); } while (!RB_EMPTY_ROOT(&map->primary_temp)) { struct ceph_pg_mapping *pg = rb_entry(rb_first(&map->primary_temp), struct ceph_pg_mapping, node); erase_pg_mapping(&map->primary_temp, pg); free_pg_mapping(pg); } while (!RB_EMPTY_ROOT(&map->pg_upmap)) { struct ceph_pg_mapping *pg = rb_entry(rb_first(&map->pg_upmap), struct ceph_pg_mapping, node); rb_erase(&pg->node, &map->pg_upmap); kfree(pg); } while (!RB_EMPTY_ROOT(&map->pg_upmap_items)) { struct ceph_pg_mapping *pg = rb_entry(rb_first(&map->pg_upmap_items), struct ceph_pg_mapping, node); rb_erase(&pg->node, &map->pg_upmap_items); kfree(pg); } while (!RB_EMPTY_ROOT(&map->pg_pools)) { struct ceph_pg_pool_info *pi = rb_entry(rb_first(&map->pg_pools), struct ceph_pg_pool_info, node); __remove_pg_pool(&map->pg_pools, pi); } kvfree(map->osd_state); kvfree(map->osd_weight); kvfree(map->osd_addr); kvfree(map->osd_primary_affinity); kfree(map); } /* * Adjust max_osd value, (re)allocate arrays. * * The new elements are properly initialized. */ static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max) { u32 *state; u32 *weight; struct ceph_entity_addr *addr; u32 to_copy; int i; dout("%s old %u new %u\n", __func__, map->max_osd, max); if (max == map->max_osd) return 0; state = kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS); weight = kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS); addr = kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS); if (!state || !weight || !addr) { kvfree(state); kvfree(weight); kvfree(addr); return -ENOMEM; } to_copy = min(map->max_osd, max); if (map->osd_state) { memcpy(state, map->osd_state, to_copy * sizeof(*state)); memcpy(weight, map->osd_weight, to_copy * sizeof(*weight)); memcpy(addr, map->osd_addr, to_copy * sizeof(*addr)); kvfree(map->osd_state); kvfree(map->osd_weight); kvfree(map->osd_addr); } map->osd_state = state; map->osd_weight = weight; map->osd_addr = addr; for (i = map->max_osd; i < max; i++) { map->osd_state[i] = 0; map->osd_weight[i] = CEPH_OSD_OUT; memset(map->osd_addr + i, 0, sizeof(*map->osd_addr)); } if (map->osd_primary_affinity) { u32 *affinity; affinity = kvmalloc(array_size(max, sizeof(*affinity)), GFP_NOFS); if (!affinity) return -ENOMEM; memcpy(affinity, map->osd_primary_affinity, to_copy * sizeof(*affinity)); kvfree(map->osd_primary_affinity); map->osd_primary_affinity = affinity; for (i = map->max_osd; i < max; i++) map->osd_primary_affinity[i] = CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; } map->max_osd = max; return 0; } static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) { struct crush_work *work; if (IS_ERR(crush)) return PTR_ERR(crush); work = alloc_workspace(crush); if (!work) { crush_destroy(crush); return -ENOMEM; } if (map->crush) crush_destroy(map->crush); cleanup_workspace_manager(&map->crush_wsm); map->crush = crush; add_initial_workspace(&map->crush_wsm, work); return 0; } #define OSDMAP_WRAPPER_COMPAT_VER 7 #define OSDMAP_CLIENT_DATA_COMPAT_VER 1 /* * Return 0 or error. On success, *v is set to 0 for old (v6) osdmaps, * to struct_v of the client_data section for new (v7 and above) * osdmaps. */ static int get_osdmap_client_data_v(void **p, void *end, const char *prefix, u8 *v) { u8 struct_v; ceph_decode_8_safe(p, end, struct_v, e_inval); if (struct_v >= 7) { u8 struct_compat; ceph_decode_8_safe(p, end, struct_compat, e_inval); if (struct_compat > OSDMAP_WRAPPER_COMPAT_VER) { pr_warn("got v %d cv %d > %d of %s ceph_osdmap\n", struct_v, struct_compat, OSDMAP_WRAPPER_COMPAT_VER, prefix); return -EINVAL; } *p += 4; /* ignore wrapper struct_len */ ceph_decode_8_safe(p, end, struct_v, e_inval); ceph_decode_8_safe(p, end, struct_compat, e_inval); if (struct_compat > OSDMAP_CLIENT_DATA_COMPAT_VER) { pr_warn("got v %d cv %d > %d of %s ceph_osdmap client data\n", struct_v, struct_compat, OSDMAP_CLIENT_DATA_COMPAT_VER, prefix); return -EINVAL; } *p += 4; /* ignore client data struct_len */ } else { u16 version; *p -= 1; ceph_decode_16_safe(p, end, version, e_inval); if (version < 6) { pr_warn("got v %d < 6 of %s ceph_osdmap\n", version, prefix); return -EINVAL; } /* old osdmap encoding */ struct_v = 0; } *v = struct_v; return 0; e_inval: return -EINVAL; } static int __decode_pools(void **p, void *end, struct ceph_osdmap *map, bool incremental) { u32 n; ceph_decode_32_safe(p, end, n, e_inval); while (n--) { struct ceph_pg_pool_info *pi; u64 pool; int ret; ceph_decode_64_safe(p, end, pool, e_inval); pi = lookup_pg_pool(&map->pg_pools, pool); if (!incremental || !pi) { pi = kzalloc(sizeof(*pi), GFP_NOFS); if (!pi) return -ENOMEM; RB_CLEAR_NODE(&pi->node); pi->id = pool; if (!__insert_pg_pool(&map->pg_pools, pi)) { kfree(pi); return -EEXIST; } } ret = decode_pool(p, end, pi); if (ret) return ret; } return 0; e_inval: return -EINVAL; } static int decode_pools(void **p, void *end, struct ceph_osdmap *map) { return __decode_pools(p, end, map, false); } static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map) { return __decode_pools(p, end, map, true); } typedef struct ceph_pg_mapping *(*decode_mapping_fn_t)(void **, void *, bool); static int decode_pg_mapping(void **p, void *end, struct rb_root *mapping_root, decode_mapping_fn_t fn, bool incremental) { u32 n; WARN_ON(!incremental && !fn); ceph_decode_32_safe(p, end, n, e_inval); while (n--) { struct ceph_pg_mapping *pg; struct ceph_pg pgid; int ret; ret = ceph_decode_pgid(p, end, &pgid); if (ret) return ret; pg = lookup_pg_mapping(mapping_root, &pgid); if (pg) { WARN_ON(!incremental); erase_pg_mapping(mapping_root, pg); free_pg_mapping(pg); } if (fn) { pg = fn(p, end, incremental); if (IS_ERR(pg)) return PTR_ERR(pg); if (pg) { pg->pgid = pgid; /* struct */ insert_pg_mapping(mapping_root, pg); } } } return 0; e_inval: return -EINVAL; } static struct ceph_pg_mapping *__decode_pg_temp(void **p, void *end, bool incremental) { struct ceph_pg_mapping *pg; u32 len, i; ceph_decode_32_safe(p, end, len, e_inval); if (len == 0 && incremental) return NULL; /* new_pg_temp: [] to remove */ if (len > (SIZE_MAX - sizeof(*pg)) / sizeof(u32)) return ERR_PTR(-EINVAL); ceph_decode_need(p, end, len * sizeof(u32), e_inval); pg = alloc_pg_mapping(len * sizeof(u32)); if (!pg) return ERR_PTR(-ENOMEM); pg->pg_temp.len = len; for (i = 0; i < len; i++) pg->pg_temp.osds[i] = ceph_decode_32(p); return pg; e_inval: return ERR_PTR(-EINVAL); } static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map) { return decode_pg_mapping(p, end, &map->pg_temp, __decode_pg_temp, false); } static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map) { return decode_pg_mapping(p, end, &map->pg_temp, __decode_pg_temp, true); } static struct ceph_pg_mapping *__decode_primary_temp(void **p, void *end, bool incremental) { struct ceph_pg_mapping *pg; u32 osd; ceph_decode_32_safe(p, end, osd, e_inval); if (osd == (u32)-1 && incremental) return NULL; /* new_primary_temp: -1 to remove */ pg = alloc_pg_mapping(0); if (!pg) return ERR_PTR(-ENOMEM); pg->primary_temp.osd = osd; return pg; e_inval: return ERR_PTR(-EINVAL); } static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map) { return decode_pg_mapping(p, end, &map->primary_temp, __decode_primary_temp, false); } static int decode_new_primary_temp(void **p, void *end, struct ceph_osdmap *map) { return decode_pg_mapping(p, end, &map->primary_temp, __decode_primary_temp, true); } u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd) { BUG_ON(osd >= map->max_osd); if (!map->osd_primary_affinity) return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; return map->osd_primary_affinity[osd]; } static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) { BUG_ON(osd >= map->max_osd); if (!map->osd_primary_affinity) { int i; map->osd_primary_affinity = kvmalloc( array_size(map->max_osd, sizeof(*map->osd_primary_affinity)), GFP_NOFS); if (!map->osd_primary_affinity) return -ENOMEM; for (i = 0; i < map->max_osd; i++) map->osd_primary_affinity[i] = CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; } map->osd_primary_affinity[osd] = aff; return 0; } static int decode_primary_affinity(void **p, void *end, struct ceph_osdmap *map) { u32 len, i; ceph_decode_32_safe(p, end, len, e_inval); if (len == 0) { kvfree(map->osd_primary_affinity); map->osd_primary_affinity = NULL; return 0; } if (len != map->max_osd) goto e_inval; ceph_decode_need(p, end, map->max_osd*sizeof(u32), e_inval); for (i = 0; i < map->max_osd; i++) { int ret; ret = set_primary_affinity(map, i, ceph_decode_32(p)); if (ret) return ret; } return 0; e_inval: return -EINVAL; } static int decode_new_primary_affinity(void **p, void *end, struct ceph_osdmap *map) { u32 n; ceph_decode_32_safe(p, end, n, e_inval); while (n--) { u32 osd, aff; int ret; ceph_decode_32_safe(p, end, osd, e_inval); ceph_decode_32_safe(p, end, aff, e_inval); ret = set_primary_affinity(map, osd, aff); if (ret) return ret; osdmap_info(map, "osd%d primary-affinity 0x%x\n", osd, aff); } return 0; e_inval: return -EINVAL; } static struct ceph_pg_mapping *__decode_pg_upmap(void **p, void *end, bool __unused) { return __decode_pg_temp(p, end, false); } static int decode_pg_upmap(void **p, void *end, struct ceph_osdmap *map) { return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap, false); } static int decode_new_pg_upmap(void **p, void *end, struct ceph_osdmap *map) { return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap, true); } static int decode_old_pg_upmap(void **p, void *end, struct ceph_osdmap *map) { return decode_pg_mapping(p, end, &map->pg_upmap, NULL, true); } static struct ceph_pg_mapping *__decode_pg_upmap_items(void **p, void *end, bool __unused) { struct ceph_pg_mapping *pg; u32 len, i; ceph_decode_32_safe(p, end, len, e_inval); if (len > (SIZE_MAX - sizeof(*pg)) / (2 * sizeof(u32))) return ERR_PTR(-EINVAL); ceph_decode_need(p, end, 2 * len * sizeof(u32), e_inval); pg = alloc_pg_mapping(2 * len * sizeof(u32)); if (!pg) return ERR_PTR(-ENOMEM); pg->pg_upmap_items.len = len; for (i = 0; i < len; i++) { pg->pg_upmap_items.from_to[i][0] = ceph_decode_32(p); pg->pg_upmap_items.from_to[i][1] = ceph_decode_32(p); } return pg; e_inval: return ERR_PTR(-EINVAL); } static int decode_pg_upmap_items(void **p, void *end, struct ceph_osdmap *map) { return decode_pg_mapping(p, end, &map->pg_upmap_items, __decode_pg_upmap_items, false); } static int decode_new_pg_upmap_items(void **p, void *end, struct ceph_osdmap *map) { return decode_pg_mapping(p, end, &map->pg_upmap_items, __decode_pg_upmap_items, true); } static int decode_old_pg_upmap_items(void **p, void *end, struct ceph_osdmap *map) { return decode_pg_mapping(p, end, &map->pg_upmap_items, NULL, true); } /* * decode a full map. */ static int osdmap_decode(void **p, void *end, bool msgr2, struct ceph_osdmap *map) { u8 struct_v; u32 epoch = 0; void *start = *p; u32 max; u32 len, i; int err; dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p)); err = get_osdmap_client_data_v(p, end, "full", &struct_v); if (err) goto bad; /* fsid, epoch, created, modified */ ceph_decode_need(p, end, sizeof(map->fsid) + sizeof(u32) + sizeof(map->created) + sizeof(map->modified), e_inval); ceph_decode_copy(p, &map->fsid, sizeof(map->fsid)); epoch = map->epoch = ceph_decode_32(p); ceph_decode_copy(p, &map->created, sizeof(map->created)); ceph_decode_copy(p, &map->modified, sizeof(map->modified)); /* pools */ err = decode_pools(p, end, map); if (err) goto bad; /* pool_name */ err = decode_pool_names(p, end, map); if (err) goto bad; ceph_decode_32_safe(p, end, map->pool_max, e_inval); ceph_decode_32_safe(p, end, map->flags, e_inval); /* max_osd */ ceph_decode_32_safe(p, end, max, e_inval); /* (re)alloc osd arrays */ err = osdmap_set_max_osd(map, max); if (err) goto bad; /* osd_state, osd_weight, osd_addrs->client_addr */ ceph_decode_need(p, end, 3*sizeof(u32) + map->max_osd*(struct_v >= 5 ? sizeof(u32) : sizeof(u8)) + sizeof(*map->osd_weight), e_inval); if (ceph_decode_32(p) != map->max_osd) goto e_inval; if (struct_v >= 5) { for (i = 0; i < map->max_osd; i++) map->osd_state[i] = ceph_decode_32(p); } else { for (i = 0; i < map->max_osd; i++) map->osd_state[i] = ceph_decode_8(p); } if (ceph_decode_32(p) != map->max_osd) goto e_inval; for (i = 0; i < map->max_osd; i++) map->osd_weight[i] = ceph_decode_32(p); if (ceph_decode_32(p) != map->max_osd) goto e_inval; for (i = 0; i < map->max_osd; i++) { struct ceph_entity_addr *addr = &map->osd_addr[i]; if (struct_v >= 8) err = ceph_decode_entity_addrvec(p, end, msgr2, addr); else err = ceph_decode_entity_addr(p, end, addr); if (err) goto bad; dout("%s osd%d addr %s\n", __func__, i, ceph_pr_addr(addr)); } /* pg_temp */ err = decode_pg_temp(p, end, map); if (err) goto bad; /* primary_temp */ if (struct_v >= 1) { err = decode_primary_temp(p, end, map); if (err) goto bad; } /* primary_affinity */ if (struct_v >= 2) { err = decode_primary_affinity(p, end, map); if (err) goto bad; } else { WARN_ON(map->osd_primary_affinity); } /* crush */ ceph_decode_32_safe(p, end, len, e_inval); err = osdmap_set_crush(map, crush_decode(*p, min(*p + len, end))); if (err) goto bad; *p += len; if (struct_v >= 3) { /* erasure_code_profiles */ ceph_decode_skip_map_of_map(p, end, string, string, string, e_inval); } if (struct_v >= 4) { err = decode_pg_upmap(p, end, map); if (err) goto bad; err = decode_pg_upmap_items(p, end, map); if (err) goto bad; } else { WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap)); WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap_items)); } /* ignore the rest */ *p = end; dout("full osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd); return 0; e_inval: err = -EINVAL; bad: pr_err("corrupt full osdmap (%d) epoch %d off %d (%p of %p-%p)\n", err, epoch, (int)(*p - start), *p, start, end); print_hex_dump(KERN_DEBUG, "osdmap: ", DUMP_PREFIX_OFFSET, 16, 1, start, end - start, true); return err; } /* * Allocate and decode a full map. */ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end, bool msgr2) { struct ceph_osdmap *map; int ret; map = ceph_osdmap_alloc(); if (!map) return ERR_PTR(-ENOMEM); ret = osdmap_decode(p, end, msgr2, map); if (ret) { ceph_osdmap_destroy(map); return ERR_PTR(ret); } return map; } /* * Encoding order is (new_up_client, new_state, new_weight). Need to * apply in the (new_weight, new_state, new_up_client) order, because * an incremental map may look like e.g. * * new_up_client: { osd=6, addr=... } # set osd_state and addr * new_state: { osd=6, xorstate=EXISTS } # clear osd_state */ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, bool msgr2, struct ceph_osdmap *map) { void *new_up_client; void *new_state; void *new_weight_end; u32 len; int ret; int i; new_up_client = *p; ceph_decode_32_safe(p, end, len, e_inval); for (i = 0; i < len; ++i) { struct ceph_entity_addr addr; ceph_decode_skip_32(p, end, e_inval); if (struct_v >= 7) ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr); else ret = ceph_decode_entity_addr(p, end, &addr); if (ret) return ret; } new_state = *p; ceph_decode_32_safe(p, end, len, e_inval); len *= sizeof(u32) + (struct_v >= 5 ? sizeof(u32) : sizeof(u8)); ceph_decode_need(p, end, len, e_inval); *p += len; /* new_weight */ ceph_decode_32_safe(p, end, len, e_inval); while (len--) { s32 osd; u32 w; ceph_decode_need(p, end, 2*sizeof(u32), e_inval); osd = ceph_decode_32(p); w = ceph_decode_32(p); BUG_ON(osd >= map->max_osd); osdmap_info(map, "osd%d weight 0x%x %s\n", osd, w, w == CEPH_OSD_IN ? "(in)" : (w == CEPH_OSD_OUT ? "(out)" : "")); map->osd_weight[osd] = w; /* * If we are marking in, set the EXISTS, and clear the * AUTOOUT and NEW bits. */ if (w) { map->osd_state[osd] |= CEPH_OSD_EXISTS; map->osd_state[osd] &= ~(CEPH_OSD_AUTOOUT | CEPH_OSD_NEW); } } new_weight_end = *p; /* new_state (up/down) */ *p = new_state; len = ceph_decode_32(p); while (len--) { s32 osd; u32 xorstate; osd = ceph_decode_32(p); if (struct_v >= 5) xorstate = ceph_decode_32(p); else xorstate = ceph_decode_8(p); if (xorstate == 0) xorstate = CEPH_OSD_UP; BUG_ON(osd >= map->max_osd); if ((map->osd_state[osd] & CEPH_OSD_UP) && (xorstate & CEPH_OSD_UP)) osdmap_info(map, "osd%d down\n", osd); if ((map->osd_state[osd] & CEPH_OSD_EXISTS) && (xorstate & CEPH_OSD_EXISTS)) { osdmap_info(map, "osd%d does not exist\n", osd); ret = set_primary_affinity(map, osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY); if (ret) return ret; memset(map->osd_addr + osd, 0, sizeof(*map->osd_addr)); map->osd_state[osd] = 0; } else { map->osd_state[osd] ^= xorstate; } } /* new_up_client */ *p = new_up_client; len = ceph_decode_32(p); while (len--) { s32 osd; struct ceph_entity_addr addr; osd = ceph_decode_32(p); BUG_ON(osd >= map->max_osd); if (struct_v >= 7) ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr); else ret = ceph_decode_entity_addr(p, end, &addr); if (ret) return ret; dout("%s osd%d addr %s\n", __func__, osd, ceph_pr_addr(&addr)); osdmap_info(map, "osd%d up\n", osd); map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP; map->osd_addr[osd] = addr; } *p = new_weight_end; return 0; e_inval: return -EINVAL; } /* * decode and apply an incremental map update. */ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, bool msgr2, struct ceph_osdmap *map) { struct ceph_fsid fsid; u32 epoch = 0; struct ceph_timespec modified; s32 len; u64 pool; __s64 new_pool_max; __s32 new_flags, max; void *start = *p; int err; u8 struct_v; dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p)); err = get_osdmap_client_data_v(p, end, "inc", &struct_v); if (err) goto bad; /* fsid, epoch, modified, new_pool_max, new_flags */ ceph_decode_need(p, end, sizeof(fsid) + sizeof(u32) + sizeof(modified) + sizeof(u64) + sizeof(u32), e_inval); ceph_decode_copy(p, &fsid, sizeof(fsid)); epoch = ceph_decode_32(p); BUG_ON(epoch != map->epoch+1); ceph_decode_copy(p, &modified, sizeof(modified)); new_pool_max = ceph_decode_64(p); new_flags = ceph_decode_32(p); /* full map? */ ceph_decode_32_safe(p, end, len, e_inval); if (len > 0) { dout("apply_incremental full map len %d, %p to %p\n", len, *p, end); return ceph_osdmap_decode(p, min(*p+len, end), msgr2); } /* new crush? */ ceph_decode_32_safe(p, end, len, e_inval); if (len > 0) { err = osdmap_set_crush(map, crush_decode(*p, min(*p + len, end))); if (err) goto bad; *p += len; } /* new flags? */ if (new_flags >= 0) map->flags = new_flags; if (new_pool_max >= 0) map->pool_max = new_pool_max; /* new max? */ ceph_decode_32_safe(p, end, max, e_inval); if (max >= 0) { err = osdmap_set_max_osd(map, max); if (err) goto bad; } map->epoch++; map->modified = modified; /* new_pools */ err = decode_new_pools(p, end, map); if (err) goto bad; /* new_pool_names */ err = decode_pool_names(p, end, map); if (err) goto bad; /* old_pool */ ceph_decode_32_safe(p, end, len, e_inval); while (len--) { struct ceph_pg_pool_info *pi; ceph_decode_64_safe(p, end, pool, e_inval); pi = lookup_pg_pool(&map->pg_pools, pool); if (pi) __remove_pg_pool(&map->pg_pools, pi); } /* new_up_client, new_state, new_weight */ err = decode_new_up_state_weight(p, end, struct_v, msgr2, map); if (err) goto bad; /* new_pg_temp */ err = decode_new_pg_temp(p, end, map); if (err) goto bad; /* new_primary_temp */ if (struct_v >= 1) { err = decode_new_primary_temp(p, end, map); if (err) goto bad; } /* new_primary_affinity */ if (struct_v >= 2) { err = decode_new_primary_affinity(p, end, map); if (err) goto bad; } if (struct_v >= 3) { /* new_erasure_code_profiles */ ceph_decode_skip_map_of_map(p, end, string, string, string, e_inval); /* old_erasure_code_profiles */ ceph_decode_skip_set(p, end, string, e_inval); } if (struct_v >= 4) { err = decode_new_pg_upmap(p, end, map); if (err) goto bad; err = decode_old_pg_upmap(p, end, map); if (err) goto bad; err = decode_new_pg_upmap_items(p, end, map); if (err) goto bad; err = decode_old_pg_upmap_items(p, end, map); if (err) goto bad; } /* ignore the rest */ *p = end; dout("inc osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd); return map; e_inval: err = -EINVAL; bad: pr_err("corrupt inc osdmap (%d) epoch %d off %d (%p of %p-%p)\n", err, epoch, (int)(*p - start), *p, start, end); print_hex_dump(KERN_DEBUG, "osdmap: ", DUMP_PREFIX_OFFSET, 16, 1, start, end - start, true); return ERR_PTR(err); } void ceph_oloc_copy(struct ceph_object_locator *dest, const struct ceph_object_locator *src) { ceph_oloc_destroy(dest); dest->pool = src->pool; if (src->pool_ns) dest->pool_ns = ceph_get_string(src->pool_ns); else dest->pool_ns = NULL; } EXPORT_SYMBOL(ceph_oloc_copy); void ceph_oloc_destroy(struct ceph_object_locator *oloc) { ceph_put_string(oloc->pool_ns); } EXPORT_SYMBOL(ceph_oloc_destroy); void ceph_oid_copy(struct ceph_object_id *dest, const struct ceph_object_id *src) { ceph_oid_destroy(dest); if (src->name != src->inline_name) { /* very rare, see ceph_object_id definition */ dest->name = kmalloc(src->name_len + 1, GFP_NOIO | __GFP_NOFAIL); } else { dest->name = dest->inline_name; } memcpy(dest->name, src->name, src->name_len + 1); dest->name_len = src->name_len; } EXPORT_SYMBOL(ceph_oid_copy); static __printf(2, 0) int oid_printf_vargs(struct ceph_object_id *oid, const char *fmt, va_list ap) { int len; WARN_ON(!ceph_oid_empty(oid)); len = vsnprintf(oid->inline_name, sizeof(oid->inline_name), fmt, ap); if (len >= sizeof(oid->inline_name)) return len; oid->name_len = len; return 0; } /* * If oid doesn't fit into inline buffer, BUG. */ void ceph_oid_printf(struct ceph_object_id *oid, const char *fmt, ...) { va_list ap; va_start(ap, fmt); BUG_ON(oid_printf_vargs(oid, fmt, ap)); va_end(ap); } EXPORT_SYMBOL(ceph_oid_printf); static __printf(3, 0) int oid_aprintf_vargs(struct ceph_object_id *oid, gfp_t gfp, const char *fmt, va_list ap) { va_list aq; int len; va_copy(aq, ap); len = oid_printf_vargs(oid, fmt, aq); va_end(aq); if (len) { char *external_name; external_name = kmalloc(len + 1, gfp); if (!external_name) return -ENOMEM; oid->name = external_name; WARN_ON(vsnprintf(oid->name, len + 1, fmt, ap) != len); oid->name_len = len; } return 0; } /* * If oid doesn't fit into inline buffer, allocate. */ int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp, const char *fmt, ...) { va_list ap; int ret; va_start(ap, fmt); ret = oid_aprintf_vargs(oid, gfp, fmt, ap); va_end(ap); return ret; } EXPORT_SYMBOL(ceph_oid_aprintf); void ceph_oid_destroy(struct ceph_object_id *oid) { if (oid->name != oid->inline_name) kfree(oid->name); } EXPORT_SYMBOL(ceph_oid_destroy); /* * osds only */ static bool __osds_equal(const struct ceph_osds *lhs, const struct ceph_osds *rhs) { if (lhs->size == rhs->size && !memcmp(lhs->osds, rhs->osds, rhs->size * sizeof(rhs->osds[0]))) return true; return false; } /* * osds + primary */ static bool osds_equal(const struct ceph_osds *lhs, const struct ceph_osds *rhs) { if (__osds_equal(lhs, rhs) && lhs->primary == rhs->primary) return true; return false; } static bool osds_valid(const struct ceph_osds *set) { /* non-empty set */ if (set->size > 0 && set->primary >= 0) return true; /* empty can_shift_osds set */ if (!set->size && set->primary == -1) return true; /* empty !can_shift_osds set - all NONE */ if (set->size > 0 && set->primary == -1) { int i; for (i = 0; i < set->size; i++) { if (set->osds[i] != CRUSH_ITEM_NONE) break; } if (i == set->size) return true; } return false; } void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src) { memcpy(dest->osds, src->osds, src->size * sizeof(src->osds[0])); dest->size = src->size; dest->primary = src->primary; } bool ceph_pg_is_split(const struct ceph_pg *pgid, u32 old_pg_num, u32 new_pg_num) { int old_bits = calc_bits_of(old_pg_num); int old_mask = (1 << old_bits) - 1; int n; WARN_ON(pgid->seed >= old_pg_num); if (new_pg_num <= old_pg_num) return false; for (n = 1; ; n++) { int next_bit = n << (old_bits - 1); u32 s = next_bit | pgid->seed; if (s < old_pg_num || s == pgid->seed) continue; if (s >= new_pg_num) break; s = ceph_stable_mod(s, old_pg_num, old_mask); if (s == pgid->seed) return true; } return false; } bool ceph_is_new_interval(const struct ceph_osds *old_acting, const struct ceph_osds *new_acting, const struct ceph_osds *old_up, const struct ceph_osds *new_up, int old_size, int new_size, int old_min_size, int new_min_size, u32 old_pg_num, u32 new_pg_num, bool old_sort_bitwise, bool new_sort_bitwise, bool old_recovery_deletes, bool new_recovery_deletes, const struct ceph_pg *pgid) { return !osds_equal(old_acting, new_acting) || !osds_equal(old_up, new_up) || old_size != new_size || old_min_size != new_min_size || ceph_pg_is_split(pgid, old_pg_num, new_pg_num) || old_sort_bitwise != new_sort_bitwise || old_recovery_deletes != new_recovery_deletes; } static int calc_pg_rank(int osd, const struct ceph_osds *acting) { int i; for (i = 0; i < acting->size; i++) { if (acting->osds[i] == osd) return i; } return -1; } static bool primary_changed(const struct ceph_osds *old_acting, const struct ceph_osds *new_acting) { if (!old_acting->size && !new_acting->size) return false; /* both still empty */ if (!old_acting->size ^ !new_acting->size) return true; /* was empty, now not, or vice versa */ if (old_acting->primary != new_acting->primary) return true; /* primary changed */ if (calc_pg_rank(old_acting->primary, old_acting) != calc_pg_rank(new_acting->primary, new_acting)) return true; return false; /* same primary (tho replicas may have changed) */ } bool ceph_osds_changed(const struct ceph_osds *old_acting, const struct ceph_osds *new_acting, bool any_change) { if (primary_changed(old_acting, new_acting)) return true; if (any_change && !__osds_equal(old_acting, new_acting)) return true; return false; } /* * Map an object into a PG. * * Should only be called with target_oid and target_oloc (as opposed to * base_oid and base_oloc), since tiering isn't taken into account. */ void __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi, const struct ceph_object_id *oid, const struct ceph_object_locator *oloc, struct ceph_pg *raw_pgid) { WARN_ON(pi->id != oloc->pool); if (!oloc->pool_ns) { raw_pgid->pool = oloc->pool; raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name, oid->name_len); dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name, raw_pgid->pool, raw_pgid->seed); } else { char stack_buf[256]; char *buf = stack_buf; int nsl = oloc->pool_ns->len; size_t total = nsl + 1 + oid->name_len; if (total > sizeof(stack_buf)) buf = kmalloc(total, GFP_NOIO | __GFP_NOFAIL); memcpy(buf, oloc->pool_ns->str, nsl); buf[nsl] = '\037'; memcpy(buf + nsl + 1, oid->name, oid->name_len); raw_pgid->pool = oloc->pool; raw_pgid->seed = ceph_str_hash(pi->object_hash, buf, total); if (buf != stack_buf) kfree(buf); dout("%s %s ns %.*s -> raw_pgid %llu.%x\n", __func__, oid->name, nsl, oloc->pool_ns->str, raw_pgid->pool, raw_pgid->seed); } } int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, const struct ceph_object_id *oid, const struct ceph_object_locator *oloc, struct ceph_pg *raw_pgid) { struct ceph_pg_pool_info *pi; pi = ceph_pg_pool_by_id(osdmap, oloc->pool); if (!pi) return -ENOENT; __ceph_object_locator_to_pg(pi, oid, oloc, raw_pgid); return 0; } EXPORT_SYMBOL(ceph_object_locator_to_pg); /* * Map a raw PG (full precision ps) into an actual PG. */ static void raw_pg_to_pg(struct ceph_pg_pool_info *pi, const struct ceph_pg *raw_pgid, struct ceph_pg *pgid) { pgid->pool = raw_pgid->pool; pgid->seed = ceph_stable_mod(raw_pgid->seed, pi->pg_num, pi->pg_num_mask); } /* * Map a raw PG (full precision ps) into a placement ps (placement * seed). Include pool id in that value so that different pools don't * use the same seeds. */ static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi, const struct ceph_pg *raw_pgid) { if (pi->flags & CEPH_POOL_FLAG_HASHPSPOOL) { /* hash pool id and seed so that pool PGs do not overlap */ return crush_hash32_2(CRUSH_HASH_RJENKINS1, ceph_stable_mod(raw_pgid->seed, pi->pgp_num, pi->pgp_num_mask), raw_pgid->pool); } else { /* * legacy behavior: add ps and pool together. this is * not a great approach because the PGs from each pool * will overlap on top of each other: 0.5 == 1.4 == * 2.3 == ... */ return ceph_stable_mod(raw_pgid->seed, pi->pgp_num, pi->pgp_num_mask) + (unsigned)raw_pgid->pool; } } /* * Magic value used for a "default" fallback choose_args, used if the * crush_choose_arg_map passed to do_crush() does not exist. If this * also doesn't exist, fall back to canonical weights. */ #define CEPH_DEFAULT_CHOOSE_ARGS -1 static int do_crush(struct ceph_osdmap *map, int ruleno, int x, int *result, int result_max, const __u32 *weight, int weight_max, s64 choose_args_index) { struct crush_choose_arg_map *arg_map; struct crush_work *work; int r; BUG_ON(result_max > CEPH_PG_MAX_SIZE); arg_map = lookup_choose_arg_map(&map->crush->choose_args, choose_args_index); if (!arg_map) arg_map = lookup_choose_arg_map(&map->crush->choose_args, CEPH_DEFAULT_CHOOSE_ARGS); work = get_workspace(&map->crush_wsm, map->crush); r = crush_do_rule(map->crush, ruleno, x, result, result_max, weight, weight_max, work, arg_map ? arg_map->args : NULL); put_workspace(&map->crush_wsm, work); return r; } static void remove_nonexistent_osds(struct ceph_osdmap *osdmap, struct ceph_pg_pool_info *pi, struct ceph_osds *set) { int i; if (ceph_can_shift_osds(pi)) { int removed = 0; /* shift left */ for (i = 0; i < set->size; i++) { if (!ceph_osd_exists(osdmap, set->osds[i])) { removed++; continue; } if (removed) set->osds[i - removed] = set->osds[i]; } set->size -= removed; } else { /* set dne devices to NONE */ for (i = 0; i < set->size; i++) { if (!ceph_osd_exists(osdmap, set->osds[i])) set->osds[i] = CRUSH_ITEM_NONE; } } } /* * Calculate raw set (CRUSH output) for given PG and filter out * nonexistent OSDs. ->primary is undefined for a raw set. * * Placement seed (CRUSH input) is returned through @ppps. */ static void pg_to_raw_osds(struct ceph_osdmap *osdmap, struct ceph_pg_pool_info *pi, const struct ceph_pg *raw_pgid, struct ceph_osds *raw, u32 *ppps) { u32 pps = raw_pg_to_pps(pi, raw_pgid); int ruleno; int len; ceph_osds_init(raw); if (ppps) *ppps = pps; ruleno = crush_find_rule(osdmap->crush, pi->crush_ruleset, pi->type, pi->size); if (ruleno < 0) { pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n", pi->id, pi->crush_ruleset, pi->type, pi->size); return; } if (pi->size > ARRAY_SIZE(raw->osds)) { pr_err_ratelimited("pool %lld ruleset %d type %d too wide: size %d > %zu\n", pi->id, pi->crush_ruleset, pi->type, pi->size, ARRAY_SIZE(raw->osds)); return; } len = do_crush(osdmap, ruleno, pps, raw->osds, pi->size, osdmap->osd_weight, osdmap->max_osd, pi->id); if (len < 0) { pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", len, ruleno, pi->id, pi->crush_ruleset, pi->type, pi->size); return; } raw->size = len; remove_nonexistent_osds(osdmap, pi, raw); } /* apply pg_upmap[_items] mappings */ static void apply_upmap(struct ceph_osdmap *osdmap, const struct ceph_pg *pgid, struct ceph_osds *raw) { struct ceph_pg_mapping *pg; int i, j; pg = lookup_pg_mapping(&osdmap->pg_upmap, pgid); if (pg) { /* make sure targets aren't marked out */ for (i = 0; i < pg->pg_upmap.len; i++) { int osd = pg->pg_upmap.osds[i]; if (osd != CRUSH_ITEM_NONE && osd < osdmap->max_osd && osdmap->osd_weight[osd] == 0) { /* reject/ignore explicit mapping */ return; } } for (i = 0; i < pg->pg_upmap.len; i++) raw->osds[i] = pg->pg_upmap.osds[i]; raw->size = pg->pg_upmap.len; /* check and apply pg_upmap_items, if any */ } pg = lookup_pg_mapping(&osdmap->pg_upmap_items, pgid); if (pg) { /* * Note: this approach does not allow a bidirectional swap, * e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1]. */ for (i = 0; i < pg->pg_upmap_items.len; i++) { int from = pg->pg_upmap_items.from_to[i][0]; int to = pg->pg_upmap_items.from_to[i][1]; int pos = -1; bool exists = false; /* make sure replacement doesn't already appear */ for (j = 0; j < raw->size; j++) { int osd = raw->osds[j]; if (osd == to) { exists = true; break; } /* ignore mapping if target is marked out */ if (osd == from && pos < 0 && !(to != CRUSH_ITEM_NONE && to < osdmap->max_osd && osdmap->osd_weight[to] == 0)) { pos = j; } } if (!exists && pos >= 0) raw->osds[pos] = to; } } } /* * Given raw set, calculate up set and up primary. By definition of an * up set, the result won't contain nonexistent or down OSDs. * * This is done in-place - on return @set is the up set. If it's * empty, ->primary will remain undefined. */ static void raw_to_up_osds(struct ceph_osdmap *osdmap, struct ceph_pg_pool_info *pi, struct ceph_osds *set) { int i; /* ->primary is undefined for a raw set */ BUG_ON(set->primary != -1); if (ceph_can_shift_osds(pi)) { int removed = 0; /* shift left */ for (i = 0; i < set->size; i++) { if (ceph_osd_is_down(osdmap, set->osds[i])) { removed++; continue; } if (removed) set->osds[i - removed] = set->osds[i]; } set->size -= removed; if (set->size > 0) set->primary = set->osds[0]; } else { /* set down/dne devices to NONE */ for (i = set->size - 1; i >= 0; i--) { if (ceph_osd_is_down(osdmap, set->osds[i])) set->osds[i] = CRUSH_ITEM_NONE; else set->primary = set->osds[i]; } } } static void apply_primary_affinity(struct ceph_osdmap *osdmap, struct ceph_pg_pool_info *pi, u32 pps, struct ceph_osds *up) { int i; int pos = -1; /* * Do we have any non-default primary_affinity values for these * osds? */ if (!osdmap->osd_primary_affinity) return; for (i = 0; i < up->size; i++) { int osd = up->osds[i]; if (osd != CRUSH_ITEM_NONE && osdmap->osd_primary_affinity[osd] != CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) { break; } } if (i == up->size) return; /* * Pick the primary. Feed both the seed (for the pg) and the * osd into the hash/rng so that a proportional fraction of an * osd's pgs get rejected as primary. */ for (i = 0; i < up->size; i++) { int osd = up->osds[i]; u32 aff; if (osd == CRUSH_ITEM_NONE) continue; aff = osdmap->osd_primary_affinity[osd]; if (aff < CEPH_OSD_MAX_PRIMARY_AFFINITY && (crush_hash32_2(CRUSH_HASH_RJENKINS1, pps, osd) >> 16) >= aff) { /* * We chose not to use this primary. Note it * anyway as a fallback in case we don't pick * anyone else, but keep looking. */ if (pos < 0) pos = i; } else { pos = i; break; } } if (pos < 0) return; up->primary = up->osds[pos]; if (ceph_can_shift_osds(pi) && pos > 0) { /* move the new primary to the front */ for (i = pos; i > 0; i--) up->osds[i] = up->osds[i - 1]; up->osds[0] = up->primary; } } /* * Get pg_temp and primary_temp mappings for given PG. * * Note that a PG may have none, only pg_temp, only primary_temp or * both pg_temp and primary_temp mappings. This means @temp isn't * always a valid OSD set on return: in the "only primary_temp" case, * @temp will have its ->primary >= 0 but ->size == 0. */ static void get_temp_osds(struct ceph_osdmap *osdmap, struct ceph_pg_pool_info *pi, const struct ceph_pg *pgid, struct ceph_osds *temp) { struct ceph_pg_mapping *pg; int i; ceph_osds_init(temp); /* pg_temp? */ pg = lookup_pg_mapping(&osdmap->pg_temp, pgid); if (pg) { for (i = 0; i < pg->pg_temp.len; i++) { if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { if (ceph_can_shift_osds(pi)) continue; temp->osds[temp->size++] = CRUSH_ITEM_NONE; } else { temp->osds[temp->size++] = pg->pg_temp.osds[i]; } } /* apply pg_temp's primary */ for (i = 0; i < temp->size; i++) { if (temp->osds[i] != CRUSH_ITEM_NONE) { temp->primary = temp->osds[i]; break; } } } /* primary_temp? */ pg = lookup_pg_mapping(&osdmap->primary_temp, pgid); if (pg) temp->primary = pg->primary_temp.osd; } /* * Map a PG to its acting set as well as its up set. * * Acting set is used for data mapping purposes, while up set can be * recorded for detecting interval changes and deciding whether to * resend a request. */ void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap, struct ceph_pg_pool_info *pi, const struct ceph_pg *raw_pgid, struct ceph_osds *up, struct ceph_osds *acting) { struct ceph_pg pgid; u32 pps; WARN_ON(pi->id != raw_pgid->pool); raw_pg_to_pg(pi, raw_pgid, &pgid); pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps); apply_upmap(osdmap, &pgid, up); raw_to_up_osds(osdmap, pi, up); apply_primary_affinity(osdmap, pi, pps, up); get_temp_osds(osdmap, pi, &pgid, acting); if (!acting->size) { memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0])); acting->size = up->size; if (acting->primary == -1) acting->primary = up->primary; } WARN_ON(!osds_valid(up) || !osds_valid(acting)); } bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap, struct ceph_pg_pool_info *pi, const struct ceph_pg *raw_pgid, struct ceph_spg *spgid) { struct ceph_pg pgid; struct ceph_osds up, acting; int i; WARN_ON(pi->id != raw_pgid->pool); raw_pg_to_pg(pi, raw_pgid, &pgid); if (ceph_can_shift_osds(pi)) { spgid->pgid = pgid; /* struct */ spgid->shard = CEPH_SPG_NOSHARD; return true; } ceph_pg_to_up_acting_osds(osdmap, pi, &pgid, &up, &acting); for (i = 0; i < acting.size; i++) { if (acting.osds[i] == acting.primary) { spgid->pgid = pgid; /* struct */ spgid->shard = i; return true; } } return false; } /* * Return acting primary for given PG, or -1 if none. */ int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, const struct ceph_pg *raw_pgid) { struct ceph_pg_pool_info *pi; struct ceph_osds up, acting; pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool); if (!pi) return -1; ceph_pg_to_up_acting_osds(osdmap, pi, raw_pgid, &up, &acting); return acting.primary; } EXPORT_SYMBOL(ceph_pg_to_acting_primary); static struct crush_loc_node *alloc_crush_loc(size_t type_name_len, size_t name_len) { struct crush_loc_node *loc; loc = kmalloc(sizeof(*loc) + type_name_len + name_len + 2, GFP_NOIO); if (!loc) return NULL; RB_CLEAR_NODE(&loc->cl_node); return loc; } static void free_crush_loc(struct crush_loc_node *loc) { WARN_ON(!RB_EMPTY_NODE(&loc->cl_node)); kfree(loc); } static int crush_loc_compare(const struct crush_loc *loc1, const struct crush_loc *loc2) { return strcmp(loc1->cl_type_name, loc2->cl_type_name) ?: strcmp(loc1->cl_name, loc2->cl_name); } DEFINE_RB_FUNCS2(crush_loc, struct crush_loc_node, cl_loc, crush_loc_compare, RB_BYPTR, const struct crush_loc *, cl_node) /* * Parses a set of <bucket type name>':'<bucket name> pairs separated * by '|', e.g. "rack:foo1|rack:foo2|datacenter:bar". * * Note that @crush_location is modified by strsep(). */ int ceph_parse_crush_location(char *crush_location, struct rb_root *locs) { struct crush_loc_node *loc; const char *type_name, *name, *colon; size_t type_name_len, name_len; dout("%s '%s'\n", __func__, crush_location); while ((type_name = strsep(&crush_location, "|"))) { colon = strchr(type_name, ':'); if (!colon) return -EINVAL; type_name_len = colon - type_name; if (type_name_len == 0) return -EINVAL; name = colon + 1; name_len = strlen(name); if (name_len == 0) return -EINVAL; loc = alloc_crush_loc(type_name_len, name_len); if (!loc) return -ENOMEM; loc->cl_loc.cl_type_name = loc->cl_data; memcpy(loc->cl_loc.cl_type_name, type_name, type_name_len); loc->cl_loc.cl_type_name[type_name_len] = '\0'; loc->cl_loc.cl_name = loc->cl_data + type_name_len + 1; memcpy(loc->cl_loc.cl_name, name, name_len); loc->cl_loc.cl_name[name_len] = '\0'; if (!__insert_crush_loc(locs, loc)) { free_crush_loc(loc); return -EEXIST; } dout("%s type_name '%s' name '%s'\n", __func__, loc->cl_loc.cl_type_name, loc->cl_loc.cl_name); } return 0; } int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2) { struct rb_node *n1 = rb_first(locs1); struct rb_node *n2 = rb_first(locs2); int ret; for ( ; n1 && n2; n1 = rb_next(n1), n2 = rb_next(n2)) { struct crush_loc_node *loc1 = rb_entry(n1, struct crush_loc_node, cl_node); struct crush_loc_node *loc2 = rb_entry(n2, struct crush_loc_node, cl_node); ret = crush_loc_compare(&loc1->cl_loc, &loc2->cl_loc); if (ret) return ret; } if (!n1 && n2) return -1; if (n1 && !n2) return 1; return 0; } void ceph_clear_crush_locs(struct rb_root *locs) { while (!RB_EMPTY_ROOT(locs)) { struct crush_loc_node *loc = rb_entry(rb_first(locs), struct crush_loc_node, cl_node); erase_crush_loc(locs, loc); free_crush_loc(loc); } } /* * [a-zA-Z0-9-_.]+ */ static bool is_valid_crush_name(const char *name) { do { if (!('a' <= *name && *name <= 'z') && !('A' <= *name && *name <= 'Z') && !('0' <= *name && *name <= '9') && *name != '-' && *name != '_' && *name != '.') return false; } while (*++name != '\0'); return true; } /* * Gets the parent of an item. Returns its id (<0 because the * parent is always a bucket), type id (>0 for the same reason, * via @parent_type_id) and location (via @parent_loc). If no * parent, returns 0. * * Does a linear search, as there are no parent pointers of any * kind. Note that the result is ambiguous for items that occur * multiple times in the map. */ static int get_immediate_parent(struct crush_map *c, int id, u16 *parent_type_id, struct crush_loc *parent_loc) { struct crush_bucket *b; struct crush_name_node *type_cn, *cn; int i, j; for (i = 0; i < c->max_buckets; i++) { b = c->buckets[i]; if (!b) continue; /* ignore per-class shadow hierarchy */ cn = lookup_crush_name(&c->names, b->id); if (!cn || !is_valid_crush_name(cn->cn_name)) continue; for (j = 0; j < b->size; j++) { if (b->items[j] != id) continue; *parent_type_id = b->type; type_cn = lookup_crush_name(&c->type_names, b->type); parent_loc->cl_type_name = type_cn->cn_name; parent_loc->cl_name = cn->cn_name; return b->id; } } return 0; /* no parent */ } /* * Calculates the locality/distance from an item to a client * location expressed in terms of CRUSH hierarchy as a set of * (bucket type name, bucket name) pairs. Specifically, looks * for the lowest-valued bucket type for which the location of * @id matches one of the locations in @locs, so for standard * bucket types (host = 1, rack = 3, datacenter = 8, zone = 9) * a matching host is closer than a matching rack and a matching * data center is closer than a matching zone. * * Specifying multiple locations (a "multipath" location) such * as "rack=foo1 rack=foo2 datacenter=bar" is allowed -- @locs * is a multimap. The locality will be: * * - 3 for OSDs in racks foo1 and foo2 * - 8 for OSDs in data center bar * - -1 for all other OSDs * * The lowest possible bucket type is 1, so the best locality * for an OSD is 1 (i.e. a matching host). Locality 0 would be * the OSD itself. */ int ceph_get_crush_locality(struct ceph_osdmap *osdmap, int id, struct rb_root *locs) { struct crush_loc loc; u16 type_id; /* * Instead of repeated get_immediate_parent() calls, * the location of @id could be obtained with a single * depth-first traversal. */ for (;;) { id = get_immediate_parent(osdmap->crush, id, &type_id, &loc); if (id >= 0) return -1; /* not local */ if (lookup_crush_loc(locs, &loc)) return type_id; } } |
45 45 13 13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | // SPDX-License-Identifier: GPL-2.0 #include <linux/cache.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/pid_namespace.h> #include "internal.h" /* * /proc/thread_self: */ static const char *proc_thread_self_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { struct pid_namespace *ns = proc_pid_ns(inode->i_sb); pid_t tgid = task_tgid_nr_ns(current, ns); pid_t pid = task_pid_nr_ns(current, ns); char *name; if (!pid) return ERR_PTR(-ENOENT); name = kmalloc(10 + 6 + 10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC); if (unlikely(!name)) return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD); sprintf(name, "%u/task/%u", tgid, pid); set_delayed_call(done, kfree_link, name); return name; } static const struct inode_operations proc_thread_self_inode_operations = { .get_link = proc_thread_self_get_link, }; static unsigned thread_self_inum __ro_after_init; int proc_setup_thread_self(struct super_block *s) { struct inode *root_inode = d_inode(s->s_root); struct proc_fs_info *fs_info = proc_sb_info(s); struct dentry *thread_self; int ret = -ENOMEM; inode_lock(root_inode); thread_self = d_alloc_name(s->s_root, "thread-self"); if (thread_self) { struct inode *inode = new_inode(s); if (inode) { inode->i_ino = thread_self_inum; simple_inode_init_ts(inode); inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; inode->i_op = &proc_thread_self_inode_operations; d_add(thread_self, inode); ret = 0; } else { dput(thread_self); } } inode_unlock(root_inode); if (ret) pr_err("proc_fill_super: can't allocate /proc/thread-self\n"); else fs_info->proc_thread_self = thread_self; return ret; } void __init proc_thread_self_init(void) { proc_alloc_inum(&thread_self_inum); } |
200 199 195 8 88 186 186 186 185 186 44 92 36 124 18 15 4 5 140 140 18 119 20 149 100 130 41 42 36 103 41 9 9 49 107 106 31 104 274 31 31 31 108 246 108 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 | // SPDX-License-Identifier: GPL-2.0 /* * Tty buffer allocation management */ #include <linux/types.h> #include <linux/errno.h> #include <linux/minmax.h> #include <linux/tty.h> #include <linux/tty_buffer.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/bitops.h> #include <linux/delay.h> #include <linux/module.h> #include <linux/ratelimit.h> #include "tty.h" #define MIN_TTYB_SIZE 256 #define TTYB_ALIGN_MASK 0xff /* * Byte threshold to limit memory consumption for flip buffers. * The actual memory limit is > 2x this amount. */ #define TTYB_DEFAULT_MEM_LIMIT (640 * 1024UL) /* * We default to dicing tty buffer allocations to this many characters * in order to avoid multiple page allocations. We know the size of * tty_buffer itself but it must also be taken into account that the * buffer is 256 byte aligned. See tty_buffer_find for the allocation * logic this must match. */ #define TTY_BUFFER_PAGE (((PAGE_SIZE - sizeof(struct tty_buffer)) / 2) & ~TTYB_ALIGN_MASK) /** * tty_buffer_lock_exclusive - gain exclusive access to buffer * @port: tty port owning the flip buffer * * Guarantees safe use of the &tty_ldisc_ops.receive_buf() method by excluding * the buffer work and any pending flush from using the flip buffer. Data can * continue to be added concurrently to the flip buffer from the driver side. * * See also tty_buffer_unlock_exclusive(). */ void tty_buffer_lock_exclusive(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; atomic_inc(&buf->priority); mutex_lock(&buf->lock); } EXPORT_SYMBOL_GPL(tty_buffer_lock_exclusive); /** * tty_buffer_unlock_exclusive - release exclusive access * @port: tty port owning the flip buffer * * The buffer work is restarted if there is data in the flip buffer. * * See also tty_buffer_lock_exclusive(). */ void tty_buffer_unlock_exclusive(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; bool restart = buf->head->commit != buf->head->read; atomic_dec(&buf->priority); mutex_unlock(&buf->lock); if (restart) queue_work(system_unbound_wq, &buf->work); } EXPORT_SYMBOL_GPL(tty_buffer_unlock_exclusive); /** * tty_buffer_space_avail - return unused buffer space * @port: tty port owning the flip buffer * * Returns: the # of bytes which can be written by the driver without reaching * the buffer limit. * * Note: this does not guarantee that memory is available to write the returned * # of bytes (use tty_prepare_flip_string() to pre-allocate if memory * guarantee is required). */ unsigned int tty_buffer_space_avail(struct tty_port *port) { int space = port->buf.mem_limit - atomic_read(&port->buf.mem_used); return max(space, 0); } EXPORT_SYMBOL_GPL(tty_buffer_space_avail); static void tty_buffer_reset(struct tty_buffer *p, size_t size) { p->used = 0; p->size = size; p->next = NULL; p->commit = 0; p->lookahead = 0; p->read = 0; p->flags = true; } /** * tty_buffer_free_all - free buffers used by a tty * @port: tty port to free from * * Remove all the buffers pending on a tty whether queued with data or in the * free ring. Must be called when the tty is no longer in use. */ void tty_buffer_free_all(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; struct tty_buffer *p, *next; struct llist_node *llist; unsigned int freed = 0; int still_used; while ((p = buf->head) != NULL) { buf->head = p->next; freed += p->size; if (p->size > 0) kfree(p); } llist = llist_del_all(&buf->free); llist_for_each_entry_safe(p, next, llist, free) kfree(p); tty_buffer_reset(&buf->sentinel, 0); buf->head = &buf->sentinel; buf->tail = &buf->sentinel; still_used = atomic_xchg(&buf->mem_used, 0); WARN(still_used != freed, "we still have not freed %d bytes!", still_used - freed); } /** * tty_buffer_alloc - allocate a tty buffer * @port: tty port * @size: desired size (characters) * * Allocate a new tty buffer to hold the desired number of characters. We * round our buffers off in 256 character chunks to get better allocation * behaviour. * * Returns: %NULL if out of memory or the allocation would exceed the per * device queue. */ static struct tty_buffer *tty_buffer_alloc(struct tty_port *port, size_t size) { struct llist_node *free; struct tty_buffer *p; /* Round the buffer size out */ size = __ALIGN_MASK(size, TTYB_ALIGN_MASK); if (size <= MIN_TTYB_SIZE) { free = llist_del_first(&port->buf.free); if (free) { p = llist_entry(free, struct tty_buffer, free); goto found; } } /* Should possibly check if this fails for the largest buffer we * have queued and recycle that ? */ if (atomic_read(&port->buf.mem_used) > port->buf.mem_limit) return NULL; p = kmalloc(struct_size(p, data, 2 * size), GFP_ATOMIC | __GFP_NOWARN); if (p == NULL) return NULL; found: tty_buffer_reset(p, size); atomic_add(size, &port->buf.mem_used); return p; } /** * tty_buffer_free - free a tty buffer * @port: tty port owning the buffer * @b: the buffer to free * * Free a tty buffer, or add it to the free list according to our internal * strategy. */ static void tty_buffer_free(struct tty_port *port, struct tty_buffer *b) { struct tty_bufhead *buf = &port->buf; /* Dumb strategy for now - should keep some stats */ WARN_ON(atomic_sub_return(b->size, &buf->mem_used) < 0); if (b->size > MIN_TTYB_SIZE) kfree(b); else if (b->size > 0) llist_add(&b->free, &buf->free); } /** * tty_buffer_flush - flush full tty buffers * @tty: tty to flush * @ld: optional ldisc ptr (must be referenced) * * Flush all the buffers containing receive data. If @ld != %NULL, flush the * ldisc input buffer. * * Locking: takes buffer lock to ensure single-threaded flip buffer 'consumer'. */ void tty_buffer_flush(struct tty_struct *tty, struct tty_ldisc *ld) { struct tty_port *port = tty->port; struct tty_bufhead *buf = &port->buf; struct tty_buffer *next; atomic_inc(&buf->priority); mutex_lock(&buf->lock); /* paired w/ release in __tty_buffer_request_room; ensures there are * no pending memory accesses to the freed buffer */ while ((next = smp_load_acquire(&buf->head->next)) != NULL) { tty_buffer_free(port, buf->head); buf->head = next; } buf->head->read = buf->head->commit; buf->head->lookahead = buf->head->read; if (ld && ld->ops->flush_buffer) ld->ops->flush_buffer(tty); atomic_dec(&buf->priority); mutex_unlock(&buf->lock); } /** * __tty_buffer_request_room - grow tty buffer if needed * @port: tty port * @size: size desired * @flags: buffer has to store flags along character data * * Make at least @size bytes of linear space available for the tty buffer. * * Will change over to a new buffer if the current buffer is encoded as * %TTY_NORMAL (so has no flags buffer) and the new buffer requires a flags * buffer. * * Returns: the size we managed to find. */ static int __tty_buffer_request_room(struct tty_port *port, size_t size, bool flags) { struct tty_bufhead *buf = &port->buf; struct tty_buffer *n, *b = buf->tail; size_t left = (b->flags ? 1 : 2) * b->size - b->used; bool change = !b->flags && flags; if (!change && left >= size) return size; /* This is the slow path - looking for new buffers to use */ n = tty_buffer_alloc(port, size); if (n == NULL) return change ? 0 : left; n->flags = flags; buf->tail = n; /* * Paired w/ acquire in flush_to_ldisc() and lookahead_bufs() * ensures they see all buffer data. */ smp_store_release(&b->commit, b->used); /* * Paired w/ acquire in flush_to_ldisc() and lookahead_bufs() * ensures the latest commit value can be read before the head * is advanced to the next buffer. */ smp_store_release(&b->next, n); return size; } int tty_buffer_request_room(struct tty_port *port, size_t size) { return __tty_buffer_request_room(port, size, true); } EXPORT_SYMBOL_GPL(tty_buffer_request_room); size_t __tty_insert_flip_string_flags(struct tty_port *port, const u8 *chars, const u8 *flags, bool mutable_flags, size_t size) { bool need_flags = mutable_flags || flags[0] != TTY_NORMAL; size_t copied = 0; do { size_t goal = min_t(size_t, size - copied, TTY_BUFFER_PAGE); size_t space = __tty_buffer_request_room(port, goal, need_flags); struct tty_buffer *tb = port->buf.tail; if (unlikely(space == 0)) break; memcpy(char_buf_ptr(tb, tb->used), chars, space); if (mutable_flags) { memcpy(flag_buf_ptr(tb, tb->used), flags, space); flags += space; } else if (tb->flags) { memset(flag_buf_ptr(tb, tb->used), flags[0], space); } else { /* tb->flags should be available once requested */ WARN_ON_ONCE(need_flags); } tb->used += space; copied += space; chars += space; /* There is a small chance that we need to split the data over * several buffers. If this is the case we must loop. */ } while (unlikely(size > copied)); return copied; } EXPORT_SYMBOL(__tty_insert_flip_string_flags); /** * tty_prepare_flip_string - make room for characters * @port: tty port * @chars: return pointer for character write area * @size: desired size * * Prepare a block of space in the buffer for data. * * This is used for drivers that need their own block copy routines into the * buffer. There is no guarantee the buffer is a DMA target! * * Returns: the length available and buffer pointer (@chars) to the space which * is now allocated and accounted for as ready for normal characters. */ size_t tty_prepare_flip_string(struct tty_port *port, u8 **chars, size_t size) { size_t space = __tty_buffer_request_room(port, size, false); if (likely(space)) { struct tty_buffer *tb = port->buf.tail; *chars = char_buf_ptr(tb, tb->used); if (tb->flags) memset(flag_buf_ptr(tb, tb->used), TTY_NORMAL, space); tb->used += space; } return space; } EXPORT_SYMBOL_GPL(tty_prepare_flip_string); /** * tty_ldisc_receive_buf - forward data to line discipline * @ld: line discipline to process input * @p: char buffer * @f: %TTY_NORMAL, %TTY_BREAK, etc. flags buffer * @count: number of bytes to process * * Callers other than flush_to_ldisc() need to exclude the kworker from * concurrent use of the line discipline, see paste_selection(). * * Returns: the number of bytes processed. */ size_t tty_ldisc_receive_buf(struct tty_ldisc *ld, const u8 *p, const u8 *f, size_t count) { if (ld->ops->receive_buf2) count = ld->ops->receive_buf2(ld->tty, p, f, count); else { count = min_t(size_t, count, ld->tty->receive_room); if (count && ld->ops->receive_buf) ld->ops->receive_buf(ld->tty, p, f, count); } return count; } EXPORT_SYMBOL_GPL(tty_ldisc_receive_buf); static void lookahead_bufs(struct tty_port *port, struct tty_buffer *head) { head->lookahead = max(head->lookahead, head->read); while (head) { struct tty_buffer *next; unsigned int count; /* * Paired w/ release in __tty_buffer_request_room(); * ensures commit value read is not stale if the head * is advancing to the next buffer. */ next = smp_load_acquire(&head->next); /* * Paired w/ release in __tty_buffer_request_room() or in * tty_buffer_flush(); ensures we see the committed buffer data. */ count = smp_load_acquire(&head->commit) - head->lookahead; if (!count) { head = next; continue; } if (port->client_ops->lookahead_buf) { u8 *p, *f = NULL; p = char_buf_ptr(head, head->lookahead); if (head->flags) f = flag_buf_ptr(head, head->lookahead); port->client_ops->lookahead_buf(port, p, f, count); } head->lookahead += count; } } static size_t receive_buf(struct tty_port *port, struct tty_buffer *head, size_t count) { u8 *p = char_buf_ptr(head, head->read); const u8 *f = NULL; size_t n; if (head->flags) f = flag_buf_ptr(head, head->read); n = port->client_ops->receive_buf(port, p, f, count); if (n > 0) memset(p, 0, n); return n; } /** * flush_to_ldisc - flush data from buffer to ldisc * @work: tty structure passed from work queue. * * This routine is called out of the software interrupt to flush data from the * buffer chain to the line discipline. * * The receive_buf() method is single threaded for each tty instance. * * Locking: takes buffer lock to ensure single-threaded flip buffer 'consumer'. */ static void flush_to_ldisc(struct work_struct *work) { struct tty_port *port = container_of(work, struct tty_port, buf.work); struct tty_bufhead *buf = &port->buf; mutex_lock(&buf->lock); while (1) { struct tty_buffer *head = buf->head; struct tty_buffer *next; size_t count, rcvd; /* Ldisc or user is trying to gain exclusive access */ if (atomic_read(&buf->priority)) break; /* paired w/ release in __tty_buffer_request_room(); * ensures commit value read is not stale if the head * is advancing to the next buffer */ next = smp_load_acquire(&head->next); /* paired w/ release in __tty_buffer_request_room() or in * tty_buffer_flush(); ensures we see the committed buffer data */ count = smp_load_acquire(&head->commit) - head->read; if (!count) { if (next == NULL) break; buf->head = next; tty_buffer_free(port, head); continue; } rcvd = receive_buf(port, head, count); head->read += rcvd; if (rcvd < count) lookahead_bufs(port, head); if (!rcvd) break; if (need_resched()) cond_resched(); } mutex_unlock(&buf->lock); } static inline void tty_flip_buffer_commit(struct tty_buffer *tail) { /* * Paired w/ acquire in flush_to_ldisc(); ensures flush_to_ldisc() sees * buffer data. */ smp_store_release(&tail->commit, tail->used); } /** * tty_flip_buffer_push - push terminal buffers * @port: tty port to push * * Queue a push of the terminal flip buffers to the line discipline. Can be * called from IRQ/atomic context. * * In the event of the queue being busy for flipping the work will be held off * and retried later. */ void tty_flip_buffer_push(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; tty_flip_buffer_commit(buf->tail); queue_work(system_unbound_wq, &buf->work); } EXPORT_SYMBOL(tty_flip_buffer_push); /** * tty_insert_flip_string_and_push_buffer - add characters to the tty buffer and * push * @port: tty port * @chars: characters * @size: size * * The function combines tty_insert_flip_string() and tty_flip_buffer_push() * with the exception of properly holding the @port->lock. * * To be used only internally (by pty currently). * * Returns: the number added. */ int tty_insert_flip_string_and_push_buffer(struct tty_port *port, const u8 *chars, size_t size) { struct tty_bufhead *buf = &port->buf; unsigned long flags; spin_lock_irqsave(&port->lock, flags); size = tty_insert_flip_string(port, chars, size); if (size) tty_flip_buffer_commit(buf->tail); spin_unlock_irqrestore(&port->lock, flags); queue_work(system_unbound_wq, &buf->work); return size; } /** * tty_buffer_init - prepare a tty buffer structure * @port: tty port to initialise * * Set up the initial state of the buffer management for a tty device. Must be * called before the other tty buffer functions are used. */ void tty_buffer_init(struct tty_port *port) { struct tty_bufhead *buf = &port->buf; mutex_init(&buf->lock); tty_buffer_reset(&buf->sentinel, 0); buf->head = &buf->sentinel; buf->tail = &buf->sentinel; init_llist_head(&buf->free); atomic_set(&buf->mem_used, 0); atomic_set(&buf->priority, 0); INIT_WORK(&buf->work, flush_to_ldisc); buf->mem_limit = TTYB_DEFAULT_MEM_LIMIT; } /** * tty_buffer_set_limit - change the tty buffer memory limit * @port: tty port to change * @limit: memory limit to set * * Change the tty buffer memory limit. * * Must be called before the other tty buffer functions are used. */ int tty_buffer_set_limit(struct tty_port *port, int limit) { if (limit < MIN_TTYB_SIZE) return -EINVAL; port->buf.mem_limit = limit; return 0; } EXPORT_SYMBOL_GPL(tty_buffer_set_limit); /* slave ptys can claim nested buffer lock when handling BRK and INTR */ void tty_buffer_set_lock_subclass(struct tty_port *port) { lockdep_set_subclass(&port->buf.lock, TTY_LOCK_SLAVE); } bool tty_buffer_restart_work(struct tty_port *port) { return queue_work(system_unbound_wq, &port->buf.work); } bool tty_buffer_cancel_work(struct tty_port *port) { return cancel_work_sync(&port->buf.work); } void tty_buffer_flush_work(struct tty_port *port) { flush_work(&port->buf.work); } |
116 2 116 68 68 108 115 116 105 13 9 116 163 5 3 2 122 19 21 163 2 144 19 19 163 163 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 | // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/skbuff.h> #include <linux/sctp.h> #include <net/gso.h> #include <net/gro.h> /** * skb_eth_gso_segment - segmentation handler for ethernet protocols. * @skb: buffer to segment * @features: features for the output path (see dev->features) * @type: Ethernet Protocol ID */ struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, netdev_features_t features, __be16 type) { struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); struct packet_offload *ptype; rcu_read_lock(); list_for_each_entry_rcu(ptype, &net_hotdata.offload_base, list) { if (ptype->type == type && ptype->callbacks.gso_segment) { segs = ptype->callbacks.gso_segment(skb, features); break; } } rcu_read_unlock(); return segs; } EXPORT_SYMBOL(skb_eth_gso_segment); /** * skb_mac_gso_segment - mac layer segmentation handler. * @skb: buffer to segment * @features: features for the output path (see dev->features) */ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); struct packet_offload *ptype; int vlan_depth = skb->mac_len; __be16 type = skb_network_protocol(skb, &vlan_depth); if (unlikely(!type)) return ERR_PTR(-EINVAL); __skb_pull(skb, vlan_depth); rcu_read_lock(); list_for_each_entry_rcu(ptype, &net_hotdata.offload_base, list) { if (ptype->type == type && ptype->callbacks.gso_segment) { segs = ptype->callbacks.gso_segment(skb, features); break; } } rcu_read_unlock(); __skb_push(skb, skb->data - skb_mac_header(skb)); return segs; } EXPORT_SYMBOL(skb_mac_gso_segment); /* openvswitch calls this on rx path, so we need a different check. */ static bool skb_needs_check(const struct sk_buff *skb, bool tx_path) { if (tx_path) return skb->ip_summed != CHECKSUM_PARTIAL && skb->ip_summed != CHECKSUM_UNNECESSARY; return skb->ip_summed == CHECKSUM_NONE; } /** * __skb_gso_segment - Perform segmentation on skb. * @skb: buffer to segment * @features: features for the output path (see dev->features) * @tx_path: whether it is called in TX path * * This function segments the given skb and returns a list of segments. * * It may return NULL if the skb requires no segmentation. This is * only possible when GSO is used for verifying header integrity. * * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb. */ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path) { struct sk_buff *segs; if (unlikely(skb_needs_check(skb, tx_path))) { int err; /* We're going to init ->check field in TCP or UDP header */ err = skb_cow_head(skb, 0); if (err < 0) return ERR_PTR(err); } /* Only report GSO partial support if it will enable us to * support segmentation on this frame without needing additional * work. */ if (features & NETIF_F_GSO_PARTIAL) { netdev_features_t partial_features = NETIF_F_GSO_ROBUST; struct net_device *dev = skb->dev; partial_features |= dev->features & dev->gso_partial_features; if (!skb_gso_ok(skb, features | partial_features)) features &= ~NETIF_F_GSO_PARTIAL; } BUILD_BUG_ON(SKB_GSO_CB_OFFSET + sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb)); SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb); SKB_GSO_CB(skb)->encap_level = 0; skb_reset_mac_header(skb); skb_reset_mac_len(skb); segs = skb_mac_gso_segment(skb, features); if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) skb_warn_bad_offload(skb); return segs; } EXPORT_SYMBOL(__skb_gso_segment); /** * skb_gso_transport_seglen - Return length of individual segments of a gso packet * * @skb: GSO skb * * skb_gso_transport_seglen is used to determine the real size of the * individual segments, including Layer4 headers (TCP/UDP). * * The MAC/L2 or network (IP, IPv6) headers are not accounted for. */ static unsigned int skb_gso_transport_seglen(const struct sk_buff *skb) { const struct skb_shared_info *shinfo = skb_shinfo(skb); unsigned int thlen = 0; if (skb->encapsulation) { thlen = skb_inner_transport_header(skb) - skb_transport_header(skb); if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) thlen += inner_tcp_hdrlen(skb); } else if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) { thlen = tcp_hdrlen(skb); } else if (unlikely(skb_is_gso_sctp(skb))) { thlen = sizeof(struct sctphdr); } else if (shinfo->gso_type & SKB_GSO_UDP_L4) { thlen = sizeof(struct udphdr); } /* UFO sets gso_size to the size of the fragmentation * payload, i.e. the size of the L4 (UDP) header is already * accounted for. */ return thlen + shinfo->gso_size; } /** * skb_gso_network_seglen - Return length of individual segments of a gso packet * * @skb: GSO skb * * skb_gso_network_seglen is used to determine the real size of the * individual segments, including Layer3 (IP, IPv6) and L4 headers (TCP/UDP). * * The MAC/L2 header is not accounted for. */ static unsigned int skb_gso_network_seglen(const struct sk_buff *skb) { unsigned int hdr_len = skb_transport_header(skb) - skb_network_header(skb); return hdr_len + skb_gso_transport_seglen(skb); } /** * skb_gso_mac_seglen - Return length of individual segments of a gso packet * * @skb: GSO skb * * skb_gso_mac_seglen is used to determine the real size of the * individual segments, including MAC/L2, Layer3 (IP, IPv6) and L4 * headers (TCP/UDP). */ static unsigned int skb_gso_mac_seglen(const struct sk_buff *skb) { unsigned int hdr_len = skb_transport_header(skb) - skb_mac_header(skb); return hdr_len + skb_gso_transport_seglen(skb); } /** * skb_gso_size_check - check the skb size, considering GSO_BY_FRAGS * * There are a couple of instances where we have a GSO skb, and we * want to determine what size it would be after it is segmented. * * We might want to check: * - L3+L4+payload size (e.g. IP forwarding) * - L2+L3+L4+payload size (e.g. sanity check before passing to driver) * * This is a helper to do that correctly considering GSO_BY_FRAGS. * * @skb: GSO skb * * @seg_len: The segmented length (from skb_gso_*_seglen). In the * GSO_BY_FRAGS case this will be [header sizes + GSO_BY_FRAGS]. * * @max_len: The maximum permissible length. * * Returns true if the segmented length <= max length. */ static inline bool skb_gso_size_check(const struct sk_buff *skb, unsigned int seg_len, unsigned int max_len) { const struct skb_shared_info *shinfo = skb_shinfo(skb); const struct sk_buff *iter; if (shinfo->gso_size != GSO_BY_FRAGS) return seg_len <= max_len; /* Undo this so we can re-use header sizes */ seg_len -= GSO_BY_FRAGS; skb_walk_frags(skb, iter) { if (seg_len + skb_headlen(iter) > max_len) return false; } return true; } /** * skb_gso_validate_network_len - Will a split GSO skb fit into a given MTU? * * @skb: GSO skb * @mtu: MTU to validate against * * skb_gso_validate_network_len validates if a given skb will fit a * wanted MTU once split. It considers L3 headers, L4 headers, and the * payload. */ bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu) { return skb_gso_size_check(skb, skb_gso_network_seglen(skb), mtu); } EXPORT_SYMBOL_GPL(skb_gso_validate_network_len); /** * skb_gso_validate_mac_len - Will a split GSO skb fit in a given length? * * @skb: GSO skb * @len: length to validate against * * skb_gso_validate_mac_len validates if a given skb will fit a wanted * length once split, including L2, L3 and L4 headers and the payload. */ bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len) { return skb_gso_size_check(skb, skb_gso_mac_seglen(skb), len); } EXPORT_SYMBOL_GPL(skb_gso_validate_mac_len); |
7 7 2 2 37 30 5 1 7 33 2 7 7 29 29 2 29 27 28 28 28 5 2 8 2 8 4 2 8 52 1 52 2 1 3 8 1 2 2 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 | // SPDX-License-Identifier: GPL-2.0 #include <linux/cgroup.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/sched/signal.h> #include "cgroup-internal.h" #include <trace/events/cgroup.h> /* * Propagate the cgroup frozen state upwards by the cgroup tree. */ static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen) { int desc = 1; /* * If the new state is frozen, some freezing ancestor cgroups may change * their state too, depending on if all their descendants are frozen. * * Otherwise, all ancestor cgroups are forced into the non-frozen state. */ while ((cgrp = cgroup_parent(cgrp))) { if (frozen) { cgrp->freezer.nr_frozen_descendants += desc; if (!test_bit(CGRP_FROZEN, &cgrp->flags) && test_bit(CGRP_FREEZE, &cgrp->flags) && cgrp->freezer.nr_frozen_descendants == cgrp->nr_descendants) { set_bit(CGRP_FROZEN, &cgrp->flags); cgroup_file_notify(&cgrp->events_file); TRACE_CGROUP_PATH(notify_frozen, cgrp, 1); desc++; } } else { cgrp->freezer.nr_frozen_descendants -= desc; if (test_bit(CGRP_FROZEN, &cgrp->flags)) { clear_bit(CGRP_FROZEN, &cgrp->flags); cgroup_file_notify(&cgrp->events_file); TRACE_CGROUP_PATH(notify_frozen, cgrp, 0); desc++; } } } } /* * Revisit the cgroup frozen state. * Checks if the cgroup is really frozen and perform all state transitions. */ void cgroup_update_frozen(struct cgroup *cgrp) { bool frozen; lockdep_assert_held(&css_set_lock); /* * If the cgroup has to be frozen (CGRP_FREEZE bit set), * and all tasks are frozen and/or stopped, let's consider * the cgroup frozen. Otherwise it's not frozen. */ frozen = test_bit(CGRP_FREEZE, &cgrp->flags) && cgrp->freezer.nr_frozen_tasks == __cgroup_task_count(cgrp); if (frozen) { /* Already there? */ if (test_bit(CGRP_FROZEN, &cgrp->flags)) return; set_bit(CGRP_FROZEN, &cgrp->flags); } else { /* Already there? */ if (!test_bit(CGRP_FROZEN, &cgrp->flags)) return; clear_bit(CGRP_FROZEN, &cgrp->flags); } cgroup_file_notify(&cgrp->events_file); TRACE_CGROUP_PATH(notify_frozen, cgrp, frozen); /* Update the state of ancestor cgroups. */ cgroup_propagate_frozen(cgrp, frozen); } /* * Increment cgroup's nr_frozen_tasks. */ static void cgroup_inc_frozen_cnt(struct cgroup *cgrp) { cgrp->freezer.nr_frozen_tasks++; } /* * Decrement cgroup's nr_frozen_tasks. */ static void cgroup_dec_frozen_cnt(struct cgroup *cgrp) { cgrp->freezer.nr_frozen_tasks--; WARN_ON_ONCE(cgrp->freezer.nr_frozen_tasks < 0); } /* * Enter frozen/stopped state, if not yet there. Update cgroup's counters, * and revisit the state of the cgroup, if necessary. */ void cgroup_enter_frozen(void) { struct cgroup *cgrp; if (current->frozen) return; spin_lock_irq(&css_set_lock); current->frozen = true; cgrp = task_dfl_cgroup(current); cgroup_inc_frozen_cnt(cgrp); cgroup_update_frozen(cgrp); spin_unlock_irq(&css_set_lock); } /* * Conditionally leave frozen/stopped state. Update cgroup's counters, * and revisit the state of the cgroup, if necessary. * * If always_leave is not set, and the cgroup is freezing, * we're racing with the cgroup freezing. In this case, we don't * drop the frozen counter to avoid a transient switch to * the unfrozen state. */ void cgroup_leave_frozen(bool always_leave) { struct cgroup *cgrp; spin_lock_irq(&css_set_lock); cgrp = task_dfl_cgroup(current); if (always_leave || !test_bit(CGRP_FREEZE, &cgrp->flags)) { cgroup_dec_frozen_cnt(cgrp); cgroup_update_frozen(cgrp); WARN_ON_ONCE(!current->frozen); current->frozen = false; } else if (!(current->jobctl & JOBCTL_TRAP_FREEZE)) { spin_lock(¤t->sighand->siglock); current->jobctl |= JOBCTL_TRAP_FREEZE; set_thread_flag(TIF_SIGPENDING); spin_unlock(¤t->sighand->siglock); } spin_unlock_irq(&css_set_lock); } /* * Freeze or unfreeze the task by setting or clearing the JOBCTL_TRAP_FREEZE * jobctl bit. */ static void cgroup_freeze_task(struct task_struct *task, bool freeze) { unsigned long flags; /* If the task is about to die, don't bother with freezing it. */ if (!lock_task_sighand(task, &flags)) return; if (freeze) { task->jobctl |= JOBCTL_TRAP_FREEZE; signal_wake_up(task, false); } else { task->jobctl &= ~JOBCTL_TRAP_FREEZE; wake_up_process(task); } unlock_task_sighand(task, &flags); } /* * Freeze or unfreeze all tasks in the given cgroup. */ static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) { struct css_task_iter it; struct task_struct *task; lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); if (freeze) set_bit(CGRP_FREEZE, &cgrp->flags); else clear_bit(CGRP_FREEZE, &cgrp->flags); spin_unlock_irq(&css_set_lock); if (freeze) TRACE_CGROUP_PATH(freeze, cgrp); else TRACE_CGROUP_PATH(unfreeze, cgrp); css_task_iter_start(&cgrp->self, 0, &it); while ((task = css_task_iter_next(&it))) { /* * Ignore kernel threads here. Freezing cgroups containing * kthreads isn't supported. */ if (task->flags & PF_KTHREAD) continue; cgroup_freeze_task(task, freeze); } css_task_iter_end(&it); /* * Cgroup state should be revisited here to cover empty leaf cgroups * and cgroups which descendants are already in the desired state. */ spin_lock_irq(&css_set_lock); if (cgrp->nr_descendants == cgrp->freezer.nr_frozen_descendants) cgroup_update_frozen(cgrp); spin_unlock_irq(&css_set_lock); } /* * Adjust the task state (freeze or unfreeze) and revisit the state of * source and destination cgroups. */ void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src, struct cgroup *dst) { lockdep_assert_held(&css_set_lock); /* * Kernel threads are not supposed to be frozen at all. */ if (task->flags & PF_KTHREAD) return; /* * It's not necessary to do changes if both of the src and dst cgroups * are not freezing and task is not frozen. */ if (!test_bit(CGRP_FREEZE, &src->flags) && !test_bit(CGRP_FREEZE, &dst->flags) && !task->frozen) return; /* * Adjust counters of freezing and frozen tasks. * Note, that if the task is frozen, but the destination cgroup is not * frozen, we bump both counters to keep them balanced. */ if (task->frozen) { cgroup_inc_frozen_cnt(dst); cgroup_dec_frozen_cnt(src); } cgroup_update_frozen(dst); cgroup_update_frozen(src); /* * Force the task to the desired state. */ cgroup_freeze_task(task, test_bit(CGRP_FREEZE, &dst->flags)); } void cgroup_freeze(struct cgroup *cgrp, bool freeze) { struct cgroup_subsys_state *css; struct cgroup *dsct; bool applied = false; lockdep_assert_held(&cgroup_mutex); /* * Nothing changed? Just exit. */ if (cgrp->freezer.freeze == freeze) return; cgrp->freezer.freeze = freeze; /* * Propagate changes downwards the cgroup tree. */ css_for_each_descendant_pre(css, &cgrp->self) { dsct = css->cgroup; if (cgroup_is_dead(dsct)) continue; if (freeze) { dsct->freezer.e_freeze++; /* * Already frozen because of ancestor's settings? */ if (dsct->freezer.e_freeze > 1) continue; } else { dsct->freezer.e_freeze--; /* * Still frozen because of ancestor's settings? */ if (dsct->freezer.e_freeze > 0) continue; WARN_ON_ONCE(dsct->freezer.e_freeze < 0); } /* * Do change actual state: freeze or unfreeze. */ cgroup_do_freeze(dsct, freeze); applied = true; } /* * Even if the actual state hasn't changed, let's notify a user. * The state can be enforced by an ancestor cgroup: the cgroup * can already be in the desired state or it can be locked in the * opposite state, so that the transition will never happen. * In both cases it's better to notify a user, that there is * nothing to wait for. */ if (!applied) { TRACE_CGROUP_PATH(notify_frozen, cgrp, test_bit(CGRP_FROZEN, &cgrp->flags)); cgroup_file_notify(&cgrp->events_file); } } |
14 1 11 2 36 1 1 8 3 23 11 1 1 1 1 2 2 1 9 1 4 5 1 1 2 2 11 11 11 11 11 11 11 13 12 4 4 4 4 4 4 11 6 1 1 3 6 1 1 59 59 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 | // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) /* Copyright (C) 2019 Netronome Systems, Inc. */ #include <linux/if_arp.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/mpls.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <linux/tc_act/tc_mpls.h> #include <net/mpls.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/tc_act/tc_mpls.h> #include <net/tc_wrapper.h> static struct tc_action_ops act_mpls_ops; #define ACT_MPLS_TTL_DEFAULT 255 static __be32 tcf_mpls_get_lse(struct mpls_shim_hdr *lse, struct tcf_mpls_params *p, bool set_bos) { u32 new_lse = 0; if (lse) new_lse = be32_to_cpu(lse->label_stack_entry); if (p->tcfm_label != ACT_MPLS_LABEL_NOT_SET) { new_lse &= ~MPLS_LS_LABEL_MASK; new_lse |= p->tcfm_label << MPLS_LS_LABEL_SHIFT; } if (p->tcfm_ttl) { new_lse &= ~MPLS_LS_TTL_MASK; new_lse |= p->tcfm_ttl << MPLS_LS_TTL_SHIFT; } if (p->tcfm_tc != ACT_MPLS_TC_NOT_SET) { new_lse &= ~MPLS_LS_TC_MASK; new_lse |= p->tcfm_tc << MPLS_LS_TC_SHIFT; } if (p->tcfm_bos != ACT_MPLS_BOS_NOT_SET) { new_lse &= ~MPLS_LS_S_MASK; new_lse |= p->tcfm_bos << MPLS_LS_S_SHIFT; } else if (set_bos) { new_lse |= 1 << MPLS_LS_S_SHIFT; } return cpu_to_be32(new_lse); } TC_INDIRECT_SCOPE int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_mpls *m = to_mpls(a); struct tcf_mpls_params *p; __be32 new_lse; int ret, mac_len; tcf_lastuse_update(&m->tcf_tm); bstats_update(this_cpu_ptr(m->common.cpu_bstats), skb); /* Ensure 'data' points at mac_header prior calling mpls manipulating * functions. */ if (skb_at_tc_ingress(skb)) { skb_push_rcsum(skb, skb->mac_len); mac_len = skb->mac_len; } else { mac_len = skb_network_offset(skb); } ret = READ_ONCE(m->tcf_action); p = rcu_dereference_bh(m->mpls_p); switch (p->tcfm_action) { case TCA_MPLS_ACT_POP: if (skb_mpls_pop(skb, p->tcfm_proto, mac_len, skb->dev && skb->dev->type == ARPHRD_ETHER)) goto drop; break; case TCA_MPLS_ACT_PUSH: new_lse = tcf_mpls_get_lse(NULL, p, !eth_p_mpls(skb_protocol(skb, true))); if (skb_mpls_push(skb, new_lse, p->tcfm_proto, mac_len, skb->dev && skb->dev->type == ARPHRD_ETHER)) goto drop; break; case TCA_MPLS_ACT_MAC_PUSH: if (skb_vlan_tag_present(skb)) { if (__vlan_insert_inner_tag(skb, skb->vlan_proto, skb_vlan_tag_get(skb), ETH_HLEN) < 0) goto drop; skb->protocol = skb->vlan_proto; __vlan_hwaccel_clear_tag(skb); } new_lse = tcf_mpls_get_lse(NULL, p, mac_len || !eth_p_mpls(skb->protocol)); if (skb_mpls_push(skb, new_lse, p->tcfm_proto, 0, false)) goto drop; break; case TCA_MPLS_ACT_MODIFY: if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN)) goto drop; new_lse = tcf_mpls_get_lse(mpls_hdr(skb), p, false); if (skb_mpls_update_lse(skb, new_lse)) goto drop; break; case TCA_MPLS_ACT_DEC_TTL: if (skb_mpls_dec_ttl(skb)) goto drop; break; } if (skb_at_tc_ingress(skb)) skb_pull_rcsum(skb, skb->mac_len); return ret; drop: qstats_drop_inc(this_cpu_ptr(m->common.cpu_qstats)); return TC_ACT_SHOT; } static int valid_label(const struct nlattr *attr, struct netlink_ext_ack *extack) { const u32 *label = nla_data(attr); if (nla_len(attr) != sizeof(*label)) { NL_SET_ERR_MSG_MOD(extack, "Invalid MPLS label length"); return -EINVAL; } if (*label & ~MPLS_LABEL_MASK || *label == MPLS_LABEL_IMPLNULL) { NL_SET_ERR_MSG_MOD(extack, "MPLS label out of range"); return -EINVAL; } return 0; } static const struct nla_policy mpls_policy[TCA_MPLS_MAX + 1] = { [TCA_MPLS_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_mpls)), [TCA_MPLS_PROTO] = { .type = NLA_U16 }, [TCA_MPLS_LABEL] = NLA_POLICY_VALIDATE_FN(NLA_BINARY, valid_label), [TCA_MPLS_TC] = NLA_POLICY_RANGE(NLA_U8, 0, 7), [TCA_MPLS_TTL] = NLA_POLICY_MIN(NLA_U8, 1), [TCA_MPLS_BOS] = NLA_POLICY_RANGE(NLA_U8, 0, 1), }; static int tcf_mpls_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_mpls_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_MPLS_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tcf_mpls_params *p; struct tc_mpls *parm; bool exists = false; struct tcf_mpls *m; int ret = 0, err; u8 mpls_ttl = 0; u32 index; if (!nla) { NL_SET_ERR_MSG_MOD(extack, "Missing netlink attributes"); return -EINVAL; } err = nla_parse_nested(tb, TCA_MPLS_MAX, nla, mpls_policy, extack); if (err < 0) return err; if (!tb[TCA_MPLS_PARMS]) { NL_SET_ERR_MSG_MOD(extack, "No MPLS params"); return -EINVAL; } parm = nla_data(tb[TCA_MPLS_PARMS]); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) return err; exists = err; if (exists && bind) return ACT_P_BOUND; if (!exists) { ret = tcf_idr_create(tn, index, est, a, &act_mpls_ops, bind, true, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } /* Verify parameters against action type. */ switch (parm->m_action) { case TCA_MPLS_ACT_POP: if (!tb[TCA_MPLS_PROTO]) { NL_SET_ERR_MSG_MOD(extack, "Protocol must be set for MPLS pop"); err = -EINVAL; goto release_idr; } if (!eth_proto_is_802_3(nla_get_be16(tb[TCA_MPLS_PROTO]))) { NL_SET_ERR_MSG_MOD(extack, "Invalid protocol type for MPLS pop"); err = -EINVAL; goto release_idr; } if (tb[TCA_MPLS_LABEL] || tb[TCA_MPLS_TTL] || tb[TCA_MPLS_TC] || tb[TCA_MPLS_BOS]) { NL_SET_ERR_MSG_MOD(extack, "Label, TTL, TC or BOS cannot be used with MPLS pop"); err = -EINVAL; goto release_idr; } break; case TCA_MPLS_ACT_DEC_TTL: if (tb[TCA_MPLS_PROTO] || tb[TCA_MPLS_LABEL] || tb[TCA_MPLS_TTL] || tb[TCA_MPLS_TC] || tb[TCA_MPLS_BOS]) { NL_SET_ERR_MSG_MOD(extack, "Label, TTL, TC, BOS or protocol cannot be used with MPLS dec_ttl"); err = -EINVAL; goto release_idr; } break; case TCA_MPLS_ACT_PUSH: case TCA_MPLS_ACT_MAC_PUSH: if (!tb[TCA_MPLS_LABEL]) { NL_SET_ERR_MSG_MOD(extack, "Label is required for MPLS push"); err = -EINVAL; goto release_idr; } if (tb[TCA_MPLS_PROTO] && !eth_p_mpls(nla_get_be16(tb[TCA_MPLS_PROTO]))) { NL_SET_ERR_MSG_MOD(extack, "Protocol must be an MPLS type for MPLS push"); err = -EPROTONOSUPPORT; goto release_idr; } /* Push needs a TTL - if not specified, set a default value. */ if (!tb[TCA_MPLS_TTL]) { #if IS_ENABLED(CONFIG_MPLS) mpls_ttl = net->mpls.default_ttl ? net->mpls.default_ttl : ACT_MPLS_TTL_DEFAULT; #else mpls_ttl = ACT_MPLS_TTL_DEFAULT; #endif } break; case TCA_MPLS_ACT_MODIFY: if (tb[TCA_MPLS_PROTO]) { NL_SET_ERR_MSG_MOD(extack, "Protocol cannot be used with MPLS modify"); err = -EINVAL; goto release_idr; } break; default: NL_SET_ERR_MSG_MOD(extack, "Unknown MPLS action"); err = -EINVAL; goto release_idr; } err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; m = to_mpls(*a); p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) { err = -ENOMEM; goto put_chain; } p->tcfm_action = parm->m_action; p->tcfm_label = tb[TCA_MPLS_LABEL] ? nla_get_u32(tb[TCA_MPLS_LABEL]) : ACT_MPLS_LABEL_NOT_SET; p->tcfm_tc = tb[TCA_MPLS_TC] ? nla_get_u8(tb[TCA_MPLS_TC]) : ACT_MPLS_TC_NOT_SET; p->tcfm_ttl = tb[TCA_MPLS_TTL] ? nla_get_u8(tb[TCA_MPLS_TTL]) : mpls_ttl; p->tcfm_bos = tb[TCA_MPLS_BOS] ? nla_get_u8(tb[TCA_MPLS_BOS]) : ACT_MPLS_BOS_NOT_SET; p->tcfm_proto = tb[TCA_MPLS_PROTO] ? nla_get_be16(tb[TCA_MPLS_PROTO]) : htons(ETH_P_MPLS_UC); spin_lock_bh(&m->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); p = rcu_replace_pointer(m->mpls_p, p, lockdep_is_held(&m->tcf_lock)); spin_unlock_bh(&m->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); if (p) kfree_rcu(p, rcu); return ret; put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: tcf_idr_release(*a, bind); return err; } static void tcf_mpls_cleanup(struct tc_action *a) { struct tcf_mpls *m = to_mpls(a); struct tcf_mpls_params *p; p = rcu_dereference_protected(m->mpls_p, 1); if (p) kfree_rcu(p, rcu); } static int tcf_mpls_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_mpls *m = to_mpls(a); struct tcf_mpls_params *p; struct tc_mpls opt = { .index = m->tcf_index, .refcnt = refcount_read(&m->tcf_refcnt) - ref, .bindcnt = atomic_read(&m->tcf_bindcnt) - bind, }; struct tcf_t t; spin_lock_bh(&m->tcf_lock); opt.action = m->tcf_action; p = rcu_dereference_protected(m->mpls_p, lockdep_is_held(&m->tcf_lock)); opt.m_action = p->tcfm_action; if (nla_put(skb, TCA_MPLS_PARMS, sizeof(opt), &opt)) goto nla_put_failure; if (p->tcfm_label != ACT_MPLS_LABEL_NOT_SET && nla_put_u32(skb, TCA_MPLS_LABEL, p->tcfm_label)) goto nla_put_failure; if (p->tcfm_tc != ACT_MPLS_TC_NOT_SET && nla_put_u8(skb, TCA_MPLS_TC, p->tcfm_tc)) goto nla_put_failure; if (p->tcfm_ttl && nla_put_u8(skb, TCA_MPLS_TTL, p->tcfm_ttl)) goto nla_put_failure; if (p->tcfm_bos != ACT_MPLS_BOS_NOT_SET && nla_put_u8(skb, TCA_MPLS_BOS, p->tcfm_bos)) goto nla_put_failure; if (nla_put_be16(skb, TCA_MPLS_PROTO, p->tcfm_proto)) goto nla_put_failure; tcf_tm_dump(&t, &m->tcf_tm); if (nla_put_64bit(skb, TCA_MPLS_TM, sizeof(t), &t, TCA_MPLS_PAD)) goto nla_put_failure; spin_unlock_bh(&m->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&m->tcf_lock); nlmsg_trim(skb, b); return -EMSGSIZE; } static int tcf_mpls_offload_act_setup(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack) { if (bind) { struct flow_action_entry *entry = entry_data; switch (tcf_mpls_action(act)) { case TCA_MPLS_ACT_PUSH: entry->id = FLOW_ACTION_MPLS_PUSH; entry->mpls_push.proto = tcf_mpls_proto(act); entry->mpls_push.label = tcf_mpls_label(act); entry->mpls_push.tc = tcf_mpls_tc(act); entry->mpls_push.bos = tcf_mpls_bos(act); entry->mpls_push.ttl = tcf_mpls_ttl(act); break; case TCA_MPLS_ACT_POP: entry->id = FLOW_ACTION_MPLS_POP; entry->mpls_pop.proto = tcf_mpls_proto(act); break; case TCA_MPLS_ACT_MODIFY: entry->id = FLOW_ACTION_MPLS_MANGLE; entry->mpls_mangle.label = tcf_mpls_label(act); entry->mpls_mangle.tc = tcf_mpls_tc(act); entry->mpls_mangle.bos = tcf_mpls_bos(act); entry->mpls_mangle.ttl = tcf_mpls_ttl(act); break; case TCA_MPLS_ACT_DEC_TTL: NL_SET_ERR_MSG_MOD(extack, "Offload not supported when \"dec_ttl\" option is used"); return -EOPNOTSUPP; case TCA_MPLS_ACT_MAC_PUSH: NL_SET_ERR_MSG_MOD(extack, "Offload not supported when \"mac_push\" option is used"); return -EOPNOTSUPP; default: NL_SET_ERR_MSG_MOD(extack, "Unsupported MPLS mode offload"); return -EOPNOTSUPP; } *index_inc = 1; } else { struct flow_offload_action *fl_action = entry_data; switch (tcf_mpls_action(act)) { case TCA_MPLS_ACT_PUSH: fl_action->id = FLOW_ACTION_MPLS_PUSH; break; case TCA_MPLS_ACT_POP: fl_action->id = FLOW_ACTION_MPLS_POP; break; case TCA_MPLS_ACT_MODIFY: fl_action->id = FLOW_ACTION_MPLS_MANGLE; break; default: return -EOPNOTSUPP; } } return 0; } static struct tc_action_ops act_mpls_ops = { .kind = "mpls", .id = TCA_ID_MPLS, .owner = THIS_MODULE, .act = tcf_mpls_act, .dump = tcf_mpls_dump, .init = tcf_mpls_init, .cleanup = tcf_mpls_cleanup, .offload_act_setup = tcf_mpls_offload_act_setup, .size = sizeof(struct tcf_mpls), }; MODULE_ALIAS_NET_ACT("mpls"); static __net_init int mpls_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_mpls_ops.net_id); return tc_action_net_init(net, tn, &act_mpls_ops); } static void __net_exit mpls_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_mpls_ops.net_id); } static struct pernet_operations mpls_net_ops = { .init = mpls_init_net, .exit_batch = mpls_exit_net, .id = &act_mpls_ops.net_id, .size = sizeof(struct tc_action_net), }; static int __init mpls_init_module(void) { return tcf_register_action(&act_mpls_ops, &mpls_net_ops); } static void __exit mpls_cleanup_module(void) { tcf_unregister_action(&act_mpls_ops, &mpls_net_ops); } module_init(mpls_init_module); module_exit(mpls_cleanup_module); MODULE_SOFTDEP("post: mpls_gso"); MODULE_AUTHOR("Netronome Systems <oss-drivers@netronome.com>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MPLS manipulation actions"); |
1 1 1 1 6 1 3 1 2 2 3 1 2 1 1 1 2 2 2 12 12 1 1 1 6 4 2 5 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 | // SPDX-License-Identifier: GPL-2.0-or-later /* * CALIPSO - Common Architecture Label IPv6 Security Option * * This is an implementation of the CALIPSO protocol as specified in * RFC 5570. * * Authors: Paul Moore <paul.moore@hp.com> * Huw Davies <huw@codeweavers.com> */ /* (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 * (c) Copyright Huw Davies <huw@codeweavers.com>, 2015 */ #include <linux/init.h> #include <linux/types.h> #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/string.h> #include <linux/jhash.h> #include <linux/audit.h> #include <linux/slab.h> #include <net/ip.h> #include <net/icmp.h> #include <net/tcp.h> #include <net/netlabel.h> #include <net/calipso.h> #include <linux/atomic.h> #include <linux/bug.h> #include <asm/unaligned.h> #include <linux/crc-ccitt.h> /* Maximium size of the calipso option including * the two-byte TLV header. */ #define CALIPSO_OPT_LEN_MAX (2 + 252) /* Size of the minimum calipso option including * the two-byte TLV header. */ #define CALIPSO_HDR_LEN (2 + 8) /* Maximium size of the calipso option including * the two-byte TLV header and upto 3 bytes of * leading pad and 7 bytes of trailing pad. */ #define CALIPSO_OPT_LEN_MAX_WITH_PAD (3 + CALIPSO_OPT_LEN_MAX + 7) /* Maximium size of u32 aligned buffer required to hold calipso * option. Max of 3 initial pad bytes starting from buffer + 3. * i.e. the worst case is when the previous tlv finishes on 4n + 3. */ #define CALIPSO_MAX_BUFFER (6 + CALIPSO_OPT_LEN_MAX) /* List of available DOI definitions */ static DEFINE_SPINLOCK(calipso_doi_list_lock); static LIST_HEAD(calipso_doi_list); /* Label mapping cache */ int calipso_cache_enabled = 1; int calipso_cache_bucketsize = 10; #define CALIPSO_CACHE_BUCKETBITS 7 #define CALIPSO_CACHE_BUCKETS BIT(CALIPSO_CACHE_BUCKETBITS) #define CALIPSO_CACHE_REORDERLIMIT 10 struct calipso_map_cache_bkt { spinlock_t lock; u32 size; struct list_head list; }; struct calipso_map_cache_entry { u32 hash; unsigned char *key; size_t key_len; struct netlbl_lsm_cache *lsm_data; u32 activity; struct list_head list; }; static struct calipso_map_cache_bkt *calipso_cache; static void calipso_cache_invalidate(void); static void calipso_doi_putdef(struct calipso_doi *doi_def); /* Label Mapping Cache Functions */ /** * calipso_cache_entry_free - Frees a cache entry * @entry: the entry to free * * Description: * This function frees the memory associated with a cache entry including the * LSM cache data if there are no longer any users, i.e. reference count == 0. * */ static void calipso_cache_entry_free(struct calipso_map_cache_entry *entry) { if (entry->lsm_data) netlbl_secattr_cache_free(entry->lsm_data); kfree(entry->key); kfree(entry); } /** * calipso_map_cache_hash - Hashing function for the CALIPSO cache * @key: the hash key * @key_len: the length of the key in bytes * * Description: * The CALIPSO tag hashing function. Returns a 32-bit hash value. * */ static u32 calipso_map_cache_hash(const unsigned char *key, u32 key_len) { return jhash(key, key_len, 0); } /** * calipso_cache_init - Initialize the CALIPSO cache * * Description: * Initializes the CALIPSO label mapping cache, this function should be called * before any of the other functions defined in this file. Returns zero on * success, negative values on error. * */ static int __init calipso_cache_init(void) { u32 iter; calipso_cache = kcalloc(CALIPSO_CACHE_BUCKETS, sizeof(struct calipso_map_cache_bkt), GFP_KERNEL); if (!calipso_cache) return -ENOMEM; for (iter = 0; iter < CALIPSO_CACHE_BUCKETS; iter++) { spin_lock_init(&calipso_cache[iter].lock); calipso_cache[iter].size = 0; INIT_LIST_HEAD(&calipso_cache[iter].list); } return 0; } /** * calipso_cache_invalidate - Invalidates the current CALIPSO cache * * Description: * Invalidates and frees any entries in the CALIPSO cache. Returns zero on * success and negative values on failure. * */ static void calipso_cache_invalidate(void) { struct calipso_map_cache_entry *entry, *tmp_entry; u32 iter; for (iter = 0; iter < CALIPSO_CACHE_BUCKETS; iter++) { spin_lock_bh(&calipso_cache[iter].lock); list_for_each_entry_safe(entry, tmp_entry, &calipso_cache[iter].list, list) { list_del(&entry->list); calipso_cache_entry_free(entry); } calipso_cache[iter].size = 0; spin_unlock_bh(&calipso_cache[iter].lock); } } /** * calipso_cache_check - Check the CALIPSO cache for a label mapping * @key: the buffer to check * @key_len: buffer length in bytes * @secattr: the security attribute struct to use * * Description: * This function checks the cache to see if a label mapping already exists for * the given key. If there is a match then the cache is adjusted and the * @secattr struct is populated with the correct LSM security attributes. The * cache is adjusted in the following manner if the entry is not already the * first in the cache bucket: * * 1. The cache entry's activity counter is incremented * 2. The previous (higher ranking) entry's activity counter is decremented * 3. If the difference between the two activity counters is geater than * CALIPSO_CACHE_REORDERLIMIT the two entries are swapped * * Returns zero on success, -ENOENT for a cache miss, and other negative values * on error. * */ static int calipso_cache_check(const unsigned char *key, u32 key_len, struct netlbl_lsm_secattr *secattr) { u32 bkt; struct calipso_map_cache_entry *entry; struct calipso_map_cache_entry *prev_entry = NULL; u32 hash; if (!calipso_cache_enabled) return -ENOENT; hash = calipso_map_cache_hash(key, key_len); bkt = hash & (CALIPSO_CACHE_BUCKETS - 1); spin_lock_bh(&calipso_cache[bkt].lock); list_for_each_entry(entry, &calipso_cache[bkt].list, list) { if (entry->hash == hash && entry->key_len == key_len && memcmp(entry->key, key, key_len) == 0) { entry->activity += 1; refcount_inc(&entry->lsm_data->refcount); secattr->cache = entry->lsm_data; secattr->flags |= NETLBL_SECATTR_CACHE; secattr->type = NETLBL_NLTYPE_CALIPSO; if (!prev_entry) { spin_unlock_bh(&calipso_cache[bkt].lock); return 0; } if (prev_entry->activity > 0) prev_entry->activity -= 1; if (entry->activity > prev_entry->activity && entry->activity - prev_entry->activity > CALIPSO_CACHE_REORDERLIMIT) { __list_del(entry->list.prev, entry->list.next); __list_add(&entry->list, prev_entry->list.prev, &prev_entry->list); } spin_unlock_bh(&calipso_cache[bkt].lock); return 0; } prev_entry = entry; } spin_unlock_bh(&calipso_cache[bkt].lock); return -ENOENT; } /** * calipso_cache_add - Add an entry to the CALIPSO cache * @calipso_ptr: the CALIPSO option * @secattr: the packet's security attributes * * Description: * Add a new entry into the CALIPSO label mapping cache. Add the new entry to * head of the cache bucket's list, if the cache bucket is out of room remove * the last entry in the list first. It is important to note that there is * currently no checking for duplicate keys. Returns zero on success, * negative values on failure. The key stored starts at calipso_ptr + 2, * i.e. the type and length bytes are not stored, this corresponds to * calipso_ptr[1] bytes of data. * */ static int calipso_cache_add(const unsigned char *calipso_ptr, const struct netlbl_lsm_secattr *secattr) { int ret_val = -EPERM; u32 bkt; struct calipso_map_cache_entry *entry = NULL; struct calipso_map_cache_entry *old_entry = NULL; u32 calipso_ptr_len; if (!calipso_cache_enabled || calipso_cache_bucketsize <= 0) return 0; calipso_ptr_len = calipso_ptr[1]; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) return -ENOMEM; entry->key = kmemdup(calipso_ptr + 2, calipso_ptr_len, GFP_ATOMIC); if (!entry->key) { ret_val = -ENOMEM; goto cache_add_failure; } entry->key_len = calipso_ptr_len; entry->hash = calipso_map_cache_hash(calipso_ptr, calipso_ptr_len); refcount_inc(&secattr->cache->refcount); entry->lsm_data = secattr->cache; bkt = entry->hash & (CALIPSO_CACHE_BUCKETS - 1); spin_lock_bh(&calipso_cache[bkt].lock); if (calipso_cache[bkt].size < calipso_cache_bucketsize) { list_add(&entry->list, &calipso_cache[bkt].list); calipso_cache[bkt].size += 1; } else { old_entry = list_entry(calipso_cache[bkt].list.prev, struct calipso_map_cache_entry, list); list_del(&old_entry->list); list_add(&entry->list, &calipso_cache[bkt].list); calipso_cache_entry_free(old_entry); } spin_unlock_bh(&calipso_cache[bkt].lock); return 0; cache_add_failure: if (entry) calipso_cache_entry_free(entry); return ret_val; } /* DOI List Functions */ /** * calipso_doi_search - Searches for a DOI definition * @doi: the DOI to search for * * Description: * Search the DOI definition list for a DOI definition with a DOI value that * matches @doi. The caller is responsible for calling rcu_read_[un]lock(). * Returns a pointer to the DOI definition on success and NULL on failure. */ static struct calipso_doi *calipso_doi_search(u32 doi) { struct calipso_doi *iter; list_for_each_entry_rcu(iter, &calipso_doi_list, list) if (iter->doi == doi && refcount_read(&iter->refcount)) return iter; return NULL; } /** * calipso_doi_add - Add a new DOI to the CALIPSO protocol engine * @doi_def: the DOI structure * @audit_info: NetLabel audit information * * Description: * The caller defines a new DOI for use by the CALIPSO engine and calls this * function to add it to the list of acceptable domains. The caller must * ensure that the mapping table specified in @doi_def->map meets all of the * requirements of the mapping type (see calipso.h for details). Returns * zero on success and non-zero on failure. * */ static int calipso_doi_add(struct calipso_doi *doi_def, struct netlbl_audit *audit_info) { int ret_val = -EINVAL; u32 doi; u32 doi_type; struct audit_buffer *audit_buf; doi = doi_def->doi; doi_type = doi_def->type; if (doi_def->doi == CALIPSO_DOI_UNKNOWN) goto doi_add_return; refcount_set(&doi_def->refcount, 1); spin_lock(&calipso_doi_list_lock); if (calipso_doi_search(doi_def->doi)) { spin_unlock(&calipso_doi_list_lock); ret_val = -EEXIST; goto doi_add_return; } list_add_tail_rcu(&doi_def->list, &calipso_doi_list); spin_unlock(&calipso_doi_list_lock); ret_val = 0; doi_add_return: audit_buf = netlbl_audit_start(AUDIT_MAC_CALIPSO_ADD, audit_info); if (audit_buf) { const char *type_str; switch (doi_type) { case CALIPSO_MAP_PASS: type_str = "pass"; break; default: type_str = "(unknown)"; } audit_log_format(audit_buf, " calipso_doi=%u calipso_type=%s res=%u", doi, type_str, ret_val == 0 ? 1 : 0); audit_log_end(audit_buf); } return ret_val; } /** * calipso_doi_free - Frees a DOI definition * @doi_def: the DOI definition * * Description: * This function frees all of the memory associated with a DOI definition. * */ static void calipso_doi_free(struct calipso_doi *doi_def) { kfree(doi_def); } /** * calipso_doi_free_rcu - Frees a DOI definition via the RCU pointer * @entry: the entry's RCU field * * Description: * This function is designed to be used as a callback to the call_rcu() * function so that the memory allocated to the DOI definition can be released * safely. * */ static void calipso_doi_free_rcu(struct rcu_head *entry) { struct calipso_doi *doi_def; doi_def = container_of(entry, struct calipso_doi, rcu); calipso_doi_free(doi_def); } /** * calipso_doi_remove - Remove an existing DOI from the CALIPSO protocol engine * @doi: the DOI value * @audit_info: NetLabel audit information * * Description: * Removes a DOI definition from the CALIPSO engine. The NetLabel routines will * be called to release their own LSM domain mappings as well as our own * domain list. Returns zero on success and negative values on failure. * */ static int calipso_doi_remove(u32 doi, struct netlbl_audit *audit_info) { int ret_val; struct calipso_doi *doi_def; struct audit_buffer *audit_buf; spin_lock(&calipso_doi_list_lock); doi_def = calipso_doi_search(doi); if (!doi_def) { spin_unlock(&calipso_doi_list_lock); ret_val = -ENOENT; goto doi_remove_return; } list_del_rcu(&doi_def->list); spin_unlock(&calipso_doi_list_lock); calipso_doi_putdef(doi_def); ret_val = 0; doi_remove_return: audit_buf = netlbl_audit_start(AUDIT_MAC_CALIPSO_DEL, audit_info); if (audit_buf) { audit_log_format(audit_buf, " calipso_doi=%u res=%u", doi, ret_val == 0 ? 1 : 0); audit_log_end(audit_buf); } return ret_val; } /** * calipso_doi_getdef - Returns a reference to a valid DOI definition * @doi: the DOI value * * Description: * Searches for a valid DOI definition and if one is found it is returned to * the caller. Otherwise NULL is returned. The caller must ensure that * calipso_doi_putdef() is called when the caller is done. * */ static struct calipso_doi *calipso_doi_getdef(u32 doi) { struct calipso_doi *doi_def; rcu_read_lock(); doi_def = calipso_doi_search(doi); if (!doi_def) goto doi_getdef_return; if (!refcount_inc_not_zero(&doi_def->refcount)) doi_def = NULL; doi_getdef_return: rcu_read_unlock(); return doi_def; } /** * calipso_doi_putdef - Releases a reference for the given DOI definition * @doi_def: the DOI definition * * Description: * Releases a DOI definition reference obtained from calipso_doi_getdef(). * */ static void calipso_doi_putdef(struct calipso_doi *doi_def) { if (!doi_def) return; if (!refcount_dec_and_test(&doi_def->refcount)) return; calipso_cache_invalidate(); call_rcu(&doi_def->rcu, calipso_doi_free_rcu); } /** * calipso_doi_walk - Iterate through the DOI definitions * @skip_cnt: skip past this number of DOI definitions, updated * @callback: callback for each DOI definition * @cb_arg: argument for the callback function * * Description: * Iterate over the DOI definition list, skipping the first @skip_cnt entries. * For each entry call @callback, if @callback returns a negative value stop * 'walking' through the list and return. Updates the value in @skip_cnt upon * return. Returns zero on success, negative values on failure. * */ static int calipso_doi_walk(u32 *skip_cnt, int (*callback)(struct calipso_doi *doi_def, void *arg), void *cb_arg) { int ret_val = -ENOENT; u32 doi_cnt = 0; struct calipso_doi *iter_doi; rcu_read_lock(); list_for_each_entry_rcu(iter_doi, &calipso_doi_list, list) if (refcount_read(&iter_doi->refcount) > 0) { if (doi_cnt++ < *skip_cnt) continue; ret_val = callback(iter_doi, cb_arg); if (ret_val < 0) { doi_cnt--; goto doi_walk_return; } } doi_walk_return: rcu_read_unlock(); *skip_cnt = doi_cnt; return ret_val; } /** * calipso_validate - Validate a CALIPSO option * @skb: the packet * @option: the start of the option * * Description: * This routine is called to validate a CALIPSO option. * If the option is valid then %true is returned, otherwise * %false is returned. * * The caller should have already checked that the length of the * option (including the TLV header) is >= 10 and that the catmap * length is consistent with the option length. * * We leave checks on the level and categories to the socket layer. */ bool calipso_validate(const struct sk_buff *skb, const unsigned char *option) { struct calipso_doi *doi_def; bool ret_val; u16 crc, len = option[1] + 2; static const u8 zero[2]; /* The original CRC runs over the option including the TLV header * with the CRC-16 field (at offset 8) zeroed out. */ crc = crc_ccitt(0xffff, option, 8); crc = crc_ccitt(crc, zero, sizeof(zero)); if (len > 10) crc = crc_ccitt(crc, option + 10, len - 10); crc = ~crc; if (option[8] != (crc & 0xff) || option[9] != ((crc >> 8) & 0xff)) return false; rcu_read_lock(); doi_def = calipso_doi_search(get_unaligned_be32(option + 2)); ret_val = !!doi_def; rcu_read_unlock(); return ret_val; } /** * calipso_map_cat_hton - Perform a category mapping from host to network * @doi_def: the DOI definition * @secattr: the security attributes * @net_cat: the zero'd out category bitmap in network/CALIPSO format * @net_cat_len: the length of the CALIPSO bitmap in bytes * * Description: * Perform a label mapping to translate a local MLS category bitmap to the * correct CALIPSO bitmap using the given DOI definition. Returns the minimum * size in bytes of the network bitmap on success, negative values otherwise. * */ static int calipso_map_cat_hton(const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *net_cat, u32 net_cat_len) { int spot = -1; u32 net_spot_max = 0; u32 net_clen_bits = net_cat_len * 8; for (;;) { spot = netlbl_catmap_walk(secattr->attr.mls.cat, spot + 1); if (spot < 0) break; if (spot >= net_clen_bits) return -ENOSPC; netlbl_bitmap_setbit(net_cat, spot, 1); if (spot > net_spot_max) net_spot_max = spot; } return (net_spot_max / 32 + 1) * 4; } /** * calipso_map_cat_ntoh - Perform a category mapping from network to host * @doi_def: the DOI definition * @net_cat: the category bitmap in network/CALIPSO format * @net_cat_len: the length of the CALIPSO bitmap in bytes * @secattr: the security attributes * * Description: * Perform a label mapping to translate a CALIPSO bitmap to the correct local * MLS category bitmap using the given DOI definition. Returns zero on * success, negative values on failure. * */ static int calipso_map_cat_ntoh(const struct calipso_doi *doi_def, const unsigned char *net_cat, u32 net_cat_len, struct netlbl_lsm_secattr *secattr) { int ret_val; int spot = -1; u32 net_clen_bits = net_cat_len * 8; for (;;) { spot = netlbl_bitmap_walk(net_cat, net_clen_bits, spot + 1, 1); if (spot < 0) return 0; ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat, spot, GFP_ATOMIC); if (ret_val != 0) return ret_val; } return -EINVAL; } /** * calipso_pad_write - Writes pad bytes in TLV format * @buf: the buffer * @offset: offset from start of buffer to write padding * @count: number of pad bytes to write * * Description: * Write @count bytes of TLV padding into @buffer starting at offset @offset. * @count should be less than 8 - see RFC 4942. * */ static int calipso_pad_write(unsigned char *buf, unsigned int offset, unsigned int count) { if (WARN_ON_ONCE(count >= 8)) return -EINVAL; switch (count) { case 0: break; case 1: buf[offset] = IPV6_TLV_PAD1; break; default: buf[offset] = IPV6_TLV_PADN; buf[offset + 1] = count - 2; if (count > 2) memset(buf + offset + 2, 0, count - 2); break; } return 0; } /** * calipso_genopt - Generate a CALIPSO option * @buf: the option buffer * @start: offset from which to write * @buf_len: the size of opt_buf * @doi_def: the CALIPSO DOI to use * @secattr: the security attributes * * Description: * Generate a CALIPSO option using the DOI definition and security attributes * passed to the function. This also generates upto three bytes of leading * padding that ensures that the option is 4n + 2 aligned. It returns the * number of bytes written (including any initial padding). */ static int calipso_genopt(unsigned char *buf, u32 start, u32 buf_len, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val; u32 len, pad; u16 crc; static const unsigned char padding[4] = {2, 1, 0, 3}; unsigned char *calipso; /* CALIPSO has 4n + 2 alignment */ pad = padding[start & 3]; if (buf_len <= start + pad + CALIPSO_HDR_LEN) return -ENOSPC; if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0) return -EPERM; len = CALIPSO_HDR_LEN; if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { ret_val = calipso_map_cat_hton(doi_def, secattr, buf + start + pad + len, buf_len - start - pad - len); if (ret_val < 0) return ret_val; len += ret_val; } calipso_pad_write(buf, start, pad); calipso = buf + start + pad; calipso[0] = IPV6_TLV_CALIPSO; calipso[1] = len - 2; *(__be32 *)(calipso + 2) = htonl(doi_def->doi); calipso[6] = (len - CALIPSO_HDR_LEN) / 4; calipso[7] = secattr->attr.mls.lvl; crc = ~crc_ccitt(0xffff, calipso, len); calipso[8] = crc & 0xff; calipso[9] = (crc >> 8) & 0xff; return pad + len; } /* Hop-by-hop hdr helper functions */ /** * calipso_opt_update - Replaces socket's hop options with a new set * @sk: the socket * @hop: new hop options * * Description: * Replaces @sk's hop options with @hop. @hop may be NULL to leave * the socket with no hop options. * */ static int calipso_opt_update(struct sock *sk, struct ipv6_opt_hdr *hop) { struct ipv6_txoptions *old = txopt_get(inet6_sk(sk)), *txopts; txopts = ipv6_renew_options(sk, old, IPV6_HOPOPTS, hop); txopt_put(old); if (IS_ERR(txopts)) return PTR_ERR(txopts); txopts = ipv6_update_options(sk, txopts); if (txopts) { atomic_sub(txopts->tot_len, &sk->sk_omem_alloc); txopt_put(txopts); } return 0; } /** * calipso_tlv_len - Returns the length of the TLV * @opt: the option header * @offset: offset of the TLV within the header * * Description: * Returns the length of the TLV option at offset @offset within * the option header @opt. Checks that the entire TLV fits inside * the option header, returns a negative value if this is not the case. */ static int calipso_tlv_len(struct ipv6_opt_hdr *opt, unsigned int offset) { unsigned char *tlv = (unsigned char *)opt; unsigned int opt_len = ipv6_optlen(opt), tlv_len; if (offset < sizeof(*opt) || offset >= opt_len) return -EINVAL; if (tlv[offset] == IPV6_TLV_PAD1) return 1; if (offset + 1 >= opt_len) return -EINVAL; tlv_len = tlv[offset + 1] + 2; if (offset + tlv_len > opt_len) return -EINVAL; return tlv_len; } /** * calipso_opt_find - Finds the CALIPSO option in an IPv6 hop options header * @hop: the hop options header * @start: on return holds the offset of any leading padding * @end: on return holds the offset of the first non-pad TLV after CALIPSO * * Description: * Finds the space occupied by a CALIPSO option (including any leading and * trailing padding). * * If a CALIPSO option exists set @start and @end to the * offsets within @hop of the start of padding before the first * CALIPSO option and the end of padding after the first CALIPSO * option. In this case the function returns 0. * * In the absence of a CALIPSO option, @start and @end will be * set to the start and end of any trailing padding in the header. * This is useful when appending a new option, as the caller may want * to overwrite some of this padding. In this case the function will * return -ENOENT. */ static int calipso_opt_find(struct ipv6_opt_hdr *hop, unsigned int *start, unsigned int *end) { int ret_val = -ENOENT, tlv_len; unsigned int opt_len, offset, offset_s = 0, offset_e = 0; unsigned char *opt = (unsigned char *)hop; opt_len = ipv6_optlen(hop); offset = sizeof(*hop); while (offset < opt_len) { tlv_len = calipso_tlv_len(hop, offset); if (tlv_len < 0) return tlv_len; switch (opt[offset]) { case IPV6_TLV_PAD1: case IPV6_TLV_PADN: if (offset_e) offset_e = offset; break; case IPV6_TLV_CALIPSO: ret_val = 0; offset_e = offset; break; default: if (offset_e == 0) offset_s = offset; else goto out; } offset += tlv_len; } out: if (offset_s) *start = offset_s + calipso_tlv_len(hop, offset_s); else *start = sizeof(*hop); if (offset_e) *end = offset_e + calipso_tlv_len(hop, offset_e); else *end = opt_len; return ret_val; } /** * calipso_opt_insert - Inserts a CALIPSO option into an IPv6 hop opt hdr * @hop: the original hop options header * @doi_def: the CALIPSO DOI to use * @secattr: the specific security attributes of the socket * * Description: * Creates a new hop options header based on @hop with a * CALIPSO option added to it. If @hop already contains a CALIPSO * option this is overwritten, otherwise the new option is appended * after any existing options. If @hop is NULL then the new header * will contain just the CALIPSO option and any needed padding. * */ static struct ipv6_opt_hdr * calipso_opt_insert(struct ipv6_opt_hdr *hop, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { unsigned int start, end, buf_len, pad, hop_len; struct ipv6_opt_hdr *new; int ret_val; if (hop) { hop_len = ipv6_optlen(hop); ret_val = calipso_opt_find(hop, &start, &end); if (ret_val && ret_val != -ENOENT) return ERR_PTR(ret_val); } else { hop_len = 0; start = sizeof(*hop); end = 0; } buf_len = hop_len + start - end + CALIPSO_OPT_LEN_MAX_WITH_PAD; new = kzalloc(buf_len, GFP_ATOMIC); if (!new) return ERR_PTR(-ENOMEM); if (start > sizeof(*hop)) memcpy(new, hop, start); ret_val = calipso_genopt((unsigned char *)new, start, buf_len, doi_def, secattr); if (ret_val < 0) { kfree(new); return ERR_PTR(ret_val); } buf_len = start + ret_val; /* At this point buf_len aligns to 4n, so (buf_len & 4) pads to 8n */ pad = ((buf_len & 4) + (end & 7)) & 7; calipso_pad_write((unsigned char *)new, buf_len, pad); buf_len += pad; if (end != hop_len) { memcpy((char *)new + buf_len, (char *)hop + end, hop_len - end); buf_len += hop_len - end; } new->nexthdr = 0; new->hdrlen = buf_len / 8 - 1; return new; } /** * calipso_opt_del - Removes the CALIPSO option from an option header * @hop: the original header * @new: the new header * * Description: * Creates a new header based on @hop without any CALIPSO option. If @hop * doesn't contain a CALIPSO option it returns -ENOENT. If @hop contains * no other non-padding options, it returns zero with @new set to NULL. * Otherwise it returns zero, creates a new header without the CALIPSO * option (and removing as much padding as possible) and returns with * @new set to that header. * */ static int calipso_opt_del(struct ipv6_opt_hdr *hop, struct ipv6_opt_hdr **new) { int ret_val; unsigned int start, end, delta, pad, hop_len; ret_val = calipso_opt_find(hop, &start, &end); if (ret_val) return ret_val; hop_len = ipv6_optlen(hop); if (start == sizeof(*hop) && end == hop_len) { /* There's no other option in the header so return NULL */ *new = NULL; return 0; } delta = (end - start) & ~7; *new = kzalloc(hop_len - delta, GFP_ATOMIC); if (!*new) return -ENOMEM; memcpy(*new, hop, start); (*new)->hdrlen -= delta / 8; pad = (end - start) & 7; calipso_pad_write((unsigned char *)*new, start, pad); if (end != hop_len) memcpy((char *)*new + start + pad, (char *)hop + end, hop_len - end); return 0; } /** * calipso_opt_getattr - Get the security attributes from a memory block * @calipso: the CALIPSO option * @secattr: the security attributes * * Description: * Inspect @calipso and return the security attributes in @secattr. * Returns zero on success and negative values on failure. * */ static int calipso_opt_getattr(const unsigned char *calipso, struct netlbl_lsm_secattr *secattr) { int ret_val = -ENOMSG; u32 doi, len = calipso[1], cat_len = calipso[6] * 4; struct calipso_doi *doi_def; if (cat_len + 8 > len) return -EINVAL; if (calipso_cache_check(calipso + 2, calipso[1], secattr) == 0) return 0; doi = get_unaligned_be32(calipso + 2); rcu_read_lock(); doi_def = calipso_doi_search(doi); if (!doi_def) goto getattr_return; secattr->attr.mls.lvl = calipso[7]; secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (cat_len) { ret_val = calipso_map_cat_ntoh(doi_def, calipso + 10, cat_len, secattr); if (ret_val != 0) { netlbl_catmap_free(secattr->attr.mls.cat); goto getattr_return; } if (secattr->attr.mls.cat) secattr->flags |= NETLBL_SECATTR_MLS_CAT; } secattr->type = NETLBL_NLTYPE_CALIPSO; getattr_return: rcu_read_unlock(); return ret_val; } /* sock functions. */ /** * calipso_sock_getattr - Get the security attributes from a sock * @sk: the sock * @secattr: the security attributes * * Description: * Query @sk to see if there is a CALIPSO option attached to the sock and if * there is return the CALIPSO security attributes in @secattr. This function * requires that @sk be locked, or privately held, but it does not do any * locking itself. Returns zero on success and negative values on failure. * */ static int calipso_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) { struct ipv6_opt_hdr *hop; int opt_len, len, ret_val = -ENOMSG, offset; unsigned char *opt; struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); if (!txopts || !txopts->hopopt) goto done; hop = txopts->hopopt; opt = (unsigned char *)hop; opt_len = ipv6_optlen(hop); offset = sizeof(*hop); while (offset < opt_len) { len = calipso_tlv_len(hop, offset); if (len < 0) { ret_val = len; goto done; } switch (opt[offset]) { case IPV6_TLV_CALIPSO: if (len < CALIPSO_HDR_LEN) ret_val = -EINVAL; else ret_val = calipso_opt_getattr(&opt[offset], secattr); goto done; default: offset += len; break; } } done: txopt_put(txopts); return ret_val; } /** * calipso_sock_setattr - Add a CALIPSO option to a socket * @sk: the socket * @doi_def: the CALIPSO DOI to use * @secattr: the specific security attributes of the socket * * Description: * Set the CALIPSO option on the given socket using the DOI definition and * security attributes passed to the function. This function requires * exclusive access to @sk, which means it either needs to be in the * process of being created or locked. Returns zero on success and negative * values on failure. * */ static int calipso_sock_setattr(struct sock *sk, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val; struct ipv6_opt_hdr *old, *new; struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); old = NULL; if (txopts) old = txopts->hopopt; new = calipso_opt_insert(old, doi_def, secattr); txopt_put(txopts); if (IS_ERR(new)) return PTR_ERR(new); ret_val = calipso_opt_update(sk, new); kfree(new); return ret_val; } /** * calipso_sock_delattr - Delete the CALIPSO option from a socket * @sk: the socket * * Description: * Removes the CALIPSO option from a socket, if present. * */ static void calipso_sock_delattr(struct sock *sk) { struct ipv6_opt_hdr *new_hop; struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk)); if (!txopts || !txopts->hopopt) goto done; if (calipso_opt_del(txopts->hopopt, &new_hop)) goto done; calipso_opt_update(sk, new_hop); kfree(new_hop); done: txopt_put(txopts); } /* request sock functions. */ /** * calipso_req_setattr - Add a CALIPSO option to a connection request socket * @req: the connection request socket * @doi_def: the CALIPSO DOI to use * @secattr: the specific security attributes of the socket * * Description: * Set the CALIPSO option on the given socket using the DOI definition and * security attributes passed to the function. Returns zero on success and * negative values on failure. * */ static int calipso_req_setattr(struct request_sock *req, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { struct ipv6_txoptions *txopts; struct inet_request_sock *req_inet = inet_rsk(req); struct ipv6_opt_hdr *old, *new; struct sock *sk = sk_to_full_sk(req_to_sk(req)); if (req_inet->ipv6_opt && req_inet->ipv6_opt->hopopt) old = req_inet->ipv6_opt->hopopt; else old = NULL; new = calipso_opt_insert(old, doi_def, secattr); if (IS_ERR(new)) return PTR_ERR(new); txopts = ipv6_renew_options(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, new); kfree(new); if (IS_ERR(txopts)) return PTR_ERR(txopts); txopts = xchg(&req_inet->ipv6_opt, txopts); if (txopts) { atomic_sub(txopts->tot_len, &sk->sk_omem_alloc); txopt_put(txopts); } return 0; } /** * calipso_req_delattr - Delete the CALIPSO option from a request socket * @req: the request socket * * Description: * Removes the CALIPSO option from a request socket, if present. * */ static void calipso_req_delattr(struct request_sock *req) { struct inet_request_sock *req_inet = inet_rsk(req); struct ipv6_opt_hdr *new; struct ipv6_txoptions *txopts; struct sock *sk = sk_to_full_sk(req_to_sk(req)); if (!req_inet->ipv6_opt || !req_inet->ipv6_opt->hopopt) return; if (calipso_opt_del(req_inet->ipv6_opt->hopopt, &new)) return; /* Nothing to do */ txopts = ipv6_renew_options(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, new); if (!IS_ERR(txopts)) { txopts = xchg(&req_inet->ipv6_opt, txopts); if (txopts) { atomic_sub(txopts->tot_len, &sk->sk_omem_alloc); txopt_put(txopts); } } kfree(new); } /* skbuff functions. */ /** * calipso_skbuff_optptr - Find the CALIPSO option in the packet * @skb: the packet * * Description: * Parse the packet's IP header looking for a CALIPSO option. Returns a pointer * to the start of the CALIPSO option on success, NULL if one if not found. * */ static unsigned char *calipso_skbuff_optptr(const struct sk_buff *skb) { const struct ipv6hdr *ip6_hdr = ipv6_hdr(skb); int offset; if (ip6_hdr->nexthdr != NEXTHDR_HOP) return NULL; offset = ipv6_find_tlv(skb, sizeof(*ip6_hdr), IPV6_TLV_CALIPSO); if (offset >= 0) return (unsigned char *)ip6_hdr + offset; return NULL; } /** * calipso_skbuff_setattr - Set the CALIPSO option on a packet * @skb: the packet * @doi_def: the CALIPSO DOI to use * @secattr: the security attributes * * Description: * Set the CALIPSO option on the given packet based on the security attributes. * Returns a pointer to the IP header on success and NULL on failure. * */ static int calipso_skbuff_setattr(struct sk_buff *skb, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val; struct ipv6hdr *ip6_hdr; struct ipv6_opt_hdr *hop; unsigned char buf[CALIPSO_MAX_BUFFER]; int len_delta, new_end, pad, payload; unsigned int start, end; ip6_hdr = ipv6_hdr(skb); if (ip6_hdr->nexthdr == NEXTHDR_HOP) { hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1); ret_val = calipso_opt_find(hop, &start, &end); if (ret_val && ret_val != -ENOENT) return ret_val; } else { start = 0; end = 0; } memset(buf, 0, sizeof(buf)); ret_val = calipso_genopt(buf, start & 3, sizeof(buf), doi_def, secattr); if (ret_val < 0) return ret_val; new_end = start + ret_val; /* At this point new_end aligns to 4n, so (new_end & 4) pads to 8n */ pad = ((new_end & 4) + (end & 7)) & 7; len_delta = new_end - (int)end + pad; ret_val = skb_cow(skb, skb_headroom(skb) + len_delta); if (ret_val < 0) return ret_val; ip6_hdr = ipv6_hdr(skb); /* Reset as skb_cow() may have moved it */ if (len_delta) { if (len_delta > 0) skb_push(skb, len_delta); else skb_pull(skb, -len_delta); memmove((char *)ip6_hdr - len_delta, ip6_hdr, sizeof(*ip6_hdr) + start); skb_reset_network_header(skb); ip6_hdr = ipv6_hdr(skb); payload = ntohs(ip6_hdr->payload_len); ip6_hdr->payload_len = htons(payload + len_delta); } hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1); if (start == 0) { struct ipv6_opt_hdr *new_hop = (struct ipv6_opt_hdr *)buf; new_hop->nexthdr = ip6_hdr->nexthdr; new_hop->hdrlen = len_delta / 8 - 1; ip6_hdr->nexthdr = NEXTHDR_HOP; } else { hop->hdrlen += len_delta / 8; } memcpy((char *)hop + start, buf + (start & 3), new_end - start); calipso_pad_write((unsigned char *)hop, new_end, pad); return 0; } /** * calipso_skbuff_delattr - Delete any CALIPSO options from a packet * @skb: the packet * * Description: * Removes any and all CALIPSO options from the given packet. Returns zero on * success, negative values on failure. * */ static int calipso_skbuff_delattr(struct sk_buff *skb) { int ret_val; struct ipv6hdr *ip6_hdr; struct ipv6_opt_hdr *old_hop; u32 old_hop_len, start = 0, end = 0, delta, size, pad; if (!calipso_skbuff_optptr(skb)) return 0; /* since we are changing the packet we should make a copy */ ret_val = skb_cow(skb, skb_headroom(skb)); if (ret_val < 0) return ret_val; ip6_hdr = ipv6_hdr(skb); old_hop = (struct ipv6_opt_hdr *)(ip6_hdr + 1); old_hop_len = ipv6_optlen(old_hop); ret_val = calipso_opt_find(old_hop, &start, &end); if (ret_val) return ret_val; if (start == sizeof(*old_hop) && end == old_hop_len) { /* There's no other option in the header so we delete * the whole thing. */ delta = old_hop_len; size = sizeof(*ip6_hdr); ip6_hdr->nexthdr = old_hop->nexthdr; } else { delta = (end - start) & ~7; if (delta) old_hop->hdrlen -= delta / 8; pad = (end - start) & 7; size = sizeof(*ip6_hdr) + start + pad; calipso_pad_write((unsigned char *)old_hop, start, pad); } if (delta) { skb_pull(skb, delta); memmove((char *)ip6_hdr + delta, ip6_hdr, size); skb_reset_network_header(skb); } return 0; } static const struct netlbl_calipso_ops ops = { .doi_add = calipso_doi_add, .doi_free = calipso_doi_free, .doi_remove = calipso_doi_remove, .doi_getdef = calipso_doi_getdef, .doi_putdef = calipso_doi_putdef, .doi_walk = calipso_doi_walk, .sock_getattr = calipso_sock_getattr, .sock_setattr = calipso_sock_setattr, .sock_delattr = calipso_sock_delattr, .req_setattr = calipso_req_setattr, .req_delattr = calipso_req_delattr, .opt_getattr = calipso_opt_getattr, .skbuff_optptr = calipso_skbuff_optptr, .skbuff_setattr = calipso_skbuff_setattr, .skbuff_delattr = calipso_skbuff_delattr, .cache_invalidate = calipso_cache_invalidate, .cache_add = calipso_cache_add }; /** * calipso_init - Initialize the CALIPSO module * * Description: * Initialize the CALIPSO module and prepare it for use. Returns zero on * success and negative values on failure. * */ int __init calipso_init(void) { int ret_val; ret_val = calipso_cache_init(); if (!ret_val) netlbl_calipso_ops_register(&ops); return ret_val; } void calipso_exit(void) { netlbl_calipso_ops_register(NULL); calipso_cache_invalidate(); kfree(calipso_cache); } |
554 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * fs/kernfs/kernfs-internal.h - kernfs internal header file * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <teheo@suse.de> */ #ifndef __KERNFS_INTERNAL_H #define __KERNFS_INTERNAL_H #include <linux/lockdep.h> #include <linux/fs.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/xattr.h> #include <linux/kernfs.h> #include <linux/fs_context.h> struct kernfs_iattrs { kuid_t ia_uid; kgid_t ia_gid; struct timespec64 ia_atime; struct timespec64 ia_mtime; struct timespec64 ia_ctime; struct simple_xattrs xattrs; atomic_t nr_user_xattrs; atomic_t user_xattr_size; }; struct kernfs_root { /* published fields */ struct kernfs_node *kn; unsigned int flags; /* KERNFS_ROOT_* flags */ /* private fields, do not use outside kernfs proper */ struct idr ino_idr; u32 last_id_lowbits; u32 id_highbits; struct kernfs_syscall_ops *syscall_ops; /* list of kernfs_super_info of this root, protected by kernfs_rwsem */ struct list_head supers; wait_queue_head_t deactivate_waitq; struct rw_semaphore kernfs_rwsem; struct rw_semaphore kernfs_iattr_rwsem; struct rw_semaphore kernfs_supers_rwsem; struct rcu_head rcu; }; /* +1 to avoid triggering overflow warning when negating it */ #define KN_DEACTIVATED_BIAS (INT_MIN + 1) /* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */ /** * kernfs_root - find out the kernfs_root a kernfs_node belongs to * @kn: kernfs_node of interest * * Return: the kernfs_root @kn belongs to. */ static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn) { /* if parent exists, it's always a dir; otherwise, @sd is a dir */ if (kn->parent) kn = kn->parent; return kn->dir.root; } /* * mount.c */ struct kernfs_super_info { struct super_block *sb; /* * The root associated with this super_block. Each super_block is * identified by the root and ns it's associated with. */ struct kernfs_root *root; /* * Each sb is associated with one namespace tag, currently the * network namespace of the task which mounted this kernfs * instance. If multiple tags become necessary, make the following * an array and compare kernfs_node tag against every entry. */ const void *ns; /* anchored at kernfs_root->supers, protected by kernfs_rwsem */ struct list_head node; }; #define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info)) static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry) { if (d_really_is_negative(dentry)) return NULL; return d_inode(dentry)->i_private; } static inline void kernfs_set_rev(struct kernfs_node *parent, struct dentry *dentry) { dentry->d_time = parent->dir.rev; } static inline void kernfs_inc_rev(struct kernfs_node *parent) { parent->dir.rev++; } static inline bool kernfs_dir_changed(struct kernfs_node *parent, struct dentry *dentry) { if (parent->dir.rev != dentry->d_time) return true; return false; } extern const struct super_operations kernfs_sops; extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache; /* * inode.c */ extern const struct xattr_handler * const kernfs_xattr_handlers[]; void kernfs_evict_inode(struct inode *inode); int kernfs_iop_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr); int kernfs_iop_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags); ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size); int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); /* * dir.c */ extern const struct dentry_operations kernfs_dops; extern const struct file_operations kernfs_dir_fops; extern const struct inode_operations kernfs_dir_iops; struct kernfs_node *kernfs_get_active(struct kernfs_node *kn); void kernfs_put_active(struct kernfs_node *kn); int kernfs_add_one(struct kernfs_node *kn); struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, unsigned flags); /* * file.c */ extern const struct file_operations kernfs_file_fops; bool kernfs_should_drain_open_files(struct kernfs_node *kn); void kernfs_drain_open_files(struct kernfs_node *kn); /* * symlink.c */ extern const struct inode_operations kernfs_symlink_iops; /* * kernfs locks */ extern struct kernfs_global_locks *kernfs_locks; #endif /* __KERNFS_INTERNAL_H */ |
81 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2007 Oracle. All rights reserved. */ #ifndef BTRFS_CTREE_H #define BTRFS_CTREE_H #include <linux/pagemap.h> #include <linux/spinlock.h> #include <linux/rbtree.h> #include <linux/mutex.h> #include <linux/wait.h> #include <linux/list.h> #include <linux/atomic.h> #include <linux/xarray.h> #include <linux/refcount.h> #include <uapi/linux/btrfs_tree.h> #include "locking.h" #include "fs.h" #include "accessors.h" #include "extent-io-tree.h" struct extent_buffer; struct btrfs_block_rsv; struct btrfs_trans_handle; struct btrfs_block_group; /* Read ahead values for struct btrfs_path.reada */ enum { READA_NONE, READA_BACK, READA_FORWARD, /* * Similar to READA_FORWARD but unlike it: * * 1) It will trigger readahead even for leaves that are not close to * each other on disk; * 2) It also triggers readahead for nodes; * 3) During a search, even when a node or leaf is already in memory, it * will still trigger readahead for other nodes and leaves that follow * it. * * This is meant to be used only when we know we are iterating over the * entire tree or a very large part of it. */ READA_FORWARD_ALWAYS, }; /* * btrfs_paths remember the path taken from the root down to the leaf. * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point * to any other levels that are present. * * The slots array records the index of the item or block pointer * used while walking the tree. */ struct btrfs_path { struct extent_buffer *nodes[BTRFS_MAX_LEVEL]; int slots[BTRFS_MAX_LEVEL]; /* if there is real range locking, this locks field will change */ u8 locks[BTRFS_MAX_LEVEL]; u8 reada; /* keep some upper locks as we walk down */ u8 lowest_level; /* * set by btrfs_split_item, tells search_slot to keep all locks * and to force calls to keep space in the nodes */ unsigned int search_for_split:1; unsigned int keep_locks:1; unsigned int skip_locking:1; unsigned int search_commit_root:1; unsigned int need_commit_sem:1; unsigned int skip_release_on_error:1; /* * Indicate that new item (btrfs_search_slot) is extending already * existing item and ins_len contains only the data size and not item * header (ie. sizeof(struct btrfs_item) is not included). */ unsigned int search_for_extension:1; /* Stop search if any locks need to be taken (for read) */ unsigned int nowait:1; }; /* * The state of btrfs root */ enum { /* * btrfs_record_root_in_trans is a multi-step process, and it can race * with the balancing code. But the race is very small, and only the * first time the root is added to each transaction. So IN_TRANS_SETUP * is used to tell us when more checks are required */ BTRFS_ROOT_IN_TRANS_SETUP, /* * Set if tree blocks of this root can be shared by other roots. * Only subvolume trees and their reloc trees have this bit set. * Conflicts with TRACK_DIRTY bit. * * This affects two things: * * - How balance works * For shareable roots, we need to use reloc tree and do path * replacement for balance, and need various pre/post hooks for * snapshot creation to handle them. * * While for non-shareable trees, we just simply do a tree search * with COW. * * - How dirty roots are tracked * For shareable roots, btrfs_record_root_in_trans() is needed to * track them, while non-subvolume roots have TRACK_DIRTY bit, they * don't need to set this manually. */ BTRFS_ROOT_SHAREABLE, BTRFS_ROOT_TRACK_DIRTY, BTRFS_ROOT_IN_RADIX, BTRFS_ROOT_ORPHAN_ITEM_INSERTED, BTRFS_ROOT_DEFRAG_RUNNING, BTRFS_ROOT_FORCE_COW, BTRFS_ROOT_MULTI_LOG_TASKS, BTRFS_ROOT_DIRTY, BTRFS_ROOT_DELETING, /* * Reloc tree is orphan, only kept here for qgroup delayed subtree scan * * Set for the subvolume tree owning the reloc tree. */ BTRFS_ROOT_DEAD_RELOC_TREE, /* Mark dead root stored on device whose cleanup needs to be resumed */ BTRFS_ROOT_DEAD_TREE, /* The root has a log tree. Used for subvolume roots and the tree root. */ BTRFS_ROOT_HAS_LOG_TREE, /* Qgroup flushing is in progress */ BTRFS_ROOT_QGROUP_FLUSHING, /* We started the orphan cleanup for this root. */ BTRFS_ROOT_ORPHAN_CLEANUP, /* This root has a drop operation that was started previously. */ BTRFS_ROOT_UNFINISHED_DROP, /* This reloc root needs to have its buffers lockdep class reset. */ BTRFS_ROOT_RESET_LOCKDEP_CLASS, }; /* * Record swapped tree blocks of a subvolume tree for delayed subtree trace * code. For detail check comment in fs/btrfs/qgroup.c. */ struct btrfs_qgroup_swapped_blocks { spinlock_t lock; /* RM_EMPTY_ROOT() of above blocks[] */ bool swapped; struct rb_root blocks[BTRFS_MAX_LEVEL]; }; /* * in ram representation of the tree. extent_root is used for all allocations * and for the extent tree extent_root root. */ struct btrfs_root { struct rb_node rb_node; struct extent_buffer *node; struct extent_buffer *commit_root; struct btrfs_root *log_root; struct btrfs_root *reloc_root; unsigned long state; struct btrfs_root_item root_item; struct btrfs_key root_key; struct btrfs_fs_info *fs_info; struct extent_io_tree dirty_log_pages; struct mutex objectid_mutex; spinlock_t accounting_lock; struct btrfs_block_rsv *block_rsv; struct mutex log_mutex; wait_queue_head_t log_writer_wait; wait_queue_head_t log_commit_wait[2]; struct list_head log_ctxs[2]; /* Used only for log trees of subvolumes, not for the log root tree */ atomic_t log_writers; atomic_t log_commit[2]; /* Used only for log trees of subvolumes, not for the log root tree */ atomic_t log_batch; /* * Protected by the 'log_mutex' lock but can be read without holding * that lock to avoid unnecessary lock contention, in which case it * should be read using btrfs_get_root_log_transid() except if it's a * log tree in which case it can be directly accessed. Updates to this * field should always use btrfs_set_root_log_transid(), except for log * trees where the field can be updated directly. */ int log_transid; /* No matter the commit succeeds or not*/ int log_transid_committed; /* * Just be updated when the commit succeeds. Use * btrfs_get_root_last_log_commit() and btrfs_set_root_last_log_commit() * to access this field. */ int last_log_commit; pid_t log_start_pid; u64 last_trans; u64 free_objectid; struct btrfs_key defrag_progress; struct btrfs_key defrag_max; /* The dirty list is only used by non-shareable roots */ struct list_head dirty_list; struct list_head root_list; /* * Xarray that keeps track of in-memory inodes, protected by the lock * @inode_lock. */ struct xarray inodes; /* * Xarray that keeps track of delayed nodes of every inode, protected * by @inode_lock. */ struct xarray delayed_nodes; /* * right now this just gets used so that a root has its own devid * for stat. It may be used for more later */ dev_t anon_dev; spinlock_t root_item_lock; refcount_t refs; struct mutex delalloc_mutex; spinlock_t delalloc_lock; /* * all of the inodes that have delalloc bytes. It is possible for * this list to be empty even when there is still dirty data=ordered * extents waiting to finish IO. */ struct list_head delalloc_inodes; struct list_head delalloc_root; u64 nr_delalloc_inodes; struct mutex ordered_extent_mutex; /* * this is used by the balancing code to wait for all the pending * ordered extents */ spinlock_t ordered_extent_lock; /* * all of the data=ordered extents pending writeback * these can span multiple transactions and basically include * every dirty data page that isn't from nodatacow */ struct list_head ordered_extents; struct list_head ordered_root; u64 nr_ordered_extents; /* * Not empty if this subvolume root has gone through tree block swap * (relocation) * * Will be used by reloc_control::dirty_subvol_roots. */ struct list_head reloc_dirty_list; /* * Number of currently running SEND ioctls to prevent * manipulation with the read-only status via SUBVOL_SETFLAGS */ int send_in_progress; /* * Number of currently running deduplication operations that have a * destination inode belonging to this root. Protected by the lock * root_item_lock. */ int dedupe_in_progress; /* For exclusion of snapshot creation and nocow writes */ struct btrfs_drew_lock snapshot_lock; atomic_t snapshot_force_cow; /* For qgroup metadata reserved space */ spinlock_t qgroup_meta_rsv_lock; u64 qgroup_meta_rsv_pertrans; u64 qgroup_meta_rsv_prealloc; wait_queue_head_t qgroup_flush_wait; /* Number of active swapfiles */ atomic_t nr_swapfiles; /* Record pairs of swapped blocks for qgroup */ struct btrfs_qgroup_swapped_blocks swapped_blocks; /* Used only by log trees, when logging csum items */ struct extent_io_tree log_csum_range; /* Used in simple quotas, track root during relocation. */ u64 relocation_src_root; #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS u64 alloc_bytenr; #endif #ifdef CONFIG_BTRFS_DEBUG struct list_head leak_list; #endif }; static inline bool btrfs_root_readonly(const struct btrfs_root *root) { /* Byte-swap the constant at compile time, root_item::flags is LE */ return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; } static inline bool btrfs_root_dead(const struct btrfs_root *root) { /* Byte-swap the constant at compile time, root_item::flags is LE */ return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; } static inline u64 btrfs_root_id(const struct btrfs_root *root) { return root->root_key.objectid; } static inline int btrfs_get_root_log_transid(const struct btrfs_root *root) { return READ_ONCE(root->log_transid); } static inline void btrfs_set_root_log_transid(struct btrfs_root *root, int log_transid) { WRITE_ONCE(root->log_transid, log_transid); } static inline int btrfs_get_root_last_log_commit(const struct btrfs_root *root) { return READ_ONCE(root->last_log_commit); } static inline void btrfs_set_root_last_log_commit(struct btrfs_root *root, int commit_id) { WRITE_ONCE(root->last_log_commit, commit_id); } static inline u64 btrfs_get_root_last_trans(const struct btrfs_root *root) { return READ_ONCE(root->last_trans); } static inline void btrfs_set_root_last_trans(struct btrfs_root *root, u64 transid) { WRITE_ONCE(root->last_trans, transid); } /* * Structure that conveys information about an extent that is going to replace * all the extents in a file range. */ struct btrfs_replace_extent_info { u64 disk_offset; u64 disk_len; u64 data_offset; u64 data_len; u64 file_offset; /* Pointer to a file extent item of type regular or prealloc. */ char *extent_buf; /* * Set to true when attempting to replace a file range with a new extent * described by this structure, set to false when attempting to clone an * existing extent into a file range. */ bool is_new_extent; /* Indicate if we should update the inode's mtime and ctime. */ bool update_times; /* Meaningful only if is_new_extent is true. */ int qgroup_reserved; /* * Meaningful only if is_new_extent is true. * Used to track how many extent items we have already inserted in a * subvolume tree that refer to the extent described by this structure, * so that we know when to create a new delayed ref or update an existing * one. */ int insertions; }; /* Arguments for btrfs_drop_extents() */ struct btrfs_drop_extents_args { /* Input parameters */ /* * If NULL, btrfs_drop_extents() will allocate and free its own path. * If 'replace_extent' is true, this must not be NULL. Also the path * is always released except if 'replace_extent' is true and * btrfs_drop_extents() sets 'extent_inserted' to true, in which case * the path is kept locked. */ struct btrfs_path *path; /* Start offset of the range to drop extents from */ u64 start; /* End (exclusive, last byte + 1) of the range to drop extents from */ u64 end; /* If true drop all the extent maps in the range */ bool drop_cache; /* * If true it means we want to insert a new extent after dropping all * the extents in the range. If this is true, the 'extent_item_size' * parameter must be set as well and the 'extent_inserted' field will * be set to true by btrfs_drop_extents() if it could insert the new * extent. * Note: when this is set to true the path must not be NULL. */ bool replace_extent; /* * Used if 'replace_extent' is true. Size of the file extent item to * insert after dropping all existing extents in the range */ u32 extent_item_size; /* Output parameters */ /* * Set to the minimum between the input parameter 'end' and the end * (exclusive, last byte + 1) of the last dropped extent. This is always * set even if btrfs_drop_extents() returns an error. */ u64 drop_end; /* * The number of allocated bytes found in the range. This can be smaller * than the range's length when there are holes in the range. */ u64 bytes_found; /* * Only set if 'replace_extent' is true. Set to true if we were able * to insert a replacement extent after dropping all extents in the * range, otherwise set to false by btrfs_drop_extents(). * Also, if btrfs_drop_extents() has set this to true it means it * returned with the path locked, otherwise if it has set this to * false it has returned with the path released. */ bool extent_inserted; }; struct btrfs_file_private { void *filldir_buf; u64 last_index; struct extent_state *llseek_cached_state; bool fsync_skip_inode_lock; }; static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) { return info->nodesize - sizeof(struct btrfs_header); } static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_fs_info *info) { return BTRFS_LEAF_DATA_SIZE(info) - sizeof(struct btrfs_item); } static inline u32 BTRFS_NODEPTRS_PER_BLOCK(const struct btrfs_fs_info *info) { return BTRFS_LEAF_DATA_SIZE(info) / sizeof(struct btrfs_key_ptr); } static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) { return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item); } #define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \ ((bytes) >> (fs_info)->sectorsize_bits) static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) { return mapping_gfp_constraint(mapping, ~__GFP_FS); } void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes); int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); /* ctree.c */ int __init btrfs_ctree_init(void); void __cold btrfs_ctree_exit(void); int btrfs_bin_search(struct extent_buffer *eb, int first_slot, const struct btrfs_key *key, int *slot); int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); #ifdef __LITTLE_ENDIAN /* * Compare two keys, on little-endian the disk order is same as CPU order and * we can avoid the conversion. */ static inline int btrfs_comp_keys(const struct btrfs_disk_key *disk_key, const struct btrfs_key *k2) { const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key; return btrfs_comp_cpu_keys(k1, k2); } #else /* Compare two keys in a memcmp fashion. */ static inline int btrfs_comp_keys(const struct btrfs_disk_key *disk, const struct btrfs_key *k2) { struct btrfs_key k1; btrfs_disk_key_to_cpu(&k1, disk); return btrfs_comp_cpu_keys(&k1, k2); } #endif int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, int type); int btrfs_previous_extent_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid); void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, struct btrfs_path *path, const struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, int lowest_level, u64 min_trans); int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, struct btrfs_path *path, u64 min_trans); struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, int slot); int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret, enum btrfs_lock_nesting nest); int btrfs_force_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret, u64 search_start, u64 empty_size, enum btrfs_lock_nesting nest); int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer **cow_ret, u64 new_root_objectid); bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf); int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); void btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_path *path, u32 data_size); void btrfs_truncate_item(struct btrfs_trans_handle *trans, struct btrfs_path *path, u32 new_size, int from_end); int btrfs_split_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *new_key, unsigned long split_offset); int btrfs_duplicate_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *new_key); int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path, u64 inum, u64 ioff, u8 key_type, struct btrfs_key *found_key); int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow); int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, u64 time_seq); int btrfs_search_slot_for_read(struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, int find_higher, int return_any); void btrfs_release_path(struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int slot, int nr); static inline int btrfs_del_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path) { return btrfs_del_items(trans, root, path, path->slots[0], 1); } /* * Describes a batch of items to insert in a btree. This is used by * btrfs_insert_empty_items(). */ struct btrfs_item_batch { /* * Pointer to an array containing the keys of the items to insert (in * sorted order). */ const struct btrfs_key *keys; /* Pointer to an array containing the data size for each item to insert. */ const u32 *data_sizes; /* * The sum of data sizes for all items. The caller can compute this while * setting up the data_sizes array, so it ends up being more efficient * than having btrfs_insert_empty_items() or setup_item_for_insert() * doing it, as it would avoid an extra loop over a potentially large * array, and in the case of setup_item_for_insert(), we would be doing * it while holding a write lock on a leaf and often on upper level nodes * too, unnecessarily increasing the size of a critical section. */ u32 total_data_size; /* Size of the keys and data_sizes arrays (number of items in the batch). */ int nr; }; void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *key, u32 data_size); int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, void *data, u32 data_size); int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_item_batch *batch); static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *key, u32 data_size) { struct btrfs_item_batch batch; batch.keys = key; batch.data_sizes = &data_size; batch.total_data_size = data_size; batch.nr = 1; return btrfs_insert_empty_items(trans, root, path, &batch); } int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq); int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *path); int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *path); /* * Search in @root for a given @key, and store the slot found in @found_key. * * @root: The root node of the tree. * @key: The key we are looking for. * @found_key: Will hold the found item. * @path: Holds the current slot/leaf. * @iter_ret: Contains the value returned from btrfs_search_slot or * btrfs_get_next_valid_item, whichever was executed last. * * The @iter_ret is an output variable that will contain the return value of * btrfs_search_slot, if it encountered an error, or the value returned from * btrfs_get_next_valid_item otherwise. That return value can be 0, if a valid * slot was found, 1 if there were no more leaves, and <0 if there was an error. * * It's recommended to use a separate variable for iter_ret and then use it to * set the function return value so there's no confusion of the 0/1/errno * values stemming from btrfs_search_slot. */ #define btrfs_for_each_slot(root, key, found_key, path, iter_ret) \ for (iter_ret = btrfs_search_slot(NULL, (root), (key), (path), 0, 0); \ (iter_ret) >= 0 && \ (iter_ret = btrfs_get_next_valid_item((root), (found_key), (path))) == 0; \ (path)->slots[0]++ \ ) int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq); /* * Search the tree again to find a leaf with greater keys. * * Returns 0 if it found something or 1 if there are no greater leaves. * Returns < 0 on error. */ static inline int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) { return btrfs_next_old_leaf(root, path, 0); } static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) { return btrfs_next_old_item(root, p, 0); } int btrfs_leaf_free_space(const struct extent_buffer *leaf); static inline int is_fstree(u64 rootid) { if (rootid == BTRFS_FS_TREE_OBJECTID || ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID && !btrfs_qgroup_level(rootid))) return 1; return 0; } static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) { return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID; } u16 btrfs_csum_type_size(u16 type); int btrfs_super_csum_size(const struct btrfs_super_block *s); const char *btrfs_super_csum_name(u16 csum_type); const char *btrfs_super_csum_driver(u16 csum_type); size_t __attribute_const__ btrfs_get_num_csums(void); /* * We use page status Private2 to indicate there is an ordered extent with * unfinished IO. * * Rename the Private2 accessors to Ordered, to improve readability. */ #define PageOrdered(page) PagePrivate2(page) #define SetPageOrdered(page) SetPagePrivate2(page) #define ClearPageOrdered(page) ClearPagePrivate2(page) #define folio_test_ordered(folio) folio_test_private_2(folio) #define folio_set_ordered(folio) folio_set_private_2(folio) #define folio_clear_ordered(folio) folio_clear_private_2(folio) #endif |
10 10 10 7 8 2 7 7 8 8 8 8 10 10 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 | // SPDX-License-Identifier: GPL-2.0 /* usb-urb.c is part of the DVB USB library. * * Copyright (C) 2004-6 Patrick Boettcher (patrick.boettcher@posteo.de) * see dvb-usb-init.c for copyright information. * * This file keeps functions for initializing and handling the * BULK and ISOC USB data transfers in a generic way. * Can be used for DVB-only and also, that's the plan, for * Hybrid USB devices (analog and DVB). */ #include "dvb_usb_common.h" /* URB stuff for streaming */ int usb_urb_reconfig(struct usb_data_stream *stream, struct usb_data_stream_properties *props); static void usb_urb_complete(struct urb *urb) { struct usb_data_stream *stream = urb->context; int ptype = usb_pipetype(urb->pipe); int i; u8 *b; dev_dbg_ratelimited(&stream->udev->dev, "%s: %s urb completed status=%d length=%d/%d pack_num=%d errors=%d\n", __func__, ptype == PIPE_ISOCHRONOUS ? "isoc" : "bulk", urb->status, urb->actual_length, urb->transfer_buffer_length, urb->number_of_packets, urb->error_count); switch (urb->status) { case 0: /* success */ case -ETIMEDOUT: /* NAK */ break; case -ECONNRESET: /* kill */ case -ENOENT: case -ESHUTDOWN: return; default: /* error */ dev_dbg_ratelimited(&stream->udev->dev, "%s: urb completion failed=%d\n", __func__, urb->status); break; } b = (u8 *) urb->transfer_buffer; switch (ptype) { case PIPE_ISOCHRONOUS: for (i = 0; i < urb->number_of_packets; i++) { if (urb->iso_frame_desc[i].status != 0) dev_dbg(&stream->udev->dev, "%s: iso frame descriptor has an error=%d\n", __func__, urb->iso_frame_desc[i].status); else if (urb->iso_frame_desc[i].actual_length > 0) stream->complete(stream, b + urb->iso_frame_desc[i].offset, urb->iso_frame_desc[i].actual_length); urb->iso_frame_desc[i].status = 0; urb->iso_frame_desc[i].actual_length = 0; } break; case PIPE_BULK: if (urb->actual_length > 0) stream->complete(stream, b, urb->actual_length); break; default: dev_err(&stream->udev->dev, "%s: unknown endpoint type in completion handler\n", KBUILD_MODNAME); return; } usb_submit_urb(urb, GFP_ATOMIC); } int usb_urb_killv2(struct usb_data_stream *stream) { int i; for (i = 0; i < stream->urbs_submitted; i++) { dev_dbg(&stream->udev->dev, "%s: kill urb=%d\n", __func__, i); /* stop the URB */ usb_kill_urb(stream->urb_list[i]); } stream->urbs_submitted = 0; return 0; } int usb_urb_submitv2(struct usb_data_stream *stream, struct usb_data_stream_properties *props) { int i, ret; if (props) { ret = usb_urb_reconfig(stream, props); if (ret < 0) return ret; } for (i = 0; i < stream->urbs_initialized; i++) { dev_dbg(&stream->udev->dev, "%s: submit urb=%d\n", __func__, i); ret = usb_submit_urb(stream->urb_list[i], GFP_ATOMIC); if (ret) { dev_err(&stream->udev->dev, "%s: could not submit urb no. %d - get them all back\n", KBUILD_MODNAME, i); usb_urb_killv2(stream); return ret; } stream->urbs_submitted++; } return 0; } static int usb_urb_free_urbs(struct usb_data_stream *stream) { int i; usb_urb_killv2(stream); for (i = stream->urbs_initialized - 1; i >= 0; i--) { if (stream->urb_list[i]) { dev_dbg(&stream->udev->dev, "%s: free urb=%d\n", __func__, i); /* free the URBs */ usb_free_urb(stream->urb_list[i]); } } stream->urbs_initialized = 0; return 0; } static int usb_urb_alloc_bulk_urbs(struct usb_data_stream *stream) { int i, j; /* allocate the URBs */ for (i = 0; i < stream->props.count; i++) { dev_dbg(&stream->udev->dev, "%s: alloc urb=%d\n", __func__, i); stream->urb_list[i] = usb_alloc_urb(0, GFP_ATOMIC); if (!stream->urb_list[i]) { dev_dbg(&stream->udev->dev, "%s: failed\n", __func__); for (j = 0; j < i; j++) usb_free_urb(stream->urb_list[j]); return -ENOMEM; } usb_fill_bulk_urb(stream->urb_list[i], stream->udev, usb_rcvbulkpipe(stream->udev, stream->props.endpoint), stream->buf_list[i], stream->props.u.bulk.buffersize, usb_urb_complete, stream); stream->urbs_initialized++; } return 0; } static int usb_urb_alloc_isoc_urbs(struct usb_data_stream *stream) { int i, j; /* allocate the URBs */ for (i = 0; i < stream->props.count; i++) { struct urb *urb; int frame_offset = 0; dev_dbg(&stream->udev->dev, "%s: alloc urb=%d\n", __func__, i); stream->urb_list[i] = usb_alloc_urb( stream->props.u.isoc.framesperurb, GFP_ATOMIC); if (!stream->urb_list[i]) { dev_dbg(&stream->udev->dev, "%s: failed\n", __func__); for (j = 0; j < i; j++) usb_free_urb(stream->urb_list[j]); return -ENOMEM; } urb = stream->urb_list[i]; urb->dev = stream->udev; urb->context = stream; urb->complete = usb_urb_complete; urb->pipe = usb_rcvisocpipe(stream->udev, stream->props.endpoint); urb->transfer_flags = URB_ISO_ASAP; urb->interval = stream->props.u.isoc.interval; urb->number_of_packets = stream->props.u.isoc.framesperurb; urb->transfer_buffer_length = stream->props.u.isoc.framesize * stream->props.u.isoc.framesperurb; urb->transfer_buffer = stream->buf_list[i]; for (j = 0; j < stream->props.u.isoc.framesperurb; j++) { urb->iso_frame_desc[j].offset = frame_offset; urb->iso_frame_desc[j].length = stream->props.u.isoc.framesize; frame_offset += stream->props.u.isoc.framesize; } stream->urbs_initialized++; } return 0; } static int usb_free_stream_buffers(struct usb_data_stream *stream) { if (stream->state & USB_STATE_URB_BUF) { while (stream->buf_num) { stream->buf_num--; kfree(stream->buf_list[stream->buf_num]); } } stream->state &= ~USB_STATE_URB_BUF; return 0; } static int usb_alloc_stream_buffers(struct usb_data_stream *stream, int num, unsigned long size) { stream->buf_num = 0; stream->buf_size = size; dev_dbg(&stream->udev->dev, "%s: all in all I will use %lu bytes for streaming\n", __func__, num * size); for (stream->buf_num = 0; stream->buf_num < num; stream->buf_num++) { stream->buf_list[stream->buf_num] = kzalloc(size, GFP_ATOMIC); if (!stream->buf_list[stream->buf_num]) { dev_dbg(&stream->udev->dev, "%s: alloc buf=%d failed\n", __func__, stream->buf_num); usb_free_stream_buffers(stream); return -ENOMEM; } dev_dbg(&stream->udev->dev, "%s: alloc buf=%d %p (dma %llu)\n", __func__, stream->buf_num, stream->buf_list[stream->buf_num], (long long)stream->dma_addr[stream->buf_num]); stream->state |= USB_STATE_URB_BUF; } return 0; } int usb_urb_reconfig(struct usb_data_stream *stream, struct usb_data_stream_properties *props) { int buf_size; if (!props) return 0; /* check allocated buffers are large enough for the request */ if (props->type == USB_BULK) { buf_size = stream->props.u.bulk.buffersize; } else if (props->type == USB_ISOC) { buf_size = props->u.isoc.framesize * props->u.isoc.framesperurb; } else { dev_err(&stream->udev->dev, "%s: invalid endpoint type=%d\n", KBUILD_MODNAME, props->type); return -EINVAL; } if (stream->buf_num < props->count || stream->buf_size < buf_size) { dev_err(&stream->udev->dev, "%s: cannot reconfigure as allocated buffers are too small\n", KBUILD_MODNAME); return -EINVAL; } /* check if all fields are same */ if (stream->props.type == props->type && stream->props.count == props->count && stream->props.endpoint == props->endpoint) { if (props->type == USB_BULK && props->u.bulk.buffersize == stream->props.u.bulk.buffersize) return 0; else if (props->type == USB_ISOC && props->u.isoc.framesperurb == stream->props.u.isoc.framesperurb && props->u.isoc.framesize == stream->props.u.isoc.framesize && props->u.isoc.interval == stream->props.u.isoc.interval) return 0; } dev_dbg(&stream->udev->dev, "%s: re-alloc urbs\n", __func__); usb_urb_free_urbs(stream); memcpy(&stream->props, props, sizeof(*props)); if (props->type == USB_BULK) return usb_urb_alloc_bulk_urbs(stream); else if (props->type == USB_ISOC) return usb_urb_alloc_isoc_urbs(stream); return 0; } int usb_urb_initv2(struct usb_data_stream *stream, const struct usb_data_stream_properties *props) { int ret; if (!stream || !props) return -EINVAL; memcpy(&stream->props, props, sizeof(*props)); if (!stream->complete) { dev_err(&stream->udev->dev, "%s: there is no data callback - this doesn't make sense\n", KBUILD_MODNAME); return -EINVAL; } switch (stream->props.type) { case USB_BULK: ret = usb_alloc_stream_buffers(stream, stream->props.count, stream->props.u.bulk.buffersize); if (ret < 0) return ret; return usb_urb_alloc_bulk_urbs(stream); case USB_ISOC: ret = usb_alloc_stream_buffers(stream, stream->props.count, stream->props.u.isoc.framesize * stream->props.u.isoc.framesperurb); if (ret < 0) return ret; return usb_urb_alloc_isoc_urbs(stream); default: dev_err(&stream->udev->dev, "%s: unknown urb-type for data transfer\n", KBUILD_MODNAME); return -EINVAL; } } int usb_urb_exitv2(struct usb_data_stream *stream) { usb_urb_free_urbs(stream); usb_free_stream_buffers(stream); return 0; } |
24 24 24 11 11 13 13 1 1 41 20 23 23 23 18 23 41 24 20 18 24 11 40 12 11 23 40 41 42 23 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 | /* * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved. * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "core_priv.h" #include <linux/slab.h> #include <linux/stat.h> #include <linux/string.h> #include <linux/netdevice.h> #include <linux/ethtool.h> #include <rdma/ib_mad.h> #include <rdma/ib_pma.h> #include <rdma/ib_cache.h> #include <rdma/rdma_counter.h> #include <rdma/ib_sysfs.h> struct port_table_attribute { struct ib_port_attribute attr; char name[8]; int index; __be16 attr_id; }; struct gid_attr_group { struct ib_port *port; struct kobject kobj; struct attribute_group groups[2]; const struct attribute_group *groups_list[3]; struct port_table_attribute attrs_list[]; }; struct ib_port { struct kobject kobj; struct ib_device *ibdev; struct gid_attr_group *gid_attr_group; struct hw_stats_port_data *hw_stats_data; struct attribute_group groups[3]; const struct attribute_group *groups_list[5]; u32 port_num; struct port_table_attribute attrs_list[]; }; struct hw_stats_device_attribute { struct device_attribute attr; ssize_t (*show)(struct ib_device *ibdev, struct rdma_hw_stats *stats, unsigned int index, unsigned int port_num, char *buf); ssize_t (*store)(struct ib_device *ibdev, struct rdma_hw_stats *stats, unsigned int index, unsigned int port_num, const char *buf, size_t count); }; struct hw_stats_port_attribute { struct ib_port_attribute attr; ssize_t (*show)(struct ib_device *ibdev, struct rdma_hw_stats *stats, unsigned int index, unsigned int port_num, char *buf); ssize_t (*store)(struct ib_device *ibdev, struct rdma_hw_stats *stats, unsigned int index, unsigned int port_num, const char *buf, size_t count); }; struct hw_stats_device_data { struct attribute_group group; struct rdma_hw_stats *stats; struct hw_stats_device_attribute attrs[]; }; struct hw_stats_port_data { struct rdma_hw_stats *stats; struct hw_stats_port_attribute attrs[]; }; static ssize_t port_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct ib_port_attribute *port_attr = container_of(attr, struct ib_port_attribute, attr); struct ib_port *p = container_of(kobj, struct ib_port, kobj); if (!port_attr->show) return -EIO; return port_attr->show(p->ibdev, p->port_num, port_attr, buf); } static ssize_t port_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct ib_port_attribute *port_attr = container_of(attr, struct ib_port_attribute, attr); struct ib_port *p = container_of(kobj, struct ib_port, kobj); if (!port_attr->store) return -EIO; return port_attr->store(p->ibdev, p->port_num, port_attr, buf, count); } struct ib_device *ib_port_sysfs_get_ibdev_kobj(struct kobject *kobj, u32 *port_num) { struct ib_port *port = container_of(kobj, struct ib_port, kobj); *port_num = port->port_num; return port->ibdev; } EXPORT_SYMBOL(ib_port_sysfs_get_ibdev_kobj); static const struct sysfs_ops port_sysfs_ops = { .show = port_attr_show, .store = port_attr_store }; static ssize_t hw_stat_device_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hw_stats_device_attribute *stat_attr = container_of(attr, struct hw_stats_device_attribute, attr); struct ib_device *ibdev = container_of(dev, struct ib_device, dev); return stat_attr->show(ibdev, ibdev->hw_stats_data->stats, stat_attr - ibdev->hw_stats_data->attrs, 0, buf); } static ssize_t hw_stat_device_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct hw_stats_device_attribute *stat_attr = container_of(attr, struct hw_stats_device_attribute, attr); struct ib_device *ibdev = container_of(dev, struct ib_device, dev); return stat_attr->store(ibdev, ibdev->hw_stats_data->stats, stat_attr - ibdev->hw_stats_data->attrs, 0, buf, count); } static ssize_t hw_stat_port_show(struct ib_device *ibdev, u32 port_num, struct ib_port_attribute *attr, char *buf) { struct hw_stats_port_attribute *stat_attr = container_of(attr, struct hw_stats_port_attribute, attr); struct ib_port *port = ibdev->port_data[port_num].sysfs; return stat_attr->show(ibdev, port->hw_stats_data->stats, stat_attr - port->hw_stats_data->attrs, port->port_num, buf); } static ssize_t hw_stat_port_store(struct ib_device *ibdev, u32 port_num, struct ib_port_attribute *attr, const char *buf, size_t count) { struct hw_stats_port_attribute *stat_attr = container_of(attr, struct hw_stats_port_attribute, attr); struct ib_port *port = ibdev->port_data[port_num].sysfs; return stat_attr->store(ibdev, port->hw_stats_data->stats, stat_attr - port->hw_stats_data->attrs, port->port_num, buf, count); } static ssize_t gid_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct ib_port_attribute *port_attr = container_of(attr, struct ib_port_attribute, attr); struct ib_port *p = container_of(kobj, struct gid_attr_group, kobj)->port; if (!port_attr->show) return -EIO; return port_attr->show(p->ibdev, p->port_num, port_attr, buf); } static const struct sysfs_ops gid_attr_sysfs_ops = { .show = gid_attr_show }; static ssize_t state_show(struct ib_device *ibdev, u32 port_num, struct ib_port_attribute *unused, char *buf) { struct ib_port_attr attr; ssize_t ret; static const char *state_name[] = { [IB_PORT_NOP] = "NOP", [IB_PORT_DOWN] = "DOWN", [IB_PORT_INIT] = "INIT", [IB_PORT_ARMED] = "ARMED", [IB_PORT_ACTIVE] = "ACTIVE", [IB_PORT_ACTIVE_DEFER] = "ACTIVE_DEFER" }; ret = ib_query_port(ibdev, port_num, &attr); if (ret) return ret; return sysfs_emit(buf, "%d: %s\n", attr.state, attr.state >= 0 && attr.state < ARRAY_SIZE(state_name) ? state_name[attr.state] : "UNKNOWN"); } static ssize_t lid_show(struct ib_device *ibdev, u32 port_num, struct ib_port_attribute *unused, char *buf) { struct ib_port_attr attr; ssize_t ret; ret = ib_query_port(ibdev, port_num, &attr); if (ret) return ret; return sysfs_emit(buf, "0x%x\n", attr.lid); } static ssize_t lid_mask_count_show(struct ib_device *ibdev, u32 port_num, struct ib_port_attribute *unused, char *buf) { struct ib_port_attr attr; ssize_t ret; ret = ib_query_port(ibdev, port_num, &attr); if (ret) return ret; return sysfs_emit(buf, "%u\n", attr.lmc); } static ssize_t sm_lid_show(struct ib_device *ibdev, u32 port_num, struct ib_port_attribute *unused, char *buf) { struct ib_port_attr attr; ssize_t ret; ret = ib_query_port(ibdev, port_num, &attr); if (ret) return ret; return sysfs_emit(buf, "0x%x\n", attr.sm_lid); } static ssize_t sm_sl_show(struct ib_device *ibdev, u32 port_num, struct ib_port_attribute *unused, char *buf) { struct ib_port_attr attr; ssize_t ret; ret = ib_query_port(ibdev, port_num, &attr); if (ret) return ret; return sysfs_emit(buf, "%u\n", attr.sm_sl); } static ssize_t cap_mask_show(struct ib_device *ibdev, u32 port_num, struct ib_port_attribute *unused, char *buf) { struct ib_port_attr attr; ssize_t ret; ret = ib_query_port(ibdev, port_num, &attr); if (ret) return ret; return sysfs_emit(buf, "0x%08x\n", attr.port_cap_flags); } static ssize_t rate_show(struct ib_device *ibdev, u32 port_num, struct ib_port_attribute *unused, char *buf) { struct ib_port_attr attr; char *speed = ""; int rate; /* in deci-Gb/sec */ ssize_t ret; ret = ib_query_port(ibdev, port_num, &attr); if (ret) return ret; switch (attr.active_speed) { case IB_SPEED_DDR: speed = " DDR"; rate = 50; break; case IB_SPEED_QDR: speed = " QDR"; rate = 100; break; case IB_SPEED_FDR10: speed = " FDR10"; rate = 100; break; case IB_SPEED_FDR: speed = " FDR"; rate = 140; break; case IB_SPEED_EDR: speed = " EDR"; rate = 250; break; case IB_SPEED_HDR: speed = " HDR"; rate = 500; break; case IB_SPEED_NDR: speed = " NDR"; rate = 1000; break; case IB_SPEED_XDR: speed = " XDR"; rate = 2000; break; case IB_SPEED_SDR: default: /* default to SDR for invalid rates */ speed = " SDR"; rate = 25; break; } rate *= ib_width_enum_to_int(attr.active_width); if (rate < 0) return -EINVAL; return sysfs_emit(buf, "%d%s Gb/sec (%dX%s)\n", rate / 10, rate % 10 ? ".5" : "", ib_width_enum_to_int(attr.active_width), speed); } static const char *phys_state_to_str(enum ib_port_phys_state phys_state) { static const char *phys_state_str[] = { "<unknown>", "Sleep", "Polling", "Disabled", "PortConfigurationTraining", "LinkUp", "LinkErrorRecovery", "Phy Test", }; if (phys_state < ARRAY_SIZE(phys_state_str)) return phys_state_str[phys_state]; return "<unknown>"; } static ssize_t phys_state_show(struct ib_device *ibdev, u32 port_num, struct ib_port_attribute *unused, char *buf) { struct ib_port_attr attr; ssize_t ret; ret = ib_query_port(ibdev, port_num, &attr); if (ret) return ret; return sysfs_emit(buf, "%u: %s\n", attr.phys_state, phys_state_to_str(attr.phys_state)); } static ssize_t link_layer_show(struct ib_device *ibdev, u32 port_num, struct ib_port_attribute *unused, char *buf) { const char *output; switch (rdma_port_get_link_layer(ibdev, port_num)) { case IB_LINK_LAYER_INFINIBAND: output = "InfiniBand"; break; case IB_LINK_LAYER_ETHERNET: output = "Ethernet"; break; default: output = "Unknown"; break; } return sysfs_emit(buf, "%s\n", output); } static IB_PORT_ATTR_RO(state); static IB_PORT_ATTR_RO(lid); static IB_PORT_ATTR_RO(lid_mask_count); static IB_PORT_ATTR_RO(sm_lid); static IB_PORT_ATTR_RO(sm_sl); static IB_PORT_ATTR_RO(cap_mask); static IB_PORT_ATTR_RO(rate); static IB_PORT_ATTR_RO(phys_state); static IB_PORT_ATTR_RO(link_layer); static struct attribute *port_default_attrs[] = { &ib_port_attr_state.attr, &ib_port_attr_lid.attr, &ib_port_attr_lid_mask_count.attr, &ib_port_attr_sm_lid.attr, &ib_port_attr_sm_sl.attr, &ib_port_attr_cap_mask.attr, &ib_port_attr_rate.attr, &ib_port_attr_phys_state.attr, &ib_port_attr_link_layer.attr, NULL }; ATTRIBUTE_GROUPS(port_default); static ssize_t print_ndev(const struct ib_gid_attr *gid_attr, char *buf) { struct net_device *ndev; int ret = -EINVAL; rcu_read_lock(); ndev = rcu_dereference(gid_attr->ndev); if (ndev) ret = sysfs_emit(buf, "%s\n", ndev->name); rcu_read_unlock(); return ret; } static ssize_t print_gid_type(const struct ib_gid_attr *gid_attr, char *buf) { return sysfs_emit(buf, "%s\n", ib_cache_gid_type_str(gid_attr->gid_type)); } static ssize_t _show_port_gid_attr( struct ib_device *ibdev, u32 port_num, struct ib_port_attribute *attr, char *buf, ssize_t (*print)(const struct ib_gid_attr *gid_attr, char *buf)) { struct port_table_attribute *tab_attr = container_of(attr, struct port_table_attribute, attr); const struct ib_gid_attr *gid_attr; ssize_t ret; gid_attr = rdma_get_gid_attr(ibdev, port_num, tab_attr->index); if (IS_ERR(gid_attr)) /* -EINVAL is returned for user space compatibility reasons. */ return -EINVAL; ret = print(gid_attr, buf); rdma_put_gid_attr(gid_attr); return ret; } static ssize_t show_port_gid(struct ib_device *ibdev, u32 port_num, struct ib_port_attribute *attr, char *buf) { struct port_table_attribute *tab_attr = container_of(attr, struct port_table_attribute, attr); const struct ib_gid_attr *gid_attr; int len; gid_attr = rdma_get_gid_attr(ibdev, port_num, tab_attr->index); if (IS_ERR(gid_attr)) { const union ib_gid zgid = {}; /* If reading GID fails, it is likely due to GID entry being * empty (invalid) or reserved GID in the table. User space * expects to read GID table entries as long as it given index * is within GID table size. Administrative/debugging tool * fails to query rest of the GID entries if it hits error * while querying a GID of the given index. To avoid user * space throwing such error on fail to read gid, return zero * GID as before. This maintains backward compatibility. */ return sysfs_emit(buf, "%pI6\n", zgid.raw); } len = sysfs_emit(buf, "%pI6\n", gid_attr->gid.raw); rdma_put_gid_attr(gid_attr); return len; } static ssize_t show_port_gid_attr_ndev(struct ib_device *ibdev, u32 port_num, struct ib_port_attribute *attr, char *buf) { return _show_port_gid_attr(ibdev, port_num, attr, buf, print_ndev); } static ssize_t show_port_gid_attr_gid_type(struct ib_device *ibdev, u32 port_num, struct ib_port_attribute *attr, char *buf) { return _show_port_gid_attr(ibdev, port_num, attr, buf, print_gid_type); } static ssize_t show_port_pkey(struct ib_device *ibdev, u32 port_num, struct ib_port_attribute *attr, char *buf) { struct port_table_attribute *tab_attr = container_of(attr, struct port_table_attribute, attr); u16 pkey; int ret; ret = ib_query_pkey(ibdev, port_num, tab_attr->index, &pkey); if (ret) return ret; return sysfs_emit(buf, "0x%04x\n", pkey); } #define PORT_PMA_ATTR(_name, _counter, _width, _offset) \ struct port_table_attribute port_pma_attr_##_name = { \ .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \ .index = (_offset) | ((_width) << 16) | ((_counter) << 24), \ .attr_id = IB_PMA_PORT_COUNTERS, \ } #define PORT_PMA_ATTR_EXT(_name, _width, _offset) \ struct port_table_attribute port_pma_attr_ext_##_name = { \ .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \ .index = (_offset) | ((_width) << 16), \ .attr_id = IB_PMA_PORT_COUNTERS_EXT, \ } /* * Get a Perfmgmt MAD block of data. * Returns error code or the number of bytes retrieved. */ static int get_perf_mad(struct ib_device *dev, int port_num, __be16 attr, void *data, int offset, size_t size) { struct ib_mad *in_mad; struct ib_mad *out_mad; size_t mad_size = sizeof(*out_mad); u16 out_mad_pkey_index = 0; ssize_t ret; if (!dev->ops.process_mad) return -ENOSYS; in_mad = kzalloc(sizeof(*in_mad), GFP_KERNEL); out_mad = kzalloc(sizeof(*out_mad), GFP_KERNEL); if (!in_mad || !out_mad) { ret = -ENOMEM; goto out; } in_mad->mad_hdr.base_version = 1; in_mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_PERF_MGMT; in_mad->mad_hdr.class_version = 1; in_mad->mad_hdr.method = IB_MGMT_METHOD_GET; in_mad->mad_hdr.attr_id = attr; if (attr != IB_PMA_CLASS_PORT_INFO) in_mad->data[41] = port_num; /* PortSelect field */ if ((dev->ops.process_mad(dev, IB_MAD_IGNORE_MKEY, port_num, NULL, NULL, in_mad, out_mad, &mad_size, &out_mad_pkey_index) & (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) != (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) { ret = -EINVAL; goto out; } memcpy(data, out_mad->data + offset, size); ret = size; out: kfree(in_mad); kfree(out_mad); return ret; } static ssize_t show_pma_counter(struct ib_device *ibdev, u32 port_num, struct ib_port_attribute *attr, char *buf) { struct port_table_attribute *tab_attr = container_of(attr, struct port_table_attribute, attr); int offset = tab_attr->index & 0xffff; int width = (tab_attr->index >> 16) & 0xff; int ret; u8 data[8]; int len; ret = get_perf_mad(ibdev, port_num, tab_attr->attr_id, &data, 40 + offset / 8, sizeof(data)); if (ret < 0) return ret; switch (width) { case 4: len = sysfs_emit(buf, "%d\n", (*data >> (4 - (offset % 8))) & 0xf); break; case 8: len = sysfs_emit(buf, "%u\n", *data); break; case 16: len = sysfs_emit(buf, "%u\n", be16_to_cpup((__be16 *)data)); break; case 32: len = sysfs_emit(buf, "%u\n", be32_to_cpup((__be32 *)data)); break; case 64: len = sysfs_emit(buf, "%llu\n", be64_to_cpup((__be64 *)data)); break; default: len = 0; break; } return len; } static PORT_PMA_ATTR(symbol_error , 0, 16, 32); static PORT_PMA_ATTR(link_error_recovery , 1, 8, 48); static PORT_PMA_ATTR(link_downed , 2, 8, 56); static PORT_PMA_ATTR(port_rcv_errors , 3, 16, 64); static PORT_PMA_ATTR(port_rcv_remote_physical_errors, 4, 16, 80); static PORT_PMA_ATTR(port_rcv_switch_relay_errors , 5, 16, 96); static PORT_PMA_ATTR(port_xmit_discards , 6, 16, 112); static PORT_PMA_ATTR(port_xmit_constraint_errors , 7, 8, 128); static PORT_PMA_ATTR(port_rcv_constraint_errors , 8, 8, 136); static PORT_PMA_ATTR(local_link_integrity_errors , 9, 4, 152); static PORT_PMA_ATTR(excessive_buffer_overrun_errors, 10, 4, 156); static PORT_PMA_ATTR(VL15_dropped , 11, 16, 176); static PORT_PMA_ATTR(port_xmit_data , 12, 32, 192); static PORT_PMA_ATTR(port_rcv_data , 13, 32, 224); static PORT_PMA_ATTR(port_xmit_packets , 14, 32, 256); static PORT_PMA_ATTR(port_rcv_packets , 15, 32, 288); static PORT_PMA_ATTR(port_xmit_wait , 0, 32, 320); /* * Counters added by extended set */ static PORT_PMA_ATTR_EXT(port_xmit_data , 64, 64); static PORT_PMA_ATTR_EXT(port_rcv_data , 64, 128); static PORT_PMA_ATTR_EXT(port_xmit_packets , 64, 192); static PORT_PMA_ATTR_EXT(port_rcv_packets , 64, 256); static PORT_PMA_ATTR_EXT(unicast_xmit_packets , 64, 320); static PORT_PMA_ATTR_EXT(unicast_rcv_packets , 64, 384); static PORT_PMA_ATTR_EXT(multicast_xmit_packets , 64, 448); static PORT_PMA_ATTR_EXT(multicast_rcv_packets , 64, 512); static struct attribute *pma_attrs[] = { &port_pma_attr_symbol_error.attr.attr, &port_pma_attr_link_error_recovery.attr.attr, &port_pma_attr_link_downed.attr.attr, &port_pma_attr_port_rcv_errors.attr.attr, &port_pma_attr_port_rcv_remote_physical_errors.attr.attr, &port_pma_attr_port_rcv_switch_relay_errors.attr.attr, &port_pma_attr_port_xmit_discards.attr.attr, &port_pma_attr_port_xmit_constraint_errors.attr.attr, &port_pma_attr_port_rcv_constraint_errors.attr.attr, &port_pma_attr_local_link_integrity_errors.attr.attr, &port_pma_attr_excessive_buffer_overrun_errors.attr.attr, &port_pma_attr_VL15_dropped.attr.attr, &port_pma_attr_port_xmit_data.attr.attr, &port_pma_attr_port_rcv_data.attr.attr, &port_pma_attr_port_xmit_packets.attr.attr, &port_pma_attr_port_rcv_packets.attr.attr, &port_pma_attr_port_xmit_wait.attr.attr, NULL }; static struct attribute *pma_attrs_ext[] = { &port_pma_attr_symbol_error.attr.attr, &port_pma_attr_link_error_recovery.attr.attr, &port_pma_attr_link_downed.attr.attr, &port_pma_attr_port_rcv_errors.attr.attr, &port_pma_attr_port_rcv_remote_physical_errors.attr.attr, &port_pma_attr_port_rcv_switch_relay_errors.attr.attr, &port_pma_attr_port_xmit_discards.attr.attr, &port_pma_attr_port_xmit_constraint_errors.attr.attr, &port_pma_attr_port_rcv_constraint_errors.attr.attr, &port_pma_attr_local_link_integrity_errors.attr.attr, &port_pma_attr_excessive_buffer_overrun_errors.attr.attr, &port_pma_attr_VL15_dropped.attr.attr, &port_pma_attr_ext_port_xmit_data.attr.attr, &port_pma_attr_ext_port_rcv_data.attr.attr, &port_pma_attr_ext_port_xmit_packets.attr.attr, &port_pma_attr_port_xmit_wait.attr.attr, &port_pma_attr_ext_port_rcv_packets.attr.attr, &port_pma_attr_ext_unicast_rcv_packets.attr.attr, &port_pma_attr_ext_unicast_xmit_packets.attr.attr, &port_pma_attr_ext_multicast_rcv_packets.attr.attr, &port_pma_attr_ext_multicast_xmit_packets.attr.attr, NULL }; static struct attribute *pma_attrs_noietf[] = { &port_pma_attr_symbol_error.attr.attr, &port_pma_attr_link_error_recovery.attr.attr, &port_pma_attr_link_downed.attr.attr, &port_pma_attr_port_rcv_errors.attr.attr, &port_pma_attr_port_rcv_remote_physical_errors.attr.attr, &port_pma_attr_port_rcv_switch_relay_errors.attr.attr, &port_pma_attr_port_xmit_discards.attr.attr, &port_pma_attr_port_xmit_constraint_errors.attr.attr, &port_pma_attr_port_rcv_constraint_errors.attr.attr, &port_pma_attr_local_link_integrity_errors.attr.attr, &port_pma_attr_excessive_buffer_overrun_errors.attr.attr, &port_pma_attr_VL15_dropped.attr.attr, &port_pma_attr_ext_port_xmit_data.attr.attr, &port_pma_attr_ext_port_rcv_data.attr.attr, &port_pma_attr_ext_port_xmit_packets.attr.attr, &port_pma_attr_ext_port_rcv_packets.attr.attr, &port_pma_attr_port_xmit_wait.attr.attr, NULL }; static const struct attribute_group pma_group = { .name = "counters", .attrs = pma_attrs }; static const struct attribute_group pma_group_ext = { .name = "counters", .attrs = pma_attrs_ext }; static const struct attribute_group pma_group_noietf = { .name = "counters", .attrs = pma_attrs_noietf }; static void ib_port_release(struct kobject *kobj) { struct ib_port *port = container_of(kobj, struct ib_port, kobj); int i; for (i = 0; i != ARRAY_SIZE(port->groups); i++) kfree(port->groups[i].attrs); if (port->hw_stats_data) rdma_free_hw_stats_struct(port->hw_stats_data->stats); kfree(port->hw_stats_data); kvfree(port); } static void ib_port_gid_attr_release(struct kobject *kobj) { struct gid_attr_group *gid_attr_group = container_of(kobj, struct gid_attr_group, kobj); int i; for (i = 0; i != ARRAY_SIZE(gid_attr_group->groups); i++) kfree(gid_attr_group->groups[i].attrs); kfree(gid_attr_group); } static struct kobj_type port_type = { .release = ib_port_release, .sysfs_ops = &port_sysfs_ops, .default_groups = port_default_groups, }; static struct kobj_type gid_attr_type = { .sysfs_ops = &gid_attr_sysfs_ops, .release = ib_port_gid_attr_release }; /* * Figure out which counter table to use depending on * the device capabilities. */ static const struct attribute_group *get_counter_table(struct ib_device *dev, int port_num) { struct ib_class_port_info cpi; if (get_perf_mad(dev, port_num, IB_PMA_CLASS_PORT_INFO, &cpi, 40, sizeof(cpi)) >= 0) { if (cpi.capability_mask & IB_PMA_CLASS_CAP_EXT_WIDTH) /* We have extended counters */ return &pma_group_ext; if (cpi.capability_mask & IB_PMA_CLASS_CAP_EXT_WIDTH_NOIETF) /* But not the IETF ones */ return &pma_group_noietf; } /* Fall back to normal counters */ return &pma_group; } static int update_hw_stats(struct ib_device *dev, struct rdma_hw_stats *stats, u32 port_num, int index) { int ret; if (time_is_after_eq_jiffies(stats->timestamp + stats->lifespan)) return 0; ret = dev->ops.get_hw_stats(dev, stats, port_num, index); if (ret < 0) return ret; if (ret == stats->num_counters) stats->timestamp = jiffies; return 0; } static int print_hw_stat(struct ib_device *dev, int port_num, struct rdma_hw_stats *stats, int index, char *buf) { u64 v = rdma_counter_get_hwstat_value(dev, port_num, index); return sysfs_emit(buf, "%llu\n", stats->value[index] + v); } static ssize_t show_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats, unsigned int index, unsigned int port_num, char *buf) { int ret; mutex_lock(&stats->lock); ret = update_hw_stats(ibdev, stats, port_num, index); if (ret) goto unlock; ret = print_hw_stat(ibdev, port_num, stats, index, buf); unlock: mutex_unlock(&stats->lock); return ret; } static ssize_t show_stats_lifespan(struct ib_device *ibdev, struct rdma_hw_stats *stats, unsigned int index, unsigned int port_num, char *buf) { int msecs; mutex_lock(&stats->lock); msecs = jiffies_to_msecs(stats->lifespan); mutex_unlock(&stats->lock); return sysfs_emit(buf, "%d\n", msecs); } static ssize_t set_stats_lifespan(struct ib_device *ibdev, struct rdma_hw_stats *stats, unsigned int index, unsigned int port_num, const char *buf, size_t count) { int msecs; int jiffies; int ret; ret = kstrtoint(buf, 10, &msecs); if (ret) return ret; if (msecs < 0 || msecs > 10000) return -EINVAL; jiffies = msecs_to_jiffies(msecs); mutex_lock(&stats->lock); stats->lifespan = jiffies; mutex_unlock(&stats->lock); return count; } static struct hw_stats_device_data * alloc_hw_stats_device(struct ib_device *ibdev) { struct hw_stats_device_data *data; struct rdma_hw_stats *stats; if (!ibdev->ops.alloc_hw_device_stats) return ERR_PTR(-EOPNOTSUPP); stats = ibdev->ops.alloc_hw_device_stats(ibdev); if (!stats) return ERR_PTR(-ENOMEM); if (!stats->descs || stats->num_counters <= 0) goto err_free_stats; /* * Two extra attribue elements here, one for the lifespan entry and * one to NULL terminate the list for the sysfs core code */ data = kzalloc(struct_size(data, attrs, size_add(stats->num_counters, 1)), GFP_KERNEL); if (!data) goto err_free_stats; data->group.attrs = kcalloc(stats->num_counters + 2, sizeof(*data->group.attrs), GFP_KERNEL); if (!data->group.attrs) goto err_free_data; data->group.name = "hw_counters"; data->stats = stats; return data; err_free_data: kfree(data); err_free_stats: rdma_free_hw_stats_struct(stats); return ERR_PTR(-ENOMEM); } void ib_device_release_hw_stats(struct hw_stats_device_data *data) { kfree(data->group.attrs); rdma_free_hw_stats_struct(data->stats); kfree(data); } int ib_setup_device_attrs(struct ib_device *ibdev) { struct hw_stats_device_attribute *attr; struct hw_stats_device_data *data; bool opstat_skipped = false; int i, ret, pos = 0; data = alloc_hw_stats_device(ibdev); if (IS_ERR(data)) { if (PTR_ERR(data) == -EOPNOTSUPP) return 0; return PTR_ERR(data); } ibdev->hw_stats_data = data; ret = ibdev->ops.get_hw_stats(ibdev, data->stats, 0, data->stats->num_counters); if (ret != data->stats->num_counters) { if (WARN_ON(ret >= 0)) return -EINVAL; return ret; } data->stats->timestamp = jiffies; for (i = 0; i < data->stats->num_counters; i++) { if (data->stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL) { opstat_skipped = true; continue; } WARN_ON(opstat_skipped); attr = &data->attrs[pos]; sysfs_attr_init(&attr->attr.attr); attr->attr.attr.name = data->stats->descs[i].name; attr->attr.attr.mode = 0444; attr->attr.show = hw_stat_device_show; attr->show = show_hw_stats; data->group.attrs[pos] = &attr->attr.attr; pos++; } attr = &data->attrs[pos]; sysfs_attr_init(&attr->attr.attr); attr->attr.attr.name = "lifespan"; attr->attr.attr.mode = 0644; attr->attr.show = hw_stat_device_show; attr->show = show_stats_lifespan; attr->attr.store = hw_stat_device_store; attr->store = set_stats_lifespan; data->group.attrs[pos] = &attr->attr.attr; for (i = 0; i != ARRAY_SIZE(ibdev->groups); i++) if (!ibdev->groups[i]) { ibdev->groups[i] = &data->group; return 0; } WARN(true, "struct ib_device->groups is too small"); return -EINVAL; } static struct hw_stats_port_data * alloc_hw_stats_port(struct ib_port *port, struct attribute_group *group) { struct ib_device *ibdev = port->ibdev; struct hw_stats_port_data *data; struct rdma_hw_stats *stats; if (!ibdev->ops.alloc_hw_port_stats) return ERR_PTR(-EOPNOTSUPP); stats = ibdev->ops.alloc_hw_port_stats(port->ibdev, port->port_num); if (!stats) return ERR_PTR(-ENOMEM); if (!stats->descs || stats->num_counters <= 0) goto err_free_stats; /* * Two extra attribue elements here, one for the lifespan entry and * one to NULL terminate the list for the sysfs core code */ data = kzalloc(struct_size(data, attrs, size_add(stats->num_counters, 1)), GFP_KERNEL); if (!data) goto err_free_stats; group->attrs = kcalloc(stats->num_counters + 2, sizeof(*group->attrs), GFP_KERNEL); if (!group->attrs) goto err_free_data; group->name = "hw_counters"; data->stats = stats; return data; err_free_data: kfree(data); err_free_stats: rdma_free_hw_stats_struct(stats); return ERR_PTR(-ENOMEM); } static int setup_hw_port_stats(struct ib_port *port, struct attribute_group *group) { struct hw_stats_port_attribute *attr; struct hw_stats_port_data *data; bool opstat_skipped = false; int i, ret, pos = 0; data = alloc_hw_stats_port(port, group); if (IS_ERR(data)) return PTR_ERR(data); ret = port->ibdev->ops.get_hw_stats(port->ibdev, data->stats, port->port_num, data->stats->num_counters); if (ret != data->stats->num_counters) { if (WARN_ON(ret >= 0)) return -EINVAL; return ret; } data->stats->timestamp = jiffies; for (i = 0; i < data->stats->num_counters; i++) { if (data->stats->descs[i].flags & IB_STAT_FLAG_OPTIONAL) { opstat_skipped = true; continue; } WARN_ON(opstat_skipped); attr = &data->attrs[pos]; sysfs_attr_init(&attr->attr.attr); attr->attr.attr.name = data->stats->descs[i].name; attr->attr.attr.mode = 0444; attr->attr.show = hw_stat_port_show; attr->show = show_hw_stats; group->attrs[pos] = &attr->attr.attr; pos++; } attr = &data->attrs[pos]; sysfs_attr_init(&attr->attr.attr); attr->attr.attr.name = "lifespan"; attr->attr.attr.mode = 0644; attr->attr.show = hw_stat_port_show; attr->show = show_stats_lifespan; attr->attr.store = hw_stat_port_store; attr->store = set_stats_lifespan; group->attrs[pos] = &attr->attr.attr; port->hw_stats_data = data; return 0; } struct rdma_hw_stats *ib_get_hw_stats_port(struct ib_device *ibdev, u32 port_num) { if (!ibdev->port_data || !rdma_is_port_valid(ibdev, port_num) || !ibdev->port_data[port_num].sysfs->hw_stats_data) return NULL; return ibdev->port_data[port_num].sysfs->hw_stats_data->stats; } static int alloc_port_table_group(const char *name, struct attribute_group *group, struct port_table_attribute *attrs, size_t num, ssize_t (*show)(struct ib_device *ibdev, u32 port_num, struct ib_port_attribute *, char *buf)) { struct attribute **attr_list; int i; attr_list = kcalloc(num + 1, sizeof(*attr_list), GFP_KERNEL); if (!attr_list) return -ENOMEM; for (i = 0; i < num; i++) { struct port_table_attribute *element = &attrs[i]; if (snprintf(element->name, sizeof(element->name), "%d", i) >= sizeof(element->name)) goto err; sysfs_attr_init(&element->attr.attr); element->attr.attr.name = element->name; element->attr.attr.mode = 0444; element->attr.show = show; element->index = i; attr_list[i] = &element->attr.attr; } group->name = name; group->attrs = attr_list; return 0; err: kfree(attr_list); return -EINVAL; } /* * Create the sysfs: * ibp0s9/ports/XX/gid_attrs/{ndevs,types}/YYY * YYY is the gid table index in decimal */ static int setup_gid_attrs(struct ib_port *port, const struct ib_port_attr *attr) { struct gid_attr_group *gid_attr_group; int ret; gid_attr_group = kzalloc(struct_size(gid_attr_group, attrs_list, size_mul(attr->gid_tbl_len, 2)), GFP_KERNEL); if (!gid_attr_group) return -ENOMEM; gid_attr_group->port = port; kobject_init(&gid_attr_group->kobj, &gid_attr_type); ret = alloc_port_table_group("ndevs", &gid_attr_group->groups[0], gid_attr_group->attrs_list, attr->gid_tbl_len, show_port_gid_attr_ndev); if (ret) goto err_put; gid_attr_group->groups_list[0] = &gid_attr_group->groups[0]; ret = alloc_port_table_group( "types", &gid_attr_group->groups[1], gid_attr_group->attrs_list + attr->gid_tbl_len, attr->gid_tbl_len, show_port_gid_attr_gid_type); if (ret) goto err_put; gid_attr_group->groups_list[1] = &gid_attr_group->groups[1]; ret = kobject_add(&gid_attr_group->kobj, &port->kobj, "gid_attrs"); if (ret) goto err_put; ret = sysfs_create_groups(&gid_attr_group->kobj, gid_attr_group->groups_list); if (ret) goto err_del; port->gid_attr_group = gid_attr_group; return 0; err_del: kobject_del(&gid_attr_group->kobj); err_put: kobject_put(&gid_attr_group->kobj); return ret; } static void destroy_gid_attrs(struct ib_port *port) { struct gid_attr_group *gid_attr_group = port->gid_attr_group; if (!gid_attr_group) return; sysfs_remove_groups(&gid_attr_group->kobj, gid_attr_group->groups_list); kobject_del(&gid_attr_group->kobj); kobject_put(&gid_attr_group->kobj); } /* * Create the sysfs: * ibp0s9/ports/XX/{gids,pkeys,counters}/YYY */ static struct ib_port *setup_port(struct ib_core_device *coredev, int port_num, const struct ib_port_attr *attr) { struct ib_device *device = rdma_device_to_ibdev(&coredev->dev); bool is_full_dev = &device->coredev == coredev; const struct attribute_group **cur_group; struct ib_port *p; int ret; p = kvzalloc(struct_size(p, attrs_list, size_add(attr->gid_tbl_len, attr->pkey_tbl_len)), GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); p->ibdev = device; p->port_num = port_num; kobject_init(&p->kobj, &port_type); if (device->port_data && is_full_dev) device->port_data[port_num].sysfs = p; cur_group = p->groups_list; ret = alloc_port_table_group("gids", &p->groups[0], p->attrs_list, attr->gid_tbl_len, show_port_gid); if (ret) goto err_put; *cur_group++ = &p->groups[0]; if (attr->pkey_tbl_len) { ret = alloc_port_table_group("pkeys", &p->groups[1], p->attrs_list + attr->gid_tbl_len, attr->pkey_tbl_len, show_port_pkey); if (ret) goto err_put; *cur_group++ = &p->groups[1]; } /* * If port == 0, it means hw_counters are per device and not per * port, so holder should be device. Therefore skip per port * counter initialization. */ if (port_num && is_full_dev) { ret = setup_hw_port_stats(p, &p->groups[2]); if (ret && ret != -EOPNOTSUPP) goto err_put; if (!ret) *cur_group++ = &p->groups[2]; } if (device->ops.process_mad && is_full_dev) *cur_group++ = get_counter_table(device, port_num); ret = kobject_add(&p->kobj, coredev->ports_kobj, "%d", port_num); if (ret) goto err_put; ret = sysfs_create_groups(&p->kobj, p->groups_list); if (ret) goto err_del; if (is_full_dev) { ret = sysfs_create_groups(&p->kobj, device->ops.port_groups); if (ret) goto err_groups; } list_add_tail(&p->kobj.entry, &coredev->port_list); return p; err_groups: sysfs_remove_groups(&p->kobj, p->groups_list); err_del: kobject_del(&p->kobj); err_put: if (device->port_data && is_full_dev) device->port_data[port_num].sysfs = NULL; kobject_put(&p->kobj); return ERR_PTR(ret); } static void destroy_port(struct ib_core_device *coredev, struct ib_port *port) { bool is_full_dev = &port->ibdev->coredev == coredev; list_del(&port->kobj.entry); if (is_full_dev) sysfs_remove_groups(&port->kobj, port->ibdev->ops.port_groups); sysfs_remove_groups(&port->kobj, port->groups_list); kobject_del(&port->kobj); if (port->ibdev->port_data && port->ibdev->port_data[port->port_num].sysfs == port) port->ibdev->port_data[port->port_num].sysfs = NULL; kobject_put(&port->kobj); } static const char *node_type_string(int node_type) { switch (node_type) { case RDMA_NODE_IB_CA: return "CA"; case RDMA_NODE_IB_SWITCH: return "switch"; case RDMA_NODE_IB_ROUTER: return "router"; case RDMA_NODE_RNIC: return "RNIC"; case RDMA_NODE_USNIC: return "usNIC"; case RDMA_NODE_USNIC_UDP: return "usNIC UDP"; case RDMA_NODE_UNSPECIFIED: return "unspecified"; } return "<unknown>"; } static ssize_t node_type_show(struct device *device, struct device_attribute *attr, char *buf) { struct ib_device *dev = rdma_device_to_ibdev(device); return sysfs_emit(buf, "%u: %s\n", dev->node_type, node_type_string(dev->node_type)); } static DEVICE_ATTR_RO(node_type); static ssize_t sys_image_guid_show(struct device *device, struct device_attribute *dev_attr, char *buf) { struct ib_device *dev = rdma_device_to_ibdev(device); __be16 *guid = (__be16 *)&dev->attrs.sys_image_guid; return sysfs_emit(buf, "%04x:%04x:%04x:%04x\n", be16_to_cpu(guid[0]), be16_to_cpu(guid[1]), be16_to_cpu(guid[2]), be16_to_cpu(guid[3])); } static DEVICE_ATTR_RO(sys_image_guid); static ssize_t node_guid_show(struct device *device, struct device_attribute *attr, char *buf) { struct ib_device *dev = rdma_device_to_ibdev(device); __be16 *node_guid = (__be16 *)&dev->node_guid; return sysfs_emit(buf, "%04x:%04x:%04x:%04x\n", be16_to_cpu(node_guid[0]), be16_to_cpu(node_guid[1]), be16_to_cpu(node_guid[2]), be16_to_cpu(node_guid[3])); } static DEVICE_ATTR_RO(node_guid); static ssize_t node_desc_show(struct device *device, struct device_attribute *attr, char *buf) { struct ib_device *dev = rdma_device_to_ibdev(device); return sysfs_emit(buf, "%.64s\n", dev->node_desc); } static ssize_t node_desc_store(struct device *device, struct device_attribute *attr, const char *buf, size_t count) { struct ib_device *dev = rdma_device_to_ibdev(device); struct ib_device_modify desc = {}; int ret; if (!dev->ops.modify_device) return -EOPNOTSUPP; memcpy(desc.node_desc, buf, min_t(int, count, IB_DEVICE_NODE_DESC_MAX)); ret = ib_modify_device(dev, IB_DEVICE_MODIFY_NODE_DESC, &desc); if (ret) return ret; return count; } static DEVICE_ATTR_RW(node_desc); static ssize_t fw_ver_show(struct device *device, struct device_attribute *attr, char *buf) { struct ib_device *dev = rdma_device_to_ibdev(device); char version[IB_FW_VERSION_NAME_MAX] = {}; ib_get_device_fw_str(dev, version); return sysfs_emit(buf, "%s\n", version); } static DEVICE_ATTR_RO(fw_ver); static struct attribute *ib_dev_attrs[] = { &dev_attr_node_type.attr, &dev_attr_node_guid.attr, &dev_attr_sys_image_guid.attr, &dev_attr_fw_ver.attr, &dev_attr_node_desc.attr, NULL, }; const struct attribute_group ib_dev_attr_group = { .attrs = ib_dev_attrs, }; void ib_free_port_attrs(struct ib_core_device *coredev) { struct kobject *p, *t; list_for_each_entry_safe(p, t, &coredev->port_list, entry) { struct ib_port *port = container_of(p, struct ib_port, kobj); destroy_gid_attrs(port); destroy_port(coredev, port); } kobject_put(coredev->ports_kobj); } int ib_setup_port_attrs(struct ib_core_device *coredev) { struct ib_device *device = rdma_device_to_ibdev(&coredev->dev); u32 port_num; int ret; coredev->ports_kobj = kobject_create_and_add("ports", &coredev->dev.kobj); if (!coredev->ports_kobj) return -ENOMEM; rdma_for_each_port (device, port_num) { struct ib_port_attr attr; struct ib_port *port; ret = ib_query_port(device, port_num, &attr); if (ret) goto err_put; port = setup_port(coredev, port_num, &attr); if (IS_ERR(port)) { ret = PTR_ERR(port); goto err_put; } ret = setup_gid_attrs(port, &attr); if (ret) goto err_put; } return 0; err_put: ib_free_port_attrs(coredev); return ret; } /** * ib_port_register_client_groups - Add an ib_client's attributes to the port * * @ibdev: IB device to add counters * @port_num: valid port number * @groups: Group list of attributes * * Do not use. Only for legacy sysfs compatibility. */ int ib_port_register_client_groups(struct ib_device *ibdev, u32 port_num, const struct attribute_group **groups) { return sysfs_create_groups(&ibdev->port_data[port_num].sysfs->kobj, groups); } EXPORT_SYMBOL(ib_port_register_client_groups); void ib_port_unregister_client_groups(struct ib_device *ibdev, u32 port_num, const struct attribute_group **groups) { return sysfs_remove_groups(&ibdev->port_data[port_num].sysfs->kobj, groups); } EXPORT_SYMBOL(ib_port_unregister_client_groups); |
251 33 476 2 529 890 895 1084 1072 25 1084 1084 1069 25 934 534 1069 17 31 17 17 556 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 | // SPDX-License-Identifier: GPL-2.0-or-later /* * fs/inotify_user.c - inotify support for userspace * * Authors: * John McCutchan <ttb@tentacle.dhs.org> * Robert Love <rml@novell.com> * * Copyright (C) 2005 John McCutchan * Copyright 2006 Hewlett-Packard Development Company, L.P. * * Copyright (C) 2009 Eric Paris <Red Hat Inc> * inotify was largely rewriten to make use of the fsnotify infrastructure */ #include <linux/dcache.h> /* d_unlinked */ #include <linux/fs.h> /* struct inode */ #include <linux/fsnotify_backend.h> #include <linux/inotify.h> #include <linux/path.h> /* struct path */ #include <linux/slab.h> /* kmem_* */ #include <linux/types.h> #include <linux/sched.h> #include <linux/sched/user.h> #include <linux/sched/mm.h> #include "inotify.h" /* * Check if 2 events contain the same information. */ static bool event_compare(struct fsnotify_event *old_fsn, struct fsnotify_event *new_fsn) { struct inotify_event_info *old, *new; old = INOTIFY_E(old_fsn); new = INOTIFY_E(new_fsn); if (old->mask & FS_IN_IGNORED) return false; if ((old->mask == new->mask) && (old->wd == new->wd) && (old->name_len == new->name_len) && (!old->name_len || !strcmp(old->name, new->name))) return true; return false; } static int inotify_merge(struct fsnotify_group *group, struct fsnotify_event *event) { struct list_head *list = &group->notification_list; struct fsnotify_event *last_event; last_event = list_entry(list->prev, struct fsnotify_event, list); return event_compare(last_event, event); } int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask, struct inode *inode, struct inode *dir, const struct qstr *name, u32 cookie) { struct inotify_inode_mark *i_mark; struct inotify_event_info *event; struct fsnotify_event *fsn_event; struct fsnotify_group *group = inode_mark->group; int ret; int len = 0, wd; int alloc_len = sizeof(struct inotify_event_info); struct mem_cgroup *old_memcg; if (name) { len = name->len; alloc_len += len + 1; } pr_debug("%s: group=%p mark=%p mask=%x\n", __func__, group, inode_mark, mask); i_mark = container_of(inode_mark, struct inotify_inode_mark, fsn_mark); /* * We can be racing with mark being detached. Don't report event with * invalid wd. */ wd = READ_ONCE(i_mark->wd); if (wd == -1) return 0; /* * Whoever is interested in the event, pays for the allocation. Do not * trigger OOM killer in the target monitoring memcg as it may have * security repercussion. */ old_memcg = set_active_memcg(group->memcg); event = kmalloc(alloc_len, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); set_active_memcg(old_memcg); if (unlikely(!event)) { /* * Treat lost event due to ENOMEM the same way as queue * overflow to let userspace know event was lost. */ fsnotify_queue_overflow(group); return -ENOMEM; } /* * We now report FS_ISDIR flag with MOVE_SELF and DELETE_SELF events * for fanotify. inotify never reported IN_ISDIR with those events. * It looks like an oversight, but to avoid the risk of breaking * existing inotify programs, mask the flag out from those events. */ if (mask & (IN_MOVE_SELF | IN_DELETE_SELF)) mask &= ~IN_ISDIR; fsn_event = &event->fse; fsnotify_init_event(fsn_event); event->mask = mask; event->wd = wd; event->sync_cookie = cookie; event->name_len = len; if (len) strcpy(event->name, name->name); ret = fsnotify_add_event(group, fsn_event, inotify_merge); if (ret) { /* Our event wasn't used in the end. Free it. */ fsnotify_destroy_event(group, fsn_event); } if (inode_mark->flags & FSNOTIFY_MARK_FLAG_IN_ONESHOT) fsnotify_destroy_mark(inode_mark, group); return 0; } static void inotify_freeing_mark(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group) { inotify_ignored_and_remove_idr(fsn_mark, group); } /* * This is NEVER supposed to be called. Inotify marks should either have been * removed from the idr when the watch was removed or in the * fsnotify_destroy_mark_by_group() call when the inotify instance was being * torn down. This is only called if the idr is about to be freed but there * are still marks in it. */ static int idr_callback(int id, void *p, void *data) { struct fsnotify_mark *fsn_mark; struct inotify_inode_mark *i_mark; static bool warned = false; if (warned) return 0; warned = true; fsn_mark = p; i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); WARN(1, "inotify closing but id=%d for fsn_mark=%p in group=%p still in " "idr. Probably leaking memory\n", id, p, data); /* * I'm taking the liberty of assuming that the mark in question is a * valid address and I'm dereferencing it. This might help to figure * out why we got here and the panic is no worse than the original * BUG() that was here. */ if (fsn_mark) printk(KERN_WARNING "fsn_mark->group=%p wd=%d\n", fsn_mark->group, i_mark->wd); return 0; } static void inotify_free_group_priv(struct fsnotify_group *group) { /* ideally the idr is empty and we won't hit the BUG in the callback */ idr_for_each(&group->inotify_data.idr, idr_callback, group); idr_destroy(&group->inotify_data.idr); if (group->inotify_data.ucounts) dec_inotify_instances(group->inotify_data.ucounts); } static void inotify_free_event(struct fsnotify_group *group, struct fsnotify_event *fsn_event) { kfree(INOTIFY_E(fsn_event)); } /* ding dong the mark is dead */ static void inotify_free_mark(struct fsnotify_mark *fsn_mark) { struct inotify_inode_mark *i_mark; i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); kmem_cache_free(inotify_inode_mark_cachep, i_mark); } const struct fsnotify_ops inotify_fsnotify_ops = { .handle_inode_event = inotify_handle_inode_event, .free_group_priv = inotify_free_group_priv, .free_event = inotify_free_event, .freeing_mark = inotify_freeing_mark, .free_mark = inotify_free_mark, }; |
3 29 2091 2079 2091 2 3 12 2 2 6 4 1 1 10 1019 1128 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 | // SPDX-License-Identifier: GPL-2.0-only /* * Landlock LSM - Network management and hooks * * Copyright © 2022-2023 Huawei Tech. Co., Ltd. * Copyright © 2022-2023 Microsoft Corporation */ #include <linux/in.h> #include <linux/net.h> #include <linux/socket.h> #include <net/ipv6.h> #include "common.h" #include "cred.h" #include "limits.h" #include "net.h" #include "ruleset.h" int landlock_append_net_rule(struct landlock_ruleset *const ruleset, const u16 port, access_mask_t access_rights) { int err; const struct landlock_id id = { .key.data = (__force uintptr_t)htons(port), .type = LANDLOCK_KEY_NET_PORT, }; BUILD_BUG_ON(sizeof(port) > sizeof(id.key.data)); /* Transforms relative access rights to absolute ones. */ access_rights |= LANDLOCK_MASK_ACCESS_NET & ~landlock_get_net_access_mask(ruleset, 0); mutex_lock(&ruleset->lock); err = landlock_insert_rule(ruleset, id, access_rights); mutex_unlock(&ruleset->lock); return err; } static access_mask_t get_raw_handled_net_accesses(const struct landlock_ruleset *const domain) { access_mask_t access_dom = 0; size_t layer_level; for (layer_level = 0; layer_level < domain->num_layers; layer_level++) access_dom |= landlock_get_net_access_mask(domain, layer_level); return access_dom; } static const struct landlock_ruleset *get_current_net_domain(void) { const struct landlock_ruleset *const dom = landlock_get_current_domain(); if (!dom || !get_raw_handled_net_accesses(dom)) return NULL; return dom; } static int current_check_access_socket(struct socket *const sock, struct sockaddr *const address, const int addrlen, access_mask_t access_request) { __be16 port; layer_mask_t layer_masks[LANDLOCK_NUM_ACCESS_NET] = {}; const struct landlock_rule *rule; struct landlock_id id = { .type = LANDLOCK_KEY_NET_PORT, }; const struct landlock_ruleset *const dom = get_current_net_domain(); if (!dom) return 0; if (WARN_ON_ONCE(dom->num_layers < 1)) return -EACCES; /* Checks if it's a (potential) TCP socket. */ if (sock->type != SOCK_STREAM) return 0; /* Checks for minimal header length to safely read sa_family. */ if (addrlen < offsetofend(typeof(*address), sa_family)) return -EINVAL; switch (address->sa_family) { case AF_UNSPEC: case AF_INET: if (addrlen < sizeof(struct sockaddr_in)) return -EINVAL; port = ((struct sockaddr_in *)address)->sin_port; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: if (addrlen < SIN6_LEN_RFC2133) return -EINVAL; port = ((struct sockaddr_in6 *)address)->sin6_port; break; #endif /* IS_ENABLED(CONFIG_IPV6) */ default: return 0; } /* Specific AF_UNSPEC handling. */ if (address->sa_family == AF_UNSPEC) { /* * Connecting to an address with AF_UNSPEC dissolves the TCP * association, which have the same effect as closing the * connection while retaining the socket object (i.e., the file * descriptor). As for dropping privileges, closing * connections is always allowed. * * For a TCP access control system, this request is legitimate. * Let the network stack handle potential inconsistencies and * return -EINVAL if needed. */ if (access_request == LANDLOCK_ACCESS_NET_CONNECT_TCP) return 0; /* * For compatibility reason, accept AF_UNSPEC for bind * accesses (mapped to AF_INET) only if the address is * INADDR_ANY (cf. __inet_bind). Checking the address is * required to not wrongfully return -EACCES instead of * -EAFNOSUPPORT. * * We could return 0 and let the network stack handle these * checks, but it is safer to return a proper error and test * consistency thanks to kselftest. */ if (access_request == LANDLOCK_ACCESS_NET_BIND_TCP) { /* addrlen has already been checked for AF_UNSPEC. */ const struct sockaddr_in *const sockaddr = (struct sockaddr_in *)address; if (sock->sk->__sk_common.skc_family != AF_INET) return -EINVAL; if (sockaddr->sin_addr.s_addr != htonl(INADDR_ANY)) return -EAFNOSUPPORT; } } else { /* * Checks sa_family consistency to not wrongfully return * -EACCES instead of -EINVAL. Valid sa_family changes are * only (from AF_INET or AF_INET6) to AF_UNSPEC. * * We could return 0 and let the network stack handle this * check, but it is safer to return a proper error and test * consistency thanks to kselftest. */ if (address->sa_family != sock->sk->__sk_common.skc_family) return -EINVAL; } id.key.data = (__force uintptr_t)port; BUILD_BUG_ON(sizeof(port) > sizeof(id.key.data)); rule = landlock_find_rule(dom, id); access_request = landlock_init_layer_masks( dom, access_request, &layer_masks, LANDLOCK_KEY_NET_PORT); if (landlock_unmask_layers(rule, access_request, &layer_masks, ARRAY_SIZE(layer_masks))) return 0; return -EACCES; } static int hook_socket_bind(struct socket *const sock, struct sockaddr *const address, const int addrlen) { return current_check_access_socket(sock, address, addrlen, LANDLOCK_ACCESS_NET_BIND_TCP); } static int hook_socket_connect(struct socket *const sock, struct sockaddr *const address, const int addrlen) { return current_check_access_socket(sock, address, addrlen, LANDLOCK_ACCESS_NET_CONNECT_TCP); } static struct security_hook_list landlock_hooks[] __ro_after_init = { LSM_HOOK_INIT(socket_bind, hook_socket_bind), LSM_HOOK_INIT(socket_connect, hook_socket_connect), }; __init void landlock_add_net_hooks(void) { security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks), &landlock_lsmid); } |
37 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 | // SPDX-License-Identifier: GPL-2.0+ /* * drivers/of/property.c - Procedures for accessing and interpreting * Devicetree properties and graphs. * * Initially created by copying procedures from drivers/of/base.c. This * file contains the OF property as well as the OF graph interface * functions. * * Paul Mackerras August 1996. * Copyright (C) 1996-2005 Paul Mackerras. * * Adapted for 64bit PowerPC by Dave Engebretsen and Peter Bergner. * {engebret|bergner}@us.ibm.com * * Adapted for sparc and sparc64 by David S. Miller davem@davemloft.net * * Reconsolidated from arch/x/kernel/prom.c by Stephen Rothwell and * Grant Likely. */ #define pr_fmt(fmt) "OF: " fmt #include <linux/of.h> #include <linux/of_address.h> #include <linux/of_device.h> #include <linux/of_graph.h> #include <linux/of_irq.h> #include <linux/string.h> #include <linux/moduleparam.h> #include "of_private.h" /** * of_graph_is_present() - check graph's presence * @node: pointer to device_node containing graph port * * Return: True if @node has a port or ports (with a port) sub-node, * false otherwise. */ bool of_graph_is_present(const struct device_node *node) { struct device_node *ports __free(device_node) = of_get_child_by_name(node, "ports"); if (ports) node = ports; struct device_node *port __free(device_node) = of_get_child_by_name(node, "port"); return !!port; } EXPORT_SYMBOL(of_graph_is_present); /** * of_property_count_elems_of_size - Count the number of elements in a property * * @np: device node from which the property value is to be read. * @propname: name of the property to be searched. * @elem_size: size of the individual element * * Search for a property in a device node and count the number of elements of * size elem_size in it. * * Return: The number of elements on sucess, -EINVAL if the property does not * exist or its length does not match a multiple of elem_size and -ENODATA if * the property does not have a value. */ int of_property_count_elems_of_size(const struct device_node *np, const char *propname, int elem_size) { struct property *prop = of_find_property(np, propname, NULL); if (!prop) return -EINVAL; if (!prop->value) return -ENODATA; if (prop->length % elem_size != 0) { pr_err("size of %s in node %pOF is not a multiple of %d\n", propname, np, elem_size); return -EINVAL; } return prop->length / elem_size; } EXPORT_SYMBOL_GPL(of_property_count_elems_of_size); /** * of_find_property_value_of_size * * @np: device node from which the property value is to be read. * @propname: name of the property to be searched. * @min: minimum allowed length of property value * @max: maximum allowed length of property value (0 means unlimited) * @len: if !=NULL, actual length is written to here * * Search for a property in a device node and valid the requested size. * * Return: The property value on success, -EINVAL if the property does not * exist, -ENODATA if property does not have a value, and -EOVERFLOW if the * property data is too small or too large. * */ static void *of_find_property_value_of_size(const struct device_node *np, const char *propname, u32 min, u32 max, size_t *len) { struct property *prop = of_find_property(np, propname, NULL); if (!prop) return ERR_PTR(-EINVAL); if (!prop->value) return ERR_PTR(-ENODATA); if (prop->length < min) return ERR_PTR(-EOVERFLOW); if (max && prop->length > max) return ERR_PTR(-EOVERFLOW); if (len) *len = prop->length; return prop->value; } /** * of_property_read_u32_index - Find and read a u32 from a multi-value property. * * @np: device node from which the property value is to be read. * @propname: name of the property to be searched. * @index: index of the u32 in the list of values * @out_value: pointer to return value, modified only if no error. * * Search for a property in a device node and read nth 32-bit value from * it. * * Return: 0 on success, -EINVAL if the property does not exist, * -ENODATA if property does not have a value, and -EOVERFLOW if the * property data isn't large enough. * * The out_value is modified only if a valid u32 value can be decoded. */ int of_property_read_u32_index(const struct device_node *np, const char *propname, u32 index, u32 *out_value) { const u32 *val = of_find_property_value_of_size(np, propname, ((index + 1) * sizeof(*out_value)), 0, NULL); if (IS_ERR(val)) return PTR_ERR(val); *out_value = be32_to_cpup(((__be32 *)val) + index); return 0; } EXPORT_SYMBOL_GPL(of_property_read_u32_index); /** * of_property_read_u64_index - Find and read a u64 from a multi-value property. * * @np: device node from which the property value is to be read. * @propname: name of the property to be searched. * @index: index of the u64 in the list of values * @out_value: pointer to return value, modified only if no error. * * Search for a property in a device node and read nth 64-bit value from * it. * * Return: 0 on success, -EINVAL if the property does not exist, * -ENODATA if property does not have a value, and -EOVERFLOW if the * property data isn't large enough. * * The out_value is modified only if a valid u64 value can be decoded. */ int of_property_read_u64_index(const struct device_node *np, const char *propname, u32 index, u64 *out_value) { const u64 *val = of_find_property_value_of_size(np, propname, ((index + 1) * sizeof(*out_value)), 0, NULL); if (IS_ERR(val)) return PTR_ERR(val); *out_value = be64_to_cpup(((__be64 *)val) + index); return 0; } EXPORT_SYMBOL_GPL(of_property_read_u64_index); /** * of_property_read_variable_u8_array - Find and read an array of u8 from a * property, with bounds on the minimum and maximum array size. * * @np: device node from which the property value is to be read. * @propname: name of the property to be searched. * @out_values: pointer to found values. * @sz_min: minimum number of array elements to read * @sz_max: maximum number of array elements to read, if zero there is no * upper limit on the number of elements in the dts entry but only * sz_min will be read. * * Search for a property in a device node and read 8-bit value(s) from * it. * * dts entry of array should be like: * ``property = /bits/ 8 <0x50 0x60 0x70>;`` * * Return: The number of elements read on success, -EINVAL if the property * does not exist, -ENODATA if property does not have a value, and -EOVERFLOW * if the property data is smaller than sz_min or longer than sz_max. * * The out_values is modified only if a valid u8 value can be decoded. */ int of_property_read_variable_u8_array(const struct device_node *np, const char *propname, u8 *out_values, size_t sz_min, size_t sz_max) { size_t sz, count; const u8 *val = of_find_property_value_of_size(np, propname, (sz_min * sizeof(*out_values)), (sz_max * sizeof(*out_values)), &sz); if (IS_ERR(val)) return PTR_ERR(val); if (!sz_max) sz = sz_min; else sz /= sizeof(*out_values); count = sz; while (count--) *out_values++ = *val++; return sz; } EXPORT_SYMBOL_GPL(of_property_read_variable_u8_array); /** * of_property_read_variable_u16_array - Find and read an array of u16 from a * property, with bounds on the minimum and maximum array size. * * @np: device node from which the property value is to be read. * @propname: name of the property to be searched. * @out_values: pointer to found values. * @sz_min: minimum number of array elements to read * @sz_max: maximum number of array elements to read, if zero there is no * upper limit on the number of elements in the dts entry but only * sz_min will be read. * * Search for a property in a device node and read 16-bit value(s) from * it. * * dts entry of array should be like: * ``property = /bits/ 16 <0x5000 0x6000 0x7000>;`` * * Return: The number of elements read on success, -EINVAL if the property * does not exist, -ENODATA if property does not have a value, and -EOVERFLOW * if the property data is smaller than sz_min or longer than sz_max. * * The out_values is modified only if a valid u16 value can be decoded. */ int of_property_read_variable_u16_array(const struct device_node *np, const char *propname, u16 *out_values, size_t sz_min, size_t sz_max) { size_t sz, count; const __be16 *val = of_find_property_value_of_size(np, propname, (sz_min * sizeof(*out_values)), (sz_max * sizeof(*out_values)), &sz); if (IS_ERR(val)) return PTR_ERR(val); if (!sz_max) sz = sz_min; else sz /= sizeof(*out_values); count = sz; while (count--) *out_values++ = be16_to_cpup(val++); return sz; } EXPORT_SYMBOL_GPL(of_property_read_variable_u16_array); /** * of_property_read_variable_u32_array - Find and read an array of 32 bit * integers from a property, with bounds on the minimum and maximum array size. * * @np: device node from which the property value is to be read. * @propname: name of the property to be searched. * @out_values: pointer to return found values. * @sz_min: minimum number of array elements to read * @sz_max: maximum number of array elements to read, if zero there is no * upper limit on the number of elements in the dts entry but only * sz_min will be read. * * Search for a property in a device node and read 32-bit value(s) from * it. * * Return: The number of elements read on success, -EINVAL if the property * does not exist, -ENODATA if property does not have a value, and -EOVERFLOW * if the property data is smaller than sz_min or longer than sz_max. * * The out_values is modified only if a valid u32 value can be decoded. */ int of_property_read_variable_u32_array(const struct device_node *np, const char *propname, u32 *out_values, size_t sz_min, size_t sz_max) { size_t sz, count; const __be32 *val = of_find_property_value_of_size(np, propname, (sz_min * sizeof(*out_values)), (sz_max * sizeof(*out_values)), &sz); if (IS_ERR(val)) return PTR_ERR(val); if (!sz_max) sz = sz_min; else sz /= sizeof(*out_values); count = sz; while (count--) *out_values++ = be32_to_cpup(val++); return sz; } EXPORT_SYMBOL_GPL(of_property_read_variable_u32_array); /** * of_property_read_u64 - Find and read a 64 bit integer from a property * @np: device node from which the property value is to be read. * @propname: name of the property to be searched. * @out_value: pointer to return value, modified only if return value is 0. * * Search for a property in a device node and read a 64-bit value from * it. * * Return: 0 on success, -EINVAL if the property does not exist, * -ENODATA if property does not have a value, and -EOVERFLOW if the * property data isn't large enough. * * The out_value is modified only if a valid u64 value can be decoded. */ int of_property_read_u64(const struct device_node *np, const char *propname, u64 *out_value) { const __be32 *val = of_find_property_value_of_size(np, propname, sizeof(*out_value), 0, NULL); if (IS_ERR(val)) return PTR_ERR(val); *out_value = of_read_number(val, 2); return 0; } EXPORT_SYMBOL_GPL(of_property_read_u64); /** * of_property_read_variable_u64_array - Find and read an array of 64 bit * integers from a property, with bounds on the minimum and maximum array size. * * @np: device node from which the property value is to be read. * @propname: name of the property to be searched. * @out_values: pointer to found values. * @sz_min: minimum number of array elements to read * @sz_max: maximum number of array elements to read, if zero there is no * upper limit on the number of elements in the dts entry but only * sz_min will be read. * * Search for a property in a device node and read 64-bit value(s) from * it. * * Return: The number of elements read on success, -EINVAL if the property * does not exist, -ENODATA if property does not have a value, and -EOVERFLOW * if the property data is smaller than sz_min or longer than sz_max. * * The out_values is modified only if a valid u64 value can be decoded. */ int of_property_read_variable_u64_array(const struct device_node *np, const char *propname, u64 *out_values, size_t sz_min, size_t sz_max) { size_t sz, count; const __be32 *val = of_find_property_value_of_size(np, propname, (sz_min * sizeof(*out_values)), (sz_max * sizeof(*out_values)), &sz); if (IS_ERR(val)) return PTR_ERR(val); if (!sz_max) sz = sz_min; else sz /= sizeof(*out_values); count = sz; while (count--) { *out_values++ = of_read_number(val, 2); val += 2; } return sz; } EXPORT_SYMBOL_GPL(of_property_read_variable_u64_array); /** * of_property_read_string - Find and read a string from a property * @np: device node from which the property value is to be read. * @propname: name of the property to be searched. * @out_string: pointer to null terminated return string, modified only if * return value is 0. * * Search for a property in a device tree node and retrieve a null * terminated string value (pointer to data, not a copy). * * Return: 0 on success, -EINVAL if the property does not exist, -ENODATA if * property does not have a value, and -EILSEQ if the string is not * null-terminated within the length of the property data. * * Note that the empty string "" has length of 1, thus -ENODATA cannot * be interpreted as an empty string. * * The out_string pointer is modified only if a valid string can be decoded. */ int of_property_read_string(const struct device_node *np, const char *propname, const char **out_string) { const struct property *prop = of_find_property(np, propname, NULL); if (!prop) return -EINVAL; if (!prop->length) return -ENODATA; if (strnlen(prop->value, prop->length) >= prop->length) return -EILSEQ; *out_string = prop->value; return 0; } EXPORT_SYMBOL_GPL(of_property_read_string); /** * of_property_match_string() - Find string in a list and return index * @np: pointer to node containing string list property * @propname: string list property name * @string: pointer to string to search for in string list * * This function searches a string list property and returns the index * of a specific string value. */ int of_property_match_string(const struct device_node *np, const char *propname, const char *string) { const struct property *prop = of_find_property(np, propname, NULL); size_t l; int i; const char *p, *end; if (!prop) return -EINVAL; if (!prop->value) return -ENODATA; p = prop->value; end = p + prop->length; for (i = 0; p < end; i++, p += l) { l = strnlen(p, end - p) + 1; if (p + l > end) return -EILSEQ; pr_debug("comparing %s with %s\n", string, p); if (strcmp(string, p) == 0) return i; /* Found it; return index */ } return -ENODATA; } EXPORT_SYMBOL_GPL(of_property_match_string); /** * of_property_read_string_helper() - Utility helper for parsing string properties * @np: device node from which the property value is to be read. * @propname: name of the property to be searched. * @out_strs: output array of string pointers. * @sz: number of array elements to read. * @skip: Number of strings to skip over at beginning of list. * * Don't call this function directly. It is a utility helper for the * of_property_read_string*() family of functions. */ int of_property_read_string_helper(const struct device_node *np, const char *propname, const char **out_strs, size_t sz, int skip) { const struct property *prop = of_find_property(np, propname, NULL); int l = 0, i = 0; const char *p, *end; if (!prop) return -EINVAL; if (!prop->value) return -ENODATA; p = prop->value; end = p + prop->length; for (i = 0; p < end && (!out_strs || i < skip + sz); i++, p += l) { l = strnlen(p, end - p) + 1; if (p + l > end) return -EILSEQ; if (out_strs && i >= skip) *out_strs++ = p; } i -= skip; return i <= 0 ? -ENODATA : i; } EXPORT_SYMBOL_GPL(of_property_read_string_helper); const __be32 *of_prop_next_u32(struct property *prop, const __be32 *cur, u32 *pu) { const void *curv = cur; if (!prop) return NULL; if (!cur) { curv = prop->value; goto out_val; } curv += sizeof(*cur); if (curv >= prop->value + prop->length) return NULL; out_val: *pu = be32_to_cpup(curv); return curv; } EXPORT_SYMBOL_GPL(of_prop_next_u32); const char *of_prop_next_string(struct property *prop, const char *cur) { const void *curv = cur; if (!prop) return NULL; if (!cur) return prop->value; curv += strlen(cur) + 1; if (curv >= prop->value + prop->length) return NULL; return curv; } EXPORT_SYMBOL_GPL(of_prop_next_string); /** * of_graph_parse_endpoint() - parse common endpoint node properties * @node: pointer to endpoint device_node * @endpoint: pointer to the OF endpoint data structure * * The caller should hold a reference to @node. */ int of_graph_parse_endpoint(const struct device_node *node, struct of_endpoint *endpoint) { struct device_node *port_node __free(device_node) = of_get_parent(node); WARN_ONCE(!port_node, "%s(): endpoint %pOF has no parent node\n", __func__, node); memset(endpoint, 0, sizeof(*endpoint)); endpoint->local_node = node; /* * It doesn't matter whether the two calls below succeed. * If they don't then the default value 0 is used. */ of_property_read_u32(port_node, "reg", &endpoint->port); of_property_read_u32(node, "reg", &endpoint->id); return 0; } EXPORT_SYMBOL(of_graph_parse_endpoint); /** * of_graph_get_port_by_id() - get the port matching a given id * @parent: pointer to the parent device node * @id: id of the port * * Return: A 'port' node pointer with refcount incremented. The caller * has to use of_node_put() on it when done. */ struct device_node *of_graph_get_port_by_id(struct device_node *parent, u32 id) { struct device_node *node __free(device_node) = of_get_child_by_name(parent, "ports"); if (node) parent = node; for_each_child_of_node_scoped(parent, port) { u32 port_id = 0; if (!of_node_name_eq(port, "port")) continue; of_property_read_u32(port, "reg", &port_id); if (id == port_id) return_ptr(port); } return NULL; } EXPORT_SYMBOL(of_graph_get_port_by_id); /** * of_graph_get_next_endpoint() - get next endpoint node * @parent: pointer to the parent device node * @prev: previous endpoint node, or NULL to get first * * Return: An 'endpoint' node pointer with refcount incremented. Refcount * of the passed @prev node is decremented. */ struct device_node *of_graph_get_next_endpoint(const struct device_node *parent, struct device_node *prev) { struct device_node *endpoint; struct device_node *port; if (!parent) return NULL; /* * Start by locating the port node. If no previous endpoint is specified * search for the first port node, otherwise get the previous endpoint * parent port node. */ if (!prev) { struct device_node *node __free(device_node) = of_get_child_by_name(parent, "ports"); if (node) parent = node; port = of_get_child_by_name(parent, "port"); if (!port) { pr_debug("graph: no port node found in %pOF\n", parent); return NULL; } } else { port = of_get_parent(prev); if (WARN_ONCE(!port, "%s(): endpoint %pOF has no parent node\n", __func__, prev)) return NULL; } while (1) { /* * Now that we have a port node, get the next endpoint by * getting the next child. If the previous endpoint is NULL this * will return the first child. */ endpoint = of_get_next_child(port, prev); if (endpoint) { of_node_put(port); return endpoint; } /* No more endpoints under this port, try the next one. */ prev = NULL; do { port = of_get_next_child(parent, port); if (!port) return NULL; } while (!of_node_name_eq(port, "port")); } } EXPORT_SYMBOL(of_graph_get_next_endpoint); /** * of_graph_get_endpoint_by_regs() - get endpoint node of specific identifiers * @parent: pointer to the parent device node * @port_reg: identifier (value of reg property) of the parent port node * @reg: identifier (value of reg property) of the endpoint node * * Return: An 'endpoint' node pointer which is identified by reg and at the same * is the child of a port node identified by port_reg. reg and port_reg are * ignored when they are -1. Use of_node_put() on the pointer when done. */ struct device_node *of_graph_get_endpoint_by_regs( const struct device_node *parent, int port_reg, int reg) { struct of_endpoint endpoint; struct device_node *node = NULL; for_each_endpoint_of_node(parent, node) { of_graph_parse_endpoint(node, &endpoint); if (((port_reg == -1) || (endpoint.port == port_reg)) && ((reg == -1) || (endpoint.id == reg))) return node; } return NULL; } EXPORT_SYMBOL(of_graph_get_endpoint_by_regs); /** * of_graph_get_remote_endpoint() - get remote endpoint node * @node: pointer to a local endpoint device_node * * Return: Remote endpoint node associated with remote endpoint node linked * to @node. Use of_node_put() on it when done. */ struct device_node *of_graph_get_remote_endpoint(const struct device_node *node) { /* Get remote endpoint node. */ return of_parse_phandle(node, "remote-endpoint", 0); } EXPORT_SYMBOL(of_graph_get_remote_endpoint); /** * of_graph_get_port_parent() - get port's parent node * @node: pointer to a local endpoint device_node * * Return: device node associated with endpoint node linked * to @node. Use of_node_put() on it when done. */ struct device_node *of_graph_get_port_parent(struct device_node *node) { unsigned int depth; if (!node) return NULL; /* * Preserve usecount for passed in node as of_get_next_parent() * will do of_node_put() on it. */ of_node_get(node); /* Walk 3 levels up only if there is 'ports' node. */ for (depth = 3; depth && node; depth--) { node = of_get_next_parent(node); if (depth == 2 && !of_node_name_eq(node, "ports") && !of_node_name_eq(node, "in-ports") && !of_node_name_eq(node, "out-ports")) break; } return node; } EXPORT_SYMBOL(of_graph_get_port_parent); /** * of_graph_get_remote_port_parent() - get remote port's parent node * @node: pointer to a local endpoint device_node * * Return: Remote device node associated with remote endpoint node linked * to @node. Use of_node_put() on it when done. */ struct device_node *of_graph_get_remote_port_parent( const struct device_node *node) { struct device_node *np, *pp; /* Get remote endpoint node. */ np = of_graph_get_remote_endpoint(node); pp = of_graph_get_port_parent(np); of_node_put(np); return pp; } EXPORT_SYMBOL(of_graph_get_remote_port_parent); /** * of_graph_get_remote_port() - get remote port node * @node: pointer to a local endpoint device_node * * Return: Remote port node associated with remote endpoint node linked * to @node. Use of_node_put() on it when done. */ struct device_node *of_graph_get_remote_port(const struct device_node *node) { struct device_node *np; /* Get remote endpoint node. */ np = of_graph_get_remote_endpoint(node); if (!np) return NULL; return of_get_next_parent(np); } EXPORT_SYMBOL(of_graph_get_remote_port); /** * of_graph_get_endpoint_count() - get the number of endpoints in a device node * @np: parent device node containing ports and endpoints * * Return: count of endpoint of this device node */ unsigned int of_graph_get_endpoint_count(const struct device_node *np) { struct device_node *endpoint; unsigned int num = 0; for_each_endpoint_of_node(np, endpoint) num++; return num; } EXPORT_SYMBOL(of_graph_get_endpoint_count); /** * of_graph_get_remote_node() - get remote parent device_node for given port/endpoint * @node: pointer to parent device_node containing graph port/endpoint * @port: identifier (value of reg property) of the parent port node * @endpoint: identifier (value of reg property) of the endpoint node * * Return: Remote device node associated with remote endpoint node linked * to @node. Use of_node_put() on it when done. */ struct device_node *of_graph_get_remote_node(const struct device_node *node, u32 port, u32 endpoint) { struct device_node *endpoint_node, *remote; endpoint_node = of_graph_get_endpoint_by_regs(node, port, endpoint); if (!endpoint_node) { pr_debug("no valid endpoint (%d, %d) for node %pOF\n", port, endpoint, node); return NULL; } remote = of_graph_get_remote_port_parent(endpoint_node); of_node_put(endpoint_node); if (!remote) { pr_debug("no valid remote node\n"); return NULL; } if (!of_device_is_available(remote)) { pr_debug("not available for remote node\n"); of_node_put(remote); return NULL; } return remote; } EXPORT_SYMBOL(of_graph_get_remote_node); static struct fwnode_handle *of_fwnode_get(struct fwnode_handle *fwnode) { return of_fwnode_handle(of_node_get(to_of_node(fwnode))); } static void of_fwnode_put(struct fwnode_handle *fwnode) { of_node_put(to_of_node(fwnode)); } static bool of_fwnode_device_is_available(const struct fwnode_handle *fwnode) { return of_device_is_available(to_of_node(fwnode)); } static bool of_fwnode_device_dma_supported(const struct fwnode_handle *fwnode) { return true; } static enum dev_dma_attr of_fwnode_device_get_dma_attr(const struct fwnode_handle *fwnode) { if (of_dma_is_coherent(to_of_node(fwnode))) return DEV_DMA_COHERENT; else return DEV_DMA_NON_COHERENT; } static bool of_fwnode_property_present(const struct fwnode_handle *fwnode, const char *propname) { return of_property_read_bool(to_of_node(fwnode), propname); } static int of_fwnode_property_read_int_array(const struct fwnode_handle *fwnode, const char *propname, unsigned int elem_size, void *val, size_t nval) { const struct device_node *node = to_of_node(fwnode); if (!val) return of_property_count_elems_of_size(node, propname, elem_size); switch (elem_size) { case sizeof(u8): return of_property_read_u8_array(node, propname, val, nval); case sizeof(u16): return of_property_read_u16_array(node, propname, val, nval); case sizeof(u32): return of_property_read_u32_array(node, propname, val, nval); case sizeof(u64): return of_property_read_u64_array(node, propname, val, nval); } return -ENXIO; } static int of_fwnode_property_read_string_array(const struct fwnode_handle *fwnode, const char *propname, const char **val, size_t nval) { const struct device_node *node = to_of_node(fwnode); return val ? of_property_read_string_array(node, propname, val, nval) : of_property_count_strings(node, propname); } static const char *of_fwnode_get_name(const struct fwnode_handle *fwnode) { return kbasename(to_of_node(fwnode)->full_name); } static const char *of_fwnode_get_name_prefix(const struct fwnode_handle *fwnode) { /* Root needs no prefix here (its name is "/"). */ if (!to_of_node(fwnode)->parent) return ""; return "/"; } static struct fwnode_handle * of_fwnode_get_parent(const struct fwnode_handle *fwnode) { return of_fwnode_handle(of_get_parent(to_of_node(fwnode))); } static struct fwnode_handle * of_fwnode_get_next_child_node(const struct fwnode_handle *fwnode, struct fwnode_handle *child) { return of_fwnode_handle(of_get_next_available_child(to_of_node(fwnode), to_of_node(child))); } static struct fwnode_handle * of_fwnode_get_named_child_node(const struct fwnode_handle *fwnode, const char *childname) { const struct device_node *node = to_of_node(fwnode); struct device_node *child; for_each_available_child_of_node(node, child) if (of_node_name_eq(child, childname)) return of_fwnode_handle(child); return NULL; } static int of_fwnode_get_reference_args(const struct fwnode_handle *fwnode, const char *prop, const char *nargs_prop, unsigned int nargs, unsigned int index, struct fwnode_reference_args *args) { struct of_phandle_args of_args; unsigned int i; int ret; if (nargs_prop) ret = of_parse_phandle_with_args(to_of_node(fwnode), prop, nargs_prop, index, &of_args); else ret = of_parse_phandle_with_fixed_args(to_of_node(fwnode), prop, nargs, index, &of_args); if (ret < 0) return ret; if (!args) { of_node_put(of_args.np); return 0; } args->nargs = of_args.args_count; args->fwnode = of_fwnode_handle(of_args.np); for (i = 0; i < NR_FWNODE_REFERENCE_ARGS; i++) args->args[i] = i < of_args.args_count ? of_args.args[i] : 0; return 0; } static struct fwnode_handle * of_fwnode_graph_get_next_endpoint(const struct fwnode_handle *fwnode, struct fwnode_handle *prev) { return of_fwnode_handle(of_graph_get_next_endpoint(to_of_node(fwnode), to_of_node(prev))); } static struct fwnode_handle * of_fwnode_graph_get_remote_endpoint(const struct fwnode_handle *fwnode) { return of_fwnode_handle( of_graph_get_remote_endpoint(to_of_node(fwnode))); } static struct fwnode_handle * of_fwnode_graph_get_port_parent(struct fwnode_handle *fwnode) { struct device_node *np; /* Get the parent of the port */ np = of_get_parent(to_of_node(fwnode)); if (!np) return NULL; /* Is this the "ports" node? If not, it's the port parent. */ if (!of_node_name_eq(np, "ports")) return of_fwnode_handle(np); return of_fwnode_handle(of_get_next_parent(np)); } static int of_fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode, struct fwnode_endpoint *endpoint) { const struct device_node *node = to_of_node(fwnode); struct device_node *port_node __free(device_node) = of_get_parent(node); endpoint->local_fwnode = fwnode; of_property_read_u32(port_node, "reg", &endpoint->port); of_property_read_u32(node, "reg", &endpoint->id); return 0; } static const void * of_fwnode_device_get_match_data(const struct fwnode_handle *fwnode, const struct device *dev) { return of_device_get_match_data(dev); } static void of_link_to_phandle(struct device_node *con_np, struct device_node *sup_np, u8 flags) { struct device_node *tmp_np = of_node_get(sup_np); /* Check that sup_np and its ancestors are available. */ while (tmp_np) { if (of_fwnode_handle(tmp_np)->dev) { of_node_put(tmp_np); break; } if (!of_device_is_available(tmp_np)) { of_node_put(tmp_np); return; } tmp_np = of_get_next_parent(tmp_np); } fwnode_link_add(of_fwnode_handle(con_np), of_fwnode_handle(sup_np), flags); } /** * parse_prop_cells - Property parsing function for suppliers * * @np: Pointer to device tree node containing a list * @prop_name: Name of property to be parsed. Expected to hold phandle values * @index: For properties holding a list of phandles, this is the index * into the list. * @list_name: Property name that is known to contain list of phandle(s) to * supplier(s) * @cells_name: property name that specifies phandles' arguments count * * This is a helper function to parse properties that have a known fixed name * and are a list of phandles and phandle arguments. * * Returns: * - phandle node pointer with refcount incremented. Caller must of_node_put() * on it when done. * - NULL if no phandle found at index */ static struct device_node *parse_prop_cells(struct device_node *np, const char *prop_name, int index, const char *list_name, const char *cells_name) { struct of_phandle_args sup_args; if (strcmp(prop_name, list_name)) return NULL; if (__of_parse_phandle_with_args(np, list_name, cells_name, 0, index, &sup_args)) return NULL; return sup_args.np; } #define DEFINE_SIMPLE_PROP(fname, name, cells) \ static struct device_node *parse_##fname(struct device_node *np, \ const char *prop_name, int index) \ { \ return parse_prop_cells(np, prop_name, index, name, cells); \ } static int strcmp_suffix(const char *str, const char *suffix) { unsigned int len, suffix_len; len = strlen(str); suffix_len = strlen(suffix); if (len <= suffix_len) return -1; return strcmp(str + len - suffix_len, suffix); } /** * parse_suffix_prop_cells - Suffix property parsing function for suppliers * * @np: Pointer to device tree node containing a list * @prop_name: Name of property to be parsed. Expected to hold phandle values * @index: For properties holding a list of phandles, this is the index * into the list. * @suffix: Property suffix that is known to contain list of phandle(s) to * supplier(s) * @cells_name: property name that specifies phandles' arguments count * * This is a helper function to parse properties that have a known fixed suffix * and are a list of phandles and phandle arguments. * * Returns: * - phandle node pointer with refcount incremented. Caller must of_node_put() * on it when done. * - NULL if no phandle found at index */ static struct device_node *parse_suffix_prop_cells(struct device_node *np, const char *prop_name, int index, const char *suffix, const char *cells_name) { struct of_phandle_args sup_args; if (strcmp_suffix(prop_name, suffix)) return NULL; if (of_parse_phandle_with_args(np, prop_name, cells_name, index, &sup_args)) return NULL; return sup_args.np; } #define DEFINE_SUFFIX_PROP(fname, suffix, cells) \ static struct device_node *parse_##fname(struct device_node *np, \ const char *prop_name, int index) \ { \ return parse_suffix_prop_cells(np, prop_name, index, suffix, cells); \ } /** * struct supplier_bindings - Property parsing functions for suppliers * * @parse_prop: function name * parse_prop() finds the node corresponding to a supplier phandle * parse_prop.np: Pointer to device node holding supplier phandle property * parse_prop.prop_name: Name of property holding a phandle value * parse_prop.index: For properties holding a list of phandles, this is the * index into the list * @get_con_dev: If the consumer node containing the property is never converted * to a struct device, implement this ops so fw_devlink can use it * to find the true consumer. * @optional: Describes whether a supplier is mandatory or not * @fwlink_flags: Optional fwnode link flags to use when creating a fwnode link * for this property. * * Returns: * parse_prop() return values are * - phandle node pointer with refcount incremented. Caller must of_node_put() * on it when done. * - NULL if no phandle found at index */ struct supplier_bindings { struct device_node *(*parse_prop)(struct device_node *np, const char *prop_name, int index); struct device_node *(*get_con_dev)(struct device_node *np); bool optional; u8 fwlink_flags; }; DEFINE_SIMPLE_PROP(clocks, "clocks", "#clock-cells") DEFINE_SIMPLE_PROP(interconnects, "interconnects", "#interconnect-cells") DEFINE_SIMPLE_PROP(iommus, "iommus", "#iommu-cells") DEFINE_SIMPLE_PROP(mboxes, "mboxes", "#mbox-cells") DEFINE_SIMPLE_PROP(io_channels, "io-channels", "#io-channel-cells") DEFINE_SIMPLE_PROP(io_backends, "io-backends", "#io-backend-cells") DEFINE_SIMPLE_PROP(interrupt_parent, "interrupt-parent", NULL) DEFINE_SIMPLE_PROP(dmas, "dmas", "#dma-cells") DEFINE_SIMPLE_PROP(power_domains, "power-domains", "#power-domain-cells") DEFINE_SIMPLE_PROP(hwlocks, "hwlocks", "#hwlock-cells") DEFINE_SIMPLE_PROP(extcon, "extcon", NULL) DEFINE_SIMPLE_PROP(nvmem_cells, "nvmem-cells", "#nvmem-cell-cells") DEFINE_SIMPLE_PROP(phys, "phys", "#phy-cells") DEFINE_SIMPLE_PROP(wakeup_parent, "wakeup-parent", NULL) DEFINE_SIMPLE_PROP(pinctrl0, "pinctrl-0", NULL) DEFINE_SIMPLE_PROP(pinctrl1, "pinctrl-1", NULL) DEFINE_SIMPLE_PROP(pinctrl2, "pinctrl-2", NULL) DEFINE_SIMPLE_PROP(pinctrl3, "pinctrl-3", NULL) DEFINE_SIMPLE_PROP(pinctrl4, "pinctrl-4", NULL) DEFINE_SIMPLE_PROP(pinctrl5, "pinctrl-5", NULL) DEFINE_SIMPLE_PROP(pinctrl6, "pinctrl-6", NULL) DEFINE_SIMPLE_PROP(pinctrl7, "pinctrl-7", NULL) DEFINE_SIMPLE_PROP(pinctrl8, "pinctrl-8", NULL) DEFINE_SIMPLE_PROP(pwms, "pwms", "#pwm-cells") DEFINE_SIMPLE_PROP(resets, "resets", "#reset-cells") DEFINE_SIMPLE_PROP(leds, "leds", NULL) DEFINE_SIMPLE_PROP(backlight, "backlight", NULL) DEFINE_SIMPLE_PROP(panel, "panel", NULL) DEFINE_SIMPLE_PROP(msi_parent, "msi-parent", "#msi-cells") DEFINE_SIMPLE_PROP(post_init_providers, "post-init-providers", NULL) DEFINE_SIMPLE_PROP(access_controllers, "access-controllers", "#access-controller-cells") DEFINE_SIMPLE_PROP(pses, "pses", "#pse-cells") DEFINE_SIMPLE_PROP(power_supplies, "power-supplies", NULL) DEFINE_SUFFIX_PROP(regulators, "-supply", NULL) DEFINE_SUFFIX_PROP(gpio, "-gpio", "#gpio-cells") static struct device_node *parse_gpios(struct device_node *np, const char *prop_name, int index) { if (!strcmp_suffix(prop_name, ",nr-gpios")) return NULL; return parse_suffix_prop_cells(np, prop_name, index, "-gpios", "#gpio-cells"); } static struct device_node *parse_iommu_maps(struct device_node *np, const char *prop_name, int index) { if (strcmp(prop_name, "iommu-map")) return NULL; return of_parse_phandle(np, prop_name, (index * 4) + 1); } static struct device_node *parse_gpio_compat(struct device_node *np, const char *prop_name, int index) { struct of_phandle_args sup_args; if (strcmp(prop_name, "gpio") && strcmp(prop_name, "gpios")) return NULL; /* * Ignore node with gpio-hog property since its gpios are all provided * by its parent. */ if (of_property_read_bool(np, "gpio-hog")) return NULL; if (of_parse_phandle_with_args(np, prop_name, "#gpio-cells", index, &sup_args)) return NULL; return sup_args.np; } static struct device_node *parse_interrupts(struct device_node *np, const char *prop_name, int index) { struct of_phandle_args sup_args; if (!IS_ENABLED(CONFIG_OF_IRQ) || IS_ENABLED(CONFIG_PPC)) return NULL; if (strcmp(prop_name, "interrupts") && strcmp(prop_name, "interrupts-extended")) return NULL; return of_irq_parse_one(np, index, &sup_args) ? NULL : sup_args.np; } static struct device_node *parse_interrupt_map(struct device_node *np, const char *prop_name, int index) { const __be32 *imap, *imap_end; struct of_phandle_args sup_args; u32 addrcells, intcells; int imaplen; if (!IS_ENABLED(CONFIG_OF_IRQ)) return NULL; if (strcmp(prop_name, "interrupt-map")) return NULL; if (of_property_read_u32(np, "#interrupt-cells", &intcells)) return NULL; addrcells = of_bus_n_addr_cells(np); imap = of_get_property(np, "interrupt-map", &imaplen); imaplen /= sizeof(*imap); if (!imap) return NULL; imap_end = imap + imaplen; for (int i = 0; imap + addrcells + intcells + 1 < imap_end; i++) { imap += addrcells + intcells; imap = of_irq_parse_imap_parent(imap, imap_end - imap, &sup_args); if (!imap) return NULL; if (i == index) return sup_args.np; of_node_put(sup_args.np); } return NULL; } static struct device_node *parse_remote_endpoint(struct device_node *np, const char *prop_name, int index) { /* Return NULL for index > 0 to signify end of remote-endpoints. */ if (index > 0 || strcmp(prop_name, "remote-endpoint")) return NULL; return of_graph_get_remote_port_parent(np); } static const struct supplier_bindings of_supplier_bindings[] = { { .parse_prop = parse_clocks, }, { .parse_prop = parse_interconnects, }, { .parse_prop = parse_iommus, .optional = true, }, { .parse_prop = parse_iommu_maps, .optional = true, }, { .parse_prop = parse_mboxes, }, { .parse_prop = parse_io_channels, }, { .parse_prop = parse_io_backends, }, { .parse_prop = parse_interrupt_parent, }, { .parse_prop = parse_dmas, .optional = true, }, { .parse_prop = parse_power_domains, }, { .parse_prop = parse_hwlocks, }, { .parse_prop = parse_extcon, }, { .parse_prop = parse_nvmem_cells, }, { .parse_prop = parse_phys, }, { .parse_prop = parse_wakeup_parent, }, { .parse_prop = parse_pinctrl0, }, { .parse_prop = parse_pinctrl1, }, { .parse_prop = parse_pinctrl2, }, { .parse_prop = parse_pinctrl3, }, { .parse_prop = parse_pinctrl4, }, { .parse_prop = parse_pinctrl5, }, { .parse_prop = parse_pinctrl6, }, { .parse_prop = parse_pinctrl7, }, { .parse_prop = parse_pinctrl8, }, { .parse_prop = parse_remote_endpoint, .get_con_dev = of_graph_get_port_parent, }, { .parse_prop = parse_pwms, }, { .parse_prop = parse_resets, }, { .parse_prop = parse_leds, }, { .parse_prop = parse_backlight, }, { .parse_prop = parse_panel, }, { .parse_prop = parse_msi_parent, }, { .parse_prop = parse_pses, }, { .parse_prop = parse_power_supplies, }, { .parse_prop = parse_gpio_compat, }, { .parse_prop = parse_interrupts, }, { .parse_prop = parse_interrupt_map, }, { .parse_prop = parse_access_controllers, }, { .parse_prop = parse_regulators, }, { .parse_prop = parse_gpio, }, { .parse_prop = parse_gpios, }, { .parse_prop = parse_post_init_providers, .fwlink_flags = FWLINK_FLAG_IGNORE, }, {} }; /** * of_link_property - Create device links to suppliers listed in a property * @con_np: The consumer device tree node which contains the property * @prop_name: Name of property to be parsed * * This function checks if the property @prop_name that is present in the * @con_np device tree node is one of the known common device tree bindings * that list phandles to suppliers. If @prop_name isn't one, this function * doesn't do anything. * * If @prop_name is one, this function attempts to create fwnode links from the * consumer device tree node @con_np to all the suppliers device tree nodes * listed in @prop_name. * * Any failed attempt to create a fwnode link will NOT result in an immediate * return. of_link_property() must create links to all the available supplier * device tree nodes even when attempts to create a link to one or more * suppliers fail. */ static int of_link_property(struct device_node *con_np, const char *prop_name) { struct device_node *phandle; const struct supplier_bindings *s = of_supplier_bindings; unsigned int i = 0; bool matched = false; /* Do not stop at first failed link, link all available suppliers. */ while (!matched && s->parse_prop) { if (s->optional && !fw_devlink_is_strict()) { s++; continue; } while ((phandle = s->parse_prop(con_np, prop_name, i))) { struct device_node *con_dev_np; con_dev_np = s->get_con_dev ? s->get_con_dev(con_np) : of_node_get(con_np); matched = true; i++; of_link_to_phandle(con_dev_np, phandle, s->fwlink_flags); of_node_put(phandle); of_node_put(con_dev_np); } s++; } return 0; } static void __iomem *of_fwnode_iomap(struct fwnode_handle *fwnode, int index) { #ifdef CONFIG_OF_ADDRESS return of_iomap(to_of_node(fwnode), index); #else return NULL; #endif } static int of_fwnode_irq_get(const struct fwnode_handle *fwnode, unsigned int index) { return of_irq_get(to_of_node(fwnode), index); } static int of_fwnode_add_links(struct fwnode_handle *fwnode) { struct property *p; struct device_node *con_np = to_of_node(fwnode); if (IS_ENABLED(CONFIG_X86)) return 0; if (!con_np) return -EINVAL; for_each_property_of_node(con_np, p) of_link_property(con_np, p->name); return 0; } const struct fwnode_operations of_fwnode_ops = { .get = of_fwnode_get, .put = of_fwnode_put, .device_is_available = of_fwnode_device_is_available, .device_get_match_data = of_fwnode_device_get_match_data, .device_dma_supported = of_fwnode_device_dma_supported, .device_get_dma_attr = of_fwnode_device_get_dma_attr, .property_present = of_fwnode_property_present, .property_read_int_array = of_fwnode_property_read_int_array, .property_read_string_array = of_fwnode_property_read_string_array, .get_name = of_fwnode_get_name, .get_name_prefix = of_fwnode_get_name_prefix, .get_parent = of_fwnode_get_parent, .get_next_child_node = of_fwnode_get_next_child_node, .get_named_child_node = of_fwnode_get_named_child_node, .get_reference_args = of_fwnode_get_reference_args, .graph_get_next_endpoint = of_fwnode_graph_get_next_endpoint, .graph_get_remote_endpoint = of_fwnode_graph_get_remote_endpoint, .graph_get_port_parent = of_fwnode_graph_get_port_parent, .graph_parse_endpoint = of_fwnode_graph_parse_endpoint, .iomap = of_fwnode_iomap, .irq_get = of_fwnode_irq_get, .add_links = of_fwnode_add_links, }; EXPORT_SYMBOL_GPL(of_fwnode_ops); |
8 1 7 15 1 14 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 | // SPDX-License-Identifier: GPL-2.0-only /* * TTL modification target for IP tables * (C) 2000,2005 by Harald Welte <laforge@netfilter.org> * * Hop Limit modification target for ip6tables * Maciej Soltysiak <solt@dns.toxicfilms.tv> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <net/checksum.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ipt_TTL.h> #include <linux/netfilter_ipv6/ip6t_HL.h> MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_AUTHOR("Maciej Soltysiak <solt@dns.toxicfilms.tv>"); MODULE_DESCRIPTION("Xtables: Hoplimit/TTL Limit field modification target"); MODULE_LICENSE("GPL"); static unsigned int ttl_tg(struct sk_buff *skb, const struct xt_action_param *par) { struct iphdr *iph; const struct ipt_TTL_info *info = par->targinfo; int new_ttl; if (skb_ensure_writable(skb, sizeof(*iph))) return NF_DROP; iph = ip_hdr(skb); switch (info->mode) { case IPT_TTL_SET: new_ttl = info->ttl; break; case IPT_TTL_INC: new_ttl = iph->ttl + info->ttl; if (new_ttl > 255) new_ttl = 255; break; case IPT_TTL_DEC: new_ttl = iph->ttl - info->ttl; if (new_ttl < 0) new_ttl = 0; break; default: new_ttl = iph->ttl; break; } if (new_ttl != iph->ttl) { csum_replace2(&iph->check, htons(iph->ttl << 8), htons(new_ttl << 8)); iph->ttl = new_ttl; } return XT_CONTINUE; } static unsigned int hl_tg6(struct sk_buff *skb, const struct xt_action_param *par) { struct ipv6hdr *ip6h; const struct ip6t_HL_info *info = par->targinfo; int new_hl; if (skb_ensure_writable(skb, sizeof(*ip6h))) return NF_DROP; ip6h = ipv6_hdr(skb); switch (info->mode) { case IP6T_HL_SET: new_hl = info->hop_limit; break; case IP6T_HL_INC: new_hl = ip6h->hop_limit + info->hop_limit; if (new_hl > 255) new_hl = 255; break; case IP6T_HL_DEC: new_hl = ip6h->hop_limit - info->hop_limit; if (new_hl < 0) new_hl = 0; break; default: new_hl = ip6h->hop_limit; break; } ip6h->hop_limit = new_hl; return XT_CONTINUE; } static int ttl_tg_check(const struct xt_tgchk_param *par) { const struct ipt_TTL_info *info = par->targinfo; if (info->mode > IPT_TTL_MAXMODE) return -EINVAL; if (info->mode != IPT_TTL_SET && info->ttl == 0) return -EINVAL; return 0; } static int hl_tg6_check(const struct xt_tgchk_param *par) { const struct ip6t_HL_info *info = par->targinfo; if (info->mode > IP6T_HL_MAXMODE) return -EINVAL; if (info->mode != IP6T_HL_SET && info->hop_limit == 0) return -EINVAL; return 0; } static struct xt_target hl_tg_reg[] __read_mostly = { { .name = "TTL", .revision = 0, .family = NFPROTO_IPV4, .target = ttl_tg, .targetsize = sizeof(struct ipt_TTL_info), .table = "mangle", .checkentry = ttl_tg_check, .me = THIS_MODULE, }, { .name = "HL", .revision = 0, .family = NFPROTO_IPV6, .target = hl_tg6, .targetsize = sizeof(struct ip6t_HL_info), .table = "mangle", .checkentry = hl_tg6_check, .me = THIS_MODULE, }, }; static int __init hl_tg_init(void) { return xt_register_targets(hl_tg_reg, ARRAY_SIZE(hl_tg_reg)); } static void __exit hl_tg_exit(void) { xt_unregister_targets(hl_tg_reg, ARRAY_SIZE(hl_tg_reg)); } module_init(hl_tg_init); module_exit(hl_tg_exit); MODULE_ALIAS("ipt_TTL"); MODULE_ALIAS("ip6t_HL"); |
887 888 831 189 188 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2008 IBM Corporation * Author: Mimi Zohar <zohar@us.ibm.com> * * File: integrity_audit.c * Audit calls for the integrity subsystem */ #include <linux/fs.h> #include <linux/gfp.h> #include <linux/audit.h> #include "integrity.h" static int integrity_audit_info; /* ima_audit_setup - enable informational auditing messages */ static int __init integrity_audit_setup(char *str) { unsigned long audit; if (!kstrtoul(str, 0, &audit)) integrity_audit_info = audit ? 1 : 0; return 1; } __setup("integrity_audit=", integrity_audit_setup); void integrity_audit_msg(int audit_msgno, struct inode *inode, const unsigned char *fname, const char *op, const char *cause, int result, int audit_info) { integrity_audit_message(audit_msgno, inode, fname, op, cause, result, audit_info, 0); } void integrity_audit_message(int audit_msgno, struct inode *inode, const unsigned char *fname, const char *op, const char *cause, int result, int audit_info, int errno) { struct audit_buffer *ab; char name[TASK_COMM_LEN]; if (!integrity_audit_info && audit_info == 1) /* Skip info messages */ return; ab = audit_log_start(audit_context(), GFP_KERNEL, audit_msgno); if (!ab) return; audit_log_format(ab, "pid=%d uid=%u auid=%u ses=%u", task_pid_nr(current), from_kuid(&init_user_ns, current_uid()), from_kuid(&init_user_ns, audit_get_loginuid(current)), audit_get_sessionid(current)); audit_log_task_context(ab); audit_log_format(ab, " op=%s cause=%s comm=", op, cause); audit_log_untrustedstring(ab, get_task_comm(name, current)); if (fname) { audit_log_format(ab, " name="); audit_log_untrustedstring(ab, fname); } if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } audit_log_format(ab, " res=%d errno=%d", !result, errno); audit_log_end(ab); } |
70 7 64 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 | // SPDX-License-Identifier: GPL-2.0 /* * fs/partitions/mac.c * * Code extracted from drivers/block/genhd.c * Copyright (C) 1991-1998 Linus Torvalds * Re-organised Feb 1998 Russell King */ #include <linux/ctype.h> #include "check.h" #include "mac.h" #ifdef CONFIG_PPC_PMAC #include <asm/machdep.h> extern void note_bootable_part(dev_t dev, int part, int goodness); #endif /* * Code to understand MacOS partition tables. */ #ifdef CONFIG_PPC_PMAC static inline void mac_fix_string(char *stg, int len) { int i; for (i = len - 1; i >= 0 && stg[i] == ' '; i--) stg[i] = 0; } #endif int mac_partition(struct parsed_partitions *state) { Sector sect; unsigned char *data; int slot, blocks_in_map; unsigned secsize, datasize, partoffset; #ifdef CONFIG_PPC_PMAC int found_root = 0; int found_root_goodness = 0; #endif struct mac_partition *part; struct mac_driver_desc *md; /* Get 0th block and look at the first partition map entry. */ md = read_part_sector(state, 0, §); if (!md) return -1; if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) { put_dev_sector(sect); return 0; } secsize = be16_to_cpu(md->block_size); put_dev_sector(sect); datasize = round_down(secsize, 512); data = read_part_sector(state, datasize / 512, §); if (!data) return -1; partoffset = secsize % 512; if (partoffset + sizeof(*part) > datasize) return -1; part = (struct mac_partition *) (data + partoffset); if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) { put_dev_sector(sect); return 0; /* not a MacOS disk */ } blocks_in_map = be32_to_cpu(part->map_count); if (blocks_in_map < 0 || blocks_in_map >= DISK_MAX_PARTS) { put_dev_sector(sect); return 0; } if (blocks_in_map >= state->limit) blocks_in_map = state->limit - 1; strlcat(state->pp_buf, " [mac]", PAGE_SIZE); for (slot = 1; slot <= blocks_in_map; ++slot) { int pos = slot * secsize; put_dev_sector(sect); data = read_part_sector(state, pos/512, §); if (!data) return -1; part = (struct mac_partition *) (data + pos%512); if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) break; put_partition(state, slot, be32_to_cpu(part->start_block) * (secsize/512), be32_to_cpu(part->block_count) * (secsize/512)); if (!strncasecmp(part->type, "Linux_RAID", 10)) state->parts[slot].flags = ADDPART_FLAG_RAID; #ifdef CONFIG_PPC_PMAC /* * If this is the first bootable partition, tell the * setup code, in case it wants to make this the root. */ if (machine_is(powermac)) { int goodness = 0; mac_fix_string(part->processor, 16); mac_fix_string(part->name, 32); mac_fix_string(part->type, 32); if ((be32_to_cpu(part->status) & MAC_STATUS_BOOTABLE) && strcasecmp(part->processor, "powerpc") == 0) goodness++; if (strcasecmp(part->type, "Apple_UNIX_SVR2") == 0 || (strncasecmp(part->type, "Linux", 5) == 0 && strcasecmp(part->type, "Linux_swap") != 0)) { int i, l; goodness++; l = strlen(part->name); if (strcmp(part->name, "/") == 0) goodness++; for (i = 0; i <= l - 4; ++i) { if (strncasecmp(part->name + i, "root", 4) == 0) { goodness += 2; break; } } if (strncasecmp(part->name, "swap", 4) == 0) goodness--; } if (goodness > found_root_goodness) { found_root = slot; found_root_goodness = goodness; } } #endif /* CONFIG_PPC_PMAC */ } #ifdef CONFIG_PPC_PMAC if (found_root_goodness) note_bootable_part(state->disk->part0->bd_dev, found_root, found_root_goodness); #endif put_dev_sector(sect); strlcat(state->pp_buf, "\n", PAGE_SIZE); return 1; } |
11 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Tap functions for AF_VSOCK sockets. * * Code based on net/netlink/af_netlink.c tap functions. */ #include <linux/module.h> #include <net/sock.h> #include <net/af_vsock.h> #include <linux/if_arp.h> static DEFINE_SPINLOCK(vsock_tap_lock); static struct list_head vsock_tap_all __read_mostly = LIST_HEAD_INIT(vsock_tap_all); int vsock_add_tap(struct vsock_tap *vt) { if (unlikely(vt->dev->type != ARPHRD_VSOCKMON)) return -EINVAL; __module_get(vt->module); spin_lock(&vsock_tap_lock); list_add_rcu(&vt->list, &vsock_tap_all); spin_unlock(&vsock_tap_lock); return 0; } EXPORT_SYMBOL_GPL(vsock_add_tap); int vsock_remove_tap(struct vsock_tap *vt) { struct vsock_tap *tmp; bool found = false; spin_lock(&vsock_tap_lock); list_for_each_entry(tmp, &vsock_tap_all, list) { if (vt == tmp) { list_del_rcu(&vt->list); found = true; goto out; } } pr_warn("vsock_remove_tap: %p not found\n", vt); out: spin_unlock(&vsock_tap_lock); synchronize_net(); if (found) module_put(vt->module); return found ? 0 : -ENODEV; } EXPORT_SYMBOL_GPL(vsock_remove_tap); static int __vsock_deliver_tap_skb(struct sk_buff *skb, struct net_device *dev) { int ret = 0; struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); if (nskb) { dev_hold(dev); nskb->dev = dev; ret = dev_queue_xmit(nskb); if (unlikely(ret > 0)) ret = net_xmit_errno(ret); dev_put(dev); } return ret; } static void __vsock_deliver_tap(struct sk_buff *skb) { int ret; struct vsock_tap *tmp; list_for_each_entry_rcu(tmp, &vsock_tap_all, list) { ret = __vsock_deliver_tap_skb(skb, tmp->dev); if (unlikely(ret)) break; } } void vsock_deliver_tap(struct sk_buff *build_skb(void *opaque), void *opaque) { struct sk_buff *skb; rcu_read_lock(); if (likely(list_empty(&vsock_tap_all))) goto out; skb = build_skb(opaque); if (skb) { __vsock_deliver_tap(skb); consume_skb(skb); } out: rcu_read_unlock(); } EXPORT_SYMBOL_GPL(vsock_deliver_tap); |
53 12 14 2 14 8 2 2 2 5 29 29 6 2 3 459 460 3 459 1956 1956 460 105 105 104 2 265 217 51 51 3 6 51 12 4 4 9 6 3 2 2 10 7 4 2 1 30 10 23 23 1 1 6 21 13 8 2 1 14 2 2 3 8 10 12 6 15 4 59 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 | // SPDX-License-Identifier: GPL-2.0-only /* * This is a module which is used for queueing packets and communicating with * userspace via nfnetlink. * * (C) 2005 by Harald Welte <laforge@netfilter.org> * (C) 2007 by Patrick McHardy <kaber@trash.net> * * Based on the old ipv4-only ip_queue.c: * (C) 2000-2002 James Morris <jmorris@intercode.com.au> * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/netfilter.h> #include <linux/proc_fs.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_bridge.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_queue.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/list.h> #include <linux/cgroup-defs.h> #include <net/gso.h> #include <net/sock.h> #include <net/tcp_states.h> #include <net/netfilter/nf_queue.h> #include <net/netns/generic.h> #include <linux/atomic.h> #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) #include "../bridge/br_private.h" #endif #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> #endif #define NFQNL_QMAX_DEFAULT 1024 /* We're using struct nlattr which has 16bit nla_len. Note that nla_len * includes the header length. Thus, the maximum packet length that we * support is 65531 bytes. We send truncated packets if the specified length * is larger than that. Userspace can check for presence of NFQA_CAP_LEN * attribute to detect truncation. */ #define NFQNL_MAX_COPY_RANGE (0xffff - NLA_HDRLEN) struct nfqnl_instance { struct hlist_node hlist; /* global list of queues */ struct rcu_head rcu; u32 peer_portid; unsigned int queue_maxlen; unsigned int copy_range; unsigned int queue_dropped; unsigned int queue_user_dropped; u_int16_t queue_num; /* number of this queue */ u_int8_t copy_mode; u_int32_t flags; /* Set using NFQA_CFG_FLAGS */ /* * Following fields are dirtied for each queued packet, * keep them in same cache line if possible. */ spinlock_t lock ____cacheline_aligned_in_smp; unsigned int queue_total; unsigned int id_sequence; /* 'sequence' of pkt ids */ struct list_head queue_list; /* packets in queue */ }; typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long); static unsigned int nfnl_queue_net_id __read_mostly; #define INSTANCE_BUCKETS 16 struct nfnl_queue_net { spinlock_t instances_lock; struct hlist_head instance_table[INSTANCE_BUCKETS]; }; static struct nfnl_queue_net *nfnl_queue_pernet(struct net *net) { return net_generic(net, nfnl_queue_net_id); } static inline u_int8_t instance_hashfn(u_int16_t queue_num) { return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS; } static struct nfqnl_instance * instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num) { struct hlist_head *head; struct nfqnl_instance *inst; head = &q->instance_table[instance_hashfn(queue_num)]; hlist_for_each_entry_rcu(inst, head, hlist) { if (inst->queue_num == queue_num) return inst; } return NULL; } static struct nfqnl_instance * instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid) { struct nfqnl_instance *inst; unsigned int h; int err; spin_lock(&q->instances_lock); if (instance_lookup(q, queue_num)) { err = -EEXIST; goto out_unlock; } inst = kzalloc(sizeof(*inst), GFP_ATOMIC); if (!inst) { err = -ENOMEM; goto out_unlock; } inst->queue_num = queue_num; inst->peer_portid = portid; inst->queue_maxlen = NFQNL_QMAX_DEFAULT; inst->copy_range = NFQNL_MAX_COPY_RANGE; inst->copy_mode = NFQNL_COPY_NONE; spin_lock_init(&inst->lock); INIT_LIST_HEAD(&inst->queue_list); if (!try_module_get(THIS_MODULE)) { err = -EAGAIN; goto out_free; } h = instance_hashfn(queue_num); hlist_add_head_rcu(&inst->hlist, &q->instance_table[h]); spin_unlock(&q->instances_lock); return inst; out_free: kfree(inst); out_unlock: spin_unlock(&q->instances_lock); return ERR_PTR(err); } static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data); static void instance_destroy_rcu(struct rcu_head *head) { struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance, rcu); rcu_read_lock(); nfqnl_flush(inst, NULL, 0); rcu_read_unlock(); kfree(inst); module_put(THIS_MODULE); } static void __instance_destroy(struct nfqnl_instance *inst) { hlist_del_rcu(&inst->hlist); call_rcu(&inst->rcu, instance_destroy_rcu); } static void instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst) { spin_lock(&q->instances_lock); __instance_destroy(inst); spin_unlock(&q->instances_lock); } static inline void __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) { list_add_tail(&entry->list, &queue->queue_list); queue->queue_total++; } static void __dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) { list_del(&entry->list); queue->queue_total--; } static struct nf_queue_entry * find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id) { struct nf_queue_entry *entry = NULL, *i; spin_lock_bh(&queue->lock); list_for_each_entry(i, &queue->queue_list, list) { if (i->id == id) { entry = i; break; } } if (entry) __dequeue_entry(queue, entry); spin_unlock_bh(&queue->lock); return entry; } static unsigned int nf_iterate(struct sk_buff *skb, struct nf_hook_state *state, const struct nf_hook_entries *hooks, unsigned int *index) { const struct nf_hook_entry *hook; unsigned int verdict, i = *index; while (i < hooks->num_hook_entries) { hook = &hooks->hooks[i]; repeat: verdict = nf_hook_entry_hookfn(hook, skb, state); if (verdict != NF_ACCEPT) { *index = i; if (verdict != NF_REPEAT) return verdict; goto repeat; } i++; } *index = i; return NF_ACCEPT; } static struct nf_hook_entries *nf_hook_entries_head(const struct net *net, u8 pf, u8 hooknum) { switch (pf) { #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE case NFPROTO_BRIDGE: return rcu_dereference(net->nf.hooks_bridge[hooknum]); #endif case NFPROTO_IPV4: return rcu_dereference(net->nf.hooks_ipv4[hooknum]); case NFPROTO_IPV6: return rcu_dereference(net->nf.hooks_ipv6[hooknum]); default: WARN_ON_ONCE(1); return NULL; } return NULL; } static int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry) { #ifdef CONFIG_INET const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); if (entry->state.hook == NF_INET_LOCAL_OUT) { const struct iphdr *iph = ip_hdr(skb); if (!(iph->tos == rt_info->tos && skb->mark == rt_info->mark && iph->daddr == rt_info->daddr && iph->saddr == rt_info->saddr)) return ip_route_me_harder(entry->state.net, entry->state.sk, skb, RTN_UNSPEC); } #endif return 0; } static int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry) { const struct nf_ipv6_ops *v6ops; int ret = 0; switch (entry->state.pf) { case AF_INET: ret = nf_ip_reroute(skb, entry); break; case AF_INET6: v6ops = rcu_dereference(nf_ipv6_ops); if (v6ops) ret = v6ops->reroute(skb, entry); break; } return ret; } /* caller must hold rcu read-side lock */ static void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) { const struct nf_hook_entry *hook_entry; const struct nf_hook_entries *hooks; struct sk_buff *skb = entry->skb; const struct net *net; unsigned int i; int err; u8 pf; net = entry->state.net; pf = entry->state.pf; hooks = nf_hook_entries_head(net, pf, entry->state.hook); i = entry->hook_index; if (!hooks || i >= hooks->num_hook_entries) { kfree_skb_reason(skb, SKB_DROP_REASON_NETFILTER_DROP); nf_queue_entry_free(entry); return; } hook_entry = &hooks->hooks[i]; /* Continue traversal iff userspace said ok... */ if (verdict == NF_REPEAT) verdict = nf_hook_entry_hookfn(hook_entry, skb, &entry->state); if (verdict == NF_ACCEPT) { if (nf_reroute(skb, entry) < 0) verdict = NF_DROP; } if (verdict == NF_ACCEPT) { next_hook: ++i; verdict = nf_iterate(skb, &entry->state, hooks, &i); } switch (verdict & NF_VERDICT_MASK) { case NF_ACCEPT: case NF_STOP: local_bh_disable(); entry->state.okfn(entry->state.net, entry->state.sk, skb); local_bh_enable(); break; case NF_QUEUE: err = nf_queue(skb, &entry->state, i, verdict); if (err == 1) goto next_hook; break; case NF_STOLEN: break; default: kfree_skb(skb); } nf_queue_entry_free(entry); } static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict) { const struct nf_ct_hook *ct_hook; if (verdict == NF_ACCEPT || verdict == NF_REPEAT || verdict == NF_STOP) { unsigned int ct_verdict = verdict; rcu_read_lock(); ct_hook = rcu_dereference(nf_ct_hook); if (ct_hook) ct_verdict = ct_hook->update(entry->state.net, entry->skb); rcu_read_unlock(); switch (ct_verdict & NF_VERDICT_MASK) { case NF_ACCEPT: /* follow userspace verdict, could be REPEAT */ break; case NF_STOLEN: nf_queue_entry_free(entry); return; default: verdict = ct_verdict & NF_VERDICT_MASK; break; } } nf_reinject(entry, verdict); } static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data) { struct nf_queue_entry *entry, *next; spin_lock_bh(&queue->lock); list_for_each_entry_safe(entry, next, &queue->queue_list, list) { if (!cmpfn || cmpfn(entry, data)) { list_del(&entry->list); queue->queue_total--; nfqnl_reinject(entry, NF_DROP); } } spin_unlock_bh(&queue->lock); } static int nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet, bool csum_verify) { __u32 flags = 0; if (packet->ip_summed == CHECKSUM_PARTIAL) flags = NFQA_SKB_CSUMNOTREADY; else if (csum_verify) flags = NFQA_SKB_CSUM_NOTVERIFIED; if (skb_is_gso(packet)) flags |= NFQA_SKB_GSO; return flags ? nla_put_be32(nlskb, NFQA_SKB_INFO, htonl(flags)) : 0; } static int nfqnl_put_sk_uidgid(struct sk_buff *skb, struct sock *sk) { const struct cred *cred; if (!sk_fullsock(sk)) return 0; read_lock_bh(&sk->sk_callback_lock); if (sk->sk_socket && sk->sk_socket->file) { cred = sk->sk_socket->file->f_cred; if (nla_put_be32(skb, NFQA_UID, htonl(from_kuid_munged(&init_user_ns, cred->fsuid)))) goto nla_put_failure; if (nla_put_be32(skb, NFQA_GID, htonl(from_kgid_munged(&init_user_ns, cred->fsgid)))) goto nla_put_failure; } read_unlock_bh(&sk->sk_callback_lock); return 0; nla_put_failure: read_unlock_bh(&sk->sk_callback_lock); return -1; } static int nfqnl_put_sk_classid(struct sk_buff *skb, struct sock *sk) { #if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID) if (sk && sk_fullsock(sk)) { u32 classid = sock_cgroup_classid(&sk->sk_cgrp_data); if (classid && nla_put_be32(skb, NFQA_CGROUP_CLASSID, htonl(classid))) return -1; } #endif return 0; } static u32 nfqnl_get_sk_secctx(struct sk_buff *skb, char **secdata) { u32 seclen = 0; #if IS_ENABLED(CONFIG_NETWORK_SECMARK) if (!skb || !sk_fullsock(skb->sk)) return 0; read_lock_bh(&skb->sk->sk_callback_lock); if (skb->secmark) security_secid_to_secctx(skb->secmark, secdata, &seclen); read_unlock_bh(&skb->sk->sk_callback_lock); #endif return seclen; } static u32 nfqnl_get_bridge_size(struct nf_queue_entry *entry) { struct sk_buff *entskb = entry->skb; u32 nlalen = 0; if (entry->state.pf != PF_BRIDGE || !skb_mac_header_was_set(entskb)) return 0; if (skb_vlan_tag_present(entskb)) nlalen += nla_total_size(nla_total_size(sizeof(__be16)) + nla_total_size(sizeof(__be16))); if (entskb->network_header > entskb->mac_header) nlalen += nla_total_size((entskb->network_header - entskb->mac_header)); return nlalen; } static int nfqnl_put_bridge(struct nf_queue_entry *entry, struct sk_buff *skb) { struct sk_buff *entskb = entry->skb; if (entry->state.pf != PF_BRIDGE || !skb_mac_header_was_set(entskb)) return 0; if (skb_vlan_tag_present(entskb)) { struct nlattr *nest; nest = nla_nest_start(skb, NFQA_VLAN); if (!nest) goto nla_put_failure; if (nla_put_be16(skb, NFQA_VLAN_TCI, htons(entskb->vlan_tci)) || nla_put_be16(skb, NFQA_VLAN_PROTO, entskb->vlan_proto)) goto nla_put_failure; nla_nest_end(skb, nest); } if (entskb->mac_header < entskb->network_header) { int len = (int)(entskb->network_header - entskb->mac_header); if (nla_put(skb, NFQA_L2HDR, len, skb_mac_header(entskb))) goto nla_put_failure; } return 0; nla_put_failure: return -1; } static struct sk_buff * nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, struct nf_queue_entry *entry, __be32 **packet_id_ptr) { size_t size; size_t data_len = 0, cap_len = 0; unsigned int hlen = 0; struct sk_buff *skb; struct nlattr *nla; struct nfqnl_msg_packet_hdr *pmsg; struct nlmsghdr *nlh; struct sk_buff *entskb = entry->skb; struct net_device *indev; struct net_device *outdev; struct nf_conn *ct = NULL; enum ip_conntrack_info ctinfo = 0; const struct nfnl_ct_hook *nfnl_ct; bool csum_verify; char *secdata = NULL; u32 seclen = 0; ktime_t tstamp; size = nlmsg_total_size(sizeof(struct nfgenmsg)) + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr)) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ #endif + nla_total_size(sizeof(u_int32_t)) /* mark */ + nla_total_size(sizeof(u_int32_t)) /* priority */ + nla_total_size(sizeof(struct nfqnl_msg_packet_hw)) + nla_total_size(sizeof(u_int32_t)) /* skbinfo */ #if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID) + nla_total_size(sizeof(u_int32_t)) /* classid */ #endif + nla_total_size(sizeof(u_int32_t)); /* cap_len */ tstamp = skb_tstamp_cond(entskb, false); if (tstamp) size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp)); size += nfqnl_get_bridge_size(entry); if (entry->state.hook <= NF_INET_FORWARD || (entry->state.hook == NF_INET_POST_ROUTING && entskb->sk == NULL)) csum_verify = !skb_csum_unnecessary(entskb); else csum_verify = false; outdev = entry->state.out; switch ((enum nfqnl_config_mode)READ_ONCE(queue->copy_mode)) { case NFQNL_COPY_META: case NFQNL_COPY_NONE: break; case NFQNL_COPY_PACKET: if (!(queue->flags & NFQA_CFG_F_GSO) && entskb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(entskb)) return NULL; data_len = READ_ONCE(queue->copy_range); if (data_len > entskb->len) data_len = entskb->len; hlen = skb_zerocopy_headlen(entskb); hlen = min_t(unsigned int, hlen, data_len); size += sizeof(struct nlattr) + hlen; cap_len = entskb->len; break; } nfnl_ct = rcu_dereference(nfnl_ct_hook); #if IS_ENABLED(CONFIG_NF_CONNTRACK) if (queue->flags & NFQA_CFG_F_CONNTRACK) { if (nfnl_ct != NULL) { ct = nf_ct_get(entskb, &ctinfo); if (ct != NULL) size += nfnl_ct->build_size(ct); } } #endif if (queue->flags & NFQA_CFG_F_UID_GID) { size += (nla_total_size(sizeof(u_int32_t)) /* uid */ + nla_total_size(sizeof(u_int32_t))); /* gid */ } if ((queue->flags & NFQA_CFG_F_SECCTX) && entskb->sk) { seclen = nfqnl_get_sk_secctx(entskb, &secdata); if (seclen) size += nla_total_size(seclen); } skb = alloc_skb(size, GFP_ATOMIC); if (!skb) { skb_tx_error(entskb); goto nlmsg_failure; } nlh = nfnl_msg_put(skb, 0, 0, nfnl_msg_type(NFNL_SUBSYS_QUEUE, NFQNL_MSG_PACKET), 0, entry->state.pf, NFNETLINK_V0, htons(queue->queue_num)); if (!nlh) { skb_tx_error(entskb); kfree_skb(skb); goto nlmsg_failure; } nla = __nla_reserve(skb, NFQA_PACKET_HDR, sizeof(*pmsg)); pmsg = nla_data(nla); pmsg->hw_protocol = entskb->protocol; pmsg->hook = entry->state.hook; *packet_id_ptr = &pmsg->packet_id; indev = entry->state.in; if (indev) { #if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex))) goto nla_put_failure; #else if (entry->state.pf == PF_BRIDGE) { /* Case 1: indev is physical input device, we need to * look for bridge group (when called from * netfilter_bridge) */ if (nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV, htonl(indev->ifindex)) || /* this is the bridge group "brX" */ /* rcu_read_lock()ed by __nf_queue */ nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(br_port_get_rcu(indev)->br->dev->ifindex))) goto nla_put_failure; } else { int physinif; /* Case 2: indev is bridge group, we need to look for * physical device (when called from ipv4) */ if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex))) goto nla_put_failure; physinif = nf_bridge_get_physinif(entskb); if (physinif && nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV, htonl(physinif))) goto nla_put_failure; } #endif } if (outdev) { #if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex))) goto nla_put_failure; #else if (entry->state.pf == PF_BRIDGE) { /* Case 1: outdev is physical output device, we need to * look for bridge group (when called from * netfilter_bridge) */ if (nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV, htonl(outdev->ifindex)) || /* this is the bridge group "brX" */ /* rcu_read_lock()ed by __nf_queue */ nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(br_port_get_rcu(outdev)->br->dev->ifindex))) goto nla_put_failure; } else { int physoutif; /* Case 2: outdev is bridge group, we need to look for * physical output device (when called from ipv4) */ if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex))) goto nla_put_failure; physoutif = nf_bridge_get_physoutif(entskb); if (physoutif && nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV, htonl(physoutif))) goto nla_put_failure; } #endif } if (entskb->mark && nla_put_be32(skb, NFQA_MARK, htonl(entskb->mark))) goto nla_put_failure; if (entskb->priority && nla_put_be32(skb, NFQA_PRIORITY, htonl(entskb->priority))) goto nla_put_failure; if (indev && entskb->dev && skb_mac_header_was_set(entskb) && skb_mac_header_len(entskb) != 0) { struct nfqnl_msg_packet_hw phw; int len; memset(&phw, 0, sizeof(phw)); len = dev_parse_header(entskb, phw.hw_addr); if (len) { phw.hw_addrlen = htons(len); if (nla_put(skb, NFQA_HWADDR, sizeof(phw), &phw)) goto nla_put_failure; } } if (nfqnl_put_bridge(entry, skb) < 0) goto nla_put_failure; if (entry->state.hook <= NF_INET_FORWARD && tstamp) { struct nfqnl_msg_packet_timestamp ts; struct timespec64 kts = ktime_to_timespec64(tstamp); ts.sec = cpu_to_be64(kts.tv_sec); ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC); if (nla_put(skb, NFQA_TIMESTAMP, sizeof(ts), &ts)) goto nla_put_failure; } if ((queue->flags & NFQA_CFG_F_UID_GID) && entskb->sk && nfqnl_put_sk_uidgid(skb, entskb->sk) < 0) goto nla_put_failure; if (nfqnl_put_sk_classid(skb, entskb->sk) < 0) goto nla_put_failure; if (seclen && nla_put(skb, NFQA_SECCTX, seclen, secdata)) goto nla_put_failure; if (ct && nfnl_ct->build(skb, ct, ctinfo, NFQA_CT, NFQA_CT_INFO) < 0) goto nla_put_failure; if (cap_len > data_len && nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len))) goto nla_put_failure; if (nfqnl_put_packet_info(skb, entskb, csum_verify)) goto nla_put_failure; if (data_len) { struct nlattr *nla; if (skb_tailroom(skb) < sizeof(*nla) + hlen) goto nla_put_failure; nla = skb_put(skb, sizeof(*nla)); nla->nla_type = NFQA_PAYLOAD; nla->nla_len = nla_attr_size(data_len); if (skb_zerocopy(skb, entskb, data_len, hlen)) goto nla_put_failure; } nlh->nlmsg_len = skb->len; if (seclen) security_release_secctx(secdata, seclen); return skb; nla_put_failure: skb_tx_error(entskb); kfree_skb(skb); net_err_ratelimited("nf_queue: error creating packet message\n"); nlmsg_failure: if (seclen) security_release_secctx(secdata, seclen); return NULL; } static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) static const unsigned long flags = IPS_CONFIRMED | IPS_DYING; struct nf_conn *ct = (void *)skb_nfct(entry->skb); unsigned long status; unsigned int use; if (!ct) return false; status = READ_ONCE(ct->status); if ((status & flags) == IPS_DYING) return true; if (status & IPS_CONFIRMED) return false; /* in some cases skb_clone() can occur after initial conntrack * pickup, but conntrack assumes exclusive skb->_nfct ownership for * unconfirmed entries. * * This happens for br_netfilter and with ip multicast routing. * We can't be solved with serialization here because one clone could * have been queued for local delivery. */ use = refcount_read(&ct->ct_general.use); if (likely(use == 1)) return false; /* Can't decrement further? Exclusive ownership. */ if (!refcount_dec_not_one(&ct->ct_general.use)) return false; skb_set_nfct(entry->skb, 0); /* No nf_ct_put(): we already decremented .use and it cannot * drop down to 0. */ return true; #endif return false; } static int __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue, struct nf_queue_entry *entry) { struct sk_buff *nskb; int err = -ENOBUFS; __be32 *packet_id_ptr; int failopen = 0; nskb = nfqnl_build_packet_message(net, queue, entry, &packet_id_ptr); if (nskb == NULL) { err = -ENOMEM; goto err_out; } spin_lock_bh(&queue->lock); if (nf_ct_drop_unconfirmed(entry)) goto err_out_free_nskb; if (queue->queue_total >= queue->queue_maxlen) { if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { failopen = 1; err = 0; } else { queue->queue_dropped++; net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n", queue->queue_total); } goto err_out_free_nskb; } entry->id = ++queue->id_sequence; *packet_id_ptr = htonl(entry->id); /* nfnetlink_unicast will either free the nskb or add it to a socket */ err = nfnetlink_unicast(nskb, net, queue->peer_portid); if (err < 0) { if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { failopen = 1; err = 0; } else { queue->queue_user_dropped++; } goto err_out_unlock; } __enqueue_entry(queue, entry); spin_unlock_bh(&queue->lock); return 0; err_out_free_nskb: kfree_skb(nskb); err_out_unlock: spin_unlock_bh(&queue->lock); if (failopen) nfqnl_reinject(entry, NF_ACCEPT); err_out: return err; } static struct nf_queue_entry * nf_queue_entry_dup(struct nf_queue_entry *e) { struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC); if (!entry) return NULL; if (nf_queue_entry_get_refs(entry)) return entry; kfree(entry); return NULL; } #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* When called from bridge netfilter, skb->data must point to MAC header * before calling skb_gso_segment(). Else, original MAC header is lost * and segmented skbs will be sent to wrong destination. */ static void nf_bridge_adjust_skb_data(struct sk_buff *skb) { if (nf_bridge_info_get(skb)) __skb_push(skb, skb->network_header - skb->mac_header); } static void nf_bridge_adjust_segmented_data(struct sk_buff *skb) { if (nf_bridge_info_get(skb)) __skb_pull(skb, skb->network_header - skb->mac_header); } #else #define nf_bridge_adjust_skb_data(s) do {} while (0) #define nf_bridge_adjust_segmented_data(s) do {} while (0) #endif static int __nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue, struct sk_buff *skb, struct nf_queue_entry *entry) { int ret = -ENOMEM; struct nf_queue_entry *entry_seg; nf_bridge_adjust_segmented_data(skb); if (skb->next == NULL) { /* last packet, no need to copy entry */ struct sk_buff *gso_skb = entry->skb; entry->skb = skb; ret = __nfqnl_enqueue_packet(net, queue, entry); if (ret) entry->skb = gso_skb; return ret; } skb_mark_not_on_list(skb); entry_seg = nf_queue_entry_dup(entry); if (entry_seg) { entry_seg->skb = skb; ret = __nfqnl_enqueue_packet(net, queue, entry_seg); if (ret) nf_queue_entry_free(entry_seg); } return ret; } static int nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) { unsigned int queued; struct nfqnl_instance *queue; struct sk_buff *skb, *segs, *nskb; int err = -ENOBUFS; struct net *net = entry->state.net; struct nfnl_queue_net *q = nfnl_queue_pernet(net); /* rcu_read_lock()ed by nf_hook_thresh */ queue = instance_lookup(q, queuenum); if (!queue) return -ESRCH; if (queue->copy_mode == NFQNL_COPY_NONE) return -EINVAL; skb = entry->skb; switch (entry->state.pf) { case NFPROTO_IPV4: skb->protocol = htons(ETH_P_IP); break; case NFPROTO_IPV6: skb->protocol = htons(ETH_P_IPV6); break; } if ((queue->flags & NFQA_CFG_F_GSO) || !skb_is_gso(skb)) return __nfqnl_enqueue_packet(net, queue, entry); nf_bridge_adjust_skb_data(skb); segs = skb_gso_segment(skb, 0); /* Does not use PTR_ERR to limit the number of error codes that can be * returned by nf_queue. For instance, callers rely on -ESRCH to * mean 'ignore this hook'. */ if (IS_ERR_OR_NULL(segs)) goto out_err; queued = 0; err = 0; skb_list_walk_safe(segs, segs, nskb) { if (err == 0) err = __nfqnl_enqueue_packet_gso(net, queue, segs, entry); if (err == 0) queued++; else kfree_skb(segs); } if (queued) { if (err) /* some segments are already queued */ nf_queue_entry_free(entry); kfree_skb(skb); return 0; } out_err: nf_bridge_adjust_segmented_data(skb); return err; } static int nfqnl_mangle(void *data, unsigned int data_len, struct nf_queue_entry *e, int diff) { struct sk_buff *nskb; if (diff < 0) { unsigned int min_len = skb_transport_offset(e->skb); if (data_len < min_len) return -EINVAL; if (pskb_trim(e->skb, data_len)) return -ENOMEM; } else if (diff > 0) { if (data_len > 0xFFFF) return -EINVAL; if (diff > skb_tailroom(e->skb)) { nskb = skb_copy_expand(e->skb, skb_headroom(e->skb), diff, GFP_ATOMIC); if (!nskb) return -ENOMEM; kfree_skb(e->skb); e->skb = nskb; } skb_put(e->skb, diff); } if (skb_ensure_writable(e->skb, data_len)) return -ENOMEM; skb_copy_to_linear_data(e->skb, data, data_len); e->skb->ip_summed = CHECKSUM_NONE; return 0; } static int nfqnl_set_mode(struct nfqnl_instance *queue, unsigned char mode, unsigned int range) { int status = 0; spin_lock_bh(&queue->lock); switch (mode) { case NFQNL_COPY_NONE: case NFQNL_COPY_META: queue->copy_mode = mode; queue->copy_range = 0; break; case NFQNL_COPY_PACKET: queue->copy_mode = mode; if (range == 0 || range > NFQNL_MAX_COPY_RANGE) queue->copy_range = NFQNL_MAX_COPY_RANGE; else queue->copy_range = range; break; default: status = -EINVAL; } spin_unlock_bh(&queue->lock); return status; } static int dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) int physinif, physoutif; physinif = nf_bridge_get_physinif(entry->skb); physoutif = nf_bridge_get_physoutif(entry->skb); if (physinif == ifindex || physoutif == ifindex) return 1; #endif if (entry->state.in) if (entry->state.in->ifindex == ifindex) return 1; if (entry->state.out) if (entry->state.out->ifindex == ifindex) return 1; return 0; } /* drop all packets with either indev or outdev == ifindex from all queue * instances */ static void nfqnl_dev_drop(struct net *net, int ifindex) { int i; struct nfnl_queue_net *q = nfnl_queue_pernet(net); rcu_read_lock(); for (i = 0; i < INSTANCE_BUCKETS; i++) { struct nfqnl_instance *inst; struct hlist_head *head = &q->instance_table[i]; hlist_for_each_entry_rcu(inst, head, hlist) nfqnl_flush(inst, dev_cmp, ifindex); } rcu_read_unlock(); } static int nfqnl_rcv_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); /* Drop any packets associated with the downed device */ if (event == NETDEV_DOWN) nfqnl_dev_drop(dev_net(dev), dev->ifindex); return NOTIFY_DONE; } static struct notifier_block nfqnl_dev_notifier = { .notifier_call = nfqnl_rcv_dev_event, }; static void nfqnl_nf_hook_drop(struct net *net) { struct nfnl_queue_net *q = nfnl_queue_pernet(net); int i; /* This function is also called on net namespace error unwind, * when pernet_ops->init() failed and ->exit() functions of the * previous pernet_ops gets called. * * This may result in a call to nfqnl_nf_hook_drop() before * struct nfnl_queue_net was allocated. */ if (!q) return; for (i = 0; i < INSTANCE_BUCKETS; i++) { struct nfqnl_instance *inst; struct hlist_head *head = &q->instance_table[i]; hlist_for_each_entry_rcu(inst, head, hlist) nfqnl_flush(inst, NULL, 0); } } static int nfqnl_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netlink_notify *n = ptr; struct nfnl_queue_net *q = nfnl_queue_pernet(n->net); if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) { int i; /* destroy all instances for this portid */ spin_lock(&q->instances_lock); for (i = 0; i < INSTANCE_BUCKETS; i++) { struct hlist_node *t2; struct nfqnl_instance *inst; struct hlist_head *head = &q->instance_table[i]; hlist_for_each_entry_safe(inst, t2, head, hlist) { if (n->portid == inst->peer_portid) __instance_destroy(inst); } } spin_unlock(&q->instances_lock); } return NOTIFY_DONE; } static struct notifier_block nfqnl_rtnl_notifier = { .notifier_call = nfqnl_rcv_nl_event, }; static const struct nla_policy nfqa_vlan_policy[NFQA_VLAN_MAX + 1] = { [NFQA_VLAN_TCI] = { .type = NLA_U16}, [NFQA_VLAN_PROTO] = { .type = NLA_U16}, }; static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = { [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) }, [NFQA_MARK] = { .type = NLA_U32 }, [NFQA_PAYLOAD] = { .type = NLA_UNSPEC }, [NFQA_CT] = { .type = NLA_UNSPEC }, [NFQA_EXP] = { .type = NLA_UNSPEC }, [NFQA_VLAN] = { .type = NLA_NESTED }, [NFQA_PRIORITY] = { .type = NLA_U32 }, }; static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = { [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) }, [NFQA_MARK] = { .type = NLA_U32 }, [NFQA_PRIORITY] = { .type = NLA_U32 }, }; static struct nfqnl_instance * verdict_instance_lookup(struct nfnl_queue_net *q, u16 queue_num, u32 nlportid) { struct nfqnl_instance *queue; queue = instance_lookup(q, queue_num); if (!queue) return ERR_PTR(-ENODEV); if (queue->peer_portid != nlportid) return ERR_PTR(-EPERM); return queue; } static struct nfqnl_msg_verdict_hdr* verdicthdr_get(const struct nlattr * const nfqa[]) { struct nfqnl_msg_verdict_hdr *vhdr; unsigned int verdict; if (!nfqa[NFQA_VERDICT_HDR]) return NULL; vhdr = nla_data(nfqa[NFQA_VERDICT_HDR]); verdict = ntohl(vhdr->verdict) & NF_VERDICT_MASK; if (verdict > NF_MAX_VERDICT || verdict == NF_STOLEN) return NULL; return vhdr; } static int nfq_id_after(unsigned int id, unsigned int max) { return (int)(id - max) > 0; } static int nfqnl_recv_verdict_batch(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nfqa[]) { struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); u16 queue_num = ntohs(info->nfmsg->res_id); struct nf_queue_entry *entry, *tmp; struct nfqnl_msg_verdict_hdr *vhdr; struct nfqnl_instance *queue; unsigned int verdict, maxid; LIST_HEAD(batch_list); queue = verdict_instance_lookup(q, queue_num, NETLINK_CB(skb).portid); if (IS_ERR(queue)) return PTR_ERR(queue); vhdr = verdicthdr_get(nfqa); if (!vhdr) return -EINVAL; verdict = ntohl(vhdr->verdict); maxid = ntohl(vhdr->id); spin_lock_bh(&queue->lock); list_for_each_entry_safe(entry, tmp, &queue->queue_list, list) { if (nfq_id_after(entry->id, maxid)) break; __dequeue_entry(queue, entry); list_add_tail(&entry->list, &batch_list); } spin_unlock_bh(&queue->lock); if (list_empty(&batch_list)) return -ENOENT; list_for_each_entry_safe(entry, tmp, &batch_list, list) { if (nfqa[NFQA_MARK]) entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); if (nfqa[NFQA_PRIORITY]) entry->skb->priority = ntohl(nla_get_be32(nfqa[NFQA_PRIORITY])); nfqnl_reinject(entry, verdict); } return 0; } static struct nf_conn *nfqnl_ct_parse(const struct nfnl_ct_hook *nfnl_ct, const struct nlmsghdr *nlh, const struct nlattr * const nfqa[], struct nf_queue_entry *entry, enum ip_conntrack_info *ctinfo) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) struct nf_conn *ct; ct = nf_ct_get(entry->skb, ctinfo); if (ct == NULL) return NULL; if (nfnl_ct->parse(nfqa[NFQA_CT], ct) < 0) return NULL; if (nfqa[NFQA_EXP]) nfnl_ct->attach_expect(nfqa[NFQA_EXP], ct, NETLINK_CB(entry->skb).portid, nlmsg_report(nlh)); return ct; #else return NULL; #endif } static int nfqa_parse_bridge(struct nf_queue_entry *entry, const struct nlattr * const nfqa[]) { if (nfqa[NFQA_VLAN]) { struct nlattr *tb[NFQA_VLAN_MAX + 1]; int err; err = nla_parse_nested_deprecated(tb, NFQA_VLAN_MAX, nfqa[NFQA_VLAN], nfqa_vlan_policy, NULL); if (err < 0) return err; if (!tb[NFQA_VLAN_TCI] || !tb[NFQA_VLAN_PROTO]) return -EINVAL; __vlan_hwaccel_put_tag(entry->skb, nla_get_be16(tb[NFQA_VLAN_PROTO]), ntohs(nla_get_be16(tb[NFQA_VLAN_TCI]))); } if (nfqa[NFQA_L2HDR]) { int mac_header_len = entry->skb->network_header - entry->skb->mac_header; if (mac_header_len != nla_len(nfqa[NFQA_L2HDR])) return -EINVAL; else if (mac_header_len > 0) memcpy(skb_mac_header(entry->skb), nla_data(nfqa[NFQA_L2HDR]), mac_header_len); } return 0; } static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nfqa[]) { struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); u_int16_t queue_num = ntohs(info->nfmsg->res_id); const struct nfnl_ct_hook *nfnl_ct; struct nfqnl_msg_verdict_hdr *vhdr; enum ip_conntrack_info ctinfo; struct nfqnl_instance *queue; struct nf_queue_entry *entry; struct nf_conn *ct = NULL; unsigned int verdict; int err; queue = verdict_instance_lookup(q, queue_num, NETLINK_CB(skb).portid); if (IS_ERR(queue)) return PTR_ERR(queue); vhdr = verdicthdr_get(nfqa); if (!vhdr) return -EINVAL; verdict = ntohl(vhdr->verdict); entry = find_dequeue_entry(queue, ntohl(vhdr->id)); if (entry == NULL) return -ENOENT; /* rcu lock already held from nfnl->call_rcu. */ nfnl_ct = rcu_dereference(nfnl_ct_hook); if (nfqa[NFQA_CT]) { if (nfnl_ct != NULL) ct = nfqnl_ct_parse(nfnl_ct, info->nlh, nfqa, entry, &ctinfo); } if (entry->state.pf == PF_BRIDGE) { err = nfqa_parse_bridge(entry, nfqa); if (err < 0) return err; } if (nfqa[NFQA_PAYLOAD]) { u16 payload_len = nla_len(nfqa[NFQA_PAYLOAD]); int diff = payload_len - entry->skb->len; if (nfqnl_mangle(nla_data(nfqa[NFQA_PAYLOAD]), payload_len, entry, diff) < 0) verdict = NF_DROP; if (ct && diff) nfnl_ct->seq_adjust(entry->skb, ct, ctinfo, diff); } if (nfqa[NFQA_MARK]) entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); if (nfqa[NFQA_PRIORITY]) entry->skb->priority = ntohl(nla_get_be32(nfqa[NFQA_PRIORITY])); nfqnl_reinject(entry, verdict); return 0; } static int nfqnl_recv_unsupp(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { return -ENOTSUPP; } static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = { [NFQA_CFG_CMD] = { .len = sizeof(struct nfqnl_msg_config_cmd) }, [NFQA_CFG_PARAMS] = { .len = sizeof(struct nfqnl_msg_config_params) }, [NFQA_CFG_QUEUE_MAXLEN] = { .type = NLA_U32 }, [NFQA_CFG_MASK] = { .type = NLA_U32 }, [NFQA_CFG_FLAGS] = { .type = NLA_U32 }, }; static const struct nf_queue_handler nfqh = { .outfn = nfqnl_enqueue_packet, .nf_hook_drop = nfqnl_nf_hook_drop, }; static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nfqa[]) { struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); u_int16_t queue_num = ntohs(info->nfmsg->res_id); struct nfqnl_msg_config_cmd *cmd = NULL; struct nfqnl_instance *queue; __u32 flags = 0, mask = 0; int ret = 0; if (nfqa[NFQA_CFG_CMD]) { cmd = nla_data(nfqa[NFQA_CFG_CMD]); /* Obsolete commands without queue context */ switch (cmd->command) { case NFQNL_CFG_CMD_PF_BIND: return 0; case NFQNL_CFG_CMD_PF_UNBIND: return 0; } } /* Check if we support these flags in first place, dependencies should * be there too not to break atomicity. */ if (nfqa[NFQA_CFG_FLAGS]) { if (!nfqa[NFQA_CFG_MASK]) { /* A mask is needed to specify which flags are being * changed. */ return -EINVAL; } flags = ntohl(nla_get_be32(nfqa[NFQA_CFG_FLAGS])); mask = ntohl(nla_get_be32(nfqa[NFQA_CFG_MASK])); if (flags >= NFQA_CFG_F_MAX) return -EOPNOTSUPP; #if !IS_ENABLED(CONFIG_NETWORK_SECMARK) if (flags & mask & NFQA_CFG_F_SECCTX) return -EOPNOTSUPP; #endif if ((flags & mask & NFQA_CFG_F_CONNTRACK) && !rcu_access_pointer(nfnl_ct_hook)) { #ifdef CONFIG_MODULES nfnl_unlock(NFNL_SUBSYS_QUEUE); request_module("ip_conntrack_netlink"); nfnl_lock(NFNL_SUBSYS_QUEUE); if (rcu_access_pointer(nfnl_ct_hook)) return -EAGAIN; #endif return -EOPNOTSUPP; } } rcu_read_lock(); queue = instance_lookup(q, queue_num); if (queue && queue->peer_portid != NETLINK_CB(skb).portid) { ret = -EPERM; goto err_out_unlock; } if (cmd != NULL) { switch (cmd->command) { case NFQNL_CFG_CMD_BIND: if (queue) { ret = -EBUSY; goto err_out_unlock; } queue = instance_create(q, queue_num, NETLINK_CB(skb).portid); if (IS_ERR(queue)) { ret = PTR_ERR(queue); goto err_out_unlock; } break; case NFQNL_CFG_CMD_UNBIND: if (!queue) { ret = -ENODEV; goto err_out_unlock; } instance_destroy(q, queue); goto err_out_unlock; case NFQNL_CFG_CMD_PF_BIND: case NFQNL_CFG_CMD_PF_UNBIND: break; default: ret = -ENOTSUPP; goto err_out_unlock; } } if (!queue) { ret = -ENODEV; goto err_out_unlock; } if (nfqa[NFQA_CFG_PARAMS]) { struct nfqnl_msg_config_params *params = nla_data(nfqa[NFQA_CFG_PARAMS]); nfqnl_set_mode(queue, params->copy_mode, ntohl(params->copy_range)); } if (nfqa[NFQA_CFG_QUEUE_MAXLEN]) { __be32 *queue_maxlen = nla_data(nfqa[NFQA_CFG_QUEUE_MAXLEN]); spin_lock_bh(&queue->lock); queue->queue_maxlen = ntohl(*queue_maxlen); spin_unlock_bh(&queue->lock); } if (nfqa[NFQA_CFG_FLAGS]) { spin_lock_bh(&queue->lock); queue->flags &= ~mask; queue->flags |= flags & mask; spin_unlock_bh(&queue->lock); } err_out_unlock: rcu_read_unlock(); return ret; } static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { [NFQNL_MSG_PACKET] = { .call = nfqnl_recv_unsupp, .type = NFNL_CB_RCU, .attr_count = NFQA_MAX, }, [NFQNL_MSG_VERDICT] = { .call = nfqnl_recv_verdict, .type = NFNL_CB_RCU, .attr_count = NFQA_MAX, .policy = nfqa_verdict_policy }, [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config, .type = NFNL_CB_MUTEX, .attr_count = NFQA_CFG_MAX, .policy = nfqa_cfg_policy }, [NFQNL_MSG_VERDICT_BATCH] = { .call = nfqnl_recv_verdict_batch, .type = NFNL_CB_RCU, .attr_count = NFQA_MAX, .policy = nfqa_verdict_batch_policy }, }; static const struct nfnetlink_subsystem nfqnl_subsys = { .name = "nf_queue", .subsys_id = NFNL_SUBSYS_QUEUE, .cb_count = NFQNL_MSG_MAX, .cb = nfqnl_cb, }; #ifdef CONFIG_PROC_FS struct iter_state { struct seq_net_private p; unsigned int bucket; }; static struct hlist_node *get_first(struct seq_file *seq) { struct iter_state *st = seq->private; struct net *net; struct nfnl_queue_net *q; if (!st) return NULL; net = seq_file_net(seq); q = nfnl_queue_pernet(net); for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { if (!hlist_empty(&q->instance_table[st->bucket])) return q->instance_table[st->bucket].first; } return NULL; } static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h) { struct iter_state *st = seq->private; struct net *net = seq_file_net(seq); h = h->next; while (!h) { struct nfnl_queue_net *q; if (++st->bucket >= INSTANCE_BUCKETS) return NULL; q = nfnl_queue_pernet(net); h = q->instance_table[st->bucket].first; } return h; } static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos) { struct hlist_node *head; head = get_first(seq); if (head) while (pos && (head = get_next(seq, head))) pos--; return pos ? NULL : head; } static void *seq_start(struct seq_file *s, loff_t *pos) __acquires(nfnl_queue_pernet(seq_file_net(s))->instances_lock) { spin_lock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock); return get_idx(s, *pos); } static void *seq_next(struct seq_file *s, void *v, loff_t *pos) { (*pos)++; return get_next(s, v); } static void seq_stop(struct seq_file *s, void *v) __releases(nfnl_queue_pernet(seq_file_net(s))->instances_lock) { spin_unlock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock); } static int seq_show(struct seq_file *s, void *v) { const struct nfqnl_instance *inst = v; seq_printf(s, "%5u %6u %5u %1u %5u %5u %5u %8u %2d\n", inst->queue_num, inst->peer_portid, inst->queue_total, inst->copy_mode, inst->copy_range, inst->queue_dropped, inst->queue_user_dropped, inst->id_sequence, 1); return 0; } static const struct seq_operations nfqnl_seq_ops = { .start = seq_start, .next = seq_next, .stop = seq_stop, .show = seq_show, }; #endif /* PROC_FS */ static int __net_init nfnl_queue_net_init(struct net *net) { unsigned int i; struct nfnl_queue_net *q = nfnl_queue_pernet(net); for (i = 0; i < INSTANCE_BUCKETS; i++) INIT_HLIST_HEAD(&q->instance_table[i]); spin_lock_init(&q->instances_lock); #ifdef CONFIG_PROC_FS if (!proc_create_net("nfnetlink_queue", 0440, net->nf.proc_netfilter, &nfqnl_seq_ops, sizeof(struct iter_state))) return -ENOMEM; #endif return 0; } static void __net_exit nfnl_queue_net_exit(struct net *net) { struct nfnl_queue_net *q = nfnl_queue_pernet(net); unsigned int i; #ifdef CONFIG_PROC_FS remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter); #endif for (i = 0; i < INSTANCE_BUCKETS; i++) WARN_ON_ONCE(!hlist_empty(&q->instance_table[i])); } static struct pernet_operations nfnl_queue_net_ops = { .init = nfnl_queue_net_init, .exit = nfnl_queue_net_exit, .id = &nfnl_queue_net_id, .size = sizeof(struct nfnl_queue_net), }; static int __init nfnetlink_queue_init(void) { int status; status = register_pernet_subsys(&nfnl_queue_net_ops); if (status < 0) { pr_err("failed to register pernet ops\n"); goto out; } netlink_register_notifier(&nfqnl_rtnl_notifier); status = nfnetlink_subsys_register(&nfqnl_subsys); if (status < 0) { pr_err("failed to create netlink socket\n"); goto cleanup_netlink_notifier; } status = register_netdevice_notifier(&nfqnl_dev_notifier); if (status < 0) { pr_err("failed to register netdevice notifier\n"); goto cleanup_netlink_subsys; } nf_register_queue_handler(&nfqh); return status; cleanup_netlink_subsys: nfnetlink_subsys_unregister(&nfqnl_subsys); cleanup_netlink_notifier: netlink_unregister_notifier(&nfqnl_rtnl_notifier); unregister_pernet_subsys(&nfnl_queue_net_ops); out: return status; } static void __exit nfnetlink_queue_fini(void) { nf_unregister_queue_handler(); unregister_netdevice_notifier(&nfqnl_dev_notifier); nfnetlink_subsys_unregister(&nfqnl_subsys); netlink_unregister_notifier(&nfqnl_rtnl_notifier); unregister_pernet_subsys(&nfnl_queue_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ } MODULE_DESCRIPTION("netfilter packet queue handler"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE); module_init(nfnetlink_queue_init); module_exit(nfnetlink_queue_fini); |
2 1 2 1 1 1 2 2 4 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 | // SPDX-License-Identifier: GPL-2.0 /* * super.c * * Copyright (c) 1999 Al Smith * * Portions derived from work (c) 1995,1996 Christian Vogelgsang. */ #include <linux/init.h> #include <linux/module.h> #include <linux/exportfs.h> #include <linux/slab.h> #include <linux/buffer_head.h> #include <linux/vfs.h> #include <linux/blkdev.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include "efs.h" #include <linux/efs_vh.h> #include <linux/efs_fs_sb.h> static int efs_statfs(struct dentry *dentry, struct kstatfs *buf); static int efs_init_fs_context(struct fs_context *fc); static void efs_kill_sb(struct super_block *s) { struct efs_sb_info *sbi = SUPER_INFO(s); kill_block_super(s); kfree(sbi); } static struct pt_types sgi_pt_types[] = { {0x00, "SGI vh"}, {0x01, "SGI trkrepl"}, {0x02, "SGI secrepl"}, {0x03, "SGI raw"}, {0x04, "SGI bsd"}, {SGI_SYSV, "SGI sysv"}, {0x06, "SGI vol"}, {SGI_EFS, "SGI efs"}, {0x08, "SGI lv"}, {0x09, "SGI rlv"}, {0x0A, "SGI xfs"}, {0x0B, "SGI xfslog"}, {0x0C, "SGI xlv"}, {0x82, "Linux swap"}, {0x83, "Linux native"}, {0, NULL} }; enum { Opt_explicit_open, }; static const struct fs_parameter_spec efs_param_spec[] = { fsparam_flag ("explicit-open", Opt_explicit_open), {} }; /* * File system definition and registration. */ static struct file_system_type efs_fs_type = { .owner = THIS_MODULE, .name = "efs", .kill_sb = efs_kill_sb, .fs_flags = FS_REQUIRES_DEV, .init_fs_context = efs_init_fs_context, .parameters = efs_param_spec, }; MODULE_ALIAS_FS("efs"); static struct kmem_cache * efs_inode_cachep; static struct inode *efs_alloc_inode(struct super_block *sb) { struct efs_inode_info *ei; ei = alloc_inode_sb(sb, efs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; } static void efs_free_inode(struct inode *inode) { kmem_cache_free(efs_inode_cachep, INODE_INFO(inode)); } static void init_once(void *foo) { struct efs_inode_info *ei = (struct efs_inode_info *) foo; inode_init_once(&ei->vfs_inode); } static int __init init_inodecache(void) { efs_inode_cachep = kmem_cache_create("efs_inode_cache", sizeof(struct efs_inode_info), 0, SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, init_once); if (efs_inode_cachep == NULL) return -ENOMEM; return 0; } static void destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(efs_inode_cachep); } static const struct super_operations efs_superblock_operations = { .alloc_inode = efs_alloc_inode, .free_inode = efs_free_inode, .statfs = efs_statfs, }; static const struct export_operations efs_export_ops = { .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = efs_fh_to_dentry, .fh_to_parent = efs_fh_to_parent, .get_parent = efs_get_parent, }; static int __init init_efs_fs(void) { int err; pr_info(EFS_VERSION" - http://aeschi.ch.eu.org/efs/\n"); err = init_inodecache(); if (err) goto out1; err = register_filesystem(&efs_fs_type); if (err) goto out; return 0; out: destroy_inodecache(); out1: return err; } static void __exit exit_efs_fs(void) { unregister_filesystem(&efs_fs_type); destroy_inodecache(); } module_init(init_efs_fs) module_exit(exit_efs_fs) static efs_block_t efs_validate_vh(struct volume_header *vh) { int i; __be32 cs, *ui; int csum; efs_block_t sblock = 0; /* shuts up gcc */ struct pt_types *pt_entry; int pt_type, slice = -1; if (be32_to_cpu(vh->vh_magic) != VHMAGIC) { /* * assume that we're dealing with a partition and allow * read_super() to try and detect a valid superblock * on the next block. */ return 0; } ui = ((__be32 *) (vh + 1)) - 1; for(csum = 0; ui >= ((__be32 *) vh);) { cs = *ui--; csum += be32_to_cpu(cs); } if (csum) { pr_warn("SGI disklabel: checksum bad, label corrupted\n"); return 0; } #ifdef DEBUG pr_debug("bf: \"%16s\"\n", vh->vh_bootfile); for(i = 0; i < NVDIR; i++) { int j; char name[VDNAMESIZE+1]; for(j = 0; j < VDNAMESIZE; j++) { name[j] = vh->vh_vd[i].vd_name[j]; } name[j] = (char) 0; if (name[0]) { pr_debug("vh: %8s block: 0x%08x size: 0x%08x\n", name, (int) be32_to_cpu(vh->vh_vd[i].vd_lbn), (int) be32_to_cpu(vh->vh_vd[i].vd_nbytes)); } } #endif for(i = 0; i < NPARTAB; i++) { pt_type = (int) be32_to_cpu(vh->vh_pt[i].pt_type); for(pt_entry = sgi_pt_types; pt_entry->pt_name; pt_entry++) { if (pt_type == pt_entry->pt_type) break; } #ifdef DEBUG if (be32_to_cpu(vh->vh_pt[i].pt_nblks)) { pr_debug("pt %2d: start: %08d size: %08d type: 0x%02x (%s)\n", i, (int)be32_to_cpu(vh->vh_pt[i].pt_firstlbn), (int)be32_to_cpu(vh->vh_pt[i].pt_nblks), pt_type, (pt_entry->pt_name) ? pt_entry->pt_name : "unknown"); } #endif if (IS_EFS(pt_type)) { sblock = be32_to_cpu(vh->vh_pt[i].pt_firstlbn); slice = i; } } if (slice == -1) { pr_notice("partition table contained no EFS partitions\n"); #ifdef DEBUG } else { pr_info("using slice %d (type %s, offset 0x%x)\n", slice, (pt_entry->pt_name) ? pt_entry->pt_name : "unknown", sblock); #endif } return sblock; } static int efs_validate_super(struct efs_sb_info *sb, struct efs_super *super) { if (!IS_EFS_MAGIC(be32_to_cpu(super->fs_magic))) return -1; sb->fs_magic = be32_to_cpu(super->fs_magic); sb->total_blocks = be32_to_cpu(super->fs_size); sb->first_block = be32_to_cpu(super->fs_firstcg); sb->group_size = be32_to_cpu(super->fs_cgfsize); sb->data_free = be32_to_cpu(super->fs_tfree); sb->inode_free = be32_to_cpu(super->fs_tinode); sb->inode_blocks = be16_to_cpu(super->fs_cgisize); sb->total_groups = be16_to_cpu(super->fs_ncg); return 0; } static int efs_fill_super(struct super_block *s, struct fs_context *fc) { struct efs_sb_info *sb; struct buffer_head *bh; struct inode *root; sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL); if (!sb) return -ENOMEM; s->s_fs_info = sb; s->s_time_min = 0; s->s_time_max = U32_MAX; s->s_magic = EFS_SUPER_MAGIC; if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) { pr_err("device does not support %d byte blocks\n", EFS_BLOCKSIZE); return -EINVAL; } /* read the vh (volume header) block */ bh = sb_bread(s, 0); if (!bh) { pr_err("cannot read volume header\n"); return -EIO; } /* * if this returns zero then we didn't find any partition table. * this isn't (yet) an error - just assume for the moment that * the device is valid and go on to search for a superblock. */ sb->fs_start = efs_validate_vh((struct volume_header *) bh->b_data); brelse(bh); if (sb->fs_start == -1) { return -EINVAL; } bh = sb_bread(s, sb->fs_start + EFS_SUPER); if (!bh) { pr_err("cannot read superblock\n"); return -EIO; } if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) { #ifdef DEBUG pr_warn("invalid superblock at block %u\n", sb->fs_start + EFS_SUPER); #endif brelse(bh); return -EINVAL; } brelse(bh); if (!sb_rdonly(s)) { #ifdef DEBUG pr_info("forcing read-only mode\n"); #endif s->s_flags |= SB_RDONLY; } s->s_op = &efs_superblock_operations; s->s_export_op = &efs_export_ops; root = efs_iget(s, EFS_ROOTINODE); if (IS_ERR(root)) { pr_err("get root inode failed\n"); return PTR_ERR(root); } s->s_root = d_make_root(root); if (!(s->s_root)) { pr_err("get root dentry failed\n"); return -ENOMEM; } return 0; } static void efs_free_fc(struct fs_context *fc) { kfree(fc->fs_private); } static int efs_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, efs_fill_super); } static int efs_parse_param(struct fs_context *fc, struct fs_parameter *param) { int token; struct fs_parse_result result; token = fs_parse(fc, efs_param_spec, param, &result); if (token < 0) return token; return 0; } static int efs_reconfigure(struct fs_context *fc) { sync_filesystem(fc->root->d_sb); return 0; } struct efs_context { unsigned long s_mount_opts; }; static const struct fs_context_operations efs_context_opts = { .parse_param = efs_parse_param, .get_tree = efs_get_tree, .reconfigure = efs_reconfigure, .free = efs_free_fc, }; /* * Set up the filesystem mount context. */ static int efs_init_fs_context(struct fs_context *fc) { struct efs_context *ctx; ctx = kzalloc(sizeof(struct efs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; fc->fs_private = ctx; fc->ops = &efs_context_opts; return 0; } static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct efs_sb_info *sbi = SUPER_INFO(sb); u64 id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = EFS_SUPER_MAGIC; /* efs magic number */ buf->f_bsize = EFS_BLOCKSIZE; /* blocksize */ buf->f_blocks = sbi->total_groups * /* total data blocks */ (sbi->group_size - sbi->inode_blocks); buf->f_bfree = sbi->data_free; /* free data blocks */ buf->f_bavail = sbi->data_free; /* free blocks for non-root */ buf->f_files = sbi->total_groups * /* total inodes */ sbi->inode_blocks * (EFS_BLOCKSIZE / sizeof(struct efs_dinode)); buf->f_ffree = sbi->inode_free; /* free inodes */ buf->f_fsid = u64_to_fsid(id); buf->f_namelen = EFS_MAXNAMELEN; /* max filename length */ return 0; } |
18 49 67 66 67 67 67 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 | // SPDX-License-Identifier: GPL-2.0-only /* * AppArmor security module * * This file contains AppArmor ipc mediation * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2017 Canonical Ltd. */ #include <linux/gfp.h> #include "include/audit.h" #include "include/capability.h" #include "include/cred.h" #include "include/policy.h" #include "include/ipc.h" #include "include/sig_names.h" static inline int map_signal_num(int sig) { if (sig > SIGRTMAX) return SIGUNKNOWN; else if (sig >= SIGRTMIN) return sig - SIGRTMIN + SIGRT_BASE; else if (sig < MAXMAPPED_SIG) return sig_map[sig]; return SIGUNKNOWN; } /** * audit_signal_mask - convert mask to permission string * @mask: permission mask to convert * * Returns: pointer to static string */ static const char *audit_signal_mask(u32 mask) { if (mask & MAY_READ) return "receive"; if (mask & MAY_WRITE) return "send"; return ""; } /** * audit_signal_cb() - call back for signal specific audit fields * @ab: audit_buffer (NOT NULL) * @va: audit struct to audit values of (NOT NULL) */ static void audit_signal_cb(struct audit_buffer *ab, void *va) { struct common_audit_data *sa = va; struct apparmor_audit_data *ad = aad(sa); if (ad->request & AA_SIGNAL_PERM_MASK) { audit_log_format(ab, " requested_mask=\"%s\"", audit_signal_mask(ad->request)); if (ad->denied & AA_SIGNAL_PERM_MASK) { audit_log_format(ab, " denied_mask=\"%s\"", audit_signal_mask(ad->denied)); } } if (ad->signal == SIGUNKNOWN) audit_log_format(ab, "signal=unknown(%d)", ad->unmappedsig); else if (ad->signal < MAXMAPPED_SIGNAME) audit_log_format(ab, " signal=%s", sig_names[ad->signal]); else audit_log_format(ab, " signal=rtmin+%d", ad->signal - SIGRT_BASE); audit_log_format(ab, " peer="); aa_label_xaudit(ab, labels_ns(ad->subj_label), ad->peer, FLAGS_NONE, GFP_ATOMIC); } static int profile_signal_perm(const struct cred *cred, struct aa_profile *profile, struct aa_label *peer, u32 request, struct apparmor_audit_data *ad) { struct aa_ruleset *rules = list_first_entry(&profile->rules, typeof(*rules), list); struct aa_perms perms; aa_state_t state; if (profile_unconfined(profile) || !ANY_RULE_MEDIATES(&profile->rules, AA_CLASS_SIGNAL)) return 0; ad->subj_cred = cred; ad->peer = peer; /* TODO: secondary cache check <profile, profile, perm> */ state = aa_dfa_next(rules->policy->dfa, rules->policy->start[AA_CLASS_SIGNAL], ad->signal); aa_label_match(profile, rules, peer, state, false, request, &perms); aa_apply_modes_to_perms(profile, &perms); return aa_check_perms(profile, &perms, request, ad, audit_signal_cb); } int aa_may_signal(const struct cred *subj_cred, struct aa_label *sender, const struct cred *target_cred, struct aa_label *target, int sig) { struct aa_profile *profile; DEFINE_AUDIT_DATA(ad, LSM_AUDIT_DATA_NONE, AA_CLASS_SIGNAL, OP_SIGNAL); ad.signal = map_signal_num(sig); ad.unmappedsig = sig; return xcheck_labels(sender, target, profile, profile_signal_perm(subj_cred, profile, target, MAY_WRITE, &ad), profile_signal_perm(target_cred, profile, sender, MAY_READ, &ad)); } |
509 6 511 400 399 399 400 400 399 400 3 7 7 399 313 313 2 2 2 388 387 313 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 | #include <linux/atomic.h> #include <linux/export.h> #include <linux/generic-radix-tree.h> #include <linux/gfp.h> #include <linux/kmemleak.h> #define GENRADIX_ARY (GENRADIX_NODE_SIZE / sizeof(struct genradix_node *)) #define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY) struct genradix_node { union { /* Interior node: */ struct genradix_node *children[GENRADIX_ARY]; /* Leaf: */ u8 data[GENRADIX_NODE_SIZE]; }; }; static inline int genradix_depth_shift(unsigned depth) { return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth; } /* * Returns size (of data, in bytes) that a tree of a given depth holds: */ static inline size_t genradix_depth_size(unsigned depth) { return 1UL << genradix_depth_shift(depth); } /* depth that's needed for a genradix that can address up to ULONG_MAX: */ #define GENRADIX_MAX_DEPTH \ DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT) #define GENRADIX_DEPTH_MASK \ ((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1)) static inline unsigned genradix_root_to_depth(struct genradix_root *r) { return (unsigned long) r & GENRADIX_DEPTH_MASK; } static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r) { return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK); } /* * Returns pointer to the specified byte @offset within @radix, or NULL if not * allocated */ void *__genradix_ptr(struct __genradix *radix, size_t offset) { struct genradix_root *r = READ_ONCE(radix->root); struct genradix_node *n = genradix_root_to_node(r); unsigned level = genradix_root_to_depth(r); if (ilog2(offset) >= genradix_depth_shift(level)) return NULL; while (1) { if (!n) return NULL; if (!level) break; level--; n = n->children[offset >> genradix_depth_shift(level)]; offset &= genradix_depth_size(level) - 1; } return &n->data[offset]; } EXPORT_SYMBOL(__genradix_ptr); static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask) { return kzalloc(GENRADIX_NODE_SIZE, gfp_mask); } static inline void genradix_free_node(struct genradix_node *node) { kfree(node); } /* * Returns pointer to the specified byte @offset within @radix, allocating it if * necessary - newly allocated slots are always zeroed out: */ void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset, gfp_t gfp_mask) { struct genradix_root *v = READ_ONCE(radix->root); struct genradix_node *n, *new_node = NULL; unsigned level; /* Increase tree depth if necessary: */ while (1) { struct genradix_root *r = v, *new_root; n = genradix_root_to_node(r); level = genradix_root_to_depth(r); if (n && ilog2(offset) < genradix_depth_shift(level)) break; if (!new_node) { new_node = genradix_alloc_node(gfp_mask); if (!new_node) return NULL; } new_node->children[0] = n; new_root = ((struct genradix_root *) ((unsigned long) new_node | (n ? level + 1 : 0))); if ((v = cmpxchg_release(&radix->root, r, new_root)) == r) { v = new_root; new_node = NULL; } else { new_node->children[0] = NULL; } } while (level--) { struct genradix_node **p = &n->children[offset >> genradix_depth_shift(level)]; offset &= genradix_depth_size(level) - 1; n = READ_ONCE(*p); if (!n) { if (!new_node) { new_node = genradix_alloc_node(gfp_mask); if (!new_node) return NULL; } if (!(n = cmpxchg_release(p, NULL, new_node))) swap(n, new_node); } } if (new_node) genradix_free_node(new_node); return &n->data[offset]; } EXPORT_SYMBOL(__genradix_ptr_alloc); void *__genradix_iter_peek(struct genradix_iter *iter, struct __genradix *radix, size_t objs_per_page) { struct genradix_root *r; struct genradix_node *n; unsigned level, i; if (iter->offset == SIZE_MAX) return NULL; restart: r = READ_ONCE(radix->root); if (!r) return NULL; n = genradix_root_to_node(r); level = genradix_root_to_depth(r); if (ilog2(iter->offset) >= genradix_depth_shift(level)) return NULL; while (level) { level--; i = (iter->offset >> genradix_depth_shift(level)) & (GENRADIX_ARY - 1); while (!n->children[i]) { size_t objs_per_ptr = genradix_depth_size(level); if (iter->offset + objs_per_ptr < iter->offset) { iter->offset = SIZE_MAX; iter->pos = SIZE_MAX; return NULL; } i++; iter->offset = round_down(iter->offset + objs_per_ptr, objs_per_ptr); iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * objs_per_page; if (i == GENRADIX_ARY) goto restart; } n = n->children[i]; } return &n->data[iter->offset & (GENRADIX_NODE_SIZE - 1)]; } EXPORT_SYMBOL(__genradix_iter_peek); void *__genradix_iter_peek_prev(struct genradix_iter *iter, struct __genradix *radix, size_t objs_per_page, size_t obj_size_plus_page_remainder) { struct genradix_root *r; struct genradix_node *n; unsigned level, i; if (iter->offset == SIZE_MAX) return NULL; restart: r = READ_ONCE(radix->root); if (!r) return NULL; n = genradix_root_to_node(r); level = genradix_root_to_depth(r); if (ilog2(iter->offset) >= genradix_depth_shift(level)) { iter->offset = genradix_depth_size(level); iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * objs_per_page; iter->offset -= obj_size_plus_page_remainder; iter->pos--; } while (level) { level--; i = (iter->offset >> genradix_depth_shift(level)) & (GENRADIX_ARY - 1); while (!n->children[i]) { size_t objs_per_ptr = genradix_depth_size(level); iter->offset = round_down(iter->offset, objs_per_ptr); iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * objs_per_page; if (!iter->offset) return NULL; iter->offset -= obj_size_plus_page_remainder; iter->pos--; if (!i) goto restart; --i; } n = n->children[i]; } return &n->data[iter->offset & (GENRADIX_NODE_SIZE - 1)]; } EXPORT_SYMBOL(__genradix_iter_peek_prev); static void genradix_free_recurse(struct genradix_node *n, unsigned level) { if (level) { unsigned i; for (i = 0; i < GENRADIX_ARY; i++) if (n->children[i]) genradix_free_recurse(n->children[i], level - 1); } genradix_free_node(n); } int __genradix_prealloc(struct __genradix *radix, size_t size, gfp_t gfp_mask) { size_t offset; for (offset = 0; offset < size; offset += GENRADIX_NODE_SIZE) if (!__genradix_ptr_alloc(radix, offset, gfp_mask)) return -ENOMEM; return 0; } EXPORT_SYMBOL(__genradix_prealloc); void __genradix_free(struct __genradix *radix) { struct genradix_root *r = xchg(&radix->root, NULL); genradix_free_recurse(genradix_root_to_node(r), genradix_root_to_depth(r)); } EXPORT_SYMBOL(__genradix_free); |
6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 | // SPDX-License-Identifier: GPL-2.0-only /* * ebt_pkttype * * Authors: * Bart De Schuymer <bdschuym@pandora.be> * * April, 2003 * */ #include <linux/module.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_bridge/ebtables.h> #include <linux/netfilter_bridge/ebt_pkttype.h> static bool ebt_pkttype_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct ebt_pkttype_info *info = par->matchinfo; return (skb->pkt_type == info->pkt_type) ^ info->invert; } static int ebt_pkttype_mt_check(const struct xt_mtchk_param *par) { const struct ebt_pkttype_info *info = par->matchinfo; if (info->invert != 0 && info->invert != 1) return -EINVAL; /* Allow any pkt_type value */ return 0; } static struct xt_match ebt_pkttype_mt_reg __read_mostly = { .name = "pkttype", .revision = 0, .family = NFPROTO_BRIDGE, .match = ebt_pkttype_mt, .checkentry = ebt_pkttype_mt_check, .matchsize = sizeof(struct ebt_pkttype_info), .me = THIS_MODULE, }; static int __init ebt_pkttype_init(void) { return xt_register_match(&ebt_pkttype_mt_reg); } static void __exit ebt_pkttype_fini(void) { xt_unregister_match(&ebt_pkttype_mt_reg); } module_init(ebt_pkttype_init); module_exit(ebt_pkttype_fini); MODULE_DESCRIPTION("Ebtables: Link layer packet type match"); MODULE_LICENSE("GPL"); |
4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 | // SPDX-License-Identifier: GPL-2.0 #ifndef _MEDIA_FRAME_VECTOR_H #define _MEDIA_FRAME_VECTOR_H /* Container for pinned pfns / pages in frame_vector.c */ struct frame_vector { unsigned int nr_allocated; /* Number of frames we have space for */ unsigned int nr_frames; /* Number of frames stored in ptrs array */ bool got_ref; /* Did we pin pages by getting page ref? */ bool is_pfns; /* Does array contain pages or pfns? */ void *ptrs[]; /* Array of pinned pfns / pages. Use * pfns_vector_pages() or pfns_vector_pfns() * for access */ }; struct frame_vector *frame_vector_create(unsigned int nr_frames); void frame_vector_destroy(struct frame_vector *vec); int get_vaddr_frames(unsigned long start, unsigned int nr_pfns, bool write, struct frame_vector *vec); void put_vaddr_frames(struct frame_vector *vec); int frame_vector_to_pages(struct frame_vector *vec); void frame_vector_to_pfns(struct frame_vector *vec); static inline unsigned int frame_vector_count(struct frame_vector *vec) { return vec->nr_frames; } static inline struct page **frame_vector_pages(struct frame_vector *vec) { if (vec->is_pfns) { int err = frame_vector_to_pages(vec); if (err) return ERR_PTR(err); } return (struct page **)(vec->ptrs); } static inline unsigned long *frame_vector_pfns(struct frame_vector *vec) { if (!vec->is_pfns) frame_vector_to_pfns(vec); return (unsigned long *)(vec->ptrs); } #endif /* _MEDIA_FRAME_VECTOR_H */ |
68 86 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* V4L2 device support header. Copyright (C) 2008 Hans Verkuil <hverkuil@xs4all.nl> */ #ifndef _V4L2_DEVICE_H #define _V4L2_DEVICE_H #include <media/media-device.h> #include <media/v4l2-subdev.h> #include <media/v4l2-dev.h> struct v4l2_ctrl_handler; /** * struct v4l2_device - main struct to for V4L2 device drivers * * @dev: pointer to struct device. * @mdev: pointer to struct media_device, may be NULL. * @subdevs: used to keep track of the registered subdevs * @lock: lock this struct; can be used by the driver as well * if this struct is embedded into a larger struct. * @name: unique device name, by default the driver name + bus ID * @notify: notify operation called by some sub-devices. * @ctrl_handler: The control handler. May be %NULL. * @prio: Device's priority state * @ref: Keep track of the references to this struct. * @release: Release function that is called when the ref count * goes to 0. * * Each instance of a V4L2 device should create the v4l2_device struct, * either stand-alone or embedded in a larger struct. * * It allows easy access to sub-devices (see v4l2-subdev.h) and provides * basic V4L2 device-level support. * * .. note:: * * #) @dev->driver_data points to this struct. * #) @dev might be %NULL if there is no parent device */ struct v4l2_device { struct device *dev; struct media_device *mdev; struct list_head subdevs; spinlock_t lock; char name[36]; void (*notify)(struct v4l2_subdev *sd, unsigned int notification, void *arg); struct v4l2_ctrl_handler *ctrl_handler; struct v4l2_prio_state prio; struct kref ref; void (*release)(struct v4l2_device *v4l2_dev); }; /** * v4l2_device_get - gets a V4L2 device reference * * @v4l2_dev: pointer to struct &v4l2_device * * This is an ancillary routine meant to increment the usage for the * struct &v4l2_device pointed by @v4l2_dev. */ static inline void v4l2_device_get(struct v4l2_device *v4l2_dev) { kref_get(&v4l2_dev->ref); } /** * v4l2_device_put - puts a V4L2 device reference * * @v4l2_dev: pointer to struct &v4l2_device * * This is an ancillary routine meant to decrement the usage for the * struct &v4l2_device pointed by @v4l2_dev. */ int v4l2_device_put(struct v4l2_device *v4l2_dev); /** * v4l2_device_register - Initialize v4l2_dev and make @dev->driver_data * point to @v4l2_dev. * * @dev: pointer to struct &device * @v4l2_dev: pointer to struct &v4l2_device * * .. note:: * @dev may be %NULL in rare cases (ISA devices). * In such case the caller must fill in the @v4l2_dev->name field * before calling this function. */ int __must_check v4l2_device_register(struct device *dev, struct v4l2_device *v4l2_dev); /** * v4l2_device_set_name - Optional function to initialize the * name field of struct &v4l2_device * * @v4l2_dev: pointer to struct &v4l2_device * @basename: base name for the device name * @instance: pointer to a static atomic_t var with the instance usage for * the device driver. * * v4l2_device_set_name() initializes the name field of struct &v4l2_device * using the driver name and a driver-global atomic_t instance. * * This function will increment the instance counter and returns the * instance value used in the name. * * Example: * * static atomic_t drv_instance = ATOMIC_INIT(0); * * ... * * instance = v4l2_device_set_name(&\ v4l2_dev, "foo", &\ drv_instance); * * The first time this is called the name field will be set to foo0 and * this function returns 0. If the name ends with a digit (e.g. cx18), * then the name will be set to cx18-0 since cx180 would look really odd. */ int v4l2_device_set_name(struct v4l2_device *v4l2_dev, const char *basename, atomic_t *instance); /** * v4l2_device_disconnect - Change V4L2 device state to disconnected. * * @v4l2_dev: pointer to struct v4l2_device * * Should be called when the USB parent disconnects. * Since the parent disappears, this ensures that @v4l2_dev doesn't have * an invalid parent pointer. * * .. note:: This function sets @v4l2_dev->dev to NULL. */ void v4l2_device_disconnect(struct v4l2_device *v4l2_dev); /** * v4l2_device_unregister - Unregister all sub-devices and any other * resources related to @v4l2_dev. * * @v4l2_dev: pointer to struct v4l2_device */ void v4l2_device_unregister(struct v4l2_device *v4l2_dev); /** * v4l2_device_register_subdev - Registers a subdev with a v4l2 device. * * @v4l2_dev: pointer to struct &v4l2_device * @sd: pointer to &struct v4l2_subdev * * While registered, the subdev module is marked as in-use. * * An error is returned if the module is no longer loaded on any attempts * to register it. */ #define v4l2_device_register_subdev(v4l2_dev, sd) \ __v4l2_device_register_subdev(v4l2_dev, sd, THIS_MODULE) int __must_check __v4l2_device_register_subdev(struct v4l2_device *v4l2_dev, struct v4l2_subdev *sd, struct module *module); /** * v4l2_device_unregister_subdev - Unregisters a subdev with a v4l2 device. * * @sd: pointer to &struct v4l2_subdev * * .. note :: * * Can also be called if the subdev wasn't registered. In such * case, it will do nothing. */ void v4l2_device_unregister_subdev(struct v4l2_subdev *sd); /** * __v4l2_device_register_subdev_nodes - Registers device nodes for * all subdevs of the v4l2 device that are marked with the * %V4L2_SUBDEV_FL_HAS_DEVNODE flag. * * @v4l2_dev: pointer to struct v4l2_device * @read_only: subdevices read-only flag. True to register the subdevices * device nodes in read-only mode, false to allow full access to the * subdevice userspace API. */ int __must_check __v4l2_device_register_subdev_nodes(struct v4l2_device *v4l2_dev, bool read_only); /** * v4l2_device_register_subdev_nodes - Registers subdevices device nodes with * unrestricted access to the subdevice userspace operations * * Internally calls __v4l2_device_register_subdev_nodes(). See its documentation * for more details. * * @v4l2_dev: pointer to struct v4l2_device */ static inline int __must_check v4l2_device_register_subdev_nodes(struct v4l2_device *v4l2_dev) { #if defined(CONFIG_VIDEO_V4L2_SUBDEV_API) return __v4l2_device_register_subdev_nodes(v4l2_dev, false); #else return 0; #endif } /** * v4l2_device_register_ro_subdev_nodes - Registers subdevices device nodes * in read-only mode * * Internally calls __v4l2_device_register_subdev_nodes(). See its documentation * for more details. * * @v4l2_dev: pointer to struct v4l2_device */ static inline int __must_check v4l2_device_register_ro_subdev_nodes(struct v4l2_device *v4l2_dev) { #if defined(CONFIG_VIDEO_V4L2_SUBDEV_API) return __v4l2_device_register_subdev_nodes(v4l2_dev, true); #else return 0; #endif } /** * v4l2_subdev_notify - Sends a notification to v4l2_device. * * @sd: pointer to &struct v4l2_subdev * @notification: type of notification. Please notice that the notification * type is driver-specific. * @arg: arguments for the notification. Those are specific to each * notification type. */ static inline void v4l2_subdev_notify(struct v4l2_subdev *sd, unsigned int notification, void *arg) { if (sd && sd->v4l2_dev && sd->v4l2_dev->notify) sd->v4l2_dev->notify(sd, notification, arg); } /** * v4l2_device_supports_requests - Test if requests are supported. * * @v4l2_dev: pointer to struct v4l2_device */ static inline bool v4l2_device_supports_requests(struct v4l2_device *v4l2_dev) { return v4l2_dev->mdev && v4l2_dev->mdev->ops && v4l2_dev->mdev->ops->req_queue; } /* Helper macros to iterate over all subdevs. */ /** * v4l2_device_for_each_subdev - Helper macro that interates over all * sub-devices of a given &v4l2_device. * * @sd: pointer that will be filled by the macro with all * &struct v4l2_subdev pointer used as an iterator by the loop. * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * * This macro iterates over all sub-devices owned by the @v4l2_dev device. * It acts as a for loop iterator and executes the next statement with * the @sd variable pointing to each sub-device in turn. */ #define v4l2_device_for_each_subdev(sd, v4l2_dev) \ list_for_each_entry(sd, &(v4l2_dev)->subdevs, list) /** * __v4l2_device_call_subdevs_p - Calls the specified operation for * all subdevs matching the condition. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @sd: pointer that will be filled by the macro with all * &struct v4l2_subdev pointer used as an iterator by the loop. * @cond: condition to be match * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Ignore any errors. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define __v4l2_device_call_subdevs_p(v4l2_dev, sd, cond, o, f, args...) \ do { \ list_for_each_entry((sd), &(v4l2_dev)->subdevs, list) \ if ((cond) && (sd)->ops->o && (sd)->ops->o->f) \ (sd)->ops->o->f((sd) , ##args); \ } while (0) /** * __v4l2_device_call_subdevs - Calls the specified operation for * all subdevs matching the condition. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @cond: condition to be match * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Ignore any errors. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define __v4l2_device_call_subdevs(v4l2_dev, cond, o, f, args...) \ do { \ struct v4l2_subdev *__sd; \ \ __v4l2_device_call_subdevs_p(v4l2_dev, __sd, cond, o, \ f , ##args); \ } while (0) /** * __v4l2_device_call_subdevs_until_err_p - Calls the specified operation for * all subdevs matching the condition. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @sd: pointer that will be filled by the macro with all * &struct v4l2_subdev sub-devices associated with @v4l2_dev. * @cond: condition to be match * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Return: * * If the operation returns an error other than 0 or ``-ENOIOCTLCMD`` * for any subdevice, then abort and return with that error code, zero * otherwise. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define __v4l2_device_call_subdevs_until_err_p(v4l2_dev, sd, cond, o, f, args...) \ ({ \ long __err = 0; \ \ list_for_each_entry((sd), &(v4l2_dev)->subdevs, list) { \ if ((cond) && (sd)->ops->o && (sd)->ops->o->f) \ __err = (sd)->ops->o->f((sd) , ##args); \ if (__err && __err != -ENOIOCTLCMD) \ break; \ } \ (__err == -ENOIOCTLCMD) ? 0 : __err; \ }) /** * __v4l2_device_call_subdevs_until_err - Calls the specified operation for * all subdevs matching the condition. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @cond: condition to be match * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Return: * * If the operation returns an error other than 0 or ``-ENOIOCTLCMD`` * for any subdevice, then abort and return with that error code, * zero otherwise. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define __v4l2_device_call_subdevs_until_err(v4l2_dev, cond, o, f, args...) \ ({ \ struct v4l2_subdev *__sd; \ __v4l2_device_call_subdevs_until_err_p(v4l2_dev, __sd, cond, o, \ f , ##args); \ }) /** * v4l2_device_call_all - Calls the specified operation for * all subdevs matching the &v4l2_subdev.grp_id, as assigned * by the bridge driver. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @grpid: &struct v4l2_subdev->grp_id group ID to match. * Use 0 to match them all. * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Ignore any errors. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define v4l2_device_call_all(v4l2_dev, grpid, o, f, args...) \ do { \ struct v4l2_subdev *__sd; \ \ __v4l2_device_call_subdevs_p(v4l2_dev, __sd, \ (grpid) == 0 || __sd->grp_id == (grpid), o, f , \ ##args); \ } while (0) /** * v4l2_device_call_until_err - Calls the specified operation for * all subdevs matching the &v4l2_subdev.grp_id, as assigned * by the bridge driver, until an error occurs. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @grpid: &struct v4l2_subdev->grp_id group ID to match. * Use 0 to match them all. * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Return: * * If the operation returns an error other than 0 or ``-ENOIOCTLCMD`` * for any subdevice, then abort and return with that error code, * zero otherwise. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define v4l2_device_call_until_err(v4l2_dev, grpid, o, f, args...) \ ({ \ struct v4l2_subdev *__sd; \ __v4l2_device_call_subdevs_until_err_p(v4l2_dev, __sd, \ (grpid) == 0 || __sd->grp_id == (grpid), o, f , \ ##args); \ }) /** * v4l2_device_mask_call_all - Calls the specified operation for * all subdevices where a group ID matches a specified bitmask. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @grpmsk: bitmask to be checked against &struct v4l2_subdev->grp_id * group ID to be matched. Use 0 to match them all. * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Ignore any errors. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define v4l2_device_mask_call_all(v4l2_dev, grpmsk, o, f, args...) \ do { \ struct v4l2_subdev *__sd; \ \ __v4l2_device_call_subdevs_p(v4l2_dev, __sd, \ (grpmsk) == 0 || (__sd->grp_id & (grpmsk)), o, \ f , ##args); \ } while (0) /** * v4l2_device_mask_call_until_err - Calls the specified operation for * all subdevices where a group ID matches a specified bitmask. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @grpmsk: bitmask to be checked against &struct v4l2_subdev->grp_id * group ID to be matched. Use 0 to match them all. * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. * @args: arguments for @f. * * Return: * * If the operation returns an error other than 0 or ``-ENOIOCTLCMD`` * for any subdevice, then abort and return with that error code, * zero otherwise. * * Note: subdevs cannot be added or deleted while walking * the subdevs list. */ #define v4l2_device_mask_call_until_err(v4l2_dev, grpmsk, o, f, args...) \ ({ \ struct v4l2_subdev *__sd; \ __v4l2_device_call_subdevs_until_err_p(v4l2_dev, __sd, \ (grpmsk) == 0 || (__sd->grp_id & (grpmsk)), o, \ f , ##args); \ }) /** * v4l2_device_has_op - checks if any subdev with matching grpid has a * given ops. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @grpid: &struct v4l2_subdev->grp_id group ID to match. * Use 0 to match them all. * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. */ #define v4l2_device_has_op(v4l2_dev, grpid, o, f) \ ({ \ struct v4l2_subdev *__sd; \ bool __result = false; \ list_for_each_entry(__sd, &(v4l2_dev)->subdevs, list) { \ if ((grpid) && __sd->grp_id != (grpid)) \ continue; \ if (v4l2_subdev_has_op(__sd, o, f)) { \ __result = true; \ break; \ } \ } \ __result; \ }) /** * v4l2_device_mask_has_op - checks if any subdev with matching group * mask has a given ops. * * @v4l2_dev: &struct v4l2_device owning the sub-devices to iterate over. * @grpmsk: bitmask to be checked against &struct v4l2_subdev->grp_id * group ID to be matched. Use 0 to match them all. * @o: name of the element at &struct v4l2_subdev_ops that contains @f. * Each element there groups a set of operations functions. * @f: operation function that will be called if @cond matches. * The operation functions are defined in groups, according to * each element at &struct v4l2_subdev_ops. */ #define v4l2_device_mask_has_op(v4l2_dev, grpmsk, o, f) \ ({ \ struct v4l2_subdev *__sd; \ bool __result = false; \ list_for_each_entry(__sd, &(v4l2_dev)->subdevs, list) { \ if ((grpmsk) && !(__sd->grp_id & (grpmsk))) \ continue; \ if (v4l2_subdev_has_op(__sd, o, f)) { \ __result = true; \ break; \ } \ } \ __result; \ }) #endif |
1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 | /* SPDX-License-Identifier: GPL-2.0-or-later * * Copyright (C) 2005 David Brownell */ #ifndef __LINUX_SPI_H #define __LINUX_SPI_H #include <linux/acpi.h> #include <linux/bits.h> #include <linux/completion.h> #include <linux/device.h> #include <linux/gpio/consumer.h> #include <linux/kthread.h> #include <linux/mod_devicetable.h> #include <linux/overflow.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <linux/u64_stats_sync.h> #include <uapi/linux/spi/spi.h> /* Max no. of CS supported per spi device */ #define SPI_CS_CNT_MAX 16 struct dma_chan; struct software_node; struct ptp_system_timestamp; struct spi_controller; struct spi_transfer; struct spi_controller_mem_ops; struct spi_controller_mem_caps; struct spi_message; /* * INTERFACES between SPI master-side drivers and SPI slave protocol handlers, * and SPI infrastructure. */ extern const struct bus_type spi_bus_type; /** * struct spi_statistics - statistics for spi transfers * @syncp: seqcount to protect members in this struct for per-cpu update * on 32-bit systems * * @messages: number of spi-messages handled * @transfers: number of spi_transfers handled * @errors: number of errors during spi_transfer * @timedout: number of timeouts during spi_transfer * * @spi_sync: number of times spi_sync is used * @spi_sync_immediate: * number of times spi_sync is executed immediately * in calling context without queuing and scheduling * @spi_async: number of times spi_async is used * * @bytes: number of bytes transferred to/from device * @bytes_tx: number of bytes sent to device * @bytes_rx: number of bytes received from device * * @transfer_bytes_histo: * transfer bytes histogram * * @transfers_split_maxsize: * number of transfers that have been split because of * maxsize limit */ struct spi_statistics { struct u64_stats_sync syncp; u64_stats_t messages; u64_stats_t transfers; u64_stats_t errors; u64_stats_t timedout; u64_stats_t spi_sync; u64_stats_t spi_sync_immediate; u64_stats_t spi_async; u64_stats_t bytes; u64_stats_t bytes_rx; u64_stats_t bytes_tx; #define SPI_STATISTICS_HISTO_SIZE 17 u64_stats_t transfer_bytes_histo[SPI_STATISTICS_HISTO_SIZE]; u64_stats_t transfers_split_maxsize; }; #define SPI_STATISTICS_ADD_TO_FIELD(pcpu_stats, field, count) \ do { \ struct spi_statistics *__lstats; \ get_cpu(); \ __lstats = this_cpu_ptr(pcpu_stats); \ u64_stats_update_begin(&__lstats->syncp); \ u64_stats_add(&__lstats->field, count); \ u64_stats_update_end(&__lstats->syncp); \ put_cpu(); \ } while (0) #define SPI_STATISTICS_INCREMENT_FIELD(pcpu_stats, field) \ do { \ struct spi_statistics *__lstats; \ get_cpu(); \ __lstats = this_cpu_ptr(pcpu_stats); \ u64_stats_update_begin(&__lstats->syncp); \ u64_stats_inc(&__lstats->field); \ u64_stats_update_end(&__lstats->syncp); \ put_cpu(); \ } while (0) /** * struct spi_delay - SPI delay information * @value: Value for the delay * @unit: Unit for the delay */ struct spi_delay { #define SPI_DELAY_UNIT_USECS 0 #define SPI_DELAY_UNIT_NSECS 1 #define SPI_DELAY_UNIT_SCK 2 u16 value; u8 unit; }; extern int spi_delay_to_ns(struct spi_delay *_delay, struct spi_transfer *xfer); extern int spi_delay_exec(struct spi_delay *_delay, struct spi_transfer *xfer); extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, struct spi_transfer *xfer); /** * struct spi_device - Controller side proxy for an SPI slave device * @dev: Driver model representation of the device. * @controller: SPI controller used with the device. * @max_speed_hz: Maximum clock rate to be used with this chip * (on this board); may be changed by the device's driver. * The spi_transfer.speed_hz can override this for each transfer. * @chip_select: Array of physical chipselect, spi->chipselect[i] gives * the corresponding physical CS for logical CS i. * @mode: The spi mode defines how data is clocked out and in. * This may be changed by the device's driver. * The "active low" default for chipselect mode can be overridden * (by specifying SPI_CS_HIGH) as can the "MSB first" default for * each word in a transfer (by specifying SPI_LSB_FIRST). * @bits_per_word: Data transfers involve one or more words; word sizes * like eight or 12 bits are common. In-memory wordsizes are * powers of two bytes (e.g. 20 bit samples use 32 bits). * This may be changed by the device's driver, or left at the * default (0) indicating protocol words are eight bit bytes. * The spi_transfer.bits_per_word can override this for each transfer. * @rt: Make the pump thread real time priority. * @irq: Negative, or the number passed to request_irq() to receive * interrupts from this device. * @controller_state: Controller's runtime state * @controller_data: Board-specific definitions for controller, such as * FIFO initialization parameters; from board_info.controller_data * @modalias: Name of the driver to use with this device, or an alias * for that name. This appears in the sysfs "modalias" attribute * for driver coldplugging, and in uevents used for hotplugging * @driver_override: If the name of a driver is written to this attribute, then * the device will bind to the named driver and only the named driver. * Do not set directly, because core frees it; use driver_set_override() to * set or clear it. * @cs_gpiod: Array of GPIO descriptors of the corresponding chipselect lines * (optional, NULL when not using a GPIO line) * @word_delay: delay to be inserted between consecutive * words of a transfer * @cs_setup: delay to be introduced by the controller after CS is asserted * @cs_hold: delay to be introduced by the controller before CS is deasserted * @cs_inactive: delay to be introduced by the controller after CS is * deasserted. If @cs_change_delay is used from @spi_transfer, then the * two delays will be added up. * @pcpu_statistics: statistics for the spi_device * @cs_index_mask: Bit mask of the active chipselect(s) in the chipselect array * * A @spi_device is used to interchange data between an SPI slave * (usually a discrete chip) and CPU memory. * * In @dev, the platform_data is used to hold information about this * device that's meaningful to the device's protocol driver, but not * to its controller. One example might be an identifier for a chip * variant with slightly different functionality; another might be * information about how this particular board wires the chip's pins. */ struct spi_device { struct device dev; struct spi_controller *controller; u32 max_speed_hz; u8 chip_select[SPI_CS_CNT_MAX]; u8 bits_per_word; bool rt; #define SPI_NO_TX BIT(31) /* No transmit wire */ #define SPI_NO_RX BIT(30) /* No receive wire */ /* * TPM specification defines flow control over SPI. Client device * can insert a wait state on MISO when address is transmitted by * controller on MOSI. Detecting the wait state in software is only * possible for full duplex controllers. For controllers that support * only half-duplex, the wait state detection needs to be implemented * in hardware. TPM devices would set this flag when hardware flow * control is expected from SPI controller. */ #define SPI_TPM_HW_FLOW BIT(29) /* TPM HW flow control */ /* * All bits defined above should be covered by SPI_MODE_KERNEL_MASK. * The SPI_MODE_KERNEL_MASK has the SPI_MODE_USER_MASK counterpart, * which is defined in 'include/uapi/linux/spi/spi.h'. * The bits defined here are from bit 31 downwards, while in * SPI_MODE_USER_MASK are from 0 upwards. * These bits must not overlap. A static assert check should make sure of that. * If adding extra bits, make sure to decrease the bit index below as well. */ #define SPI_MODE_KERNEL_MASK (~(BIT(29) - 1)) u32 mode; int irq; void *controller_state; void *controller_data; char modalias[SPI_NAME_SIZE]; const char *driver_override; struct gpio_desc *cs_gpiod[SPI_CS_CNT_MAX]; /* Chip select gpio desc */ struct spi_delay word_delay; /* Inter-word delay */ /* CS delays */ struct spi_delay cs_setup; struct spi_delay cs_hold; struct spi_delay cs_inactive; /* The statistics */ struct spi_statistics __percpu *pcpu_statistics; /* Bit mask of the chipselect(s) that the driver need to use from * the chipselect array.When the controller is capable to handle * multiple chip selects & memories are connected in parallel * then more than one bit need to be set in cs_index_mask. */ u32 cs_index_mask : SPI_CS_CNT_MAX; /* * Likely need more hooks for more protocol options affecting how * the controller talks to each chip, like: * - memory packing (12 bit samples into low bits, others zeroed) * - priority * - chipselect delays * - ... */ }; /* Make sure that SPI_MODE_KERNEL_MASK & SPI_MODE_USER_MASK don't overlap */ static_assert((SPI_MODE_KERNEL_MASK & SPI_MODE_USER_MASK) == 0, "SPI_MODE_USER_MASK & SPI_MODE_KERNEL_MASK must not overlap"); static inline struct spi_device *to_spi_device(const struct device *dev) { return dev ? container_of(dev, struct spi_device, dev) : NULL; } /* Most drivers won't need to care about device refcounting */ static inline struct spi_device *spi_dev_get(struct spi_device *spi) { return (spi && get_device(&spi->dev)) ? spi : NULL; } static inline void spi_dev_put(struct spi_device *spi) { if (spi) put_device(&spi->dev); } /* ctldata is for the bus_controller driver's runtime state */ static inline void *spi_get_ctldata(const struct spi_device *spi) { return spi->controller_state; } static inline void spi_set_ctldata(struct spi_device *spi, void *state) { spi->controller_state = state; } /* Device driver data */ static inline void spi_set_drvdata(struct spi_device *spi, void *data) { dev_set_drvdata(&spi->dev, data); } static inline void *spi_get_drvdata(const struct spi_device *spi) { return dev_get_drvdata(&spi->dev); } static inline u8 spi_get_chipselect(const struct spi_device *spi, u8 idx) { return spi->chip_select[idx]; } static inline void spi_set_chipselect(struct spi_device *spi, u8 idx, u8 chipselect) { spi->chip_select[idx] = chipselect; } static inline struct gpio_desc *spi_get_csgpiod(const struct spi_device *spi, u8 idx) { return spi->cs_gpiod[idx]; } static inline void spi_set_csgpiod(struct spi_device *spi, u8 idx, struct gpio_desc *csgpiod) { spi->cs_gpiod[idx] = csgpiod; } static inline bool spi_is_csgpiod(struct spi_device *spi) { u8 idx; for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { if (spi_get_csgpiod(spi, idx)) return true; } return false; } /** * struct spi_driver - Host side "protocol" driver * @id_table: List of SPI devices supported by this driver * @probe: Binds this driver to the SPI device. Drivers can verify * that the device is actually present, and may need to configure * characteristics (such as bits_per_word) which weren't needed for * the initial configuration done during system setup. * @remove: Unbinds this driver from the SPI device * @shutdown: Standard shutdown callback used during system state * transitions such as powerdown/halt and kexec * @driver: SPI device drivers should initialize the name and owner * field of this structure. * * This represents the kind of device driver that uses SPI messages to * interact with the hardware at the other end of a SPI link. It's called * a "protocol" driver because it works through messages rather than talking * directly to SPI hardware (which is what the underlying SPI controller * driver does to pass those messages). These protocols are defined in the * specification for the device(s) supported by the driver. * * As a rule, those device protocols represent the lowest level interface * supported by a driver, and it will support upper level interfaces too. * Examples of such upper levels include frameworks like MTD, networking, * MMC, RTC, filesystem character device nodes, and hardware monitoring. */ struct spi_driver { const struct spi_device_id *id_table; int (*probe)(struct spi_device *spi); void (*remove)(struct spi_device *spi); void (*shutdown)(struct spi_device *spi); struct device_driver driver; }; #define to_spi_driver(__drv) \ ( __drv ? container_of_const(__drv, struct spi_driver, driver) : NULL ) extern int __spi_register_driver(struct module *owner, struct spi_driver *sdrv); /** * spi_unregister_driver - reverse effect of spi_register_driver * @sdrv: the driver to unregister * Context: can sleep */ static inline void spi_unregister_driver(struct spi_driver *sdrv) { if (sdrv) driver_unregister(&sdrv->driver); } extern struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 chip_select); /* Use a define to avoid include chaining to get THIS_MODULE */ #define spi_register_driver(driver) \ __spi_register_driver(THIS_MODULE, driver) /** * module_spi_driver() - Helper macro for registering a SPI driver * @__spi_driver: spi_driver struct * * Helper macro for SPI drivers which do not do anything special in module * init/exit. This eliminates a lot of boilerplate. Each module may only * use this macro once, and calling it replaces module_init() and module_exit() */ #define module_spi_driver(__spi_driver) \ module_driver(__spi_driver, spi_register_driver, \ spi_unregister_driver) /** * struct spi_controller - interface to SPI master or slave controller * @dev: device interface to this driver * @list: link with the global spi_controller list * @bus_num: board-specific (and often SOC-specific) identifier for a * given SPI controller. * @num_chipselect: chipselects are used to distinguish individual * SPI slaves, and are numbered from zero to num_chipselects. * each slave has a chipselect signal, but it's common that not * every chipselect is connected to a slave. * @dma_alignment: SPI controller constraint on DMA buffers alignment. * @mode_bits: flags understood by this controller driver * @buswidth_override_bits: flags to override for this controller driver * @bits_per_word_mask: A mask indicating which values of bits_per_word are * supported by the driver. Bit n indicates that a bits_per_word n+1 is * supported. If set, the SPI core will reject any transfer with an * unsupported bits_per_word. If not set, this value is simply ignored, * and it's up to the individual driver to perform any validation. * @min_speed_hz: Lowest supported transfer speed * @max_speed_hz: Highest supported transfer speed * @flags: other constraints relevant to this driver * @slave: indicates that this is an SPI slave controller * @target: indicates that this is an SPI target controller * @devm_allocated: whether the allocation of this struct is devres-managed * @max_transfer_size: function that returns the max transfer size for * a &spi_device; may be %NULL, so the default %SIZE_MAX will be used. * @max_message_size: function that returns the max message size for * a &spi_device; may be %NULL, so the default %SIZE_MAX will be used. * @io_mutex: mutex for physical bus access * @add_lock: mutex to avoid adding devices to the same chipselect * @bus_lock_spinlock: spinlock for SPI bus locking * @bus_lock_mutex: mutex for exclusion of multiple callers * @bus_lock_flag: indicates that the SPI bus is locked for exclusive use * @setup: updates the device mode and clocking records used by a * device's SPI controller; protocol code may call this. This * must fail if an unrecognized or unsupported mode is requested. * It's always safe to call this unless transfers are pending on * the device whose settings are being modified. * @set_cs_timing: optional hook for SPI devices to request SPI master * controller for configuring specific CS setup time, hold time and inactive * delay interms of clock counts * @transfer: adds a message to the controller's transfer queue. * @cleanup: frees controller-specific state * @can_dma: determine whether this controller supports DMA * @dma_map_dev: device which can be used for DMA mapping * @cur_rx_dma_dev: device which is currently used for RX DMA mapping * @cur_tx_dma_dev: device which is currently used for TX DMA mapping * @queued: whether this controller is providing an internal message queue * @kworker: pointer to thread struct for message pump * @pump_messages: work struct for scheduling work to the message pump * @queue_lock: spinlock to synchronise access to message queue * @queue: message queue * @cur_msg: the currently in-flight message * @cur_msg_completion: a completion for the current in-flight message * @cur_msg_incomplete: Flag used internally to opportunistically skip * the @cur_msg_completion. This flag is used to check if the driver has * already called spi_finalize_current_message(). * @cur_msg_need_completion: Flag used internally to opportunistically skip * the @cur_msg_completion. This flag is used to signal the context that * is running spi_finalize_current_message() that it needs to complete() * @fallback: fallback to PIO if DMA transfer return failure with * SPI_TRANS_FAIL_NO_START. * @last_cs_mode_high: was (mode & SPI_CS_HIGH) true on the last call to set_cs. * @last_cs: the last chip_select that is recorded by set_cs, -1 on non chip * selected * @last_cs_index_mask: bit mask the last chip selects that were used * @xfer_completion: used by core transfer_one_message() * @busy: message pump is busy * @running: message pump is running * @rt: whether this queue is set to run as a realtime task * @auto_runtime_pm: the core should ensure a runtime PM reference is held * while the hardware is prepared, using the parent * device for the spidev * @max_dma_len: Maximum length of a DMA transfer for the device. * @prepare_transfer_hardware: a message will soon arrive from the queue * so the subsystem requests the driver to prepare the transfer hardware * by issuing this call * @transfer_one_message: the subsystem calls the driver to transfer a single * message while queuing transfers that arrive in the meantime. When the * driver is finished with this message, it must call * spi_finalize_current_message() so the subsystem can issue the next * message * @unprepare_transfer_hardware: there are currently no more messages on the * queue so the subsystem notifies the driver that it may relax the * hardware by issuing this call * * @set_cs: set the logic level of the chip select line. May be called * from interrupt context. * @optimize_message: optimize the message for reuse * @unoptimize_message: release resources allocated by optimize_message * @prepare_message: set up the controller to transfer a single message, * for example doing DMA mapping. Called from threaded * context. * @transfer_one: transfer a single spi_transfer. * * - return 0 if the transfer is finished, * - return 1 if the transfer is still in progress. When * the driver is finished with this transfer it must * call spi_finalize_current_transfer() so the subsystem * can issue the next transfer. If the transfer fails, the * driver must set the flag SPI_TRANS_FAIL_IO to * spi_transfer->error first, before calling * spi_finalize_current_transfer(). * Note: transfer_one and transfer_one_message are mutually * exclusive; when both are set, the generic subsystem does * not call your transfer_one callback. * @handle_err: the subsystem calls the driver to handle an error that occurs * in the generic implementation of transfer_one_message(). * @mem_ops: optimized/dedicated operations for interactions with SPI memory. * This field is optional and should only be implemented if the * controller has native support for memory like operations. * @mem_caps: controller capabilities for the handling of memory operations. * @unprepare_message: undo any work done by prepare_message(). * @slave_abort: abort the ongoing transfer request on an SPI slave controller * @target_abort: abort the ongoing transfer request on an SPI target controller * @cs_gpiods: Array of GPIO descriptors to use as chip select lines; one per CS * number. Any individual value may be NULL for CS lines that * are not GPIOs (driven by the SPI controller itself). * @use_gpio_descriptors: Turns on the code in the SPI core to parse and grab * GPIO descriptors. This will fill in @cs_gpiods and SPI devices will have * the cs_gpiod assigned if a GPIO line is found for the chipselect. * @unused_native_cs: When cs_gpiods is used, spi_register_controller() will * fill in this field with the first unused native CS, to be used by SPI * controller drivers that need to drive a native CS when using GPIO CS. * @max_native_cs: When cs_gpiods is used, and this field is filled in, * spi_register_controller() will validate all native CS (including the * unused native CS) against this value. * @pcpu_statistics: statistics for the spi_controller * @dma_tx: DMA transmit channel * @dma_rx: DMA receive channel * @dummy_rx: dummy receive buffer for full-duplex devices * @dummy_tx: dummy transmit buffer for full-duplex devices * @fw_translate_cs: If the boot firmware uses different numbering scheme * what Linux expects, this optional hook can be used to translate * between the two. * @ptp_sts_supported: If the driver sets this to true, it must provide a * time snapshot in @spi_transfer->ptp_sts as close as possible to the * moment in time when @spi_transfer->ptp_sts_word_pre and * @spi_transfer->ptp_sts_word_post were transmitted. * If the driver does not set this, the SPI core takes the snapshot as * close to the driver hand-over as possible. * @irq_flags: Interrupt enable state during PTP system timestamping * @queue_empty: signal green light for opportunistically skipping the queue * for spi_sync transfers. * @must_async: disable all fast paths in the core * @defer_optimize_message: set to true if controller cannot pre-optimize messages * and needs to defer the optimization step until the message is actually * being transferred * * Each SPI controller can communicate with one or more @spi_device * children. These make a small bus, sharing MOSI, MISO and SCK signals * but not chip select signals. Each device may be configured to use a * different clock rate, since those shared signals are ignored unless * the chip is selected. * * The driver for an SPI controller manages access to those devices through * a queue of spi_message transactions, copying data between CPU memory and * an SPI slave device. For each such message it queues, it calls the * message's completion function when the transaction completes. */ struct spi_controller { struct device dev; struct list_head list; /* * Other than negative (== assign one dynamically), bus_num is fully * board-specific. Usually that simplifies to being SoC-specific. * example: one SoC has three SPI controllers, numbered 0..2, * and one board's schematics might show it using SPI-2. Software * would normally use bus_num=2 for that controller. */ s16 bus_num; /* * Chipselects will be integral to many controllers; some others * might use board-specific GPIOs. */ u16 num_chipselect; /* Some SPI controllers pose alignment requirements on DMAable * buffers; let protocol drivers know about these requirements. */ u16 dma_alignment; /* spi_device.mode flags understood by this controller driver */ u32 mode_bits; /* spi_device.mode flags override flags for this controller */ u32 buswidth_override_bits; /* Bitmask of supported bits_per_word for transfers */ u32 bits_per_word_mask; #define SPI_BPW_MASK(bits) BIT((bits) - 1) #define SPI_BPW_RANGE_MASK(min, max) GENMASK((max) - 1, (min) - 1) /* Limits on transfer speed */ u32 min_speed_hz; u32 max_speed_hz; /* Other constraints relevant to this driver */ u16 flags; #define SPI_CONTROLLER_HALF_DUPLEX BIT(0) /* Can't do full duplex */ #define SPI_CONTROLLER_NO_RX BIT(1) /* Can't do buffer read */ #define SPI_CONTROLLER_NO_TX BIT(2) /* Can't do buffer write */ #define SPI_CONTROLLER_MUST_RX BIT(3) /* Requires rx */ #define SPI_CONTROLLER_MUST_TX BIT(4) /* Requires tx */ #define SPI_CONTROLLER_GPIO_SS BIT(5) /* GPIO CS must select slave */ #define SPI_CONTROLLER_SUSPENDED BIT(6) /* Currently suspended */ /* * The spi-controller has multi chip select capability and can * assert/de-assert more than one chip select at once. */ #define SPI_CONTROLLER_MULTI_CS BIT(7) /* Flag indicating if the allocation of this struct is devres-managed */ bool devm_allocated; union { /* Flag indicating this is an SPI slave controller */ bool slave; /* Flag indicating this is an SPI target controller */ bool target; }; /* * On some hardware transfer / message size may be constrained * the limit may depend on device transfer settings. */ size_t (*max_transfer_size)(struct spi_device *spi); size_t (*max_message_size)(struct spi_device *spi); /* I/O mutex */ struct mutex io_mutex; /* Used to avoid adding the same CS twice */ struct mutex add_lock; /* Lock and mutex for SPI bus locking */ spinlock_t bus_lock_spinlock; struct mutex bus_lock_mutex; /* Flag indicating that the SPI bus is locked for exclusive use */ bool bus_lock_flag; /* * Setup mode and clock, etc (SPI driver may call many times). * * IMPORTANT: this may be called when transfers to another * device are active. DO NOT UPDATE SHARED REGISTERS in ways * which could break those transfers. */ int (*setup)(struct spi_device *spi); /* * set_cs_timing() method is for SPI controllers that supports * configuring CS timing. * * This hook allows SPI client drivers to request SPI controllers * to configure specific CS timing through spi_set_cs_timing() after * spi_setup(). */ int (*set_cs_timing)(struct spi_device *spi); /* * Bidirectional bulk transfers * * + The transfer() method may not sleep; its main role is * just to add the message to the queue. * + For now there's no remove-from-queue operation, or * any other request management * + To a given spi_device, message queueing is pure FIFO * * + The controller's main job is to process its message queue, * selecting a chip (for masters), then transferring data * + If there are multiple spi_device children, the i/o queue * arbitration algorithm is unspecified (round robin, FIFO, * priority, reservations, preemption, etc) * * + Chipselect stays active during the entire message * (unless modified by spi_transfer.cs_change != 0). * + The message transfers use clock and SPI mode parameters * previously established by setup() for this device */ int (*transfer)(struct spi_device *spi, struct spi_message *mesg); /* Called on release() to free memory provided by spi_controller */ void (*cleanup)(struct spi_device *spi); /* * Used to enable core support for DMA handling, if can_dma() * exists and returns true then the transfer will be mapped * prior to transfer_one() being called. The driver should * not modify or store xfer and dma_tx and dma_rx must be set * while the device is prepared. */ bool (*can_dma)(struct spi_controller *ctlr, struct spi_device *spi, struct spi_transfer *xfer); struct device *dma_map_dev; struct device *cur_rx_dma_dev; struct device *cur_tx_dma_dev; /* * These hooks are for drivers that want to use the generic * controller transfer queueing mechanism. If these are used, the * transfer() function above must NOT be specified by the driver. * Over time we expect SPI drivers to be phased over to this API. */ bool queued; struct kthread_worker *kworker; struct kthread_work pump_messages; spinlock_t queue_lock; struct list_head queue; struct spi_message *cur_msg; struct completion cur_msg_completion; bool cur_msg_incomplete; bool cur_msg_need_completion; bool busy; bool running; bool rt; bool auto_runtime_pm; bool fallback; bool last_cs_mode_high; s8 last_cs[SPI_CS_CNT_MAX]; u32 last_cs_index_mask : SPI_CS_CNT_MAX; struct completion xfer_completion; size_t max_dma_len; int (*optimize_message)(struct spi_message *msg); int (*unoptimize_message)(struct spi_message *msg); int (*prepare_transfer_hardware)(struct spi_controller *ctlr); int (*transfer_one_message)(struct spi_controller *ctlr, struct spi_message *mesg); int (*unprepare_transfer_hardware)(struct spi_controller *ctlr); int (*prepare_message)(struct spi_controller *ctlr, struct spi_message *message); int (*unprepare_message)(struct spi_controller *ctlr, struct spi_message *message); union { int (*slave_abort)(struct spi_controller *ctlr); int (*target_abort)(struct spi_controller *ctlr); }; /* * These hooks are for drivers that use a generic implementation * of transfer_one_message() provided by the core. */ void (*set_cs)(struct spi_device *spi, bool enable); int (*transfer_one)(struct spi_controller *ctlr, struct spi_device *spi, struct spi_transfer *transfer); void (*handle_err)(struct spi_controller *ctlr, struct spi_message *message); /* Optimized handlers for SPI memory-like operations. */ const struct spi_controller_mem_ops *mem_ops; const struct spi_controller_mem_caps *mem_caps; /* GPIO chip select */ struct gpio_desc **cs_gpiods; bool use_gpio_descriptors; s8 unused_native_cs; s8 max_native_cs; /* Statistics */ struct spi_statistics __percpu *pcpu_statistics; /* DMA channels for use with core dmaengine helpers */ struct dma_chan *dma_tx; struct dma_chan *dma_rx; /* Dummy data for full duplex devices */ void *dummy_rx; void *dummy_tx; int (*fw_translate_cs)(struct spi_controller *ctlr, unsigned cs); /* * Driver sets this field to indicate it is able to snapshot SPI * transfers (needed e.g. for reading the time of POSIX clocks) */ bool ptp_sts_supported; /* Interrupt enable state during PTP system timestamping */ unsigned long irq_flags; /* Flag for enabling opportunistic skipping of the queue in spi_sync */ bool queue_empty; bool must_async; bool defer_optimize_message; }; static inline void *spi_controller_get_devdata(struct spi_controller *ctlr) { return dev_get_drvdata(&ctlr->dev); } static inline void spi_controller_set_devdata(struct spi_controller *ctlr, void *data) { dev_set_drvdata(&ctlr->dev, data); } static inline struct spi_controller *spi_controller_get(struct spi_controller *ctlr) { if (!ctlr || !get_device(&ctlr->dev)) return NULL; return ctlr; } static inline void spi_controller_put(struct spi_controller *ctlr) { if (ctlr) put_device(&ctlr->dev); } static inline bool spi_controller_is_slave(struct spi_controller *ctlr) { return IS_ENABLED(CONFIG_SPI_SLAVE) && ctlr->slave; } static inline bool spi_controller_is_target(struct spi_controller *ctlr) { return IS_ENABLED(CONFIG_SPI_SLAVE) && ctlr->target; } /* PM calls that need to be issued by the driver */ extern int spi_controller_suspend(struct spi_controller *ctlr); extern int spi_controller_resume(struct spi_controller *ctlr); /* Calls the driver make to interact with the message queue */ extern struct spi_message *spi_get_next_queued_message(struct spi_controller *ctlr); extern void spi_finalize_current_message(struct spi_controller *ctlr); extern void spi_finalize_current_transfer(struct spi_controller *ctlr); /* Helper calls for driver to timestamp transfer */ void spi_take_timestamp_pre(struct spi_controller *ctlr, struct spi_transfer *xfer, size_t progress, bool irqs_off); void spi_take_timestamp_post(struct spi_controller *ctlr, struct spi_transfer *xfer, size_t progress, bool irqs_off); /* The SPI driver core manages memory for the spi_controller classdev */ extern struct spi_controller *__spi_alloc_controller(struct device *host, unsigned int size, bool slave); static inline struct spi_controller *spi_alloc_master(struct device *host, unsigned int size) { return __spi_alloc_controller(host, size, false); } static inline struct spi_controller *spi_alloc_slave(struct device *host, unsigned int size) { if (!IS_ENABLED(CONFIG_SPI_SLAVE)) return NULL; return __spi_alloc_controller(host, size, true); } static inline struct spi_controller *spi_alloc_host(struct device *dev, unsigned int size) { return __spi_alloc_controller(dev, size, false); } static inline struct spi_controller *spi_alloc_target(struct device *dev, unsigned int size) { if (!IS_ENABLED(CONFIG_SPI_SLAVE)) return NULL; return __spi_alloc_controller(dev, size, true); } struct spi_controller *__devm_spi_alloc_controller(struct device *dev, unsigned int size, bool slave); static inline struct spi_controller *devm_spi_alloc_master(struct device *dev, unsigned int size) { return __devm_spi_alloc_controller(dev, size, false); } static inline struct spi_controller *devm_spi_alloc_slave(struct device *dev, unsigned int size) { if (!IS_ENABLED(CONFIG_SPI_SLAVE)) return NULL; return __devm_spi_alloc_controller(dev, size, true); } static inline struct spi_controller *devm_spi_alloc_host(struct device *dev, unsigned int size) { return __devm_spi_alloc_controller(dev, size, false); } static inline struct spi_controller *devm_spi_alloc_target(struct device *dev, unsigned int size) { if (!IS_ENABLED(CONFIG_SPI_SLAVE)) return NULL; return __devm_spi_alloc_controller(dev, size, true); } extern int spi_register_controller(struct spi_controller *ctlr); extern int devm_spi_register_controller(struct device *dev, struct spi_controller *ctlr); extern void spi_unregister_controller(struct spi_controller *ctlr); #if IS_ENABLED(CONFIG_ACPI) && IS_ENABLED(CONFIG_SPI_MASTER) extern struct spi_controller *acpi_spi_find_controller_by_adev(struct acpi_device *adev); extern struct spi_device *acpi_spi_device_alloc(struct spi_controller *ctlr, struct acpi_device *adev, int index); int acpi_spi_count_resources(struct acpi_device *adev); #else static inline struct spi_controller *acpi_spi_find_controller_by_adev(struct acpi_device *adev) { return NULL; } static inline struct spi_device *acpi_spi_device_alloc(struct spi_controller *ctlr, struct acpi_device *adev, int index) { return ERR_PTR(-ENODEV); } static inline int acpi_spi_count_resources(struct acpi_device *adev) { return 0; } #endif /* * SPI resource management while processing a SPI message */ typedef void (*spi_res_release_t)(struct spi_controller *ctlr, struct spi_message *msg, void *res); /** * struct spi_res - SPI resource management structure * @entry: list entry * @release: release code called prior to freeing this resource * @data: extra data allocated for the specific use-case * * This is based on ideas from devres, but focused on life-cycle * management during spi_message processing. */ struct spi_res { struct list_head entry; spi_res_release_t release; unsigned long long data[]; /* Guarantee ull alignment */ }; /*---------------------------------------------------------------------------*/ /* * I/O INTERFACE between SPI controller and protocol drivers * * Protocol drivers use a queue of spi_messages, each transferring data * between the controller and memory buffers. * * The spi_messages themselves consist of a series of read+write transfer * segments. Those segments always read the same number of bits as they * write; but one or the other is easily ignored by passing a NULL buffer * pointer. (This is unlike most types of I/O API, because SPI hardware * is full duplex.) * * NOTE: Allocation of spi_transfer and spi_message memory is entirely * up to the protocol driver, which guarantees the integrity of both (as * well as the data buffers) for as long as the message is queued. */ /** * struct spi_transfer - a read/write buffer pair * @tx_buf: data to be written (DMA-safe memory), or NULL * @rx_buf: data to be read (DMA-safe memory), or NULL * @tx_dma: DMA address of tx_buf, currently not for client use * @rx_dma: DMA address of rx_buf, currently not for client use * @tx_nbits: number of bits used for writing. If 0 the default * (SPI_NBITS_SINGLE) is used. * @rx_nbits: number of bits used for reading. If 0 the default * (SPI_NBITS_SINGLE) is used. * @len: size of rx and tx buffers (in bytes) * @speed_hz: Select a speed other than the device default for this * transfer. If 0 the default (from @spi_device) is used. * @bits_per_word: select a bits_per_word other than the device default * for this transfer. If 0 the default (from @spi_device) is used. * @dummy_data: indicates transfer is dummy bytes transfer. * @cs_off: performs the transfer with chipselect off. * @cs_change: affects chipselect after this transfer completes * @cs_change_delay: delay between cs deassert and assert when * @cs_change is set and @spi_transfer is not the last in @spi_message * @delay: delay to be introduced after this transfer before * (optionally) changing the chipselect status, then starting * the next transfer or completing this @spi_message. * @word_delay: inter word delay to be introduced after each word size * (set by bits_per_word) transmission. * @effective_speed_hz: the effective SCK-speed that was used to * transfer this transfer. Set to 0 if the SPI bus driver does * not support it. * @transfer_list: transfers are sequenced through @spi_message.transfers * @tx_sg_mapped: If true, the @tx_sg is mapped for DMA * @rx_sg_mapped: If true, the @rx_sg is mapped for DMA * @tx_sg: Scatterlist for transmit, currently not for client use * @rx_sg: Scatterlist for receive, currently not for client use * @ptp_sts_word_pre: The word (subject to bits_per_word semantics) offset * within @tx_buf for which the SPI device is requesting that the time * snapshot for this transfer begins. Upon completing the SPI transfer, * this value may have changed compared to what was requested, depending * on the available snapshotting resolution (DMA transfer, * @ptp_sts_supported is false, etc). * @ptp_sts_word_post: See @ptp_sts_word_post. The two can be equal (meaning * that a single byte should be snapshotted). * If the core takes care of the timestamp (if @ptp_sts_supported is false * for this controller), it will set @ptp_sts_word_pre to 0, and * @ptp_sts_word_post to the length of the transfer. This is done * purposefully (instead of setting to spi_transfer->len - 1) to denote * that a transfer-level snapshot taken from within the driver may still * be of higher quality. * @ptp_sts: Pointer to a memory location held by the SPI slave device where a * PTP system timestamp structure may lie. If drivers use PIO or their * hardware has some sort of assist for retrieving exact transfer timing, * they can (and should) assert @ptp_sts_supported and populate this * structure using the ptp_read_system_*ts helper functions. * The timestamp must represent the time at which the SPI slave device has * processed the word, i.e. the "pre" timestamp should be taken before * transmitting the "pre" word, and the "post" timestamp after receiving * transmit confirmation from the controller for the "post" word. * @timestamped: true if the transfer has been timestamped * @error: Error status logged by SPI controller driver. * * SPI transfers always write the same number of bytes as they read. * Protocol drivers should always provide @rx_buf and/or @tx_buf. * In some cases, they may also want to provide DMA addresses for * the data being transferred; that may reduce overhead, when the * underlying driver uses DMA. * * If the transmit buffer is NULL, zeroes will be shifted out * while filling @rx_buf. If the receive buffer is NULL, the data * shifted in will be discarded. Only "len" bytes shift out (or in). * It's an error to try to shift out a partial word. (For example, by * shifting out three bytes with word size of sixteen or twenty bits; * the former uses two bytes per word, the latter uses four bytes.) * * In-memory data values are always in native CPU byte order, translated * from the wire byte order (big-endian except with SPI_LSB_FIRST). So * for example when bits_per_word is sixteen, buffers are 2N bytes long * (@len = 2N) and hold N sixteen bit words in CPU byte order. * * When the word size of the SPI transfer is not a power-of-two multiple * of eight bits, those in-memory words include extra bits. In-memory * words are always seen by protocol drivers as right-justified, so the * undefined (rx) or unused (tx) bits are always the most significant bits. * * All SPI transfers start with the relevant chipselect active. Normally * it stays selected until after the last transfer in a message. Drivers * can affect the chipselect signal using cs_change. * * (i) If the transfer isn't the last one in the message, this flag is * used to make the chipselect briefly go inactive in the middle of the * message. Toggling chipselect in this way may be needed to terminate * a chip command, letting a single spi_message perform all of group of * chip transactions together. * * (ii) When the transfer is the last one in the message, the chip may * stay selected until the next transfer. On multi-device SPI busses * with nothing blocking messages going to other devices, this is just * a performance hint; starting a message to another device deselects * this one. But in other cases, this can be used to ensure correctness. * Some devices need protocol transactions to be built from a series of * spi_message submissions, where the content of one message is determined * by the results of previous messages and where the whole transaction * ends when the chipselect goes inactive. * * When SPI can transfer in 1x,2x or 4x. It can get this transfer information * from device through @tx_nbits and @rx_nbits. In Bi-direction, these * two should both be set. User can set transfer mode with SPI_NBITS_SINGLE(1x) * SPI_NBITS_DUAL(2x) and SPI_NBITS_QUAD(4x) to support these three transfer. * * The code that submits an spi_message (and its spi_transfers) * to the lower layers is responsible for managing its memory. * Zero-initialize every field you don't set up explicitly, to * insulate against future API updates. After you submit a message * and its transfers, ignore them until its completion callback. */ struct spi_transfer { /* * It's okay if tx_buf == rx_buf (right?). * For MicroWire, one buffer must be NULL. * Buffers must work with dma_*map_single() calls. */ const void *tx_buf; void *rx_buf; unsigned len; #define SPI_TRANS_FAIL_NO_START BIT(0) #define SPI_TRANS_FAIL_IO BIT(1) u16 error; bool tx_sg_mapped; bool rx_sg_mapped; struct sg_table tx_sg; struct sg_table rx_sg; dma_addr_t tx_dma; dma_addr_t rx_dma; unsigned dummy_data:1; unsigned cs_off:1; unsigned cs_change:1; unsigned tx_nbits:4; unsigned rx_nbits:4; unsigned timestamped:1; #define SPI_NBITS_SINGLE 0x01 /* 1-bit transfer */ #define SPI_NBITS_DUAL 0x02 /* 2-bit transfer */ #define SPI_NBITS_QUAD 0x04 /* 4-bit transfer */ #define SPI_NBITS_OCTAL 0x08 /* 8-bit transfer */ u8 bits_per_word; struct spi_delay delay; struct spi_delay cs_change_delay; struct spi_delay word_delay; u32 speed_hz; u32 effective_speed_hz; unsigned int ptp_sts_word_pre; unsigned int ptp_sts_word_post; struct ptp_system_timestamp *ptp_sts; struct list_head transfer_list; }; /** * struct spi_message - one multi-segment SPI transaction * @transfers: list of transfer segments in this transaction * @spi: SPI device to which the transaction is queued * @pre_optimized: peripheral driver pre-optimized the message * @optimized: the message is in the optimized state * @prepared: spi_prepare_message was called for the this message * @status: zero for success, else negative errno * @complete: called to report transaction completions * @context: the argument to complete() when it's called * @frame_length: the total number of bytes in the message * @actual_length: the total number of bytes that were transferred in all * successful segments * @queue: for use by whichever driver currently owns the message * @state: for use by whichever driver currently owns the message * @opt_state: for use by whichever driver currently owns the message * @resources: for resource management when the SPI message is processed * * A @spi_message is used to execute an atomic sequence of data transfers, * each represented by a struct spi_transfer. The sequence is "atomic" * in the sense that no other spi_message may use that SPI bus until that * sequence completes. On some systems, many such sequences can execute as * a single programmed DMA transfer. On all systems, these messages are * queued, and might complete after transactions to other devices. Messages * sent to a given spi_device are always executed in FIFO order. * * The code that submits an spi_message (and its spi_transfers) * to the lower layers is responsible for managing its memory. * Zero-initialize every field you don't set up explicitly, to * insulate against future API updates. After you submit a message * and its transfers, ignore them until its completion callback. */ struct spi_message { struct list_head transfers; struct spi_device *spi; /* spi_optimize_message() was called for this message */ bool pre_optimized; /* __spi_optimize_message() was called for this message */ bool optimized; /* spi_prepare_message() was called for this message */ bool prepared; /* * REVISIT: we might want a flag affecting the behavior of the * last transfer ... allowing things like "read 16 bit length L" * immediately followed by "read L bytes". Basically imposing * a specific message scheduling algorithm. * * Some controller drivers (message-at-a-time queue processing) * could provide that as their default scheduling algorithm. But * others (with multi-message pipelines) could need a flag to * tell them about such special cases. */ /* Completion is reported through a callback */ int status; void (*complete)(void *context); void *context; unsigned frame_length; unsigned actual_length; /* * For optional use by whatever driver currently owns the * spi_message ... between calls to spi_async and then later * complete(), that's the spi_controller controller driver. */ struct list_head queue; void *state; /* * Optional state for use by controller driver between calls to * __spi_optimize_message() and __spi_unoptimize_message(). */ void *opt_state; /* List of spi_res resources when the SPI message is processed */ struct list_head resources; }; static inline void spi_message_init_no_memset(struct spi_message *m) { INIT_LIST_HEAD(&m->transfers); INIT_LIST_HEAD(&m->resources); } static inline void spi_message_init(struct spi_message *m) { memset(m, 0, sizeof *m); spi_message_init_no_memset(m); } static inline void spi_message_add_tail(struct spi_transfer *t, struct spi_message *m) { list_add_tail(&t->transfer_list, &m->transfers); } static inline void spi_transfer_del(struct spi_transfer *t) { list_del(&t->transfer_list); } static inline int spi_transfer_delay_exec(struct spi_transfer *t) { return spi_delay_exec(&t->delay, t); } /** * spi_message_init_with_transfers - Initialize spi_message and append transfers * @m: spi_message to be initialized * @xfers: An array of SPI transfers * @num_xfers: Number of items in the xfer array * * This function initializes the given spi_message and adds each spi_transfer in * the given array to the message. */ static inline void spi_message_init_with_transfers(struct spi_message *m, struct spi_transfer *xfers, unsigned int num_xfers) { unsigned int i; spi_message_init(m); for (i = 0; i < num_xfers; ++i) spi_message_add_tail(&xfers[i], m); } /* * It's fine to embed message and transaction structures in other data * structures so long as you don't free them while they're in use. */ static inline struct spi_message *spi_message_alloc(unsigned ntrans, gfp_t flags) { struct spi_message_with_transfers { struct spi_message m; struct spi_transfer t[]; } *mwt; unsigned i; mwt = kzalloc(struct_size(mwt, t, ntrans), flags); if (!mwt) return NULL; spi_message_init_no_memset(&mwt->m); for (i = 0; i < ntrans; i++) spi_message_add_tail(&mwt->t[i], &mwt->m); return &mwt->m; } static inline void spi_message_free(struct spi_message *m) { kfree(m); } extern int spi_optimize_message(struct spi_device *spi, struct spi_message *msg); extern void spi_unoptimize_message(struct spi_message *msg); extern int devm_spi_optimize_message(struct device *dev, struct spi_device *spi, struct spi_message *msg); extern int spi_setup(struct spi_device *spi); extern int spi_async(struct spi_device *spi, struct spi_message *message); extern int spi_slave_abort(struct spi_device *spi); extern int spi_target_abort(struct spi_device *spi); static inline size_t spi_max_message_size(struct spi_device *spi) { struct spi_controller *ctlr = spi->controller; if (!ctlr->max_message_size) return SIZE_MAX; return ctlr->max_message_size(spi); } static inline size_t spi_max_transfer_size(struct spi_device *spi) { struct spi_controller *ctlr = spi->controller; size_t tr_max = SIZE_MAX; size_t msg_max = spi_max_message_size(spi); if (ctlr->max_transfer_size) tr_max = ctlr->max_transfer_size(spi); /* Transfer size limit must not be greater than message size limit */ return min(tr_max, msg_max); } /** * spi_is_bpw_supported - Check if bits per word is supported * @spi: SPI device * @bpw: Bits per word * * This function checks to see if the SPI controller supports @bpw. * * Returns: * True if @bpw is supported, false otherwise. */ static inline bool spi_is_bpw_supported(struct spi_device *spi, u32 bpw) { u32 bpw_mask = spi->controller->bits_per_word_mask; if (bpw == 8 || (bpw <= 32 && bpw_mask & SPI_BPW_MASK(bpw))) return true; return false; } /** * spi_controller_xfer_timeout - Compute a suitable timeout value * @ctlr: SPI device * @xfer: Transfer descriptor * * Compute a relevant timeout value for the given transfer. We derive the time * that it would take on a single data line and take twice this amount of time * with a minimum of 500ms to avoid false positives on loaded systems. * * Returns: Transfer timeout value in milliseconds. */ static inline unsigned int spi_controller_xfer_timeout(struct spi_controller *ctlr, struct spi_transfer *xfer) { return max(xfer->len * 8 * 2 / (xfer->speed_hz / 1000), 500U); } /*---------------------------------------------------------------------------*/ /* SPI transfer replacement methods which make use of spi_res */ struct spi_replaced_transfers; typedef void (*spi_replaced_release_t)(struct spi_controller *ctlr, struct spi_message *msg, struct spi_replaced_transfers *res); /** * struct spi_replaced_transfers - structure describing the spi_transfer * replacements that have occurred * so that they can get reverted * @release: some extra release code to get executed prior to * releasing this structure * @extradata: pointer to some extra data if requested or NULL * @replaced_transfers: transfers that have been replaced and which need * to get restored * @replaced_after: the transfer after which the @replaced_transfers * are to get re-inserted * @inserted: number of transfers inserted * @inserted_transfers: array of spi_transfers of array-size @inserted, * that have been replacing replaced_transfers * * Note: that @extradata will point to @inserted_transfers[@inserted] * if some extra allocation is requested, so alignment will be the same * as for spi_transfers. */ struct spi_replaced_transfers { spi_replaced_release_t release; void *extradata; struct list_head replaced_transfers; struct list_head *replaced_after; size_t inserted; struct spi_transfer inserted_transfers[]; }; /*---------------------------------------------------------------------------*/ /* SPI transfer transformation methods */ extern int spi_split_transfers_maxsize(struct spi_controller *ctlr, struct spi_message *msg, size_t maxsize); extern int spi_split_transfers_maxwords(struct spi_controller *ctlr, struct spi_message *msg, size_t maxwords); /*---------------------------------------------------------------------------*/ /* * All these synchronous SPI transfer routines are utilities layered * over the core async transfer primitive. Here, "synchronous" means * they will sleep uninterruptibly until the async transfer completes. */ extern int spi_sync(struct spi_device *spi, struct spi_message *message); extern int spi_sync_locked(struct spi_device *spi, struct spi_message *message); extern int spi_bus_lock(struct spi_controller *ctlr); extern int spi_bus_unlock(struct spi_controller *ctlr); /** * spi_sync_transfer - synchronous SPI data transfer * @spi: device with which data will be exchanged * @xfers: An array of spi_transfers * @num_xfers: Number of items in the xfer array * Context: can sleep * * Does a synchronous SPI data transfer of the given spi_transfer array. * * For more specific semantics see spi_sync(). * * Return: zero on success, else a negative error code. */ static inline int spi_sync_transfer(struct spi_device *spi, struct spi_transfer *xfers, unsigned int num_xfers) { struct spi_message msg; spi_message_init_with_transfers(&msg, xfers, num_xfers); return spi_sync(spi, &msg); } /** * spi_write - SPI synchronous write * @spi: device to which data will be written * @buf: data buffer * @len: data buffer size * Context: can sleep * * This function writes the buffer @buf. * Callable only from contexts that can sleep. * * Return: zero on success, else a negative error code. */ static inline int spi_write(struct spi_device *spi, const void *buf, size_t len) { struct spi_transfer t = { .tx_buf = buf, .len = len, }; return spi_sync_transfer(spi, &t, 1); } /** * spi_read - SPI synchronous read * @spi: device from which data will be read * @buf: data buffer * @len: data buffer size * Context: can sleep * * This function reads the buffer @buf. * Callable only from contexts that can sleep. * * Return: zero on success, else a negative error code. */ static inline int spi_read(struct spi_device *spi, void *buf, size_t len) { struct spi_transfer t = { .rx_buf = buf, .len = len, }; return spi_sync_transfer(spi, &t, 1); } /* This copies txbuf and rxbuf data; for small transfers only! */ extern int spi_write_then_read(struct spi_device *spi, const void *txbuf, unsigned n_tx, void *rxbuf, unsigned n_rx); /** * spi_w8r8 - SPI synchronous 8 bit write followed by 8 bit read * @spi: device with which data will be exchanged * @cmd: command to be written before data is read back * Context: can sleep * * Callable only from contexts that can sleep. * * Return: the (unsigned) eight bit number returned by the * device, or else a negative error code. */ static inline ssize_t spi_w8r8(struct spi_device *spi, u8 cmd) { ssize_t status; u8 result; status = spi_write_then_read(spi, &cmd, 1, &result, 1); /* Return negative errno or unsigned value */ return (status < 0) ? status : result; } /** * spi_w8r16 - SPI synchronous 8 bit write followed by 16 bit read * @spi: device with which data will be exchanged * @cmd: command to be written before data is read back * Context: can sleep * * The number is returned in wire-order, which is at least sometimes * big-endian. * * Callable only from contexts that can sleep. * * Return: the (unsigned) sixteen bit number returned by the * device, or else a negative error code. */ static inline ssize_t spi_w8r16(struct spi_device *spi, u8 cmd) { ssize_t status; u16 result; status = spi_write_then_read(spi, &cmd, 1, &result, 2); /* Return negative errno or unsigned value */ return (status < 0) ? status : result; } /** * spi_w8r16be - SPI synchronous 8 bit write followed by 16 bit big-endian read * @spi: device with which data will be exchanged * @cmd: command to be written before data is read back * Context: can sleep * * This function is similar to spi_w8r16, with the exception that it will * convert the read 16 bit data word from big-endian to native endianness. * * Callable only from contexts that can sleep. * * Return: the (unsigned) sixteen bit number returned by the device in CPU * endianness, or else a negative error code. */ static inline ssize_t spi_w8r16be(struct spi_device *spi, u8 cmd) { ssize_t status; __be16 result; status = spi_write_then_read(spi, &cmd, 1, &result, 2); if (status < 0) return status; return be16_to_cpu(result); } /*---------------------------------------------------------------------------*/ /* * INTERFACE between board init code and SPI infrastructure. * * No SPI driver ever sees these SPI device table segments, but * it's how the SPI core (or adapters that get hotplugged) grows * the driver model tree. * * As a rule, SPI devices can't be probed. Instead, board init code * provides a table listing the devices which are present, with enough * information to bind and set up the device's driver. There's basic * support for non-static configurations too; enough to handle adding * parport adapters, or microcontrollers acting as USB-to-SPI bridges. */ /** * struct spi_board_info - board-specific template for a SPI device * @modalias: Initializes spi_device.modalias; identifies the driver. * @platform_data: Initializes spi_device.platform_data; the particular * data stored there is driver-specific. * @swnode: Software node for the device. * @controller_data: Initializes spi_device.controller_data; some * controllers need hints about hardware setup, e.g. for DMA. * @irq: Initializes spi_device.irq; depends on how the board is wired. * @max_speed_hz: Initializes spi_device.max_speed_hz; based on limits * from the chip datasheet and board-specific signal quality issues. * @bus_num: Identifies which spi_controller parents the spi_device; unused * by spi_new_device(), and otherwise depends on board wiring. * @chip_select: Initializes spi_device.chip_select; depends on how * the board is wired. * @mode: Initializes spi_device.mode; based on the chip datasheet, board * wiring (some devices support both 3WIRE and standard modes), and * possibly presence of an inverter in the chipselect path. * * When adding new SPI devices to the device tree, these structures serve * as a partial device template. They hold information which can't always * be determined by drivers. Information that probe() can establish (such * as the default transfer wordsize) is not included here. * * These structures are used in two places. Their primary role is to * be stored in tables of board-specific device descriptors, which are * declared early in board initialization and then used (much later) to * populate a controller's device tree after the that controller's driver * initializes. A secondary (and atypical) role is as a parameter to * spi_new_device() call, which happens after those controller drivers * are active in some dynamic board configuration models. */ struct spi_board_info { /* * The device name and module name are coupled, like platform_bus; * "modalias" is normally the driver name. * * platform_data goes to spi_device.dev.platform_data, * controller_data goes to spi_device.controller_data, * IRQ is copied too. */ char modalias[SPI_NAME_SIZE]; const void *platform_data; const struct software_node *swnode; void *controller_data; int irq; /* Slower signaling on noisy or low voltage boards */ u32 max_speed_hz; /* * bus_num is board specific and matches the bus_num of some * spi_controller that will probably be registered later. * * chip_select reflects how this chip is wired to that master; * it's less than num_chipselect. */ u16 bus_num; u16 chip_select; /* * mode becomes spi_device.mode, and is essential for chips * where the default of SPI_CS_HIGH = 0 is wrong. */ u32 mode; /* * ... may need additional spi_device chip config data here. * avoid stuff protocol drivers can set; but include stuff * needed to behave without being bound to a driver: * - quirks like clock rate mattering when not selected */ }; #ifdef CONFIG_SPI extern int spi_register_board_info(struct spi_board_info const *info, unsigned n); #else /* Board init code may ignore whether SPI is configured or not */ static inline int spi_register_board_info(struct spi_board_info const *info, unsigned n) { return 0; } #endif /* * If you're hotplugging an adapter with devices (parport, USB, etc) * use spi_new_device() to describe each device. You can also call * spi_unregister_device() to start making that device vanish, but * normally that would be handled by spi_unregister_controller(). * * You can also use spi_alloc_device() and spi_add_device() to use a two * stage registration sequence for each spi_device. This gives the caller * some more control over the spi_device structure before it is registered, * but requires that caller to initialize fields that would otherwise * be defined using the board info. */ extern struct spi_device * spi_alloc_device(struct spi_controller *ctlr); extern int spi_add_device(struct spi_device *spi); extern struct spi_device * spi_new_device(struct spi_controller *, struct spi_board_info *); extern void spi_unregister_device(struct spi_device *spi); extern const struct spi_device_id * spi_get_device_id(const struct spi_device *sdev); extern const void * spi_get_device_match_data(const struct spi_device *sdev); static inline bool spi_transfer_is_last(struct spi_controller *ctlr, struct spi_transfer *xfer) { return list_is_last(&xfer->transfer_list, &ctlr->cur_msg->transfers); } #endif /* __LINUX_SPI_H */ |
1 1 1 1 4 3 2 4 3 1 6 1 4 2 4 8 8 8 6 3 1 3 9 1 1 1 1 1 1 1 2 2 1 11 10 1 3 3 1 2 2 3 1 70 1 1 3 9 2 1 2 54 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 21 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 | // SPDX-License-Identifier: GPL-2.0-only /* By Ross Biro 1/23/92 */ /* * Pentium III FXSR, SSE support * Gareth Hughes <gareth@valinux.com>, May 2000 */ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/mm.h> #include <linux/smp.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/ptrace.h> #include <linux/user.h> #include <linux/elf.h> #include <linux/security.h> #include <linux/audit.h> #include <linux/seccomp.h> #include <linux/signal.h> #include <linux/perf_event.h> #include <linux/hw_breakpoint.h> #include <linux/rcupdate.h> #include <linux/export.h> #include <linux/context_tracking.h> #include <linux/nospec.h> #include <linux/uaccess.h> #include <asm/processor.h> #include <asm/fpu/signal.h> #include <asm/fpu/regset.h> #include <asm/fpu/xstate.h> #include <asm/debugreg.h> #include <asm/ldt.h> #include <asm/desc.h> #include <asm/prctl.h> #include <asm/proto.h> #include <asm/hw_breakpoint.h> #include <asm/traps.h> #include <asm/syscall.h> #include <asm/fsgsbase.h> #include <asm/io_bitmap.h> #include "tls.h" enum x86_regset_32 { REGSET32_GENERAL, REGSET32_FP, REGSET32_XFP, REGSET32_XSTATE, REGSET32_TLS, REGSET32_IOPERM, }; enum x86_regset_64 { REGSET64_GENERAL, REGSET64_FP, REGSET64_IOPERM, REGSET64_XSTATE, REGSET64_SSP, }; #define REGSET_GENERAL \ ({ \ BUILD_BUG_ON((int)REGSET32_GENERAL != (int)REGSET64_GENERAL); \ REGSET32_GENERAL; \ }) #define REGSET_FP \ ({ \ BUILD_BUG_ON((int)REGSET32_FP != (int)REGSET64_FP); \ REGSET32_FP; \ }) struct pt_regs_offset { const char *name; int offset; }; #define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)} #define REG_OFFSET_END {.name = NULL, .offset = 0} static const struct pt_regs_offset regoffset_table[] = { #ifdef CONFIG_X86_64 REG_OFFSET_NAME(r15), REG_OFFSET_NAME(r14), REG_OFFSET_NAME(r13), REG_OFFSET_NAME(r12), REG_OFFSET_NAME(r11), REG_OFFSET_NAME(r10), REG_OFFSET_NAME(r9), REG_OFFSET_NAME(r8), #endif REG_OFFSET_NAME(bx), REG_OFFSET_NAME(cx), REG_OFFSET_NAME(dx), REG_OFFSET_NAME(si), REG_OFFSET_NAME(di), REG_OFFSET_NAME(bp), REG_OFFSET_NAME(ax), #ifdef CONFIG_X86_32 REG_OFFSET_NAME(ds), REG_OFFSET_NAME(es), REG_OFFSET_NAME(fs), REG_OFFSET_NAME(gs), #endif REG_OFFSET_NAME(orig_ax), REG_OFFSET_NAME(ip), REG_OFFSET_NAME(cs), REG_OFFSET_NAME(flags), REG_OFFSET_NAME(sp), REG_OFFSET_NAME(ss), REG_OFFSET_END, }; /** * regs_query_register_offset() - query register offset from its name * @name: the name of a register * * regs_query_register_offset() returns the offset of a register in struct * pt_regs from its name. If the name is invalid, this returns -EINVAL; */ int regs_query_register_offset(const char *name) { const struct pt_regs_offset *roff; for (roff = regoffset_table; roff->name != NULL; roff++) if (!strcmp(roff->name, name)) return roff->offset; return -EINVAL; } /** * regs_query_register_name() - query register name from its offset * @offset: the offset of a register in struct pt_regs. * * regs_query_register_name() returns the name of a register from its * offset in struct pt_regs. If the @offset is invalid, this returns NULL; */ const char *regs_query_register_name(unsigned int offset) { const struct pt_regs_offset *roff; for (roff = regoffset_table; roff->name != NULL; roff++) if (roff->offset == offset) return roff->name; return NULL; } /* * does not yet catch signals sent when the child dies. * in exit.c or in signal.c. */ /* * Determines which flags the user has access to [1 = access, 0 = no access]. */ #define FLAG_MASK_32 ((unsigned long) \ (X86_EFLAGS_CF | X86_EFLAGS_PF | \ X86_EFLAGS_AF | X86_EFLAGS_ZF | \ X86_EFLAGS_SF | X86_EFLAGS_TF | \ X86_EFLAGS_DF | X86_EFLAGS_OF | \ X86_EFLAGS_RF | X86_EFLAGS_AC)) /* * Determines whether a value may be installed in a segment register. */ static inline bool invalid_selector(u16 value) { return unlikely(value != 0 && (value & SEGMENT_RPL_MASK) != USER_RPL); } #ifdef CONFIG_X86_32 #define FLAG_MASK FLAG_MASK_32 static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) { BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); return ®s->bx + (regno >> 2); } static u16 get_segment_reg(struct task_struct *task, unsigned long offset) { /* * Returning the value truncates it to 16 bits. */ unsigned int retval; if (offset != offsetof(struct user_regs_struct, gs)) retval = *pt_regs_access(task_pt_regs(task), offset); else { if (task == current) savesegment(gs, retval); else retval = task->thread.gs; } return retval; } static int set_segment_reg(struct task_struct *task, unsigned long offset, u16 value) { if (WARN_ON_ONCE(task == current)) return -EIO; /* * The value argument was already truncated to 16 bits. */ if (invalid_selector(value)) return -EIO; /* * For %cs and %ss we cannot permit a null selector. * We can permit a bogus selector as long as it has USER_RPL. * Null selectors are fine for other segment registers, but * we will never get back to user mode with invalid %cs or %ss * and will take the trap in iret instead. Much code relies * on user_mode() to distinguish a user trap frame (which can * safely use invalid selectors) from a kernel trap frame. */ switch (offset) { case offsetof(struct user_regs_struct, cs): case offsetof(struct user_regs_struct, ss): if (unlikely(value == 0)) return -EIO; fallthrough; default: *pt_regs_access(task_pt_regs(task), offset) = value; break; case offsetof(struct user_regs_struct, gs): task->thread.gs = value; } return 0; } #else /* CONFIG_X86_64 */ #define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT) static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long offset) { BUILD_BUG_ON(offsetof(struct pt_regs, r15) != 0); return ®s->r15 + (offset / sizeof(regs->r15)); } static u16 get_segment_reg(struct task_struct *task, unsigned long offset) { /* * Returning the value truncates it to 16 bits. */ unsigned int seg; switch (offset) { case offsetof(struct user_regs_struct, fs): if (task == current) { /* Older gas can't assemble movq %?s,%r?? */ asm("movl %%fs,%0" : "=r" (seg)); return seg; } return task->thread.fsindex; case offsetof(struct user_regs_struct, gs): if (task == current) { asm("movl %%gs,%0" : "=r" (seg)); return seg; } return task->thread.gsindex; case offsetof(struct user_regs_struct, ds): if (task == current) { asm("movl %%ds,%0" : "=r" (seg)); return seg; } return task->thread.ds; case offsetof(struct user_regs_struct, es): if (task == current) { asm("movl %%es,%0" : "=r" (seg)); return seg; } return task->thread.es; case offsetof(struct user_regs_struct, cs): case offsetof(struct user_regs_struct, ss): break; } return *pt_regs_access(task_pt_regs(task), offset); } static int set_segment_reg(struct task_struct *task, unsigned long offset, u16 value) { if (WARN_ON_ONCE(task == current)) return -EIO; /* * The value argument was already truncated to 16 bits. */ if (invalid_selector(value)) return -EIO; /* * Writes to FS and GS will change the stored selector. Whether * this changes the segment base as well depends on whether * FSGSBASE is enabled. */ switch (offset) { case offsetof(struct user_regs_struct,fs): task->thread.fsindex = value; break; case offsetof(struct user_regs_struct,gs): task->thread.gsindex = value; break; case offsetof(struct user_regs_struct,ds): task->thread.ds = value; break; case offsetof(struct user_regs_struct,es): task->thread.es = value; break; /* * Can't actually change these in 64-bit mode. */ case offsetof(struct user_regs_struct,cs): if (unlikely(value == 0)) return -EIO; task_pt_regs(task)->cs = value; break; case offsetof(struct user_regs_struct,ss): if (unlikely(value == 0)) return -EIO; task_pt_regs(task)->ss = value; break; } return 0; } #endif /* CONFIG_X86_32 */ static unsigned long get_flags(struct task_struct *task) { unsigned long retval = task_pt_regs(task)->flags; /* * If the debugger set TF, hide it from the readout. */ if (test_tsk_thread_flag(task, TIF_FORCED_TF)) retval &= ~X86_EFLAGS_TF; return retval; } static int set_flags(struct task_struct *task, unsigned long value) { struct pt_regs *regs = task_pt_regs(task); /* * If the user value contains TF, mark that * it was not "us" (the debugger) that set it. * If not, make sure it stays set if we had. */ if (value & X86_EFLAGS_TF) clear_tsk_thread_flag(task, TIF_FORCED_TF); else if (test_tsk_thread_flag(task, TIF_FORCED_TF)) value |= X86_EFLAGS_TF; regs->flags = (regs->flags & ~FLAG_MASK) | (value & FLAG_MASK); return 0; } static int putreg(struct task_struct *child, unsigned long offset, unsigned long value) { switch (offset) { case offsetof(struct user_regs_struct, cs): case offsetof(struct user_regs_struct, ds): case offsetof(struct user_regs_struct, es): case offsetof(struct user_regs_struct, fs): case offsetof(struct user_regs_struct, gs): case offsetof(struct user_regs_struct, ss): return set_segment_reg(child, offset, value); case offsetof(struct user_regs_struct, flags): return set_flags(child, value); #ifdef CONFIG_X86_64 case offsetof(struct user_regs_struct,fs_base): if (value >= TASK_SIZE_MAX) return -EIO; x86_fsbase_write_task(child, value); return 0; case offsetof(struct user_regs_struct,gs_base): if (value >= TASK_SIZE_MAX) return -EIO; x86_gsbase_write_task(child, value); return 0; #endif } *pt_regs_access(task_pt_regs(child), offset) = value; return 0; } static unsigned long getreg(struct task_struct *task, unsigned long offset) { switch (offset) { case offsetof(struct user_regs_struct, cs): case offsetof(struct user_regs_struct, ds): case offsetof(struct user_regs_struct, es): case offsetof(struct user_regs_struct, fs): case offsetof(struct user_regs_struct, gs): case offsetof(struct user_regs_struct, ss): return get_segment_reg(task, offset); case offsetof(struct user_regs_struct, flags): return get_flags(task); #ifdef CONFIG_X86_64 case offsetof(struct user_regs_struct, fs_base): return x86_fsbase_read_task(task); case offsetof(struct user_regs_struct, gs_base): return x86_gsbase_read_task(task); #endif } return *pt_regs_access(task_pt_regs(task), offset); } static int genregs_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { int reg; for (reg = 0; to.left; reg++) membuf_store(&to, getreg(target, reg * sizeof(unsigned long))); return 0; } static int genregs_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int ret = 0; if (kbuf) { const unsigned long *k = kbuf; while (count >= sizeof(*k) && !ret) { ret = putreg(target, pos, *k++); count -= sizeof(*k); pos += sizeof(*k); } } else { const unsigned long __user *u = ubuf; while (count >= sizeof(*u) && !ret) { unsigned long word; ret = __get_user(word, u++); if (ret) break; ret = putreg(target, pos, word); count -= sizeof(*u); pos += sizeof(*u); } } return ret; } static void ptrace_triggered(struct perf_event *bp, struct perf_sample_data *data, struct pt_regs *regs) { int i; struct thread_struct *thread = &(current->thread); /* * Store in the virtual DR6 register the fact that the breakpoint * was hit so the thread's debugger will see it. */ for (i = 0; i < HBP_NUM; i++) { if (thread->ptrace_bps[i] == bp) break; } thread->virtual_dr6 |= (DR_TRAP0 << i); } /* * Walk through every ptrace breakpoints for this thread and * build the dr7 value on top of their attributes. * */ static unsigned long ptrace_get_dr7(struct perf_event *bp[]) { int i; int dr7 = 0; struct arch_hw_breakpoint *info; for (i = 0; i < HBP_NUM; i++) { if (bp[i] && !bp[i]->attr.disabled) { info = counter_arch_bp(bp[i]); dr7 |= encode_dr7(i, info->len, info->type); } } return dr7; } static int ptrace_fill_bp_fields(struct perf_event_attr *attr, int len, int type, bool disabled) { int err, bp_len, bp_type; err = arch_bp_generic_fields(len, type, &bp_len, &bp_type); if (!err) { attr->bp_len = bp_len; attr->bp_type = bp_type; attr->disabled = disabled; } return err; } static struct perf_event * ptrace_register_breakpoint(struct task_struct *tsk, int len, int type, unsigned long addr, bool disabled) { struct perf_event_attr attr; int err; ptrace_breakpoint_init(&attr); attr.bp_addr = addr; err = ptrace_fill_bp_fields(&attr, len, type, disabled); if (err) return ERR_PTR(err); return register_user_hw_breakpoint(&attr, ptrace_triggered, NULL, tsk); } static int ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, int disabled) { struct perf_event_attr attr = bp->attr; int err; err = ptrace_fill_bp_fields(&attr, len, type, disabled); if (err) return err; return modify_user_hw_breakpoint(bp, &attr); } /* * Handle ptrace writes to debug register 7. */ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data) { struct thread_struct *thread = &tsk->thread; unsigned long old_dr7; bool second_pass = false; int i, rc, ret = 0; data &= ~DR_CONTROL_RESERVED; old_dr7 = ptrace_get_dr7(thread->ptrace_bps); restore: rc = 0; for (i = 0; i < HBP_NUM; i++) { unsigned len, type; bool disabled = !decode_dr7(data, i, &len, &type); struct perf_event *bp = thread->ptrace_bps[i]; if (!bp) { if (disabled) continue; bp = ptrace_register_breakpoint(tsk, len, type, 0, disabled); if (IS_ERR(bp)) { rc = PTR_ERR(bp); break; } thread->ptrace_bps[i] = bp; continue; } rc = ptrace_modify_breakpoint(bp, len, type, disabled); if (rc) break; } /* Restore if the first pass failed, second_pass shouldn't fail. */ if (rc && !WARN_ON(second_pass)) { ret = rc; data = old_dr7; second_pass = true; goto restore; } return ret; } /* * Handle PTRACE_PEEKUSR calls for the debug register area. */ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n) { struct thread_struct *thread = &tsk->thread; unsigned long val = 0; if (n < HBP_NUM) { int index = array_index_nospec(n, HBP_NUM); struct perf_event *bp = thread->ptrace_bps[index]; if (bp) val = bp->hw.info.address; } else if (n == 6) { val = thread->virtual_dr6 ^ DR6_RESERVED; /* Flip back to arch polarity */ } else if (n == 7) { val = thread->ptrace_dr7; } return val; } static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, unsigned long addr) { struct thread_struct *t = &tsk->thread; struct perf_event *bp = t->ptrace_bps[nr]; int err = 0; if (!bp) { /* * Put stub len and type to create an inactive but correct bp. * * CHECKME: the previous code returned -EIO if the addr wasn't * a valid task virtual addr. The new one will return -EINVAL in * this case. * -EINVAL may be what we want for in-kernel breakpoints users, * but -EIO looks better for ptrace, since we refuse a register * writing for the user. And anyway this is the previous * behaviour. */ bp = ptrace_register_breakpoint(tsk, X86_BREAKPOINT_LEN_1, X86_BREAKPOINT_WRITE, addr, true); if (IS_ERR(bp)) err = PTR_ERR(bp); else t->ptrace_bps[nr] = bp; } else { struct perf_event_attr attr = bp->attr; attr.bp_addr = addr; err = modify_user_hw_breakpoint(bp, &attr); } return err; } /* * Handle PTRACE_POKEUSR calls for the debug register area. */ static int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) { struct thread_struct *thread = &tsk->thread; /* There are no DR4 or DR5 registers */ int rc = -EIO; if (n < HBP_NUM) { rc = ptrace_set_breakpoint_addr(tsk, n, val); } else if (n == 6) { thread->virtual_dr6 = val ^ DR6_RESERVED; /* Flip to positive polarity */ rc = 0; } else if (n == 7) { rc = ptrace_write_dr7(tsk, val); if (!rc) thread->ptrace_dr7 = val; } return rc; } /* * These access the current or another (stopped) task's io permission * bitmap for debugging or core dump. */ static int ioperm_active(struct task_struct *target, const struct user_regset *regset) { struct io_bitmap *iobm = target->thread.io_bitmap; return iobm ? DIV_ROUND_UP(iobm->max, regset->size) : 0; } static int ioperm_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { struct io_bitmap *iobm = target->thread.io_bitmap; if (!iobm) return -ENXIO; return membuf_write(&to, iobm->bitmap, IO_BITMAP_BYTES); } /* * Called by kernel/ptrace.c when detaching.. * * Make sure the single step bit is not set. */ void ptrace_disable(struct task_struct *child) { user_disable_single_step(child); } #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION static const struct user_regset_view user_x86_32_view; /* Initialized below. */ #endif #ifdef CONFIG_X86_64 static const struct user_regset_view user_x86_64_view; /* Initialized below. */ #endif long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data) { int ret; unsigned long __user *datap = (unsigned long __user *)data; #ifdef CONFIG_X86_64 /* This is native 64-bit ptrace() */ const struct user_regset_view *regset_view = &user_x86_64_view; #else /* This is native 32-bit ptrace() */ const struct user_regset_view *regset_view = &user_x86_32_view; #endif switch (request) { /* read the word at location addr in the USER area. */ case PTRACE_PEEKUSR: { unsigned long tmp; ret = -EIO; if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user)) break; tmp = 0; /* Default return condition */ if (addr < sizeof(struct user_regs_struct)) tmp = getreg(child, addr); else if (addr >= offsetof(struct user, u_debugreg[0]) && addr <= offsetof(struct user, u_debugreg[7])) { addr -= offsetof(struct user, u_debugreg[0]); tmp = ptrace_get_debugreg(child, addr / sizeof(data)); } ret = put_user(tmp, datap); break; } case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ ret = -EIO; if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user)) break; if (addr < sizeof(struct user_regs_struct)) ret = putreg(child, addr, data); else if (addr >= offsetof(struct user, u_debugreg[0]) && addr <= offsetof(struct user, u_debugreg[7])) { addr -= offsetof(struct user, u_debugreg[0]); ret = ptrace_set_debugreg(child, addr / sizeof(data), data); } break; case PTRACE_GETREGS: /* Get all gp regs from the child. */ return copy_regset_to_user(child, regset_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_SETREGS: /* Set all gp regs in the child. */ return copy_regset_from_user(child, regset_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_GETFPREGS: /* Get the child FPU state. */ return copy_regset_to_user(child, regset_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); case PTRACE_SETFPREGS: /* Set the child FPU state. */ return copy_regset_from_user(child, regset_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); #ifdef CONFIG_X86_32 case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ return copy_regset_to_user(child, &user_x86_32_view, REGSET32_XFP, 0, sizeof(struct user_fxsr_struct), datap) ? -EIO : 0; case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ return copy_regset_from_user(child, &user_x86_32_view, REGSET32_XFP, 0, sizeof(struct user_fxsr_struct), datap) ? -EIO : 0; #endif #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION case PTRACE_GET_THREAD_AREA: if ((int) addr < 0) return -EIO; ret = do_get_thread_area(child, addr, (struct user_desc __user *)data); break; case PTRACE_SET_THREAD_AREA: if ((int) addr < 0) return -EIO; ret = do_set_thread_area(child, addr, (struct user_desc __user *)data, 0); break; #endif #ifdef CONFIG_X86_64 /* normal 64bit interface to access TLS data. Works just like arch_prctl, except that the arguments are reversed. */ case PTRACE_ARCH_PRCTL: ret = do_arch_prctl_64(child, data, addr); break; #endif default: ret = ptrace_request(child, request, addr, data); break; } return ret; } #ifdef CONFIG_IA32_EMULATION #include <linux/compat.h> #include <linux/syscalls.h> #include <asm/ia32.h> #include <asm/user32.h> #define R32(l,q) \ case offsetof(struct user32, regs.l): \ regs->q = value; break #define SEG32(rs) \ case offsetof(struct user32, regs.rs): \ return set_segment_reg(child, \ offsetof(struct user_regs_struct, rs), \ value); \ break static int putreg32(struct task_struct *child, unsigned regno, u32 value) { struct pt_regs *regs = task_pt_regs(child); int ret; switch (regno) { SEG32(cs); SEG32(ds); SEG32(es); /* * A 32-bit ptracer on a 64-bit kernel expects that writing * FS or GS will also update the base. This is needed for * operations like PTRACE_SETREGS to fully restore a saved * CPU state. */ case offsetof(struct user32, regs.fs): ret = set_segment_reg(child, offsetof(struct user_regs_struct, fs), value); if (ret == 0) child->thread.fsbase = x86_fsgsbase_read_task(child, value); return ret; case offsetof(struct user32, regs.gs): ret = set_segment_reg(child, offsetof(struct user_regs_struct, gs), value); if (ret == 0) child->thread.gsbase = x86_fsgsbase_read_task(child, value); return ret; SEG32(ss); R32(ebx, bx); R32(ecx, cx); R32(edx, dx); R32(edi, di); R32(esi, si); R32(ebp, bp); R32(eax, ax); R32(eip, ip); R32(esp, sp); case offsetof(struct user32, regs.orig_eax): /* * Warning: bizarre corner case fixup here. A 32-bit * debugger setting orig_eax to -1 wants to disable * syscall restart. Make sure that the syscall * restart code sign-extends orig_ax. Also make sure * we interpret the -ERESTART* codes correctly if * loaded into regs->ax in case the task is not * actually still sitting at the exit from a 32-bit * syscall with TS_COMPAT still set. */ regs->orig_ax = value; if (syscall_get_nr(child, regs) != -1) child->thread_info.status |= TS_I386_REGS_POKED; break; case offsetof(struct user32, regs.eflags): return set_flags(child, value); case offsetof(struct user32, u_debugreg[0]) ... offsetof(struct user32, u_debugreg[7]): regno -= offsetof(struct user32, u_debugreg[0]); return ptrace_set_debugreg(child, regno / 4, value); default: if (regno > sizeof(struct user32) || (regno & 3)) return -EIO; /* * Other dummy fields in the virtual user structure * are ignored */ break; } return 0; } #undef R32 #undef SEG32 #define R32(l,q) \ case offsetof(struct user32, regs.l): \ *val = regs->q; break #define SEG32(rs) \ case offsetof(struct user32, regs.rs): \ *val = get_segment_reg(child, \ offsetof(struct user_regs_struct, rs)); \ break static int getreg32(struct task_struct *child, unsigned regno, u32 *val) { struct pt_regs *regs = task_pt_regs(child); switch (regno) { SEG32(ds); SEG32(es); SEG32(fs); SEG32(gs); R32(cs, cs); R32(ss, ss); R32(ebx, bx); R32(ecx, cx); R32(edx, dx); R32(edi, di); R32(esi, si); R32(ebp, bp); R32(eax, ax); R32(orig_eax, orig_ax); R32(eip, ip); R32(esp, sp); case offsetof(struct user32, regs.eflags): *val = get_flags(child); break; case offsetof(struct user32, u_debugreg[0]) ... offsetof(struct user32, u_debugreg[7]): regno -= offsetof(struct user32, u_debugreg[0]); *val = ptrace_get_debugreg(child, regno / 4); break; default: if (regno > sizeof(struct user32) || (regno & 3)) return -EIO; /* * Other dummy fields in the virtual user structure * are ignored */ *val = 0; break; } return 0; } #undef R32 #undef SEG32 static int genregs32_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { int reg; for (reg = 0; to.left; reg++) { u32 val; getreg32(target, reg * 4, &val); membuf_store(&to, val); } return 0; } static int genregs32_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { int ret = 0; if (kbuf) { const compat_ulong_t *k = kbuf; while (count >= sizeof(*k) && !ret) { ret = putreg32(target, pos, *k++); count -= sizeof(*k); pos += sizeof(*k); } } else { const compat_ulong_t __user *u = ubuf; while (count >= sizeof(*u) && !ret) { compat_ulong_t word; ret = __get_user(word, u++); if (ret) break; ret = putreg32(target, pos, word); count -= sizeof(*u); pos += sizeof(*u); } } return ret; } static long ia32_arch_ptrace(struct task_struct *child, compat_long_t request, compat_ulong_t caddr, compat_ulong_t cdata) { unsigned long addr = caddr; unsigned long data = cdata; void __user *datap = compat_ptr(data); int ret; __u32 val; switch (request) { case PTRACE_PEEKUSR: ret = getreg32(child, addr, &val); if (ret == 0) ret = put_user(val, (__u32 __user *)datap); break; case PTRACE_POKEUSR: ret = putreg32(child, addr, data); break; case PTRACE_GETREGS: /* Get all gp regs from the child. */ return copy_regset_to_user(child, &user_x86_32_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct32), datap); case PTRACE_SETREGS: /* Set all gp regs in the child. */ return copy_regset_from_user(child, &user_x86_32_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct32), datap); case PTRACE_GETFPREGS: /* Get the child FPU state. */ return copy_regset_to_user(child, &user_x86_32_view, REGSET_FP, 0, sizeof(struct user_i387_ia32_struct), datap); case PTRACE_SETFPREGS: /* Set the child FPU state. */ return copy_regset_from_user( child, &user_x86_32_view, REGSET_FP, 0, sizeof(struct user_i387_ia32_struct), datap); case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ return copy_regset_to_user(child, &user_x86_32_view, REGSET32_XFP, 0, sizeof(struct user32_fxsr_struct), datap); case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ return copy_regset_from_user(child, &user_x86_32_view, REGSET32_XFP, 0, sizeof(struct user32_fxsr_struct), datap); case PTRACE_GET_THREAD_AREA: case PTRACE_SET_THREAD_AREA: return arch_ptrace(child, request, addr, data); default: return compat_ptrace_request(child, request, addr, data); } return ret; } #endif /* CONFIG_IA32_EMULATION */ #ifdef CONFIG_X86_X32_ABI static long x32_arch_ptrace(struct task_struct *child, compat_long_t request, compat_ulong_t caddr, compat_ulong_t cdata) { unsigned long addr = caddr; unsigned long data = cdata; void __user *datap = compat_ptr(data); int ret; switch (request) { /* Read 32bits at location addr in the USER area. Only allow to return the lower 32bits of segment and debug registers. */ case PTRACE_PEEKUSR: { u32 tmp; ret = -EIO; if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) || addr < offsetof(struct user_regs_struct, cs)) break; tmp = 0; /* Default return condition */ if (addr < sizeof(struct user_regs_struct)) tmp = getreg(child, addr); else if (addr >= offsetof(struct user, u_debugreg[0]) && addr <= offsetof(struct user, u_debugreg[7])) { addr -= offsetof(struct user, u_debugreg[0]); tmp = ptrace_get_debugreg(child, addr / sizeof(data)); } ret = put_user(tmp, (__u32 __user *)datap); break; } /* Write the word at location addr in the USER area. Only allow to update segment and debug registers with the upper 32bits zero-extended. */ case PTRACE_POKEUSR: ret = -EIO; if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user) || addr < offsetof(struct user_regs_struct, cs)) break; if (addr < sizeof(struct user_regs_struct)) ret = putreg(child, addr, data); else if (addr >= offsetof(struct user, u_debugreg[0]) && addr <= offsetof(struct user, u_debugreg[7])) { addr -= offsetof(struct user, u_debugreg[0]); ret = ptrace_set_debugreg(child, addr / sizeof(data), data); } break; case PTRACE_GETREGS: /* Get all gp regs from the child. */ return copy_regset_to_user(child, &user_x86_64_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_SETREGS: /* Set all gp regs in the child. */ return copy_regset_from_user(child, &user_x86_64_view, REGSET_GENERAL, 0, sizeof(struct user_regs_struct), datap); case PTRACE_GETFPREGS: /* Get the child FPU state. */ return copy_regset_to_user(child, &user_x86_64_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); case PTRACE_SETFPREGS: /* Set the child FPU state. */ return copy_regset_from_user(child, &user_x86_64_view, REGSET_FP, 0, sizeof(struct user_i387_struct), datap); default: return compat_ptrace_request(child, request, addr, data); } return ret; } #endif #ifdef CONFIG_COMPAT long compat_arch_ptrace(struct task_struct *child, compat_long_t request, compat_ulong_t caddr, compat_ulong_t cdata) { #ifdef CONFIG_X86_X32_ABI if (!in_ia32_syscall()) return x32_arch_ptrace(child, request, caddr, cdata); #endif #ifdef CONFIG_IA32_EMULATION return ia32_arch_ptrace(child, request, caddr, cdata); #else return 0; #endif } #endif /* CONFIG_COMPAT */ #ifdef CONFIG_X86_64 static struct user_regset x86_64_regsets[] __ro_after_init = { [REGSET64_GENERAL] = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct) / sizeof(long), .size = sizeof(long), .align = sizeof(long), .regset_get = genregs_get, .set = genregs_set }, [REGSET64_FP] = { .core_note_type = NT_PRFPREG, .n = sizeof(struct fxregs_state) / sizeof(long), .size = sizeof(long), .align = sizeof(long), .active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set }, [REGSET64_XSTATE] = { .core_note_type = NT_X86_XSTATE, .size = sizeof(u64), .align = sizeof(u64), .active = xstateregs_active, .regset_get = xstateregs_get, .set = xstateregs_set }, [REGSET64_IOPERM] = { .core_note_type = NT_386_IOPERM, .n = IO_BITMAP_LONGS, .size = sizeof(long), .align = sizeof(long), .active = ioperm_active, .regset_get = ioperm_get }, #ifdef CONFIG_X86_USER_SHADOW_STACK [REGSET64_SSP] = { .core_note_type = NT_X86_SHSTK, .n = 1, .size = sizeof(u64), .align = sizeof(u64), .active = ssp_active, .regset_get = ssp_get, .set = ssp_set }, #endif }; static const struct user_regset_view user_x86_64_view = { .name = "x86_64", .e_machine = EM_X86_64, .regsets = x86_64_regsets, .n = ARRAY_SIZE(x86_64_regsets) }; #else /* CONFIG_X86_32 */ #define user_regs_struct32 user_regs_struct #define genregs32_get genregs_get #define genregs32_set genregs_set #endif /* CONFIG_X86_64 */ #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION static struct user_regset x86_32_regsets[] __ro_after_init = { [REGSET32_GENERAL] = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct32) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), .regset_get = genregs32_get, .set = genregs32_set }, [REGSET32_FP] = { .core_note_type = NT_PRFPREG, .n = sizeof(struct user_i387_ia32_struct) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), .active = regset_fpregs_active, .regset_get = fpregs_get, .set = fpregs_set }, [REGSET32_XFP] = { .core_note_type = NT_PRXFPREG, .n = sizeof(struct fxregs_state) / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), .active = regset_xregset_fpregs_active, .regset_get = xfpregs_get, .set = xfpregs_set }, [REGSET32_XSTATE] = { .core_note_type = NT_X86_XSTATE, .size = sizeof(u64), .align = sizeof(u64), .active = xstateregs_active, .regset_get = xstateregs_get, .set = xstateregs_set }, [REGSET32_TLS] = { .core_note_type = NT_386_TLS, .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN, .size = sizeof(struct user_desc), .align = sizeof(struct user_desc), .active = regset_tls_active, .regset_get = regset_tls_get, .set = regset_tls_set }, [REGSET32_IOPERM] = { .core_note_type = NT_386_IOPERM, .n = IO_BITMAP_BYTES / sizeof(u32), .size = sizeof(u32), .align = sizeof(u32), .active = ioperm_active, .regset_get = ioperm_get }, }; static const struct user_regset_view user_x86_32_view = { .name = "i386", .e_machine = EM_386, .regsets = x86_32_regsets, .n = ARRAY_SIZE(x86_32_regsets) }; #endif /* * This represents bytes 464..511 in the memory layout exported through * the REGSET_XSTATE interface. */ u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask) { #ifdef CONFIG_X86_64 x86_64_regsets[REGSET64_XSTATE].n = size / sizeof(u64); #endif #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION x86_32_regsets[REGSET32_XSTATE].n = size / sizeof(u64); #endif xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask; } /* * This is used by the core dump code to decide which regset to dump. The * core dump code writes out the resulting .e_machine and the corresponding * regsets. This is suboptimal if the task is messing around with its CS.L * field, but at worst the core dump will end up missing some information. * * Unfortunately, it is also used by the broken PTRACE_GETREGSET and * PTRACE_SETREGSET APIs. These APIs look at the .regsets field but have * no way to make sure that the e_machine they use matches the caller's * expectations. The result is that the data format returned by * PTRACE_GETREGSET depends on the returned CS field (and even the offset * of the returned CS field depends on its value!) and the data format * accepted by PTRACE_SETREGSET is determined by the old CS value. The * upshot is that it is basically impossible to use these APIs correctly. * * The best way to fix it in the long run would probably be to add new * improved ptrace() APIs to read and write registers reliably, possibly by * allowing userspace to select the ELF e_machine variant that they expect. */ const struct user_regset_view *task_user_regset_view(struct task_struct *task) { #ifdef CONFIG_IA32_EMULATION if (!user_64bit_mode(task_pt_regs(task))) #endif #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION return &user_x86_32_view; #endif #ifdef CONFIG_X86_64 return &user_x86_64_view; #endif } void send_sigtrap(struct pt_regs *regs, int error_code, int si_code) { struct task_struct *tsk = current; tsk->thread.trap_nr = X86_TRAP_DB; tsk->thread.error_code = error_code; /* Send us the fake SIGTRAP */ force_sig_fault(SIGTRAP, si_code, user_mode(regs) ? (void __user *)regs->ip : NULL); } void user_single_step_report(struct pt_regs *regs) { send_sigtrap(regs, 0, TRAP_BRKPT); } |
4 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/xattr_trusted.c * * Vyacheslav Dubeyko <slava@dubeyko.com> * * Handler for trusted extended attributes. */ #include <linux/nls.h> #include "hfsplus_fs.h" #include "xattr.h" static int hfsplus_trusted_getxattr(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *buffer, size_t size) { return hfsplus_getxattr(inode, name, buffer, size, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); } static int hfsplus_trusted_setxattr(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { return hfsplus_setxattr(inode, name, buffer, size, flags, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); } const struct xattr_handler hfsplus_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .get = hfsplus_trusted_getxattr, .set = hfsplus_trusted_setxattr, }; |
145 145 7 49 89 83 55 64 1 136 58 105 56 39 7 7 25 27 1 4 10 10 27 3 1 6 30 2 39 8 6 4 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 4 9 2 4 3 11 1 11 5 2 1 7 6 6 1 1 4 4 1 4 4 4 4 4 4 5 5 2 1 3 3 1 1 8 2 6 2 5 7 1 1 1 10 2 1 7 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 9 4 1 4 4 4 4 2 2 1 1 1 1 1 1 1 1 1 1 2 2 2 1 1 1 2 5 4 5 4 4 3 63 5 5 5 5 27 5 4 4 4 7 3 7 18 16 1 1 3 12 2 2 1 2 2 2 2 2 1 19 2 2 9 2 2 1 1 1 15 2 11 2 1 1 2 2 4 4 6 2 2 2 2 1 1 15 1 2 10 2 13 2 1 8 2 2 1 1 4 3 3 4 4 5 12 12 11 11 174 175 39 12 31 135 19 129 2 130 130 161 31 2 128 161 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 | // SPDX-License-Identifier: GPL-2.0-only /* * * Authors: * Alexander Aring <aar@pengutronix.de> * * Based on: net/wireless/nl80211.c */ #include <linux/rtnetlink.h> #include <net/cfg802154.h> #include <net/genetlink.h> #include <net/mac802154.h> #include <net/netlink.h> #include <net/nl802154.h> #include <net/sock.h> #include "nl802154.h" #include "rdev-ops.h" #include "core.h" /* the netlink family */ static struct genl_family nl802154_fam; /* multicast groups */ enum nl802154_multicast_groups { NL802154_MCGRP_CONFIG, NL802154_MCGRP_SCAN, }; static const struct genl_multicast_group nl802154_mcgrps[] = { [NL802154_MCGRP_CONFIG] = { .name = "config", }, [NL802154_MCGRP_SCAN] = { .name = "scan", }, }; /* returns ERR_PTR values */ static struct wpan_dev * __cfg802154_wpan_dev_from_attrs(struct net *netns, struct nlattr **attrs) { struct cfg802154_registered_device *rdev; struct wpan_dev *result = NULL; bool have_ifidx = attrs[NL802154_ATTR_IFINDEX]; bool have_wpan_dev_id = attrs[NL802154_ATTR_WPAN_DEV]; u64 wpan_dev_id; int wpan_phy_idx = -1; int ifidx = -1; ASSERT_RTNL(); if (!have_ifidx && !have_wpan_dev_id) return ERR_PTR(-EINVAL); if (have_ifidx) ifidx = nla_get_u32(attrs[NL802154_ATTR_IFINDEX]); if (have_wpan_dev_id) { wpan_dev_id = nla_get_u64(attrs[NL802154_ATTR_WPAN_DEV]); wpan_phy_idx = wpan_dev_id >> 32; } list_for_each_entry(rdev, &cfg802154_rdev_list, list) { struct wpan_dev *wpan_dev; if (wpan_phy_net(&rdev->wpan_phy) != netns) continue; if (have_wpan_dev_id && rdev->wpan_phy_idx != wpan_phy_idx) continue; list_for_each_entry(wpan_dev, &rdev->wpan_dev_list, list) { if (have_ifidx && wpan_dev->netdev && wpan_dev->netdev->ifindex == ifidx) { result = wpan_dev; break; } if (have_wpan_dev_id && wpan_dev->identifier == (u32)wpan_dev_id) { result = wpan_dev; break; } } if (result) break; } if (result) return result; return ERR_PTR(-ENODEV); } static struct cfg802154_registered_device * __cfg802154_rdev_from_attrs(struct net *netns, struct nlattr **attrs) { struct cfg802154_registered_device *rdev = NULL, *tmp; struct net_device *netdev; ASSERT_RTNL(); if (!attrs[NL802154_ATTR_WPAN_PHY] && !attrs[NL802154_ATTR_IFINDEX] && !attrs[NL802154_ATTR_WPAN_DEV]) return ERR_PTR(-EINVAL); if (attrs[NL802154_ATTR_WPAN_PHY]) rdev = cfg802154_rdev_by_wpan_phy_idx( nla_get_u32(attrs[NL802154_ATTR_WPAN_PHY])); if (attrs[NL802154_ATTR_WPAN_DEV]) { u64 wpan_dev_id = nla_get_u64(attrs[NL802154_ATTR_WPAN_DEV]); struct wpan_dev *wpan_dev; bool found = false; tmp = cfg802154_rdev_by_wpan_phy_idx(wpan_dev_id >> 32); if (tmp) { /* make sure wpan_dev exists */ list_for_each_entry(wpan_dev, &tmp->wpan_dev_list, list) { if (wpan_dev->identifier != (u32)wpan_dev_id) continue; found = true; break; } if (!found) tmp = NULL; if (rdev && tmp != rdev) return ERR_PTR(-EINVAL); rdev = tmp; } } if (attrs[NL802154_ATTR_IFINDEX]) { int ifindex = nla_get_u32(attrs[NL802154_ATTR_IFINDEX]); netdev = __dev_get_by_index(netns, ifindex); if (netdev) { if (netdev->ieee802154_ptr) tmp = wpan_phy_to_rdev( netdev->ieee802154_ptr->wpan_phy); else tmp = NULL; /* not wireless device -- return error */ if (!tmp) return ERR_PTR(-EINVAL); /* mismatch -- return error */ if (rdev && tmp != rdev) return ERR_PTR(-EINVAL); rdev = tmp; } } if (!rdev) return ERR_PTR(-ENODEV); if (netns != wpan_phy_net(&rdev->wpan_phy)) return ERR_PTR(-ENODEV); return rdev; } /* This function returns a pointer to the driver * that the genl_info item that is passed refers to. * * The result of this can be a PTR_ERR and hence must * be checked with IS_ERR() for errors. */ static struct cfg802154_registered_device * cfg802154_get_dev_from_info(struct net *netns, struct genl_info *info) { return __cfg802154_rdev_from_attrs(netns, info->attrs); } /* policy for the attributes */ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = { [NL802154_ATTR_WPAN_PHY] = { .type = NLA_U32 }, [NL802154_ATTR_WPAN_PHY_NAME] = { .type = NLA_NUL_STRING, .len = 20-1 }, [NL802154_ATTR_IFINDEX] = { .type = NLA_U32 }, [NL802154_ATTR_IFTYPE] = { .type = NLA_U32 }, [NL802154_ATTR_IFNAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ-1 }, [NL802154_ATTR_WPAN_DEV] = { .type = NLA_U64 }, [NL802154_ATTR_PAGE] = NLA_POLICY_MAX(NLA_U8, IEEE802154_MAX_PAGE), [NL802154_ATTR_CHANNEL] = NLA_POLICY_MAX(NLA_U8, IEEE802154_MAX_CHANNEL), [NL802154_ATTR_TX_POWER] = { .type = NLA_S32, }, [NL802154_ATTR_CCA_MODE] = { .type = NLA_U32, }, [NL802154_ATTR_CCA_OPT] = { .type = NLA_U32, }, [NL802154_ATTR_CCA_ED_LEVEL] = { .type = NLA_S32, }, [NL802154_ATTR_SUPPORTED_CHANNEL] = { .type = NLA_U32, }, [NL802154_ATTR_PAN_ID] = { .type = NLA_U16, }, [NL802154_ATTR_EXTENDED_ADDR] = { .type = NLA_U64 }, [NL802154_ATTR_SHORT_ADDR] = { .type = NLA_U16, }, [NL802154_ATTR_MIN_BE] = { .type = NLA_U8, }, [NL802154_ATTR_MAX_BE] = { .type = NLA_U8, }, [NL802154_ATTR_MAX_CSMA_BACKOFFS] = { .type = NLA_U8, }, [NL802154_ATTR_MAX_FRAME_RETRIES] = { .type = NLA_S8, }, [NL802154_ATTR_LBT_MODE] = { .type = NLA_U8, }, [NL802154_ATTR_WPAN_PHY_CAPS] = { .type = NLA_NESTED }, [NL802154_ATTR_SUPPORTED_COMMANDS] = { .type = NLA_NESTED }, [NL802154_ATTR_ACKREQ_DEFAULT] = { .type = NLA_U8 }, [NL802154_ATTR_PID] = { .type = NLA_U32 }, [NL802154_ATTR_NETNS_FD] = { .type = NLA_U32 }, [NL802154_ATTR_COORDINATOR] = { .type = NLA_NESTED }, [NL802154_ATTR_SCAN_TYPE] = NLA_POLICY_RANGE(NLA_U8, NL802154_SCAN_ED, NL802154_SCAN_RIT_PASSIVE), [NL802154_ATTR_SCAN_CHANNELS] = NLA_POLICY_MASK(NLA_U32, GENMASK(IEEE802154_MAX_CHANNEL, 0)), [NL802154_ATTR_SCAN_PREAMBLE_CODES] = { .type = NLA_REJECT }, [NL802154_ATTR_SCAN_MEAN_PRF] = { .type = NLA_REJECT }, [NL802154_ATTR_SCAN_DURATION] = NLA_POLICY_MAX(NLA_U8, IEEE802154_MAX_SCAN_DURATION), [NL802154_ATTR_SCAN_DONE_REASON] = NLA_POLICY_RANGE(NLA_U8, NL802154_SCAN_DONE_REASON_FINISHED, NL802154_SCAN_DONE_REASON_ABORTED), [NL802154_ATTR_BEACON_INTERVAL] = NLA_POLICY_MAX(NLA_U8, IEEE802154_ACTIVE_SCAN_DURATION), [NL802154_ATTR_MAX_ASSOCIATIONS] = { .type = NLA_U32 }, [NL802154_ATTR_PEER] = { .type = NLA_NESTED }, #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL [NL802154_ATTR_SEC_ENABLED] = { .type = NLA_U8, }, [NL802154_ATTR_SEC_OUT_LEVEL] = { .type = NLA_U32, }, [NL802154_ATTR_SEC_OUT_KEY_ID] = { .type = NLA_NESTED, }, [NL802154_ATTR_SEC_FRAME_COUNTER] = { .type = NLA_U32 }, [NL802154_ATTR_SEC_LEVEL] = { .type = NLA_NESTED }, [NL802154_ATTR_SEC_DEVICE] = { .type = NLA_NESTED }, [NL802154_ATTR_SEC_DEVKEY] = { .type = NLA_NESTED }, [NL802154_ATTR_SEC_KEY] = { .type = NLA_NESTED }, #endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ }; static int nl802154_prepare_wpan_dev_dump(struct sk_buff *skb, struct netlink_callback *cb, struct cfg802154_registered_device **rdev, struct wpan_dev **wpan_dev) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); int err; rtnl_lock(); if (!cb->args[0]) { *wpan_dev = __cfg802154_wpan_dev_from_attrs(sock_net(skb->sk), info->info.attrs); if (IS_ERR(*wpan_dev)) { err = PTR_ERR(*wpan_dev); goto out_unlock; } *rdev = wpan_phy_to_rdev((*wpan_dev)->wpan_phy); /* 0 is the first index - add 1 to parse only once */ cb->args[0] = (*rdev)->wpan_phy_idx + 1; cb->args[1] = (*wpan_dev)->identifier; } else { /* subtract the 1 again here */ struct wpan_phy *wpan_phy = wpan_phy_idx_to_wpan_phy(cb->args[0] - 1); struct wpan_dev *tmp; if (!wpan_phy) { err = -ENODEV; goto out_unlock; } *rdev = wpan_phy_to_rdev(wpan_phy); *wpan_dev = NULL; list_for_each_entry(tmp, &(*rdev)->wpan_dev_list, list) { if (tmp->identifier == cb->args[1]) { *wpan_dev = tmp; break; } } if (!*wpan_dev) { err = -ENODEV; goto out_unlock; } } return 0; out_unlock: rtnl_unlock(); return err; } static void nl802154_finish_wpan_dev_dump(struct cfg802154_registered_device *rdev) { rtnl_unlock(); } /* message building helper */ static inline void *nl802154hdr_put(struct sk_buff *skb, u32 portid, u32 seq, int flags, u8 cmd) { /* since there is no private header just add the generic one */ return genlmsg_put(skb, portid, seq, &nl802154_fam, flags, cmd); } static int nl802154_put_flags(struct sk_buff *msg, int attr, u32 mask) { struct nlattr *nl_flags = nla_nest_start_noflag(msg, attr); int i; if (!nl_flags) return -ENOBUFS; i = 0; while (mask) { if ((mask & 1) && nla_put_flag(msg, i)) return -ENOBUFS; mask >>= 1; i++; } nla_nest_end(msg, nl_flags); return 0; } static int nl802154_send_wpan_phy_channels(struct cfg802154_registered_device *rdev, struct sk_buff *msg) { struct nlattr *nl_page; unsigned long page; nl_page = nla_nest_start_noflag(msg, NL802154_ATTR_CHANNELS_SUPPORTED); if (!nl_page) return -ENOBUFS; for (page = 0; page <= IEEE802154_MAX_PAGE; page++) { if (nla_put_u32(msg, NL802154_ATTR_SUPPORTED_CHANNEL, rdev->wpan_phy.supported.channels[page])) return -ENOBUFS; } nla_nest_end(msg, nl_page); return 0; } static int nl802154_put_capabilities(struct sk_buff *msg, struct cfg802154_registered_device *rdev) { const struct wpan_phy_supported *caps = &rdev->wpan_phy.supported; struct nlattr *nl_caps, *nl_channels; int i; nl_caps = nla_nest_start_noflag(msg, NL802154_ATTR_WPAN_PHY_CAPS); if (!nl_caps) return -ENOBUFS; nl_channels = nla_nest_start_noflag(msg, NL802154_CAP_ATTR_CHANNELS); if (!nl_channels) return -ENOBUFS; for (i = 0; i <= IEEE802154_MAX_PAGE; i++) { if (caps->channels[i]) { if (nl802154_put_flags(msg, i, caps->channels[i])) return -ENOBUFS; } } nla_nest_end(msg, nl_channels); if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL) { struct nlattr *nl_ed_lvls; nl_ed_lvls = nla_nest_start_noflag(msg, NL802154_CAP_ATTR_CCA_ED_LEVELS); if (!nl_ed_lvls) return -ENOBUFS; for (i = 0; i < caps->cca_ed_levels_size; i++) { if (nla_put_s32(msg, i, caps->cca_ed_levels[i])) return -ENOBUFS; } nla_nest_end(msg, nl_ed_lvls); } if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER) { struct nlattr *nl_tx_pwrs; nl_tx_pwrs = nla_nest_start_noflag(msg, NL802154_CAP_ATTR_TX_POWERS); if (!nl_tx_pwrs) return -ENOBUFS; for (i = 0; i < caps->tx_powers_size; i++) { if (nla_put_s32(msg, i, caps->tx_powers[i])) return -ENOBUFS; } nla_nest_end(msg, nl_tx_pwrs); } if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE) { if (nl802154_put_flags(msg, NL802154_CAP_ATTR_CCA_MODES, caps->cca_modes) || nl802154_put_flags(msg, NL802154_CAP_ATTR_CCA_OPTS, caps->cca_opts)) return -ENOBUFS; } if (nla_put_u8(msg, NL802154_CAP_ATTR_MIN_MINBE, caps->min_minbe) || nla_put_u8(msg, NL802154_CAP_ATTR_MAX_MINBE, caps->max_minbe) || nla_put_u8(msg, NL802154_CAP_ATTR_MIN_MAXBE, caps->min_maxbe) || nla_put_u8(msg, NL802154_CAP_ATTR_MAX_MAXBE, caps->max_maxbe) || nla_put_u8(msg, NL802154_CAP_ATTR_MIN_CSMA_BACKOFFS, caps->min_csma_backoffs) || nla_put_u8(msg, NL802154_CAP_ATTR_MAX_CSMA_BACKOFFS, caps->max_csma_backoffs) || nla_put_s8(msg, NL802154_CAP_ATTR_MIN_FRAME_RETRIES, caps->min_frame_retries) || nla_put_s8(msg, NL802154_CAP_ATTR_MAX_FRAME_RETRIES, caps->max_frame_retries) || nl802154_put_flags(msg, NL802154_CAP_ATTR_IFTYPES, caps->iftypes) || nla_put_u32(msg, NL802154_CAP_ATTR_LBT, caps->lbt)) return -ENOBUFS; nla_nest_end(msg, nl_caps); return 0; } static int nl802154_send_wpan_phy(struct cfg802154_registered_device *rdev, enum nl802154_commands cmd, struct sk_buff *msg, u32 portid, u32 seq, int flags) { struct nlattr *nl_cmds; void *hdr; int i; hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); if (!hdr) return -ENOBUFS; if (nla_put_u32(msg, NL802154_ATTR_WPAN_PHY, rdev->wpan_phy_idx) || nla_put_string(msg, NL802154_ATTR_WPAN_PHY_NAME, wpan_phy_name(&rdev->wpan_phy)) || nla_put_u32(msg, NL802154_ATTR_GENERATION, cfg802154_rdev_list_generation)) goto nla_put_failure; if (cmd != NL802154_CMD_NEW_WPAN_PHY) goto finish; /* DUMP PHY PIB */ /* current channel settings */ if (nla_put_u8(msg, NL802154_ATTR_PAGE, rdev->wpan_phy.current_page) || nla_put_u8(msg, NL802154_ATTR_CHANNEL, rdev->wpan_phy.current_channel)) goto nla_put_failure; /* TODO remove this behaviour, we still keep support it for a while * so users can change the behaviour to the new one. */ if (nl802154_send_wpan_phy_channels(rdev, msg)) goto nla_put_failure; /* cca mode */ if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE) { if (nla_put_u32(msg, NL802154_ATTR_CCA_MODE, rdev->wpan_phy.cca.mode)) goto nla_put_failure; if (rdev->wpan_phy.cca.mode == NL802154_CCA_ENERGY_CARRIER) { if (nla_put_u32(msg, NL802154_ATTR_CCA_OPT, rdev->wpan_phy.cca.opt)) goto nla_put_failure; } } if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER) { if (nla_put_s32(msg, NL802154_ATTR_TX_POWER, rdev->wpan_phy.transmit_power)) goto nla_put_failure; } if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL) { if (nla_put_s32(msg, NL802154_ATTR_CCA_ED_LEVEL, rdev->wpan_phy.cca_ed_level)) goto nla_put_failure; } if (nl802154_put_capabilities(msg, rdev)) goto nla_put_failure; nl_cmds = nla_nest_start_noflag(msg, NL802154_ATTR_SUPPORTED_COMMANDS); if (!nl_cmds) goto nla_put_failure; i = 0; #define CMD(op, n) \ do { \ if (rdev->ops->op) { \ i++; \ if (nla_put_u32(msg, i, NL802154_CMD_ ## n)) \ goto nla_put_failure; \ } \ } while (0) CMD(add_virtual_intf, NEW_INTERFACE); CMD(del_virtual_intf, DEL_INTERFACE); CMD(set_channel, SET_CHANNEL); CMD(set_pan_id, SET_PAN_ID); CMD(set_short_addr, SET_SHORT_ADDR); CMD(set_backoff_exponent, SET_BACKOFF_EXPONENT); CMD(set_max_csma_backoffs, SET_MAX_CSMA_BACKOFFS); CMD(set_max_frame_retries, SET_MAX_FRAME_RETRIES); CMD(set_lbt_mode, SET_LBT_MODE); CMD(set_ackreq_default, SET_ACKREQ_DEFAULT); if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER) CMD(set_tx_power, SET_TX_POWER); if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL) CMD(set_cca_ed_level, SET_CCA_ED_LEVEL); if (rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE) CMD(set_cca_mode, SET_CCA_MODE); #undef CMD nla_nest_end(msg, nl_cmds); finish: genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } struct nl802154_dump_wpan_phy_state { s64 filter_wpan_phy; long start; }; static int nl802154_dump_wpan_phy_parse(struct sk_buff *skb, struct netlink_callback *cb, struct nl802154_dump_wpan_phy_state *state) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct nlattr **tb = info->info.attrs; if (tb[NL802154_ATTR_WPAN_PHY]) state->filter_wpan_phy = nla_get_u32(tb[NL802154_ATTR_WPAN_PHY]); if (tb[NL802154_ATTR_WPAN_DEV]) state->filter_wpan_phy = nla_get_u64(tb[NL802154_ATTR_WPAN_DEV]) >> 32; if (tb[NL802154_ATTR_IFINDEX]) { struct net_device *netdev; struct cfg802154_registered_device *rdev; int ifidx = nla_get_u32(tb[NL802154_ATTR_IFINDEX]); netdev = __dev_get_by_index(&init_net, ifidx); if (!netdev) return -ENODEV; if (netdev->ieee802154_ptr) { rdev = wpan_phy_to_rdev( netdev->ieee802154_ptr->wpan_phy); state->filter_wpan_phy = rdev->wpan_phy_idx; } } return 0; } static int nl802154_dump_wpan_phy(struct sk_buff *skb, struct netlink_callback *cb) { int idx = 0, ret; struct nl802154_dump_wpan_phy_state *state = (void *)cb->args[0]; struct cfg802154_registered_device *rdev; rtnl_lock(); if (!state) { state = kzalloc(sizeof(*state), GFP_KERNEL); if (!state) { rtnl_unlock(); return -ENOMEM; } state->filter_wpan_phy = -1; ret = nl802154_dump_wpan_phy_parse(skb, cb, state); if (ret) { kfree(state); rtnl_unlock(); return ret; } cb->args[0] = (long)state; } list_for_each_entry(rdev, &cfg802154_rdev_list, list) { if (!net_eq(wpan_phy_net(&rdev->wpan_phy), sock_net(skb->sk))) continue; if (++idx <= state->start) continue; if (state->filter_wpan_phy != -1 && state->filter_wpan_phy != rdev->wpan_phy_idx) continue; /* attempt to fit multiple wpan_phy data chunks into the skb */ ret = nl802154_send_wpan_phy(rdev, NL802154_CMD_NEW_WPAN_PHY, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); if (ret < 0) { if ((ret == -ENOBUFS || ret == -EMSGSIZE) && !skb->len && cb->min_dump_alloc < 4096) { cb->min_dump_alloc = 4096; rtnl_unlock(); return 1; } idx--; break; } break; } rtnl_unlock(); state->start = idx; return skb->len; } static int nl802154_dump_wpan_phy_done(struct netlink_callback *cb) { kfree((void *)cb->args[0]); return 0; } static int nl802154_get_wpan_phy(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; struct cfg802154_registered_device *rdev = info->user_ptr[0]; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; if (nl802154_send_wpan_phy(rdev, NL802154_CMD_NEW_WPAN_PHY, msg, info->snd_portid, info->snd_seq, 0) < 0) { nlmsg_free(msg); return -ENOBUFS; } return genlmsg_reply(msg, info); } static inline u64 wpan_dev_id(struct wpan_dev *wpan_dev) { return (u64)wpan_dev->identifier | ((u64)wpan_phy_to_rdev(wpan_dev->wpan_phy)->wpan_phy_idx << 32); } #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL #include <net/ieee802154_netdev.h> static int ieee802154_llsec_send_key_id(struct sk_buff *msg, const struct ieee802154_llsec_key_id *desc) { struct nlattr *nl_dev_addr; if (nla_put_u32(msg, NL802154_KEY_ID_ATTR_MODE, desc->mode)) return -ENOBUFS; switch (desc->mode) { case NL802154_KEY_ID_MODE_IMPLICIT: nl_dev_addr = nla_nest_start_noflag(msg, NL802154_KEY_ID_ATTR_IMPLICIT); if (!nl_dev_addr) return -ENOBUFS; if (nla_put_le16(msg, NL802154_DEV_ADDR_ATTR_PAN_ID, desc->device_addr.pan_id) || nla_put_u32(msg, NL802154_DEV_ADDR_ATTR_MODE, desc->device_addr.mode)) return -ENOBUFS; switch (desc->device_addr.mode) { case NL802154_DEV_ADDR_SHORT: if (nla_put_le16(msg, NL802154_DEV_ADDR_ATTR_SHORT, desc->device_addr.short_addr)) return -ENOBUFS; break; case NL802154_DEV_ADDR_EXTENDED: if (nla_put_le64(msg, NL802154_DEV_ADDR_ATTR_EXTENDED, desc->device_addr.extended_addr, NL802154_DEV_ADDR_ATTR_PAD)) return -ENOBUFS; break; default: /* userspace should handle unknown */ break; } nla_nest_end(msg, nl_dev_addr); break; case NL802154_KEY_ID_MODE_INDEX: break; case NL802154_KEY_ID_MODE_INDEX_SHORT: /* TODO renmae short_source? */ if (nla_put_le32(msg, NL802154_KEY_ID_ATTR_SOURCE_SHORT, desc->short_source)) return -ENOBUFS; break; case NL802154_KEY_ID_MODE_INDEX_EXTENDED: if (nla_put_le64(msg, NL802154_KEY_ID_ATTR_SOURCE_EXTENDED, desc->extended_source, NL802154_KEY_ID_ATTR_PAD)) return -ENOBUFS; break; default: /* userspace should handle unknown */ break; } /* TODO key_id to key_idx ? Check naming */ if (desc->mode != NL802154_KEY_ID_MODE_IMPLICIT) { if (nla_put_u8(msg, NL802154_KEY_ID_ATTR_INDEX, desc->id)) return -ENOBUFS; } return 0; } static int nl802154_get_llsec_params(struct sk_buff *msg, struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev) { struct nlattr *nl_key_id; struct ieee802154_llsec_params params; int ret; ret = rdev_get_llsec_params(rdev, wpan_dev, ¶ms); if (ret < 0) return ret; if (nla_put_u8(msg, NL802154_ATTR_SEC_ENABLED, params.enabled) || nla_put_u32(msg, NL802154_ATTR_SEC_OUT_LEVEL, params.out_level) || nla_put_be32(msg, NL802154_ATTR_SEC_FRAME_COUNTER, params.frame_counter)) return -ENOBUFS; nl_key_id = nla_nest_start_noflag(msg, NL802154_ATTR_SEC_OUT_KEY_ID); if (!nl_key_id) return -ENOBUFS; ret = ieee802154_llsec_send_key_id(msg, ¶ms.out_key); if (ret < 0) return ret; nla_nest_end(msg, nl_key_id); return 0; } #endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ static int nl802154_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flags, struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev) { struct net_device *dev = wpan_dev->netdev; void *hdr; hdr = nl802154hdr_put(msg, portid, seq, flags, NL802154_CMD_NEW_INTERFACE); if (!hdr) return -1; if (dev && (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex) || nla_put_string(msg, NL802154_ATTR_IFNAME, dev->name))) goto nla_put_failure; if (nla_put_u32(msg, NL802154_ATTR_WPAN_PHY, rdev->wpan_phy_idx) || nla_put_u32(msg, NL802154_ATTR_IFTYPE, wpan_dev->iftype) || nla_put_u64_64bit(msg, NL802154_ATTR_WPAN_DEV, wpan_dev_id(wpan_dev), NL802154_ATTR_PAD) || nla_put_u32(msg, NL802154_ATTR_GENERATION, rdev->devlist_generation ^ (cfg802154_rdev_list_generation << 2))) goto nla_put_failure; /* address settings */ if (nla_put_le64(msg, NL802154_ATTR_EXTENDED_ADDR, wpan_dev->extended_addr, NL802154_ATTR_PAD) || nla_put_le16(msg, NL802154_ATTR_SHORT_ADDR, wpan_dev->short_addr) || nla_put_le16(msg, NL802154_ATTR_PAN_ID, wpan_dev->pan_id)) goto nla_put_failure; /* ARET handling */ if (nla_put_s8(msg, NL802154_ATTR_MAX_FRAME_RETRIES, wpan_dev->frame_retries) || nla_put_u8(msg, NL802154_ATTR_MAX_BE, wpan_dev->max_be) || nla_put_u8(msg, NL802154_ATTR_MAX_CSMA_BACKOFFS, wpan_dev->csma_retries) || nla_put_u8(msg, NL802154_ATTR_MIN_BE, wpan_dev->min_be)) goto nla_put_failure; /* listen before transmit */ if (nla_put_u8(msg, NL802154_ATTR_LBT_MODE, wpan_dev->lbt)) goto nla_put_failure; /* ackreq default behaviour */ if (nla_put_u8(msg, NL802154_ATTR_ACKREQ_DEFAULT, wpan_dev->ackreq)) goto nla_put_failure; #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) goto out; if (nl802154_get_llsec_params(msg, rdev, wpan_dev) < 0) goto nla_put_failure; out: #endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nl802154_dump_interface(struct sk_buff *skb, struct netlink_callback *cb) { int wp_idx = 0; int if_idx = 0; int wp_start = cb->args[0]; int if_start = cb->args[1]; struct cfg802154_registered_device *rdev; struct wpan_dev *wpan_dev; rtnl_lock(); list_for_each_entry(rdev, &cfg802154_rdev_list, list) { if (!net_eq(wpan_phy_net(&rdev->wpan_phy), sock_net(skb->sk))) continue; if (wp_idx < wp_start) { wp_idx++; continue; } if_idx = 0; list_for_each_entry(wpan_dev, &rdev->wpan_dev_list, list) { if (if_idx < if_start) { if_idx++; continue; } if (nl802154_send_iface(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, rdev, wpan_dev) < 0) { goto out; } if_idx++; } wp_idx++; } out: rtnl_unlock(); cb->args[0] = wp_idx; cb->args[1] = if_idx; return skb->len; } static int nl802154_get_interface(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct wpan_dev *wdev = info->user_ptr[1]; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; if (nl802154_send_iface(msg, info->snd_portid, info->snd_seq, 0, rdev, wdev) < 0) { nlmsg_free(msg); return -ENOBUFS; } return genlmsg_reply(msg, info); } static int nl802154_new_interface(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; enum nl802154_iftype type = NL802154_IFTYPE_UNSPEC; __le64 extended_addr = cpu_to_le64(0x0000000000000000ULL); /* TODO avoid failing a new interface * creation due to pending removal? */ if (!info->attrs[NL802154_ATTR_IFNAME]) return -EINVAL; if (info->attrs[NL802154_ATTR_IFTYPE]) { type = nla_get_u32(info->attrs[NL802154_ATTR_IFTYPE]); if (type > NL802154_IFTYPE_MAX || !(rdev->wpan_phy.supported.iftypes & BIT(type))) return -EINVAL; } if (info->attrs[NL802154_ATTR_EXTENDED_ADDR]) extended_addr = nla_get_le64(info->attrs[NL802154_ATTR_EXTENDED_ADDR]); if (!rdev->ops->add_virtual_intf) return -EOPNOTSUPP; return rdev_add_virtual_intf(rdev, nla_data(info->attrs[NL802154_ATTR_IFNAME]), NET_NAME_USER, type, extended_addr); } static int nl802154_del_interface(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct wpan_dev *wpan_dev = info->user_ptr[1]; if (!rdev->ops->del_virtual_intf) return -EOPNOTSUPP; /* If we remove a wpan device without a netdev then clear * user_ptr[1] so that nl802154_post_doit won't dereference it * to check if it needs to do dev_put(). Otherwise it crashes * since the wpan_dev has been freed, unlike with a netdev where * we need the dev_put() for the netdev to really be freed. */ if (!wpan_dev->netdev) info->user_ptr[1] = NULL; return rdev_del_virtual_intf(rdev, wpan_dev); } static int nl802154_set_channel(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; u8 channel, page; if (!info->attrs[NL802154_ATTR_PAGE] || !info->attrs[NL802154_ATTR_CHANNEL]) return -EINVAL; page = nla_get_u8(info->attrs[NL802154_ATTR_PAGE]); channel = nla_get_u8(info->attrs[NL802154_ATTR_CHANNEL]); /* check 802.15.4 constraints */ if (!ieee802154_chan_is_valid(&rdev->wpan_phy, page, channel)) return -EINVAL; return rdev_set_channel(rdev, page, channel); } static int nl802154_set_cca_mode(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct wpan_phy_cca cca; if (!(rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_MODE)) return -EOPNOTSUPP; if (!info->attrs[NL802154_ATTR_CCA_MODE]) return -EINVAL; cca.mode = nla_get_u32(info->attrs[NL802154_ATTR_CCA_MODE]); /* checking 802.15.4 constraints */ if (cca.mode < NL802154_CCA_ENERGY || cca.mode > NL802154_CCA_ATTR_MAX || !(rdev->wpan_phy.supported.cca_modes & BIT(cca.mode))) return -EINVAL; if (cca.mode == NL802154_CCA_ENERGY_CARRIER) { if (!info->attrs[NL802154_ATTR_CCA_OPT]) return -EINVAL; cca.opt = nla_get_u32(info->attrs[NL802154_ATTR_CCA_OPT]); if (cca.opt > NL802154_CCA_OPT_ATTR_MAX || !(rdev->wpan_phy.supported.cca_opts & BIT(cca.opt))) return -EINVAL; } return rdev_set_cca_mode(rdev, &cca); } static int nl802154_set_cca_ed_level(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; s32 ed_level; int i; if (!(rdev->wpan_phy.flags & WPAN_PHY_FLAG_CCA_ED_LEVEL)) return -EOPNOTSUPP; if (!info->attrs[NL802154_ATTR_CCA_ED_LEVEL]) return -EINVAL; ed_level = nla_get_s32(info->attrs[NL802154_ATTR_CCA_ED_LEVEL]); for (i = 0; i < rdev->wpan_phy.supported.cca_ed_levels_size; i++) { if (ed_level == rdev->wpan_phy.supported.cca_ed_levels[i]) return rdev_set_cca_ed_level(rdev, ed_level); } return -EINVAL; } static int nl802154_set_tx_power(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; s32 power; int i; if (!(rdev->wpan_phy.flags & WPAN_PHY_FLAG_TXPOWER)) return -EOPNOTSUPP; if (!info->attrs[NL802154_ATTR_TX_POWER]) return -EINVAL; power = nla_get_s32(info->attrs[NL802154_ATTR_TX_POWER]); for (i = 0; i < rdev->wpan_phy.supported.tx_powers_size; i++) { if (power == rdev->wpan_phy.supported.tx_powers[i]) return rdev_set_tx_power(rdev, power); } return -EINVAL; } static int nl802154_set_pan_id(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; __le16 pan_id; /* conflict here while tx/rx calls */ if (netif_running(dev)) return -EBUSY; if (wpan_dev->lowpan_dev) { if (netif_running(wpan_dev->lowpan_dev)) return -EBUSY; } /* don't change address fields on monitor */ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR || !info->attrs[NL802154_ATTR_PAN_ID]) return -EINVAL; pan_id = nla_get_le16(info->attrs[NL802154_ATTR_PAN_ID]); /* Only allow changing the PAN ID when the device has no more * associations ongoing to avoid confusing peers. */ if (cfg802154_device_is_associated(wpan_dev)) { NL_SET_ERR_MSG(info->extack, "Existing associations, changing PAN ID forbidden"); return -EINVAL; } return rdev_set_pan_id(rdev, wpan_dev, pan_id); } static int nl802154_set_short_addr(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; __le16 short_addr; /* conflict here while tx/rx calls */ if (netif_running(dev)) return -EBUSY; if (wpan_dev->lowpan_dev) { if (netif_running(wpan_dev->lowpan_dev)) return -EBUSY; } /* don't change address fields on monitor */ if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR || !info->attrs[NL802154_ATTR_SHORT_ADDR]) return -EINVAL; short_addr = nla_get_le16(info->attrs[NL802154_ATTR_SHORT_ADDR]); /* The short address only has a meaning when part of a PAN, after a * proper association procedure. However, we want to still offer the * possibility to create static networks so changing the short address * is only allowed when not already associated to other devices with * the official handshake. */ if (cfg802154_device_is_associated(wpan_dev)) { NL_SET_ERR_MSG(info->extack, "Existing associations, changing short address forbidden"); return -EINVAL; } return rdev_set_short_addr(rdev, wpan_dev, short_addr); } static int nl802154_set_backoff_exponent(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; u8 min_be, max_be; /* should be set on netif open inside phy settings */ if (netif_running(dev)) return -EBUSY; if (!info->attrs[NL802154_ATTR_MIN_BE] || !info->attrs[NL802154_ATTR_MAX_BE]) return -EINVAL; min_be = nla_get_u8(info->attrs[NL802154_ATTR_MIN_BE]); max_be = nla_get_u8(info->attrs[NL802154_ATTR_MAX_BE]); /* check 802.15.4 constraints */ if (min_be < rdev->wpan_phy.supported.min_minbe || min_be > rdev->wpan_phy.supported.max_minbe || max_be < rdev->wpan_phy.supported.min_maxbe || max_be > rdev->wpan_phy.supported.max_maxbe || min_be > max_be) return -EINVAL; return rdev_set_backoff_exponent(rdev, wpan_dev, min_be, max_be); } static int nl802154_set_max_csma_backoffs(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; u8 max_csma_backoffs; /* conflict here while other running iface settings */ if (netif_running(dev)) return -EBUSY; if (!info->attrs[NL802154_ATTR_MAX_CSMA_BACKOFFS]) return -EINVAL; max_csma_backoffs = nla_get_u8( info->attrs[NL802154_ATTR_MAX_CSMA_BACKOFFS]); /* check 802.15.4 constraints */ if (max_csma_backoffs < rdev->wpan_phy.supported.min_csma_backoffs || max_csma_backoffs > rdev->wpan_phy.supported.max_csma_backoffs) return -EINVAL; return rdev_set_max_csma_backoffs(rdev, wpan_dev, max_csma_backoffs); } static int nl802154_set_max_frame_retries(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; s8 max_frame_retries; if (netif_running(dev)) return -EBUSY; if (!info->attrs[NL802154_ATTR_MAX_FRAME_RETRIES]) return -EINVAL; max_frame_retries = nla_get_s8( info->attrs[NL802154_ATTR_MAX_FRAME_RETRIES]); /* check 802.15.4 constraints */ if (max_frame_retries < rdev->wpan_phy.supported.min_frame_retries || max_frame_retries > rdev->wpan_phy.supported.max_frame_retries) return -EINVAL; return rdev_set_max_frame_retries(rdev, wpan_dev, max_frame_retries); } static int nl802154_set_lbt_mode(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; int mode; if (netif_running(dev)) return -EBUSY; if (!info->attrs[NL802154_ATTR_LBT_MODE]) return -EINVAL; mode = nla_get_u8(info->attrs[NL802154_ATTR_LBT_MODE]); if (mode != 0 && mode != 1) return -EINVAL; if (!wpan_phy_supported_bool(mode, rdev->wpan_phy.supported.lbt)) return -EINVAL; return rdev_set_lbt_mode(rdev, wpan_dev, mode); } static int nl802154_set_ackreq_default(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; int ackreq; if (netif_running(dev)) return -EBUSY; if (!info->attrs[NL802154_ATTR_ACKREQ_DEFAULT]) return -EINVAL; ackreq = nla_get_u8(info->attrs[NL802154_ATTR_ACKREQ_DEFAULT]); if (ackreq != 0 && ackreq != 1) return -EINVAL; return rdev_set_ackreq_default(rdev, wpan_dev, ackreq); } static int nl802154_wpan_phy_netns(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net *net; int err; if (info->attrs[NL802154_ATTR_PID]) { u32 pid = nla_get_u32(info->attrs[NL802154_ATTR_PID]); net = get_net_ns_by_pid(pid); } else if (info->attrs[NL802154_ATTR_NETNS_FD]) { u32 fd = nla_get_u32(info->attrs[NL802154_ATTR_NETNS_FD]); net = get_net_ns_by_fd(fd); } else { return -EINVAL; } if (IS_ERR(net)) return PTR_ERR(net); err = 0; /* check if anything to do */ if (!net_eq(wpan_phy_net(&rdev->wpan_phy), net)) err = cfg802154_switch_netns(rdev, net); put_net(net); return err; } static int nl802154_prep_scan_event_msg(struct sk_buff *msg, struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, u32 portid, u32 seq, int flags, u8 cmd, struct ieee802154_coord_desc *desc) { struct nlattr *nla; void *hdr; hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); if (!hdr) return -ENOBUFS; if (nla_put_u32(msg, NL802154_ATTR_WPAN_PHY, rdev->wpan_phy_idx)) goto nla_put_failure; if (wpan_dev->netdev && nla_put_u32(msg, NL802154_ATTR_IFINDEX, wpan_dev->netdev->ifindex)) goto nla_put_failure; if (nla_put_u64_64bit(msg, NL802154_ATTR_WPAN_DEV, wpan_dev_id(wpan_dev), NL802154_ATTR_PAD)) goto nla_put_failure; nla = nla_nest_start_noflag(msg, NL802154_ATTR_COORDINATOR); if (!nla) goto nla_put_failure; if (nla_put(msg, NL802154_COORD_PANID, IEEE802154_PAN_ID_LEN, &desc->addr.pan_id)) goto nla_put_failure; if (desc->addr.mode == IEEE802154_ADDR_SHORT) { if (nla_put(msg, NL802154_COORD_ADDR, IEEE802154_SHORT_ADDR_LEN, &desc->addr.short_addr)) goto nla_put_failure; } else { if (nla_put(msg, NL802154_COORD_ADDR, IEEE802154_EXTENDED_ADDR_LEN, &desc->addr.extended_addr)) goto nla_put_failure; } if (nla_put_u8(msg, NL802154_COORD_CHANNEL, desc->channel)) goto nla_put_failure; if (nla_put_u8(msg, NL802154_COORD_PAGE, desc->page)) goto nla_put_failure; if (nla_put_u16(msg, NL802154_COORD_SUPERFRAME_SPEC, desc->superframe_spec)) goto nla_put_failure; if (nla_put_u8(msg, NL802154_COORD_LINK_QUALITY, desc->link_quality)) goto nla_put_failure; if (desc->gts_permit && nla_put_flag(msg, NL802154_COORD_GTS_PERMIT)) goto nla_put_failure; /* TODO: NL802154_COORD_PAYLOAD_DATA if any */ nla_nest_end(msg, nla); genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } int nl802154_scan_event(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, struct ieee802154_coord_desc *desc) { struct cfg802154_registered_device *rdev = wpan_phy_to_rdev(wpan_phy); struct sk_buff *msg; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; ret = nl802154_prep_scan_event_msg(msg, rdev, wpan_dev, 0, 0, 0, NL802154_CMD_SCAN_EVENT, desc); if (ret < 0) { nlmsg_free(msg); return ret; } return genlmsg_multicast_netns(&nl802154_fam, wpan_phy_net(wpan_phy), msg, 0, NL802154_MCGRP_SCAN, GFP_ATOMIC); } EXPORT_SYMBOL_GPL(nl802154_scan_event); static int nl802154_trigger_scan(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; struct wpan_phy *wpan_phy = &rdev->wpan_phy; struct cfg802154_scan_request *request; u8 type; int err; if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) { NL_SET_ERR_MSG(info->extack, "Monitors are not allowed to perform scans"); return -EOPNOTSUPP; } if (!info->attrs[NL802154_ATTR_SCAN_TYPE]) { NL_SET_ERR_MSG(info->extack, "Malformed request, missing scan type"); return -EINVAL; } if (wpan_phy->flags & WPAN_PHY_FLAG_DATAGRAMS_ONLY) { NL_SET_ERR_MSG(info->extack, "PHY only supports datagrams"); return -EOPNOTSUPP; } request = kzalloc(sizeof(*request), GFP_KERNEL); if (!request) return -ENOMEM; request->wpan_dev = wpan_dev; request->wpan_phy = wpan_phy; type = nla_get_u8(info->attrs[NL802154_ATTR_SCAN_TYPE]); switch (type) { case NL802154_SCAN_ACTIVE: case NL802154_SCAN_PASSIVE: request->type = type; break; default: NL_SET_ERR_MSG_FMT(info->extack, "Unsupported scan type: %d", type); err = -EINVAL; goto free_request; } /* Use current page by default */ if (info->attrs[NL802154_ATTR_PAGE]) request->page = nla_get_u8(info->attrs[NL802154_ATTR_PAGE]); else request->page = wpan_phy->current_page; /* Scan all supported channels by default */ if (info->attrs[NL802154_ATTR_SCAN_CHANNELS]) request->channels = nla_get_u32(info->attrs[NL802154_ATTR_SCAN_CHANNELS]); else request->channels = wpan_phy->supported.channels[request->page]; /* Use maximum duration order by default */ if (info->attrs[NL802154_ATTR_SCAN_DURATION]) request->duration = nla_get_u8(info->attrs[NL802154_ATTR_SCAN_DURATION]); else request->duration = IEEE802154_MAX_SCAN_DURATION; err = rdev_trigger_scan(rdev, request); if (err) { pr_err("Failure starting scanning (%d)\n", err); goto free_request; } return 0; free_request: kfree(request); return err; } static int nl802154_prep_scan_msg(struct sk_buff *msg, struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, u32 portid, u32 seq, int flags, u8 cmd, u8 arg) { void *hdr; hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); if (!hdr) return -ENOBUFS; if (nla_put_u32(msg, NL802154_ATTR_WPAN_PHY, rdev->wpan_phy_idx)) goto nla_put_failure; if (wpan_dev->netdev && nla_put_u32(msg, NL802154_ATTR_IFINDEX, wpan_dev->netdev->ifindex)) goto nla_put_failure; if (nla_put_u64_64bit(msg, NL802154_ATTR_WPAN_DEV, wpan_dev_id(wpan_dev), NL802154_ATTR_PAD)) goto nla_put_failure; if (cmd == NL802154_CMD_SCAN_DONE && nla_put_u8(msg, NL802154_ATTR_SCAN_DONE_REASON, arg)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nl802154_send_scan_msg(struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, u8 cmd, u8 arg) { struct sk_buff *msg; int ret; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; ret = nl802154_prep_scan_msg(msg, rdev, wpan_dev, 0, 0, 0, cmd, arg); if (ret < 0) { nlmsg_free(msg); return ret; } return genlmsg_multicast_netns(&nl802154_fam, wpan_phy_net(&rdev->wpan_phy), msg, 0, NL802154_MCGRP_SCAN, GFP_KERNEL); } int nl802154_scan_started(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev) { struct cfg802154_registered_device *rdev = wpan_phy_to_rdev(wpan_phy); int err; /* Ignore errors when there are no listeners */ err = nl802154_send_scan_msg(rdev, wpan_dev, NL802154_CMD_TRIGGER_SCAN, 0); if (err == -ESRCH) err = 0; return err; } EXPORT_SYMBOL_GPL(nl802154_scan_started); int nl802154_scan_done(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev, enum nl802154_scan_done_reasons reason) { struct cfg802154_registered_device *rdev = wpan_phy_to_rdev(wpan_phy); int err; /* Ignore errors when there are no listeners */ err = nl802154_send_scan_msg(rdev, wpan_dev, NL802154_CMD_SCAN_DONE, reason); if (err == -ESRCH) err = 0; return err; } EXPORT_SYMBOL_GPL(nl802154_scan_done); static int nl802154_abort_scan(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; /* Resources are released in the notification helper above */ return rdev_abort_scan(rdev, wpan_dev); } static int nl802154_send_beacons(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; struct wpan_phy *wpan_phy = &rdev->wpan_phy; struct cfg802154_beacon_request *request; int err; if (wpan_dev->iftype != NL802154_IFTYPE_COORD) { NL_SET_ERR_MSG(info->extack, "Only coordinators can send beacons"); return -EOPNOTSUPP; } if (wpan_dev->pan_id == cpu_to_le16(IEEE802154_PANID_BROADCAST)) { NL_SET_ERR_MSG(info->extack, "Device is not part of any PAN"); return -EPERM; } if (wpan_phy->flags & WPAN_PHY_FLAG_DATAGRAMS_ONLY) { NL_SET_ERR_MSG(info->extack, "PHY only supports datagrams"); return -EOPNOTSUPP; } request = kzalloc(sizeof(*request), GFP_KERNEL); if (!request) return -ENOMEM; request->wpan_dev = wpan_dev; request->wpan_phy = wpan_phy; /* Use maximum duration order by default */ if (info->attrs[NL802154_ATTR_BEACON_INTERVAL]) request->interval = nla_get_u8(info->attrs[NL802154_ATTR_BEACON_INTERVAL]); else request->interval = IEEE802154_MAX_SCAN_DURATION; err = rdev_send_beacons(rdev, request); if (err) { pr_err("Failure starting sending beacons (%d)\n", err); goto free_request; } return 0; free_request: kfree(request); return err; } void nl802154_beaconing_done(struct wpan_dev *wpan_dev) { /* NOP */ } EXPORT_SYMBOL_GPL(nl802154_beaconing_done); static int nl802154_stop_beacons(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; /* Resources are released in the notification helper above */ return rdev_stop_beacons(rdev, wpan_dev); } static int nl802154_associate(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev; struct wpan_phy *wpan_phy; struct ieee802154_addr coord; int err; wpan_dev = dev->ieee802154_ptr; wpan_phy = &rdev->wpan_phy; if (wpan_phy->flags & WPAN_PHY_FLAG_DATAGRAMS_ONLY) { NL_SET_ERR_MSG(info->extack, "PHY only supports datagrams"); return -EOPNOTSUPP; } if (!info->attrs[NL802154_ATTR_PAN_ID] || !info->attrs[NL802154_ATTR_EXTENDED_ADDR]) return -EINVAL; coord.pan_id = nla_get_le16(info->attrs[NL802154_ATTR_PAN_ID]); coord.mode = IEEE802154_ADDR_LONG; coord.extended_addr = nla_get_le64(info->attrs[NL802154_ATTR_EXTENDED_ADDR]); mutex_lock(&wpan_dev->association_lock); err = rdev_associate(rdev, wpan_dev, &coord); mutex_unlock(&wpan_dev->association_lock); if (err) pr_err("Association with PAN ID 0x%x failed (%d)\n", le16_to_cpu(coord.pan_id), err); return err; } static int nl802154_disassociate(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; struct wpan_phy *wpan_phy = &rdev->wpan_phy; struct ieee802154_addr target; if (wpan_phy->flags & WPAN_PHY_FLAG_DATAGRAMS_ONLY) { NL_SET_ERR_MSG(info->extack, "PHY only supports datagrams"); return -EOPNOTSUPP; } target.pan_id = wpan_dev->pan_id; if (info->attrs[NL802154_ATTR_EXTENDED_ADDR]) { target.mode = IEEE802154_ADDR_LONG; target.extended_addr = nla_get_le64(info->attrs[NL802154_ATTR_EXTENDED_ADDR]); } else if (info->attrs[NL802154_ATTR_SHORT_ADDR]) { target.mode = IEEE802154_ADDR_SHORT; target.short_addr = nla_get_le16(info->attrs[NL802154_ATTR_SHORT_ADDR]); } else { NL_SET_ERR_MSG(info->extack, "Device address is missing"); return -EINVAL; } mutex_lock(&wpan_dev->association_lock); rdev_disassociate(rdev, wpan_dev, &target); mutex_unlock(&wpan_dev->association_lock); return 0; } static int nl802154_set_max_associations(struct sk_buff *skb, struct genl_info *info) { struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; unsigned int max_assoc; if (!info->attrs[NL802154_ATTR_MAX_ASSOCIATIONS]) { NL_SET_ERR_MSG(info->extack, "No maximum number of association given"); return -EINVAL; } max_assoc = nla_get_u32(info->attrs[NL802154_ATTR_MAX_ASSOCIATIONS]); mutex_lock(&wpan_dev->association_lock); cfg802154_set_max_associations(wpan_dev, max_assoc); mutex_unlock(&wpan_dev->association_lock); return 0; } static int nl802154_send_peer_info(struct sk_buff *msg, struct netlink_callback *cb, u32 seq, int flags, struct cfg802154_registered_device *rdev, struct wpan_dev *wpan_dev, struct ieee802154_pan_device *peer, enum nl802154_peer_type type) { struct nlattr *nla; void *hdr; ASSERT_RTNL(); hdr = nl802154hdr_put(msg, NETLINK_CB(cb->skb).portid, seq, flags, NL802154_CMD_LIST_ASSOCIATIONS); if (!hdr) return -ENOBUFS; genl_dump_check_consistent(cb, hdr); nla = nla_nest_start_noflag(msg, NL802154_ATTR_PEER); if (!nla) goto nla_put_failure; if (nla_put_u8(msg, NL802154_DEV_ADDR_ATTR_PEER_TYPE, type)) goto nla_put_failure; if (nla_put_u8(msg, NL802154_DEV_ADDR_ATTR_MODE, peer->mode)) goto nla_put_failure; if (nla_put(msg, NL802154_DEV_ADDR_ATTR_SHORT, IEEE802154_SHORT_ADDR_LEN, &peer->short_addr)) goto nla_put_failure; if (nla_put(msg, NL802154_DEV_ADDR_ATTR_EXTENDED, IEEE802154_EXTENDED_ADDR_LEN, &peer->extended_addr)) goto nla_put_failure; nla_nest_end(msg, nla); genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nl802154_list_associations(struct sk_buff *skb, struct netlink_callback *cb) { struct cfg802154_registered_device *rdev; struct ieee802154_pan_device *child; struct wpan_dev *wpan_dev; int err; err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev); if (err) return err; mutex_lock(&wpan_dev->association_lock); if (cb->args[2]) goto out; if (wpan_dev->parent) { err = nl802154_send_peer_info(skb, cb, cb->nlh->nlmsg_seq, NLM_F_MULTI, rdev, wpan_dev, wpan_dev->parent, NL802154_PEER_TYPE_PARENT); if (err < 0) goto out_err; } list_for_each_entry(child, &wpan_dev->children, node) { err = nl802154_send_peer_info(skb, cb, cb->nlh->nlmsg_seq, NLM_F_MULTI, rdev, wpan_dev, child, NL802154_PEER_TYPE_CHILD); if (err < 0) goto out_err; } cb->args[2] = 1; out: err = skb->len; out_err: mutex_unlock(&wpan_dev->association_lock); nl802154_finish_wpan_dev_dump(rdev); return err; } #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL static const struct nla_policy nl802154_dev_addr_policy[NL802154_DEV_ADDR_ATTR_MAX + 1] = { [NL802154_DEV_ADDR_ATTR_PAN_ID] = { .type = NLA_U16 }, [NL802154_DEV_ADDR_ATTR_MODE] = { .type = NLA_U32 }, [NL802154_DEV_ADDR_ATTR_SHORT] = { .type = NLA_U16 }, [NL802154_DEV_ADDR_ATTR_EXTENDED] = { .type = NLA_U64 }, }; static int ieee802154_llsec_parse_dev_addr(struct nlattr *nla, struct ieee802154_addr *addr) { struct nlattr *attrs[NL802154_DEV_ADDR_ATTR_MAX + 1]; if (!nla || nla_parse_nested_deprecated(attrs, NL802154_DEV_ADDR_ATTR_MAX, nla, nl802154_dev_addr_policy, NULL)) return -EINVAL; if (!attrs[NL802154_DEV_ADDR_ATTR_PAN_ID] || !attrs[NL802154_DEV_ADDR_ATTR_MODE]) return -EINVAL; addr->pan_id = nla_get_le16(attrs[NL802154_DEV_ADDR_ATTR_PAN_ID]); addr->mode = nla_get_u32(attrs[NL802154_DEV_ADDR_ATTR_MODE]); switch (addr->mode) { case NL802154_DEV_ADDR_SHORT: if (!attrs[NL802154_DEV_ADDR_ATTR_SHORT]) return -EINVAL; addr->short_addr = nla_get_le16(attrs[NL802154_DEV_ADDR_ATTR_SHORT]); break; case NL802154_DEV_ADDR_EXTENDED: if (!attrs[NL802154_DEV_ADDR_ATTR_EXTENDED]) return -EINVAL; addr->extended_addr = nla_get_le64(attrs[NL802154_DEV_ADDR_ATTR_EXTENDED]); break; default: return -EINVAL; } return 0; } static const struct nla_policy nl802154_key_id_policy[NL802154_KEY_ID_ATTR_MAX + 1] = { [NL802154_KEY_ID_ATTR_MODE] = { .type = NLA_U32 }, [NL802154_KEY_ID_ATTR_INDEX] = { .type = NLA_U8 }, [NL802154_KEY_ID_ATTR_IMPLICIT] = { .type = NLA_NESTED }, [NL802154_KEY_ID_ATTR_SOURCE_SHORT] = { .type = NLA_U32 }, [NL802154_KEY_ID_ATTR_SOURCE_EXTENDED] = { .type = NLA_U64 }, }; static int ieee802154_llsec_parse_key_id(struct nlattr *nla, struct ieee802154_llsec_key_id *desc) { struct nlattr *attrs[NL802154_KEY_ID_ATTR_MAX + 1]; if (!nla || nla_parse_nested_deprecated(attrs, NL802154_KEY_ID_ATTR_MAX, nla, nl802154_key_id_policy, NULL)) return -EINVAL; if (!attrs[NL802154_KEY_ID_ATTR_MODE]) return -EINVAL; desc->mode = nla_get_u32(attrs[NL802154_KEY_ID_ATTR_MODE]); switch (desc->mode) { case NL802154_KEY_ID_MODE_IMPLICIT: if (!attrs[NL802154_KEY_ID_ATTR_IMPLICIT]) return -EINVAL; if (ieee802154_llsec_parse_dev_addr(attrs[NL802154_KEY_ID_ATTR_IMPLICIT], &desc->device_addr) < 0) return -EINVAL; break; case NL802154_KEY_ID_MODE_INDEX: break; case NL802154_KEY_ID_MODE_INDEX_SHORT: if (!attrs[NL802154_KEY_ID_ATTR_SOURCE_SHORT]) return -EINVAL; desc->short_source = nla_get_le32(attrs[NL802154_KEY_ID_ATTR_SOURCE_SHORT]); break; case NL802154_KEY_ID_MODE_INDEX_EXTENDED: if (!attrs[NL802154_KEY_ID_ATTR_SOURCE_EXTENDED]) return -EINVAL; desc->extended_source = nla_get_le64(attrs[NL802154_KEY_ID_ATTR_SOURCE_EXTENDED]); break; default: return -EINVAL; } if (desc->mode != NL802154_KEY_ID_MODE_IMPLICIT) { if (!attrs[NL802154_KEY_ID_ATTR_INDEX]) return -EINVAL; /* TODO change id to idx */ desc->id = nla_get_u8(attrs[NL802154_KEY_ID_ATTR_INDEX]); } return 0; } static int nl802154_set_llsec_params(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; struct ieee802154_llsec_params params; u32 changed = 0; int ret; if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) return -EOPNOTSUPP; if (info->attrs[NL802154_ATTR_SEC_ENABLED]) { u8 enabled; enabled = nla_get_u8(info->attrs[NL802154_ATTR_SEC_ENABLED]); if (enabled != 0 && enabled != 1) return -EINVAL; params.enabled = nla_get_u8(info->attrs[NL802154_ATTR_SEC_ENABLED]); changed |= IEEE802154_LLSEC_PARAM_ENABLED; } if (info->attrs[NL802154_ATTR_SEC_OUT_KEY_ID]) { ret = ieee802154_llsec_parse_key_id(info->attrs[NL802154_ATTR_SEC_OUT_KEY_ID], ¶ms.out_key); if (ret < 0) return ret; changed |= IEEE802154_LLSEC_PARAM_OUT_KEY; } if (info->attrs[NL802154_ATTR_SEC_OUT_LEVEL]) { params.out_level = nla_get_u32(info->attrs[NL802154_ATTR_SEC_OUT_LEVEL]); if (params.out_level > NL802154_SECLEVEL_MAX) return -EINVAL; changed |= IEEE802154_LLSEC_PARAM_OUT_LEVEL; } if (info->attrs[NL802154_ATTR_SEC_FRAME_COUNTER]) { params.frame_counter = nla_get_be32(info->attrs[NL802154_ATTR_SEC_FRAME_COUNTER]); changed |= IEEE802154_LLSEC_PARAM_FRAME_COUNTER; } return rdev_set_llsec_params(rdev, wpan_dev, ¶ms, changed); } static int nl802154_send_key(struct sk_buff *msg, u32 cmd, u32 portid, u32 seq, int flags, struct cfg802154_registered_device *rdev, struct net_device *dev, const struct ieee802154_llsec_key_entry *key) { void *hdr; u32 commands[NL802154_CMD_FRAME_NR_IDS / 32]; struct nlattr *nl_key, *nl_key_id; hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); if (!hdr) return -ENOBUFS; if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; nl_key = nla_nest_start_noflag(msg, NL802154_ATTR_SEC_KEY); if (!nl_key) goto nla_put_failure; nl_key_id = nla_nest_start_noflag(msg, NL802154_KEY_ATTR_ID); if (!nl_key_id) goto nla_put_failure; if (ieee802154_llsec_send_key_id(msg, &key->id) < 0) goto nla_put_failure; nla_nest_end(msg, nl_key_id); if (nla_put_u8(msg, NL802154_KEY_ATTR_USAGE_FRAMES, key->key->frame_types)) goto nla_put_failure; if (key->key->frame_types & BIT(NL802154_FRAME_CMD)) { /* TODO for each nested */ memset(commands, 0, sizeof(commands)); commands[7] = key->key->cmd_frame_ids; if (nla_put(msg, NL802154_KEY_ATTR_USAGE_CMDS, sizeof(commands), commands)) goto nla_put_failure; } if (nla_put(msg, NL802154_KEY_ATTR_BYTES, NL802154_KEY_SIZE, key->key->key)) goto nla_put_failure; nla_nest_end(msg, nl_key); genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nl802154_dump_llsec_key(struct sk_buff *skb, struct netlink_callback *cb) { struct cfg802154_registered_device *rdev = NULL; struct ieee802154_llsec_key_entry *key; struct ieee802154_llsec_table *table; struct wpan_dev *wpan_dev; int err; err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev); if (err) return err; if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) { err = skb->len; goto out_err; } if (!wpan_dev->netdev) { err = -EINVAL; goto out_err; } rdev_lock_llsec_table(rdev, wpan_dev); rdev_get_llsec_table(rdev, wpan_dev, &table); /* TODO make it like station dump */ if (cb->args[2]) goto out; list_for_each_entry(key, &table->keys, list) { if (nl802154_send_key(skb, NL802154_CMD_NEW_SEC_KEY, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, rdev, wpan_dev->netdev, key) < 0) { /* TODO */ err = -EIO; rdev_unlock_llsec_table(rdev, wpan_dev); goto out_err; } } cb->args[2] = 1; out: rdev_unlock_llsec_table(rdev, wpan_dev); err = skb->len; out_err: nl802154_finish_wpan_dev_dump(rdev); return err; } static const struct nla_policy nl802154_key_policy[NL802154_KEY_ATTR_MAX + 1] = { [NL802154_KEY_ATTR_ID] = { NLA_NESTED }, /* TODO handle it as for_each_nested and NLA_FLAG? */ [NL802154_KEY_ATTR_USAGE_FRAMES] = { NLA_U8 }, /* TODO handle it as for_each_nested, not static array? */ [NL802154_KEY_ATTR_USAGE_CMDS] = { .len = NL802154_CMD_FRAME_NR_IDS / 8 }, [NL802154_KEY_ATTR_BYTES] = { .len = NL802154_KEY_SIZE }, }; static int nl802154_add_llsec_key(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; struct nlattr *attrs[NL802154_KEY_ATTR_MAX + 1]; struct ieee802154_llsec_key key = { }; struct ieee802154_llsec_key_id id = { }; u32 commands[NL802154_CMD_FRAME_NR_IDS / 32] = { }; if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) return -EOPNOTSUPP; if (!info->attrs[NL802154_ATTR_SEC_KEY] || nla_parse_nested_deprecated(attrs, NL802154_KEY_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_KEY], nl802154_key_policy, info->extack)) return -EINVAL; if (!attrs[NL802154_KEY_ATTR_USAGE_FRAMES] || !attrs[NL802154_KEY_ATTR_BYTES]) return -EINVAL; if (ieee802154_llsec_parse_key_id(attrs[NL802154_KEY_ATTR_ID], &id) < 0) return -ENOBUFS; key.frame_types = nla_get_u8(attrs[NL802154_KEY_ATTR_USAGE_FRAMES]); if (key.frame_types > BIT(NL802154_FRAME_MAX) || ((key.frame_types & BIT(NL802154_FRAME_CMD)) && !attrs[NL802154_KEY_ATTR_USAGE_CMDS])) return -EINVAL; if (attrs[NL802154_KEY_ATTR_USAGE_CMDS]) { /* TODO for each nested */ nla_memcpy(commands, attrs[NL802154_KEY_ATTR_USAGE_CMDS], NL802154_CMD_FRAME_NR_IDS / 8); /* TODO understand the -EINVAL logic here? last condition */ if (commands[0] || commands[1] || commands[2] || commands[3] || commands[4] || commands[5] || commands[6] || commands[7] > BIT(NL802154_CMD_FRAME_MAX)) return -EINVAL; key.cmd_frame_ids = commands[7]; } else { key.cmd_frame_ids = 0; } nla_memcpy(key.key, attrs[NL802154_KEY_ATTR_BYTES], NL802154_KEY_SIZE); if (ieee802154_llsec_parse_key_id(attrs[NL802154_KEY_ATTR_ID], &id) < 0) return -ENOBUFS; return rdev_add_llsec_key(rdev, wpan_dev, &id, &key); } static int nl802154_del_llsec_key(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; struct nlattr *attrs[NL802154_KEY_ATTR_MAX + 1]; struct ieee802154_llsec_key_id id; if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) return -EOPNOTSUPP; if (!info->attrs[NL802154_ATTR_SEC_KEY] || nla_parse_nested_deprecated(attrs, NL802154_KEY_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_KEY], nl802154_key_policy, info->extack)) return -EINVAL; if (ieee802154_llsec_parse_key_id(attrs[NL802154_KEY_ATTR_ID], &id) < 0) return -ENOBUFS; return rdev_del_llsec_key(rdev, wpan_dev, &id); } static int nl802154_send_device(struct sk_buff *msg, u32 cmd, u32 portid, u32 seq, int flags, struct cfg802154_registered_device *rdev, struct net_device *dev, const struct ieee802154_llsec_device *dev_desc) { void *hdr; struct nlattr *nl_device; hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); if (!hdr) return -ENOBUFS; if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; nl_device = nla_nest_start_noflag(msg, NL802154_ATTR_SEC_DEVICE); if (!nl_device) goto nla_put_failure; if (nla_put_u32(msg, NL802154_DEV_ATTR_FRAME_COUNTER, dev_desc->frame_counter) || nla_put_le16(msg, NL802154_DEV_ATTR_PAN_ID, dev_desc->pan_id) || nla_put_le16(msg, NL802154_DEV_ATTR_SHORT_ADDR, dev_desc->short_addr) || nla_put_le64(msg, NL802154_DEV_ATTR_EXTENDED_ADDR, dev_desc->hwaddr, NL802154_DEV_ATTR_PAD) || nla_put_u8(msg, NL802154_DEV_ATTR_SECLEVEL_EXEMPT, dev_desc->seclevel_exempt) || nla_put_u32(msg, NL802154_DEV_ATTR_KEY_MODE, dev_desc->key_mode)) goto nla_put_failure; nla_nest_end(msg, nl_device); genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nl802154_dump_llsec_dev(struct sk_buff *skb, struct netlink_callback *cb) { struct cfg802154_registered_device *rdev = NULL; struct ieee802154_llsec_device *dev; struct ieee802154_llsec_table *table; struct wpan_dev *wpan_dev; int err; err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev); if (err) return err; if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) { err = skb->len; goto out_err; } if (!wpan_dev->netdev) { err = -EINVAL; goto out_err; } rdev_lock_llsec_table(rdev, wpan_dev); rdev_get_llsec_table(rdev, wpan_dev, &table); /* TODO make it like station dump */ if (cb->args[2]) goto out; list_for_each_entry(dev, &table->devices, list) { if (nl802154_send_device(skb, NL802154_CMD_NEW_SEC_LEVEL, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, rdev, wpan_dev->netdev, dev) < 0) { /* TODO */ err = -EIO; rdev_unlock_llsec_table(rdev, wpan_dev); goto out_err; } } cb->args[2] = 1; out: rdev_unlock_llsec_table(rdev, wpan_dev); err = skb->len; out_err: nl802154_finish_wpan_dev_dump(rdev); return err; } static const struct nla_policy nl802154_dev_policy[NL802154_DEV_ATTR_MAX + 1] = { [NL802154_DEV_ATTR_FRAME_COUNTER] = { NLA_U32 }, [NL802154_DEV_ATTR_PAN_ID] = { .type = NLA_U16 }, [NL802154_DEV_ATTR_SHORT_ADDR] = { .type = NLA_U16 }, [NL802154_DEV_ATTR_EXTENDED_ADDR] = { .type = NLA_U64 }, [NL802154_DEV_ATTR_SECLEVEL_EXEMPT] = { NLA_U8 }, [NL802154_DEV_ATTR_KEY_MODE] = { NLA_U32 }, }; static int ieee802154_llsec_parse_device(struct nlattr *nla, struct ieee802154_llsec_device *dev) { struct nlattr *attrs[NL802154_DEV_ATTR_MAX + 1]; if (!nla || nla_parse_nested_deprecated(attrs, NL802154_DEV_ATTR_MAX, nla, nl802154_dev_policy, NULL)) return -EINVAL; memset(dev, 0, sizeof(*dev)); if (!attrs[NL802154_DEV_ATTR_FRAME_COUNTER] || !attrs[NL802154_DEV_ATTR_PAN_ID] || !attrs[NL802154_DEV_ATTR_SHORT_ADDR] || !attrs[NL802154_DEV_ATTR_EXTENDED_ADDR] || !attrs[NL802154_DEV_ATTR_SECLEVEL_EXEMPT] || !attrs[NL802154_DEV_ATTR_KEY_MODE]) return -EINVAL; /* TODO be32 */ dev->frame_counter = nla_get_u32(attrs[NL802154_DEV_ATTR_FRAME_COUNTER]); dev->pan_id = nla_get_le16(attrs[NL802154_DEV_ATTR_PAN_ID]); dev->short_addr = nla_get_le16(attrs[NL802154_DEV_ATTR_SHORT_ADDR]); /* TODO rename hwaddr to extended_addr */ dev->hwaddr = nla_get_le64(attrs[NL802154_DEV_ATTR_EXTENDED_ADDR]); dev->seclevel_exempt = nla_get_u8(attrs[NL802154_DEV_ATTR_SECLEVEL_EXEMPT]); dev->key_mode = nla_get_u32(attrs[NL802154_DEV_ATTR_KEY_MODE]); if (dev->key_mode > NL802154_DEVKEY_MAX || (dev->seclevel_exempt != 0 && dev->seclevel_exempt != 1)) return -EINVAL; return 0; } static int nl802154_add_llsec_dev(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; struct ieee802154_llsec_device dev_desc; if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) return -EOPNOTSUPP; if (ieee802154_llsec_parse_device(info->attrs[NL802154_ATTR_SEC_DEVICE], &dev_desc) < 0) return -EINVAL; return rdev_add_device(rdev, wpan_dev, &dev_desc); } static int nl802154_del_llsec_dev(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; struct nlattr *attrs[NL802154_DEV_ATTR_MAX + 1]; __le64 extended_addr; if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) return -EOPNOTSUPP; if (!info->attrs[NL802154_ATTR_SEC_DEVICE] || nla_parse_nested_deprecated(attrs, NL802154_DEV_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_DEVICE], nl802154_dev_policy, info->extack)) return -EINVAL; if (!attrs[NL802154_DEV_ATTR_EXTENDED_ADDR]) return -EINVAL; extended_addr = nla_get_le64(attrs[NL802154_DEV_ATTR_EXTENDED_ADDR]); return rdev_del_device(rdev, wpan_dev, extended_addr); } static int nl802154_send_devkey(struct sk_buff *msg, u32 cmd, u32 portid, u32 seq, int flags, struct cfg802154_registered_device *rdev, struct net_device *dev, __le64 extended_addr, const struct ieee802154_llsec_device_key *devkey) { void *hdr; struct nlattr *nl_devkey, *nl_key_id; hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); if (!hdr) return -ENOBUFS; if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; nl_devkey = nla_nest_start_noflag(msg, NL802154_ATTR_SEC_DEVKEY); if (!nl_devkey) goto nla_put_failure; if (nla_put_le64(msg, NL802154_DEVKEY_ATTR_EXTENDED_ADDR, extended_addr, NL802154_DEVKEY_ATTR_PAD) || nla_put_u32(msg, NL802154_DEVKEY_ATTR_FRAME_COUNTER, devkey->frame_counter)) goto nla_put_failure; nl_key_id = nla_nest_start_noflag(msg, NL802154_DEVKEY_ATTR_ID); if (!nl_key_id) goto nla_put_failure; if (ieee802154_llsec_send_key_id(msg, &devkey->key_id) < 0) goto nla_put_failure; nla_nest_end(msg, nl_key_id); nla_nest_end(msg, nl_devkey); genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nl802154_dump_llsec_devkey(struct sk_buff *skb, struct netlink_callback *cb) { struct cfg802154_registered_device *rdev = NULL; struct ieee802154_llsec_device_key *kpos; struct ieee802154_llsec_device *dpos; struct ieee802154_llsec_table *table; struct wpan_dev *wpan_dev; int err; err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev); if (err) return err; if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) { err = skb->len; goto out_err; } if (!wpan_dev->netdev) { err = -EINVAL; goto out_err; } rdev_lock_llsec_table(rdev, wpan_dev); rdev_get_llsec_table(rdev, wpan_dev, &table); /* TODO make it like station dump */ if (cb->args[2]) goto out; /* TODO look if remove devkey and do some nested attribute */ list_for_each_entry(dpos, &table->devices, list) { list_for_each_entry(kpos, &dpos->keys, list) { if (nl802154_send_devkey(skb, NL802154_CMD_NEW_SEC_LEVEL, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, rdev, wpan_dev->netdev, dpos->hwaddr, kpos) < 0) { /* TODO */ err = -EIO; rdev_unlock_llsec_table(rdev, wpan_dev); goto out_err; } } } cb->args[2] = 1; out: rdev_unlock_llsec_table(rdev, wpan_dev); err = skb->len; out_err: nl802154_finish_wpan_dev_dump(rdev); return err; } static const struct nla_policy nl802154_devkey_policy[NL802154_DEVKEY_ATTR_MAX + 1] = { [NL802154_DEVKEY_ATTR_FRAME_COUNTER] = { NLA_U32 }, [NL802154_DEVKEY_ATTR_EXTENDED_ADDR] = { NLA_U64 }, [NL802154_DEVKEY_ATTR_ID] = { NLA_NESTED }, }; static int nl802154_add_llsec_devkey(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; struct nlattr *attrs[NL802154_DEVKEY_ATTR_MAX + 1]; struct ieee802154_llsec_device_key key; __le64 extended_addr; if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) return -EOPNOTSUPP; if (!info->attrs[NL802154_ATTR_SEC_DEVKEY] || nla_parse_nested_deprecated(attrs, NL802154_DEVKEY_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_DEVKEY], nl802154_devkey_policy, info->extack) < 0) return -EINVAL; if (!attrs[NL802154_DEVKEY_ATTR_FRAME_COUNTER] || !attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR]) return -EINVAL; /* TODO change key.id ? */ if (ieee802154_llsec_parse_key_id(attrs[NL802154_DEVKEY_ATTR_ID], &key.key_id) < 0) return -ENOBUFS; /* TODO be32 */ key.frame_counter = nla_get_u32(attrs[NL802154_DEVKEY_ATTR_FRAME_COUNTER]); /* TODO change naming hwaddr -> extended_addr * check unique identifier short+pan OR extended_addr */ extended_addr = nla_get_le64(attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR]); return rdev_add_devkey(rdev, wpan_dev, extended_addr, &key); } static int nl802154_del_llsec_devkey(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; struct nlattr *attrs[NL802154_DEVKEY_ATTR_MAX + 1]; struct ieee802154_llsec_device_key key; __le64 extended_addr; if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) return -EOPNOTSUPP; if (!info->attrs[NL802154_ATTR_SEC_DEVKEY] || nla_parse_nested_deprecated(attrs, NL802154_DEVKEY_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_DEVKEY], nl802154_devkey_policy, info->extack)) return -EINVAL; if (!attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR]) return -EINVAL; /* TODO change key.id ? */ if (ieee802154_llsec_parse_key_id(attrs[NL802154_DEVKEY_ATTR_ID], &key.key_id) < 0) return -ENOBUFS; /* TODO change naming hwaddr -> extended_addr * check unique identifier short+pan OR extended_addr */ extended_addr = nla_get_le64(attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR]); return rdev_del_devkey(rdev, wpan_dev, extended_addr, &key); } static int nl802154_send_seclevel(struct sk_buff *msg, u32 cmd, u32 portid, u32 seq, int flags, struct cfg802154_registered_device *rdev, struct net_device *dev, const struct ieee802154_llsec_seclevel *sl) { void *hdr; struct nlattr *nl_seclevel; hdr = nl802154hdr_put(msg, portid, seq, flags, cmd); if (!hdr) return -ENOBUFS; if (nla_put_u32(msg, NL802154_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; nl_seclevel = nla_nest_start_noflag(msg, NL802154_ATTR_SEC_LEVEL); if (!nl_seclevel) goto nla_put_failure; if (nla_put_u32(msg, NL802154_SECLEVEL_ATTR_FRAME, sl->frame_type) || nla_put_u32(msg, NL802154_SECLEVEL_ATTR_LEVELS, sl->sec_levels) || nla_put_u8(msg, NL802154_SECLEVEL_ATTR_DEV_OVERRIDE, sl->device_override)) goto nla_put_failure; if (sl->frame_type == NL802154_FRAME_CMD) { if (nla_put_u32(msg, NL802154_SECLEVEL_ATTR_CMD_FRAME, sl->cmd_frame_id)) goto nla_put_failure; } nla_nest_end(msg, nl_seclevel); genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nl802154_dump_llsec_seclevel(struct sk_buff *skb, struct netlink_callback *cb) { struct cfg802154_registered_device *rdev = NULL; struct ieee802154_llsec_seclevel *sl; struct ieee802154_llsec_table *table; struct wpan_dev *wpan_dev; int err; err = nl802154_prepare_wpan_dev_dump(skb, cb, &rdev, &wpan_dev); if (err) return err; if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) { err = skb->len; goto out_err; } if (!wpan_dev->netdev) { err = -EINVAL; goto out_err; } rdev_lock_llsec_table(rdev, wpan_dev); rdev_get_llsec_table(rdev, wpan_dev, &table); /* TODO make it like station dump */ if (cb->args[2]) goto out; list_for_each_entry(sl, &table->security_levels, list) { if (nl802154_send_seclevel(skb, NL802154_CMD_NEW_SEC_LEVEL, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, rdev, wpan_dev->netdev, sl) < 0) { /* TODO */ err = -EIO; rdev_unlock_llsec_table(rdev, wpan_dev); goto out_err; } } cb->args[2] = 1; out: rdev_unlock_llsec_table(rdev, wpan_dev); err = skb->len; out_err: nl802154_finish_wpan_dev_dump(rdev); return err; } static const struct nla_policy nl802154_seclevel_policy[NL802154_SECLEVEL_ATTR_MAX + 1] = { [NL802154_SECLEVEL_ATTR_LEVELS] = { .type = NLA_U8 }, [NL802154_SECLEVEL_ATTR_FRAME] = { .type = NLA_U32 }, [NL802154_SECLEVEL_ATTR_CMD_FRAME] = { .type = NLA_U32 }, [NL802154_SECLEVEL_ATTR_DEV_OVERRIDE] = { .type = NLA_U8 }, }; static int llsec_parse_seclevel(struct nlattr *nla, struct ieee802154_llsec_seclevel *sl) { struct nlattr *attrs[NL802154_SECLEVEL_ATTR_MAX + 1]; if (!nla || nla_parse_nested_deprecated(attrs, NL802154_SECLEVEL_ATTR_MAX, nla, nl802154_seclevel_policy, NULL)) return -EINVAL; memset(sl, 0, sizeof(*sl)); if (!attrs[NL802154_SECLEVEL_ATTR_LEVELS] || !attrs[NL802154_SECLEVEL_ATTR_FRAME] || !attrs[NL802154_SECLEVEL_ATTR_DEV_OVERRIDE]) return -EINVAL; sl->sec_levels = nla_get_u8(attrs[NL802154_SECLEVEL_ATTR_LEVELS]); sl->frame_type = nla_get_u32(attrs[NL802154_SECLEVEL_ATTR_FRAME]); sl->device_override = nla_get_u8(attrs[NL802154_SECLEVEL_ATTR_DEV_OVERRIDE]); if (sl->frame_type > NL802154_FRAME_MAX || (sl->device_override != 0 && sl->device_override != 1)) return -EINVAL; if (sl->frame_type == NL802154_FRAME_CMD) { if (!attrs[NL802154_SECLEVEL_ATTR_CMD_FRAME]) return -EINVAL; sl->cmd_frame_id = nla_get_u32(attrs[NL802154_SECLEVEL_ATTR_CMD_FRAME]); if (sl->cmd_frame_id > NL802154_CMD_FRAME_MAX) return -EINVAL; } return 0; } static int nl802154_add_llsec_seclevel(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; struct ieee802154_llsec_seclevel sl; if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) return -EOPNOTSUPP; if (llsec_parse_seclevel(info->attrs[NL802154_ATTR_SEC_LEVEL], &sl) < 0) return -EINVAL; return rdev_add_seclevel(rdev, wpan_dev, &sl); } static int nl802154_del_llsec_seclevel(struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wpan_dev *wpan_dev = dev->ieee802154_ptr; struct ieee802154_llsec_seclevel sl; if (wpan_dev->iftype == NL802154_IFTYPE_MONITOR) return -EOPNOTSUPP; if (llsec_parse_seclevel(info->attrs[NL802154_ATTR_SEC_LEVEL], &sl) < 0) return -EINVAL; return rdev_del_seclevel(rdev, wpan_dev, &sl); } #endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ #define NL802154_FLAG_NEED_WPAN_PHY 0x01 #define NL802154_FLAG_NEED_NETDEV 0x02 #define NL802154_FLAG_NEED_RTNL 0x04 #define NL802154_FLAG_CHECK_NETDEV_UP 0x08 #define NL802154_FLAG_NEED_WPAN_DEV 0x10 static int nl802154_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { struct cfg802154_registered_device *rdev; struct wpan_dev *wpan_dev; struct net_device *dev; bool rtnl = ops->internal_flags & NL802154_FLAG_NEED_RTNL; if (rtnl) rtnl_lock(); if (ops->internal_flags & NL802154_FLAG_NEED_WPAN_PHY) { rdev = cfg802154_get_dev_from_info(genl_info_net(info), info); if (IS_ERR(rdev)) { if (rtnl) rtnl_unlock(); return PTR_ERR(rdev); } info->user_ptr[0] = rdev; } else if (ops->internal_flags & NL802154_FLAG_NEED_NETDEV || ops->internal_flags & NL802154_FLAG_NEED_WPAN_DEV) { ASSERT_RTNL(); wpan_dev = __cfg802154_wpan_dev_from_attrs(genl_info_net(info), info->attrs); if (IS_ERR(wpan_dev)) { if (rtnl) rtnl_unlock(); return PTR_ERR(wpan_dev); } dev = wpan_dev->netdev; rdev = wpan_phy_to_rdev(wpan_dev->wpan_phy); if (ops->internal_flags & NL802154_FLAG_NEED_NETDEV) { if (!dev) { if (rtnl) rtnl_unlock(); return -EINVAL; } info->user_ptr[1] = dev; } else { info->user_ptr[1] = wpan_dev; } if (dev) { if (ops->internal_flags & NL802154_FLAG_CHECK_NETDEV_UP && !netif_running(dev)) { if (rtnl) rtnl_unlock(); return -ENETDOWN; } dev_hold(dev); } info->user_ptr[0] = rdev; } return 0; } static void nl802154_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb, struct genl_info *info) { if (info->user_ptr[1]) { if (ops->internal_flags & NL802154_FLAG_NEED_WPAN_DEV) { struct wpan_dev *wpan_dev = info->user_ptr[1]; dev_put(wpan_dev->netdev); } else { dev_put(info->user_ptr[1]); } } if (ops->internal_flags & NL802154_FLAG_NEED_RTNL) rtnl_unlock(); } static const struct genl_ops nl802154_ops[] = { { .cmd = NL802154_CMD_GET_WPAN_PHY, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP_STRICT, .doit = nl802154_get_wpan_phy, .dumpit = nl802154_dump_wpan_phy, .done = nl802154_dump_wpan_phy_done, /* can be retrieved by unprivileged users */ .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_GET_INTERFACE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_get_interface, .dumpit = nl802154_dump_interface, /* can be retrieved by unprivileged users */ .internal_flags = NL802154_FLAG_NEED_WPAN_DEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_NEW_INTERFACE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_new_interface, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_DEL_INTERFACE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_del_interface, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_WPAN_DEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_CHANNEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_channel, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_CCA_MODE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_cca_mode, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_CCA_ED_LEVEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_cca_ed_level, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_TX_POWER, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_tx_power, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_WPAN_PHY_NETNS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_wpan_phy_netns, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_WPAN_PHY | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_PAN_ID, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_pan_id, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_SHORT_ADDR, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_short_addr, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_BACKOFF_EXPONENT, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_backoff_exponent, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_MAX_CSMA_BACKOFFS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_max_csma_backoffs, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_MAX_FRAME_RETRIES, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_max_frame_retries, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_LBT_MODE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_lbt_mode, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_ACKREQ_DEFAULT, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_ackreq_default, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_TRIGGER_SCAN, .doit = nl802154_trigger_scan, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_CHECK_NETDEV_UP | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_ABORT_SCAN, .doit = nl802154_abort_scan, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_CHECK_NETDEV_UP | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SEND_BEACONS, .doit = nl802154_send_beacons, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_CHECK_NETDEV_UP | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_STOP_BEACONS, .doit = nl802154_stop_beacons, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_CHECK_NETDEV_UP | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_ASSOCIATE, .doit = nl802154_associate, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_CHECK_NETDEV_UP | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_DISASSOCIATE, .doit = nl802154_disassociate, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_CHECK_NETDEV_UP | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_SET_MAX_ASSOCIATIONS, .doit = nl802154_set_max_associations, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_LIST_ASSOCIATIONS, .dumpit = nl802154_list_associations, /* can be retrieved by unprivileged users */ }, #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL { .cmd = NL802154_CMD_SET_SEC_PARAMS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_set_llsec_params, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_GET_SEC_KEY, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP_STRICT, /* TODO .doit by matching key id? */ .dumpit = nl802154_dump_llsec_key, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_NEW_SEC_KEY, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_add_llsec_key, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_DEL_SEC_KEY, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_del_llsec_key, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, /* TODO unique identifier must short+pan OR extended_addr */ { .cmd = NL802154_CMD_GET_SEC_DEV, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP_STRICT, /* TODO .doit by matching extended_addr? */ .dumpit = nl802154_dump_llsec_dev, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_NEW_SEC_DEV, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_add_llsec_dev, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_DEL_SEC_DEV, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_del_llsec_dev, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, /* TODO remove complete devkey, put it as nested? */ { .cmd = NL802154_CMD_GET_SEC_DEVKEY, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP_STRICT, /* TODO doit by matching ??? */ .dumpit = nl802154_dump_llsec_devkey, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_NEW_SEC_DEVKEY, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_add_llsec_devkey, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_DEL_SEC_DEVKEY, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_del_llsec_devkey, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_GET_SEC_LEVEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP_STRICT, /* TODO .doit by matching frame_type? */ .dumpit = nl802154_dump_llsec_seclevel, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_NEW_SEC_LEVEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl802154_add_llsec_seclevel, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, { .cmd = NL802154_CMD_DEL_SEC_LEVEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, /* TODO match frame_type only? */ .doit = nl802154_del_llsec_seclevel, .flags = GENL_ADMIN_PERM, .internal_flags = NL802154_FLAG_NEED_NETDEV | NL802154_FLAG_NEED_RTNL, }, #endif /* CONFIG_IEEE802154_NL802154_EXPERIMENTAL */ }; static struct genl_family nl802154_fam __ro_after_init = { .name = NL802154_GENL_NAME, /* have users key off the name instead */ .hdrsize = 0, /* no private header */ .version = 1, /* no particular meaning now */ .maxattr = NL802154_ATTR_MAX, .policy = nl802154_policy, .netnsok = true, .pre_doit = nl802154_pre_doit, .post_doit = nl802154_post_doit, .module = THIS_MODULE, .ops = nl802154_ops, .n_ops = ARRAY_SIZE(nl802154_ops), .resv_start_op = NL802154_CMD_DEL_SEC_LEVEL + 1, .mcgrps = nl802154_mcgrps, .n_mcgrps = ARRAY_SIZE(nl802154_mcgrps), }; /* initialisation/exit functions */ int __init nl802154_init(void) { return genl_register_family(&nl802154_fam); } void nl802154_exit(void) { genl_unregister_family(&nl802154_fam); } |
3 2 2 2 1 1 2 1 3 14 3 48 48 2 46 36 1 9 7 2 1 2 2 1 1 1 2 2 1 1 1 1 3 1 1 1 1 1 1 1 1 2 4 6 2 1 6 1 3 3 1 1 2 1 1 43 43 12 1 1 4 13 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Ioctl handler * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/capability.h> #include <linux/compat.h> #include <linux/kernel.h> #include <linux/if_bridge.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/times.h> #include <net/net_namespace.h> #include <linux/uaccess.h> #include "br_private.h" static int get_bridge_ifindices(struct net *net, int *indices, int num) { struct net_device *dev; int i = 0; rcu_read_lock(); for_each_netdev_rcu(net, dev) { if (i >= num) break; if (netif_is_bridge_master(dev)) indices[i++] = dev->ifindex; } rcu_read_unlock(); return i; } /* called with RTNL */ static void get_port_ifindices(struct net_bridge *br, int *ifindices, int num) { struct net_bridge_port *p; list_for_each_entry(p, &br->port_list, list) { if (p->port_no < num) ifindices[p->port_no] = p->dev->ifindex; } } /* * Format up to a page worth of forwarding table entries * userbuf -- where to copy result * maxnum -- maximum number of entries desired * (limited to a page for sanity) * offset -- number of records to skip */ static int get_fdb_entries(struct net_bridge *br, void __user *userbuf, unsigned long maxnum, unsigned long offset) { int num; void *buf; size_t size; /* Clamp size to PAGE_SIZE, test maxnum to avoid overflow */ if (maxnum > PAGE_SIZE/sizeof(struct __fdb_entry)) maxnum = PAGE_SIZE/sizeof(struct __fdb_entry); size = maxnum * sizeof(struct __fdb_entry); buf = kmalloc(size, GFP_USER); if (!buf) return -ENOMEM; num = br_fdb_fillbuf(br, buf, maxnum, offset); if (num > 0) { if (copy_to_user(userbuf, buf, array_size(num, sizeof(struct __fdb_entry)))) num = -EFAULT; } kfree(buf); return num; } /* called with RTNL */ static int add_del_if(struct net_bridge *br, int ifindex, int isadd) { struct net *net = dev_net(br->dev); struct net_device *dev; int ret; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; dev = __dev_get_by_index(net, ifindex); if (dev == NULL) return -EINVAL; if (isadd) ret = br_add_if(br, dev, NULL); else ret = br_del_if(br, dev); return ret; } #define BR_UARGS_MAX 4 static int br_dev_read_uargs(unsigned long *args, size_t nr_args, void __user **argp, void __user *data) { int ret; if (nr_args < 2 || nr_args > BR_UARGS_MAX) return -EINVAL; if (in_compat_syscall()) { unsigned int cargs[BR_UARGS_MAX]; int i; ret = copy_from_user(cargs, data, nr_args * sizeof(*cargs)); if (ret) goto fault; for (i = 0; i < nr_args; ++i) args[i] = cargs[i]; *argp = compat_ptr(args[1]); } else { ret = copy_from_user(args, data, nr_args * sizeof(*args)); if (ret) goto fault; *argp = (void __user *)args[1]; } return 0; fault: return -EFAULT; } /* * Legacy ioctl's through SIOCDEVPRIVATE * This interface is deprecated because it was too difficult * to do the translation for 32/64bit ioctl compatibility. */ int br_dev_siocdevprivate(struct net_device *dev, struct ifreq *rq, void __user *data, int cmd) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *p = NULL; unsigned long args[4]; void __user *argp; int ret; ret = br_dev_read_uargs(args, ARRAY_SIZE(args), &argp, data); if (ret) return ret; switch (args[0]) { case BRCTL_ADD_IF: case BRCTL_DEL_IF: return add_del_if(br, args[1], args[0] == BRCTL_ADD_IF); case BRCTL_GET_BRIDGE_INFO: { struct __bridge_info b; memset(&b, 0, sizeof(struct __bridge_info)); rcu_read_lock(); memcpy(&b.designated_root, &br->designated_root, 8); memcpy(&b.bridge_id, &br->bridge_id, 8); b.root_path_cost = br->root_path_cost; b.max_age = jiffies_to_clock_t(br->max_age); b.hello_time = jiffies_to_clock_t(br->hello_time); b.forward_delay = br->forward_delay; b.bridge_max_age = br->bridge_max_age; b.bridge_hello_time = br->bridge_hello_time; b.bridge_forward_delay = jiffies_to_clock_t(br->bridge_forward_delay); b.topology_change = br->topology_change; b.topology_change_detected = br->topology_change_detected; b.root_port = br->root_port; b.stp_enabled = (br->stp_enabled != BR_NO_STP); b.ageing_time = jiffies_to_clock_t(br->ageing_time); b.hello_timer_value = br_timer_value(&br->hello_timer); b.tcn_timer_value = br_timer_value(&br->tcn_timer); b.topology_change_timer_value = br_timer_value(&br->topology_change_timer); b.gc_timer_value = br_timer_value(&br->gc_work.timer); rcu_read_unlock(); if (copy_to_user((void __user *)args[1], &b, sizeof(b))) return -EFAULT; return 0; } case BRCTL_GET_PORT_LIST: { int num, *indices; num = args[2]; if (num < 0) return -EINVAL; if (num == 0) num = 256; if (num > BR_MAX_PORTS) num = BR_MAX_PORTS; indices = kcalloc(num, sizeof(int), GFP_KERNEL); if (indices == NULL) return -ENOMEM; get_port_ifindices(br, indices, num); if (copy_to_user(argp, indices, array_size(num, sizeof(int)))) num = -EFAULT; kfree(indices); return num; } case BRCTL_SET_BRIDGE_FORWARD_DELAY: if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; ret = br_set_forward_delay(br, args[1]); break; case BRCTL_SET_BRIDGE_HELLO_TIME: if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; ret = br_set_hello_time(br, args[1]); break; case BRCTL_SET_BRIDGE_MAX_AGE: if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; ret = br_set_max_age(br, args[1]); break; case BRCTL_SET_AGEING_TIME: if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; ret = br_set_ageing_time(br, args[1]); break; case BRCTL_GET_PORT_INFO: { struct __port_info p; struct net_bridge_port *pt; rcu_read_lock(); if ((pt = br_get_port(br, args[2])) == NULL) { rcu_read_unlock(); return -EINVAL; } memset(&p, 0, sizeof(struct __port_info)); memcpy(&p.designated_root, &pt->designated_root, 8); memcpy(&p.designated_bridge, &pt->designated_bridge, 8); p.port_id = pt->port_id; p.designated_port = pt->designated_port; p.path_cost = pt->path_cost; p.designated_cost = pt->designated_cost; p.state = pt->state; p.top_change_ack = pt->topology_change_ack; p.config_pending = pt->config_pending; p.message_age_timer_value = br_timer_value(&pt->message_age_timer); p.forward_delay_timer_value = br_timer_value(&pt->forward_delay_timer); p.hold_timer_value = br_timer_value(&pt->hold_timer); rcu_read_unlock(); if (copy_to_user(argp, &p, sizeof(p))) return -EFAULT; return 0; } case BRCTL_SET_BRIDGE_STP_STATE: if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; ret = br_stp_set_enabled(br, args[1], NULL); break; case BRCTL_SET_BRIDGE_PRIORITY: if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; br_stp_set_bridge_priority(br, args[1]); ret = 0; break; case BRCTL_SET_PORT_PRIORITY: { if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; spin_lock_bh(&br->lock); if ((p = br_get_port(br, args[1])) == NULL) ret = -EINVAL; else ret = br_stp_set_port_priority(p, args[2]); spin_unlock_bh(&br->lock); break; } case BRCTL_SET_PATH_COST: { if (!ns_capable(dev_net(dev)->user_ns, CAP_NET_ADMIN)) return -EPERM; spin_lock_bh(&br->lock); if ((p = br_get_port(br, args[1])) == NULL) ret = -EINVAL; else ret = br_stp_set_path_cost(p, args[2]); spin_unlock_bh(&br->lock); break; } case BRCTL_GET_FDB_ENTRIES: return get_fdb_entries(br, argp, args[2], args[3]); default: ret = -EOPNOTSUPP; } if (!ret) { if (p) br_ifinfo_notify(RTM_NEWLINK, NULL, p); else netdev_state_change(br->dev); } return ret; } static int old_deviceless(struct net *net, void __user *data) { unsigned long args[3]; void __user *argp; int ret; ret = br_dev_read_uargs(args, ARRAY_SIZE(args), &argp, data); if (ret) return ret; switch (args[0]) { case BRCTL_GET_VERSION: return BRCTL_VERSION; case BRCTL_GET_BRIDGES: { int *indices; int ret = 0; if (args[2] >= 2048) return -ENOMEM; indices = kcalloc(args[2], sizeof(int), GFP_KERNEL); if (indices == NULL) return -ENOMEM; args[2] = get_bridge_ifindices(net, indices, args[2]); ret = copy_to_user(argp, indices, array_size(args[2], sizeof(int))) ? -EFAULT : args[2]; kfree(indices); return ret; } case BRCTL_ADD_BRIDGE: case BRCTL_DEL_BRIDGE: { char buf[IFNAMSIZ]; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (copy_from_user(buf, argp, IFNAMSIZ)) return -EFAULT; buf[IFNAMSIZ-1] = 0; if (args[0] == BRCTL_ADD_BRIDGE) return br_add_bridge(net, buf); return br_del_bridge(net, buf); } } return -EOPNOTSUPP; } int br_ioctl_stub(struct net *net, struct net_bridge *br, unsigned int cmd, struct ifreq *ifr, void __user *uarg) { int ret = -EOPNOTSUPP; rtnl_lock(); switch (cmd) { case SIOCGIFBR: case SIOCSIFBR: ret = old_deviceless(net, uarg); break; case SIOCBRADDBR: case SIOCBRDELBR: { char buf[IFNAMSIZ]; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; break; } if (copy_from_user(buf, uarg, IFNAMSIZ)) { ret = -EFAULT; break; } buf[IFNAMSIZ-1] = 0; if (cmd == SIOCBRADDBR) ret = br_add_bridge(net, buf); else ret = br_del_bridge(net, buf); } break; case SIOCBRADDIF: case SIOCBRDELIF: ret = add_del_if(br, ifr->ifr_ifindex, cmd == SIOCBRADDIF); break; } rtnl_unlock(); return ret; } |
12 22505 4260 79 46 80 167 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_NODEMASK_H #define __LINUX_NODEMASK_H /* * Nodemasks provide a bitmap suitable for representing the * set of Node's in a system, one bit position per Node number. * * See detailed comments in the file linux/bitmap.h describing the * data type on which these nodemasks are based. * * For details of nodemask_parse_user(), see bitmap_parse_user() in * lib/bitmap.c. For details of nodelist_parse(), see bitmap_parselist(), * also in bitmap.c. For details of node_remap(), see bitmap_bitremap in * lib/bitmap.c. For details of nodes_remap(), see bitmap_remap in * lib/bitmap.c. For details of nodes_onto(), see bitmap_onto in * lib/bitmap.c. For details of nodes_fold(), see bitmap_fold in * lib/bitmap.c. * * The available nodemask operations are: * * void node_set(node, mask) turn on bit 'node' in mask * void node_clear(node, mask) turn off bit 'node' in mask * void nodes_setall(mask) set all bits * void nodes_clear(mask) clear all bits * int node_isset(node, mask) true iff bit 'node' set in mask * int node_test_and_set(node, mask) test and set bit 'node' in mask * * void nodes_and(dst, src1, src2) dst = src1 & src2 [intersection] * void nodes_or(dst, src1, src2) dst = src1 | src2 [union] * void nodes_xor(dst, src1, src2) dst = src1 ^ src2 * void nodes_andnot(dst, src1, src2) dst = src1 & ~src2 * void nodes_complement(dst, src) dst = ~src * * int nodes_equal(mask1, mask2) Does mask1 == mask2? * int nodes_intersects(mask1, mask2) Do mask1 and mask2 intersect? * int nodes_subset(mask1, mask2) Is mask1 a subset of mask2? * int nodes_empty(mask) Is mask empty (no bits sets)? * int nodes_full(mask) Is mask full (all bits sets)? * int nodes_weight(mask) Hamming weight - number of set bits * * void nodes_shift_right(dst, src, n) Shift right * void nodes_shift_left(dst, src, n) Shift left * * unsigned int first_node(mask) Number lowest set bit, or MAX_NUMNODES * unsigend int next_node(node, mask) Next node past 'node', or MAX_NUMNODES * unsigned int next_node_in(node, mask) Next node past 'node', or wrap to first, * or MAX_NUMNODES * unsigned int first_unset_node(mask) First node not set in mask, or * MAX_NUMNODES * * nodemask_t nodemask_of_node(node) Return nodemask with bit 'node' set * NODE_MASK_ALL Initializer - all bits set * NODE_MASK_NONE Initializer - no bits set * unsigned long *nodes_addr(mask) Array of unsigned long's in mask * * int nodemask_parse_user(ubuf, ulen, mask) Parse ascii string as nodemask * int nodelist_parse(buf, map) Parse ascii string as nodelist * int node_remap(oldbit, old, new) newbit = map(old, new)(oldbit) * void nodes_remap(dst, src, old, new) *dst = map(old, new)(src) * void nodes_onto(dst, orig, relmap) *dst = orig relative to relmap * void nodes_fold(dst, orig, sz) dst bits = orig bits mod sz * * for_each_node_mask(node, mask) for-loop node over mask * * int num_online_nodes() Number of online Nodes * int num_possible_nodes() Number of all possible Nodes * * int node_random(mask) Random node with set bit in mask * * int node_online(node) Is some node online? * int node_possible(node) Is some node possible? * * node_set_online(node) set bit 'node' in node_online_map * node_set_offline(node) clear bit 'node' in node_online_map * * for_each_node(node) for-loop node over node_possible_map * for_each_online_node(node) for-loop node over node_online_map * * Subtlety: * 1) The 'type-checked' form of node_isset() causes gcc (3.3.2, anyway) * to generate slightly worse code. So use a simple one-line #define * for node_isset(), instead of wrapping an inline inside a macro, the * way we do the other calls. * * NODEMASK_SCRATCH * When doing above logical AND, OR, XOR, Remap operations the callers tend to * need temporary nodemask_t's on the stack. But if NODES_SHIFT is large, * nodemask_t's consume too much stack space. NODEMASK_SCRATCH is a helper * for such situations. See below and CPUMASK_ALLOC also. */ #include <linux/threads.h> #include <linux/bitmap.h> #include <linux/minmax.h> #include <linux/nodemask_types.h> #include <linux/numa.h> #include <linux/random.h> extern nodemask_t _unused_nodemask_arg_; /** * nodemask_pr_args - printf args to output a nodemask * @maskp: nodemask to be printed * * Can be used to provide arguments for '%*pb[l]' when printing a nodemask. */ #define nodemask_pr_args(maskp) __nodemask_pr_numnodes(maskp), \ __nodemask_pr_bits(maskp) static inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m) { return m ? MAX_NUMNODES : 0; } static inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m) { return m ? m->bits : NULL; } /* * The inline keyword gives the compiler room to decide to inline, or * not inline a function as it sees best. However, as these functions * are called in both __init and non-__init functions, if they are not * inlined we will end up with a section mismatch error (of the type of * freeable items not being freed). So we must use __always_inline here * to fix the problem. If other functions in the future also end up in * this situation they will also need to be annotated as __always_inline */ #define node_set(node, dst) __node_set((node), &(dst)) static __always_inline void __node_set(int node, volatile nodemask_t *dstp) { set_bit(node, dstp->bits); } #define node_clear(node, dst) __node_clear((node), &(dst)) static inline void __node_clear(int node, volatile nodemask_t *dstp) { clear_bit(node, dstp->bits); } #define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES) static inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits) { bitmap_fill(dstp->bits, nbits); } #define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES) static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits) { bitmap_zero(dstp->bits, nbits); } /* No static inline type checking - see Subtlety (1) above. */ #define node_isset(node, nodemask) test_bit((node), (nodemask).bits) #define node_test_and_set(node, nodemask) \ __node_test_and_set((node), &(nodemask)) static inline bool __node_test_and_set(int node, nodemask_t *addr) { return test_and_set_bit(node, addr->bits); } #define nodes_and(dst, src1, src2) \ __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES) static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_or(dst, src1, src2) \ __nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES) static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_xor(dst, src1, src2) \ __nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES) static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_andnot(dst, src1, src2) \ __nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES) static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_complement(dst, src) \ __nodes_complement(&(dst), &(src), MAX_NUMNODES) static inline void __nodes_complement(nodemask_t *dstp, const nodemask_t *srcp, unsigned int nbits) { bitmap_complement(dstp->bits, srcp->bits, nbits); } #define nodes_equal(src1, src2) \ __nodes_equal(&(src1), &(src2), MAX_NUMNODES) static inline bool __nodes_equal(const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { return bitmap_equal(src1p->bits, src2p->bits, nbits); } #define nodes_intersects(src1, src2) \ __nodes_intersects(&(src1), &(src2), MAX_NUMNODES) static inline bool __nodes_intersects(const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { return bitmap_intersects(src1p->bits, src2p->bits, nbits); } #define nodes_subset(src1, src2) \ __nodes_subset(&(src1), &(src2), MAX_NUMNODES) static inline bool __nodes_subset(const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { return bitmap_subset(src1p->bits, src2p->bits, nbits); } #define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES) static inline bool __nodes_empty(const nodemask_t *srcp, unsigned int nbits) { return bitmap_empty(srcp->bits, nbits); } #define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES) static inline bool __nodes_full(const nodemask_t *srcp, unsigned int nbits) { return bitmap_full(srcp->bits, nbits); } #define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES) static inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits) { return bitmap_weight(srcp->bits, nbits); } #define nodes_shift_right(dst, src, n) \ __nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES) static inline void __nodes_shift_right(nodemask_t *dstp, const nodemask_t *srcp, int n, int nbits) { bitmap_shift_right(dstp->bits, srcp->bits, n, nbits); } #define nodes_shift_left(dst, src, n) \ __nodes_shift_left(&(dst), &(src), (n), MAX_NUMNODES) static inline void __nodes_shift_left(nodemask_t *dstp, const nodemask_t *srcp, int n, int nbits) { bitmap_shift_left(dstp->bits, srcp->bits, n, nbits); } /* FIXME: better would be to fix all architectures to never return > MAX_NUMNODES, then the silly min_ts could be dropped. */ #define first_node(src) __first_node(&(src)) static inline unsigned int __first_node(const nodemask_t *srcp) { return min_t(unsigned int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES)); } #define next_node(n, src) __next_node((n), &(src)) static inline unsigned int __next_node(int n, const nodemask_t *srcp) { return min_t(unsigned int, MAX_NUMNODES, find_next_bit(srcp->bits, MAX_NUMNODES, n+1)); } /* * Find the next present node in src, starting after node n, wrapping around to * the first node in src if needed. Returns MAX_NUMNODES if src is empty. */ #define next_node_in(n, src) __next_node_in((n), &(src)) static inline unsigned int __next_node_in(int node, const nodemask_t *srcp) { unsigned int ret = __next_node(node, srcp); if (ret == MAX_NUMNODES) ret = __first_node(srcp); return ret; } static inline void init_nodemask_of_node(nodemask_t *mask, int node) { nodes_clear(*mask); node_set(node, *mask); } #define nodemask_of_node(node) \ ({ \ typeof(_unused_nodemask_arg_) m; \ if (sizeof(m) == sizeof(unsigned long)) { \ m.bits[0] = 1UL << (node); \ } else { \ init_nodemask_of_node(&m, (node)); \ } \ m; \ }) #define first_unset_node(mask) __first_unset_node(&(mask)) static inline unsigned int __first_unset_node(const nodemask_t *maskp) { return min_t(unsigned int, MAX_NUMNODES, find_first_zero_bit(maskp->bits, MAX_NUMNODES)); } #define NODE_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(MAX_NUMNODES) #if MAX_NUMNODES <= BITS_PER_LONG #define NODE_MASK_ALL \ ((nodemask_t) { { \ [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD \ } }) #else #define NODE_MASK_ALL \ ((nodemask_t) { { \ [0 ... BITS_TO_LONGS(MAX_NUMNODES)-2] = ~0UL, \ [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD \ } }) #endif #define NODE_MASK_NONE \ ((nodemask_t) { { \ [0 ... BITS_TO_LONGS(MAX_NUMNODES)-1] = 0UL \ } }) #define nodes_addr(src) ((src).bits) #define nodemask_parse_user(ubuf, ulen, dst) \ __nodemask_parse_user((ubuf), (ulen), &(dst), MAX_NUMNODES) static inline int __nodemask_parse_user(const char __user *buf, int len, nodemask_t *dstp, int nbits) { return bitmap_parse_user(buf, len, dstp->bits, nbits); } #define nodelist_parse(buf, dst) __nodelist_parse((buf), &(dst), MAX_NUMNODES) static inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits) { return bitmap_parselist(buf, dstp->bits, nbits); } #define node_remap(oldbit, old, new) \ __node_remap((oldbit), &(old), &(new), MAX_NUMNODES) static inline int __node_remap(int oldbit, const nodemask_t *oldp, const nodemask_t *newp, int nbits) { return bitmap_bitremap(oldbit, oldp->bits, newp->bits, nbits); } #define nodes_remap(dst, src, old, new) \ __nodes_remap(&(dst), &(src), &(old), &(new), MAX_NUMNODES) static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp, const nodemask_t *oldp, const nodemask_t *newp, int nbits) { bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits); } #define nodes_onto(dst, orig, relmap) \ __nodes_onto(&(dst), &(orig), &(relmap), MAX_NUMNODES) static inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp, const nodemask_t *relmapp, int nbits) { bitmap_onto(dstp->bits, origp->bits, relmapp->bits, nbits); } #define nodes_fold(dst, orig, sz) \ __nodes_fold(&(dst), &(orig), sz, MAX_NUMNODES) static inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp, int sz, int nbits) { bitmap_fold(dstp->bits, origp->bits, sz, nbits); } #if MAX_NUMNODES > 1 #define for_each_node_mask(node, mask) \ for ((node) = first_node(mask); \ (node) < MAX_NUMNODES; \ (node) = next_node((node), (mask))) #else /* MAX_NUMNODES == 1 */ #define for_each_node_mask(node, mask) \ for ((node) = 0; (node) < 1 && !nodes_empty(mask); (node)++) #endif /* MAX_NUMNODES */ /* * Bitmasks that are kept for all the nodes. */ enum node_states { N_POSSIBLE, /* The node could become online at some point */ N_ONLINE, /* The node is online */ N_NORMAL_MEMORY, /* The node has regular memory */ #ifdef CONFIG_HIGHMEM N_HIGH_MEMORY, /* The node has regular or high memory */ #else N_HIGH_MEMORY = N_NORMAL_MEMORY, #endif N_MEMORY, /* The node has memory(regular, high, movable) */ N_CPU, /* The node has one or more cpus */ N_GENERIC_INITIATOR, /* The node has one or more Generic Initiators */ NR_NODE_STATES }; /* * The following particular system nodemasks and operations * on them manage all possible and online nodes. */ extern nodemask_t node_states[NR_NODE_STATES]; #if MAX_NUMNODES > 1 static inline int node_state(int node, enum node_states state) { return node_isset(node, node_states[state]); } static inline void node_set_state(int node, enum node_states state) { __node_set(node, &node_states[state]); } static inline void node_clear_state(int node, enum node_states state) { __node_clear(node, &node_states[state]); } static inline int num_node_state(enum node_states state) { return nodes_weight(node_states[state]); } #define for_each_node_state(__node, __state) \ for_each_node_mask((__node), node_states[__state]) #define first_online_node first_node(node_states[N_ONLINE]) #define first_memory_node first_node(node_states[N_MEMORY]) static inline unsigned int next_online_node(int nid) { return next_node(nid, node_states[N_ONLINE]); } static inline unsigned int next_memory_node(int nid) { return next_node(nid, node_states[N_MEMORY]); } extern unsigned int nr_node_ids; extern unsigned int nr_online_nodes; static inline void node_set_online(int nid) { node_set_state(nid, N_ONLINE); nr_online_nodes = num_node_state(N_ONLINE); } static inline void node_set_offline(int nid) { node_clear_state(nid, N_ONLINE); nr_online_nodes = num_node_state(N_ONLINE); } #else static inline int node_state(int node, enum node_states state) { return node == 0; } static inline void node_set_state(int node, enum node_states state) { } static inline void node_clear_state(int node, enum node_states state) { } static inline int num_node_state(enum node_states state) { return 1; } #define for_each_node_state(node, __state) \ for ( (node) = 0; (node) == 0; (node) = 1) #define first_online_node 0 #define first_memory_node 0 #define next_online_node(nid) (MAX_NUMNODES) #define next_memory_node(nid) (MAX_NUMNODES) #define nr_node_ids 1U #define nr_online_nodes 1U #define node_set_online(node) node_set_state((node), N_ONLINE) #define node_set_offline(node) node_clear_state((node), N_ONLINE) #endif static inline int node_random(const nodemask_t *maskp) { #if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1) int w, bit; w = nodes_weight(*maskp); switch (w) { case 0: bit = NUMA_NO_NODE; break; case 1: bit = first_node(*maskp); break; default: bit = find_nth_bit(maskp->bits, MAX_NUMNODES, get_random_u32_below(w)); break; } return bit; #else return 0; #endif } #define node_online_map node_states[N_ONLINE] #define node_possible_map node_states[N_POSSIBLE] #define num_online_nodes() num_node_state(N_ONLINE) #define num_possible_nodes() num_node_state(N_POSSIBLE) #define node_online(node) node_state((node), N_ONLINE) #define node_possible(node) node_state((node), N_POSSIBLE) #define for_each_node(node) for_each_node_state(node, N_POSSIBLE) #define for_each_online_node(node) for_each_node_state(node, N_ONLINE) /* * For nodemask scratch area. * NODEMASK_ALLOC(type, name) allocates an object with a specified type and * name. */ #if NODES_SHIFT > 8 /* nodemask_t > 32 bytes */ #define NODEMASK_ALLOC(type, name, gfp_flags) \ type *name = kmalloc(sizeof(*name), gfp_flags) #define NODEMASK_FREE(m) kfree(m) #else #define NODEMASK_ALLOC(type, name, gfp_flags) type _##name, *name = &_##name #define NODEMASK_FREE(m) do {} while (0) #endif /* Example structure for using NODEMASK_ALLOC, used in mempolicy. */ struct nodemask_scratch { nodemask_t mask1; nodemask_t mask2; }; #define NODEMASK_SCRATCH(x) \ NODEMASK_ALLOC(struct nodemask_scratch, x, \ GFP_KERNEL | __GFP_NORETRY) #define NODEMASK_SCRATCH_FREE(x) NODEMASK_FREE(x) #endif /* __LINUX_NODEMASK_H */ |
1 1 1 1 1 1 1 4 1 3 2 1 2 7 2 5 5 1 4 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 | // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/nospec.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "io_uring.h" #include "rsrc.h" #include "filetable.h" #include "alloc_cache.h" #include "msg_ring.h" /* All valid masks for MSG_RING */ #define IORING_MSG_RING_MASK (IORING_MSG_RING_CQE_SKIP | \ IORING_MSG_RING_FLAGS_PASS) struct io_msg { struct file *file; struct file *src_file; struct callback_head tw; u64 user_data; u32 len; u32 cmd; u32 src_fd; union { u32 dst_fd; u32 cqe_flags; }; u32 flags; }; static void io_double_unlock_ctx(struct io_ring_ctx *octx) { mutex_unlock(&octx->uring_lock); } static int io_double_lock_ctx(struct io_ring_ctx *octx, unsigned int issue_flags) { /* * To ensure proper ordering between the two ctxs, we can only * attempt a trylock on the target. If that fails and we already have * the source ctx lock, punt to io-wq. */ if (!(issue_flags & IO_URING_F_UNLOCKED)) { if (!mutex_trylock(&octx->uring_lock)) return -EAGAIN; return 0; } mutex_lock(&octx->uring_lock); return 0; } void io_msg_ring_cleanup(struct io_kiocb *req) { struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); if (WARN_ON_ONCE(!msg->src_file)) return; fput(msg->src_file); msg->src_file = NULL; } static inline bool io_msg_need_remote(struct io_ring_ctx *target_ctx) { return target_ctx->task_complete; } static void io_msg_tw_complete(struct io_kiocb *req, struct io_tw_state *ts) { struct io_ring_ctx *ctx = req->ctx; io_add_aux_cqe(ctx, req->cqe.user_data, req->cqe.res, req->cqe.flags); if (spin_trylock(&ctx->msg_lock)) { if (io_alloc_cache_put(&ctx->msg_cache, req)) req = NULL; spin_unlock(&ctx->msg_lock); } if (req) kmem_cache_free(req_cachep, req); percpu_ref_put(&ctx->refs); } static int io_msg_remote_post(struct io_ring_ctx *ctx, struct io_kiocb *req, int res, u32 cflags, u64 user_data) { req->task = READ_ONCE(ctx->submitter_task); if (!req->task) { kmem_cache_free(req_cachep, req); return -EOWNERDEAD; } req->cqe.user_data = user_data; io_req_set_res(req, res, cflags); percpu_ref_get(&ctx->refs); req->ctx = ctx; req->io_task_work.func = io_msg_tw_complete; io_req_task_work_add_remote(req, ctx, IOU_F_TWQ_LAZY_WAKE); return 0; } static struct io_kiocb *io_msg_get_kiocb(struct io_ring_ctx *ctx) { struct io_kiocb *req = NULL; if (spin_trylock(&ctx->msg_lock)) { req = io_alloc_cache_get(&ctx->msg_cache); spin_unlock(&ctx->msg_lock); if (req) return req; } return kmem_cache_alloc(req_cachep, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO); } static int io_msg_data_remote(struct io_kiocb *req) { struct io_ring_ctx *target_ctx = req->file->private_data; struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); struct io_kiocb *target; u32 flags = 0; target = io_msg_get_kiocb(req->ctx); if (unlikely(!target)) return -ENOMEM; if (msg->flags & IORING_MSG_RING_FLAGS_PASS) flags = msg->cqe_flags; return io_msg_remote_post(target_ctx, target, msg->len, flags, msg->user_data); } static int io_msg_ring_data(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *target_ctx = req->file->private_data; struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); u32 flags = 0; int ret; if (msg->src_fd || msg->flags & ~IORING_MSG_RING_FLAGS_PASS) return -EINVAL; if (!(msg->flags & IORING_MSG_RING_FLAGS_PASS) && msg->dst_fd) return -EINVAL; if (target_ctx->flags & IORING_SETUP_R_DISABLED) return -EBADFD; if (io_msg_need_remote(target_ctx)) return io_msg_data_remote(req); if (msg->flags & IORING_MSG_RING_FLAGS_PASS) flags = msg->cqe_flags; ret = -EOVERFLOW; if (target_ctx->flags & IORING_SETUP_IOPOLL) { if (unlikely(io_double_lock_ctx(target_ctx, issue_flags))) return -EAGAIN; } if (io_post_aux_cqe(target_ctx, msg->user_data, msg->len, flags)) ret = 0; if (target_ctx->flags & IORING_SETUP_IOPOLL) io_double_unlock_ctx(target_ctx); return ret; } static struct file *io_msg_grab_file(struct io_kiocb *req, unsigned int issue_flags) { struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); struct io_ring_ctx *ctx = req->ctx; struct file *file = NULL; int idx = msg->src_fd; io_ring_submit_lock(ctx, issue_flags); if (likely(idx < ctx->nr_user_files)) { idx = array_index_nospec(idx, ctx->nr_user_files); file = io_file_from_index(&ctx->file_table, idx); if (file) get_file(file); } io_ring_submit_unlock(ctx, issue_flags); return file; } static int io_msg_install_complete(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *target_ctx = req->file->private_data; struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); struct file *src_file = msg->src_file; int ret; if (unlikely(io_double_lock_ctx(target_ctx, issue_flags))) return -EAGAIN; ret = __io_fixed_fd_install(target_ctx, src_file, msg->dst_fd); if (ret < 0) goto out_unlock; msg->src_file = NULL; req->flags &= ~REQ_F_NEED_CLEANUP; if (msg->flags & IORING_MSG_RING_CQE_SKIP) goto out_unlock; /* * If this fails, the target still received the file descriptor but * wasn't notified of the fact. This means that if this request * completes with -EOVERFLOW, then the sender must ensure that a * later IORING_OP_MSG_RING delivers the message. */ if (!io_post_aux_cqe(target_ctx, msg->user_data, ret, 0)) ret = -EOVERFLOW; out_unlock: io_double_unlock_ctx(target_ctx); return ret; } static void io_msg_tw_fd_complete(struct callback_head *head) { struct io_msg *msg = container_of(head, struct io_msg, tw); struct io_kiocb *req = cmd_to_io_kiocb(msg); int ret = -EOWNERDEAD; if (!(current->flags & PF_EXITING)) ret = io_msg_install_complete(req, IO_URING_F_UNLOCKED); if (ret < 0) req_set_fail(req); io_req_queue_tw_complete(req, ret); } static int io_msg_fd_remote(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->file->private_data; struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); struct task_struct *task = READ_ONCE(ctx->submitter_task); if (unlikely(!task)) return -EOWNERDEAD; init_task_work(&msg->tw, io_msg_tw_fd_complete); if (task_work_add(task, &msg->tw, TWA_SIGNAL)) return -EOWNERDEAD; return IOU_ISSUE_SKIP_COMPLETE; } static int io_msg_send_fd(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *target_ctx = req->file->private_data; struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); struct io_ring_ctx *ctx = req->ctx; struct file *src_file = msg->src_file; if (msg->len) return -EINVAL; if (target_ctx == ctx) return -EINVAL; if (target_ctx->flags & IORING_SETUP_R_DISABLED) return -EBADFD; if (!src_file) { src_file = io_msg_grab_file(req, issue_flags); if (!src_file) return -EBADF; msg->src_file = src_file; req->flags |= REQ_F_NEED_CLEANUP; } if (io_msg_need_remote(target_ctx)) return io_msg_fd_remote(req); return io_msg_install_complete(req, issue_flags); } int io_msg_ring_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); if (unlikely(sqe->buf_index || sqe->personality)) return -EINVAL; msg->src_file = NULL; msg->user_data = READ_ONCE(sqe->off); msg->len = READ_ONCE(sqe->len); msg->cmd = READ_ONCE(sqe->addr); msg->src_fd = READ_ONCE(sqe->addr3); msg->dst_fd = READ_ONCE(sqe->file_index); msg->flags = READ_ONCE(sqe->msg_ring_flags); if (msg->flags & ~IORING_MSG_RING_MASK) return -EINVAL; return 0; } int io_msg_ring(struct io_kiocb *req, unsigned int issue_flags) { struct io_msg *msg = io_kiocb_to_cmd(req, struct io_msg); int ret; ret = -EBADFD; if (!io_is_uring_fops(req->file)) goto done; switch (msg->cmd) { case IORING_MSG_DATA: ret = io_msg_ring_data(req, issue_flags); break; case IORING_MSG_SEND_FD: ret = io_msg_send_fd(req, issue_flags); break; default: ret = -EINVAL; break; } done: if (ret < 0) { if (ret == -EAGAIN || ret == IOU_ISSUE_SKIP_COMPLETE) return ret; req_set_fail(req); } io_req_set_res(req, ret, 0); return IOU_OK; } void io_msg_cache_free(const void *entry) { struct io_kiocb *req = (struct io_kiocb *) entry; kmem_cache_free(req_cachep, req); } |
1 2 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Roccat Lua driver for Linux * * Copyright (c) 2012 Stefan Achatz <erazor_de@users.sourceforge.net> */ /* */ /* * Roccat Lua is a gamer mouse which cpi, button and light settings can be * configured. */ #include <linux/device.h> #include <linux/input.h> #include <linux/hid.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/hid-roccat.h> #include "hid-ids.h" #include "hid-roccat-common.h" #include "hid-roccat-lua.h" static ssize_t lua_sysfs_read(struct file *fp, struct kobject *kobj, char *buf, loff_t off, size_t count, size_t real_size, uint command) { struct device *dev = kobj_to_dev(kobj); struct lua_device *lua = hid_get_drvdata(dev_get_drvdata(dev)); struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev)); int retval; if (off >= real_size) return 0; if (off != 0 || count != real_size) return -EINVAL; mutex_lock(&lua->lua_lock); retval = roccat_common2_receive(usb_dev, command, buf, real_size); mutex_unlock(&lua->lua_lock); return retval ? retval : real_size; } static ssize_t lua_sysfs_write(struct file *fp, struct kobject *kobj, void const *buf, loff_t off, size_t count, size_t real_size, uint command) { struct device *dev = kobj_to_dev(kobj); struct lua_device *lua = hid_get_drvdata(dev_get_drvdata(dev)); struct usb_device *usb_dev = interface_to_usbdev(to_usb_interface(dev)); int retval; if (off != 0 || count != real_size) return -EINVAL; mutex_lock(&lua->lua_lock); retval = roccat_common2_send(usb_dev, command, buf, real_size); mutex_unlock(&lua->lua_lock); return retval ? retval : real_size; } #define LUA_SYSFS_W(thingy, THINGY) \ static ssize_t lua_sysfs_write_ ## thingy(struct file *fp, \ struct kobject *kobj, struct bin_attribute *attr, \ char *buf, loff_t off, size_t count) \ { \ return lua_sysfs_write(fp, kobj, buf, off, count, \ LUA_SIZE_ ## THINGY, LUA_COMMAND_ ## THINGY); \ } #define LUA_SYSFS_R(thingy, THINGY) \ static ssize_t lua_sysfs_read_ ## thingy(struct file *fp, \ struct kobject *kobj, struct bin_attribute *attr, \ char *buf, loff_t off, size_t count) \ { \ return lua_sysfs_read(fp, kobj, buf, off, count, \ LUA_SIZE_ ## THINGY, LUA_COMMAND_ ## THINGY); \ } #define LUA_BIN_ATTRIBUTE_RW(thingy, THINGY) \ LUA_SYSFS_W(thingy, THINGY) \ LUA_SYSFS_R(thingy, THINGY) \ static struct bin_attribute lua_ ## thingy ## _attr = { \ .attr = { .name = #thingy, .mode = 0660 }, \ .size = LUA_SIZE_ ## THINGY, \ .read = lua_sysfs_read_ ## thingy, \ .write = lua_sysfs_write_ ## thingy \ }; LUA_BIN_ATTRIBUTE_RW(control, CONTROL) static int lua_create_sysfs_attributes(struct usb_interface *intf) { return sysfs_create_bin_file(&intf->dev.kobj, &lua_control_attr); } static void lua_remove_sysfs_attributes(struct usb_interface *intf) { sysfs_remove_bin_file(&intf->dev.kobj, &lua_control_attr); } static int lua_init_lua_device_struct(struct usb_device *usb_dev, struct lua_device *lua) { mutex_init(&lua->lua_lock); return 0; } static int lua_init_specials(struct hid_device *hdev) { struct usb_interface *intf = to_usb_interface(hdev->dev.parent); struct usb_device *usb_dev = interface_to_usbdev(intf); struct lua_device *lua; int retval; lua = kzalloc(sizeof(*lua), GFP_KERNEL); if (!lua) { hid_err(hdev, "can't alloc device descriptor\n"); return -ENOMEM; } hid_set_drvdata(hdev, lua); retval = lua_init_lua_device_struct(usb_dev, lua); if (retval) { hid_err(hdev, "couldn't init struct lua_device\n"); goto exit; } retval = lua_create_sysfs_attributes(intf); if (retval) { hid_err(hdev, "cannot create sysfs files\n"); goto exit; } return 0; exit: kfree(lua); return retval; } static void lua_remove_specials(struct hid_device *hdev) { struct usb_interface *intf = to_usb_interface(hdev->dev.parent); struct lua_device *lua; lua_remove_sysfs_attributes(intf); lua = hid_get_drvdata(hdev); kfree(lua); } static int lua_probe(struct hid_device *hdev, const struct hid_device_id *id) { int retval; if (!hid_is_usb(hdev)) return -EINVAL; retval = hid_parse(hdev); if (retval) { hid_err(hdev, "parse failed\n"); goto exit; } retval = hid_hw_start(hdev, HID_CONNECT_DEFAULT); if (retval) { hid_err(hdev, "hw start failed\n"); goto exit; } retval = lua_init_specials(hdev); if (retval) { hid_err(hdev, "couldn't install mouse\n"); goto exit_stop; } return 0; exit_stop: hid_hw_stop(hdev); exit: return retval; } static void lua_remove(struct hid_device *hdev) { lua_remove_specials(hdev); hid_hw_stop(hdev); } static const struct hid_device_id lua_devices[] = { { HID_USB_DEVICE(USB_VENDOR_ID_ROCCAT, USB_DEVICE_ID_ROCCAT_LUA) }, { } }; MODULE_DEVICE_TABLE(hid, lua_devices); static struct hid_driver lua_driver = { .name = "lua", .id_table = lua_devices, .probe = lua_probe, .remove = lua_remove }; module_hid_driver(lua_driver); MODULE_AUTHOR("Stefan Achatz"); MODULE_DESCRIPTION("USB Roccat Lua driver"); MODULE_LICENSE("GPL v2"); |
13 13 13 13 13 12 13 13 13 13 13 13 13 13 10 18 12 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 | // SPDX-License-Identifier: GPL-2.0-only /* * handle transition of Linux booting another kernel * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> */ #define pr_fmt(fmt) "kexec: " fmt #include <linux/mm.h> #include <linux/kexec.h> #include <linux/string.h> #include <linux/gfp.h> #include <linux/reboot.h> #include <linux/numa.h> #include <linux/ftrace.h> #include <linux/io.h> #include <linux/suspend.h> #include <linux/vmalloc.h> #include <linux/efi.h> #include <linux/cc_platform.h> #include <asm/init.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> #include <asm/io_apic.h> #include <asm/debugreg.h> #include <asm/kexec-bzimage64.h> #include <asm/setup.h> #include <asm/set_memory.h> #include <asm/cpu.h> #ifdef CONFIG_ACPI /* * Used while adding mapping for ACPI tables. * Can be reused when other iomem regions need be mapped */ struct init_pgtable_data { struct x86_mapping_info *info; pgd_t *level4p; }; static int mem_region_callback(struct resource *res, void *arg) { struct init_pgtable_data *data = arg; return kernel_ident_mapping_init(data->info, data->level4p, res->start, res->end + 1); } static int map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) { struct init_pgtable_data data; unsigned long flags; int ret; data.info = info; data.level4p = level4p; flags = IORESOURCE_MEM | IORESOURCE_BUSY; ret = walk_iomem_res_desc(IORES_DESC_ACPI_TABLES, flags, 0, -1, &data, mem_region_callback); if (ret && ret != -EINVAL) return ret; /* ACPI tables could be located in ACPI Non-volatile Storage region */ ret = walk_iomem_res_desc(IORES_DESC_ACPI_NV_STORAGE, flags, 0, -1, &data, mem_region_callback); if (ret && ret != -EINVAL) return ret; return 0; } #else static int map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) { return 0; } #endif #ifdef CONFIG_KEXEC_FILE const struct kexec_file_ops * const kexec_file_loaders[] = { &kexec_bzImage64_ops, NULL }; #endif static int map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p) { #ifdef CONFIG_EFI unsigned long mstart, mend; if (!efi_enabled(EFI_BOOT)) return 0; mstart = (boot_params.efi_info.efi_systab | ((u64)boot_params.efi_info.efi_systab_hi<<32)); if (efi_enabled(EFI_64BIT)) mend = mstart + sizeof(efi_system_table_64_t); else mend = mstart + sizeof(efi_system_table_32_t); if (!mstart) return 0; return kernel_ident_mapping_init(info, level4p, mstart, mend); #endif return 0; } static void free_transition_pgtable(struct kimage *image) { free_page((unsigned long)image->arch.p4d); image->arch.p4d = NULL; free_page((unsigned long)image->arch.pud); image->arch.pud = NULL; free_page((unsigned long)image->arch.pmd); image->arch.pmd = NULL; free_page((unsigned long)image->arch.pte); image->arch.pte = NULL; } static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) { pgprot_t prot = PAGE_KERNEL_EXEC_NOENC; unsigned long vaddr, paddr; int result = -ENOMEM; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; vaddr = (unsigned long)relocate_kernel; paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE); pgd += pgd_index(vaddr); if (!pgd_present(*pgd)) { p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL); if (!p4d) goto err; image->arch.p4d = p4d; set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); } p4d = p4d_offset(pgd, vaddr); if (!p4d_present(*p4d)) { pud = (pud_t *)get_zeroed_page(GFP_KERNEL); if (!pud) goto err; image->arch.pud = pud; set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); } pud = pud_offset(p4d, vaddr); if (!pud_present(*pud)) { pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); if (!pmd) goto err; image->arch.pmd = pmd; set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); } pmd = pmd_offset(pud, vaddr); if (!pmd_present(*pmd)) { pte = (pte_t *)get_zeroed_page(GFP_KERNEL); if (!pte) goto err; image->arch.pte = pte; set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); } pte = pte_offset_kernel(pmd, vaddr); if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) prot = PAGE_KERNEL_EXEC; set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot)); return 0; err: return result; } static void *alloc_pgt_page(void *data) { struct kimage *image = (struct kimage *)data; struct page *page; void *p = NULL; page = kimage_alloc_control_pages(image, 0); if (page) { p = page_address(page); clear_page(p); } return p; } static int init_pgtable(struct kimage *image, unsigned long start_pgtable) { struct x86_mapping_info info = { .alloc_pgt_page = alloc_pgt_page, .context = image, .page_flag = __PAGE_KERNEL_LARGE_EXEC, .kernpg_flag = _KERNPG_TABLE_NOENC, }; unsigned long mstart, mend; pgd_t *level4p; int result; int i; level4p = (pgd_t *)__va(start_pgtable); clear_page(level4p); if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT)) { info.page_flag |= _PAGE_ENC; info.kernpg_flag |= _PAGE_ENC; } if (direct_gbpages) info.direct_gbpages = true; for (i = 0; i < nr_pfn_mapped; i++) { mstart = pfn_mapped[i].start << PAGE_SHIFT; mend = pfn_mapped[i].end << PAGE_SHIFT; result = kernel_ident_mapping_init(&info, level4p, mstart, mend); if (result) return result; } /* * segments's mem ranges could be outside 0 ~ max_pfn, * for example when jump back to original kernel from kexeced kernel. * or first kernel is booted with user mem map, and second kernel * could be loaded out of that range. */ for (i = 0; i < image->nr_segments; i++) { mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; result = kernel_ident_mapping_init(&info, level4p, mstart, mend); if (result) return result; } /* * Prepare EFI systab and ACPI tables for kexec kernel since they are * not covered by pfn_mapped. */ result = map_efi_systab(&info, level4p); if (result) return result; result = map_acpi_tables(&info, level4p); if (result) return result; return init_transition_pgtable(image, level4p); } static void load_segments(void) { __asm__ __volatile__ ( "\tmovl %0,%%ds\n" "\tmovl %0,%%es\n" "\tmovl %0,%%ss\n" "\tmovl %0,%%fs\n" "\tmovl %0,%%gs\n" : : "a" (__KERNEL_DS) : "memory" ); } int machine_kexec_prepare(struct kimage *image) { unsigned long start_pgtable; int result; /* Calculate the offsets */ start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; /* Setup the identity mapped 64bit page table */ result = init_pgtable(image, start_pgtable); if (result) return result; return 0; } void machine_kexec_cleanup(struct kimage *image) { free_transition_pgtable(image); } /* * Do not allocate memory (or fail in any way) in machine_kexec(). * We are past the point of no return, committed to rebooting now. */ void machine_kexec(struct kimage *image) { unsigned long page_list[PAGES_NR]; unsigned int host_mem_enc_active; int save_ftrace_enabled; void *control_page; /* * This must be done before load_segments() since if call depth tracking * is used then GS must be valid to make any function calls. */ host_mem_enc_active = cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT); #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) save_processor_state(); #endif save_ftrace_enabled = __ftrace_enabled_save(); /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); hw_breakpoint_disable(); cet_disable(); if (image->preserve_context) { #ifdef CONFIG_X86_IO_APIC /* * We need to put APICs in legacy mode so that we can * get timer interrupts in second kernel. kexec/kdump * paths already have calls to restore_boot_irq_mode() * in one form or other. kexec jump path also need one. */ clear_IO_APIC(); restore_boot_irq_mode(); #endif } control_page = page_address(image->control_code_page) + PAGE_SIZE; __memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page); page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; page_list[PA_TABLE_PAGE] = (unsigned long)__pa(page_address(image->control_code_page)); if (image->type == KEXEC_TYPE_DEFAULT) page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT); /* * The segment registers are funny things, they have both a * visible and an invisible part. Whenever the visible part is * set to a specific selector, the invisible part is loaded * with from a table in memory. At no other time is the * descriptor table in memory accessed. * * I take advantage of this here by force loading the * segments, before I zap the gdt with an invalid value. */ load_segments(); /* * The gdt & idt are now invalid. * If you want to load them you must set up your own idt & gdt. */ native_idt_invalidate(); native_gdt_invalidate(); /* now call it */ image->start = relocate_kernel((unsigned long)image->head, (unsigned long)page_list, image->start, image->preserve_context, host_mem_enc_active); #ifdef CONFIG_KEXEC_JUMP if (image->preserve_context) restore_processor_state(); #endif __ftrace_enabled_restore(save_ftrace_enabled); } /* arch-dependent functionality related to kexec file-based syscall */ #ifdef CONFIG_KEXEC_FILE /* * Apply purgatory relocations. * * @pi: Purgatory to be relocated. * @section: Section relocations applying to. * @relsec: Section containing RELAs. * @symtabsec: Corresponding symtab. * * TODO: Some of the code belongs to generic code. Move that in kexec.c. */ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section, const Elf_Shdr *relsec, const Elf_Shdr *symtabsec) { unsigned int i; Elf64_Rela *rel; Elf64_Sym *sym; void *location; unsigned long address, sec_base, value; const char *strtab, *name, *shstrtab; const Elf_Shdr *sechdrs; /* String & section header string table */ sechdrs = (void *)pi->ehdr + pi->ehdr->e_shoff; strtab = (char *)pi->ehdr + sechdrs[symtabsec->sh_link].sh_offset; shstrtab = (char *)pi->ehdr + sechdrs[pi->ehdr->e_shstrndx].sh_offset; rel = (void *)pi->ehdr + relsec->sh_offset; pr_debug("Applying relocate section %s to %u\n", shstrtab + relsec->sh_name, relsec->sh_info); for (i = 0; i < relsec->sh_size / sizeof(*rel); i++) { /* * rel[i].r_offset contains byte offset from beginning * of section to the storage unit affected. * * This is location to update. This is temporary buffer * where section is currently loaded. This will finally be * loaded to a different address later, pointed to by * ->sh_addr. kexec takes care of moving it * (kexec_load_segment()). */ location = pi->purgatory_buf; location += section->sh_offset; location += rel[i].r_offset; /* Final address of the location */ address = section->sh_addr + rel[i].r_offset; /* * rel[i].r_info contains information about symbol table index * w.r.t which relocation must be made and type of relocation * to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get * these respectively. */ sym = (void *)pi->ehdr + symtabsec->sh_offset; sym += ELF64_R_SYM(rel[i].r_info); if (sym->st_name) name = strtab + sym->st_name; else name = shstrtab + sechdrs[sym->st_shndx].sh_name; pr_debug("Symbol: %s info: %02x shndx: %02x value=%llx size: %llx\n", name, sym->st_info, sym->st_shndx, sym->st_value, sym->st_size); if (sym->st_shndx == SHN_UNDEF) { pr_err("Undefined symbol: %s\n", name); return -ENOEXEC; } if (sym->st_shndx == SHN_COMMON) { pr_err("symbol '%s' in common section\n", name); return -ENOEXEC; } if (sym->st_shndx == SHN_ABS) sec_base = 0; else if (sym->st_shndx >= pi->ehdr->e_shnum) { pr_err("Invalid section %d for symbol %s\n", sym->st_shndx, name); return -ENOEXEC; } else sec_base = pi->sechdrs[sym->st_shndx].sh_addr; value = sym->st_value; value += sec_base; value += rel[i].r_addend; switch (ELF64_R_TYPE(rel[i].r_info)) { case R_X86_64_NONE: break; case R_X86_64_64: *(u64 *)location = value; break; case R_X86_64_32: *(u32 *)location = value; if (value != *(u32 *)location) goto overflow; break; case R_X86_64_32S: *(s32 *)location = value; if ((s64)value != *(s32 *)location) goto overflow; break; case R_X86_64_PC32: case R_X86_64_PLT32: value -= (u64)address; *(u32 *)location = value; break; default: pr_err("Unknown rela relocation: %llu\n", ELF64_R_TYPE(rel[i].r_info)); return -ENOEXEC; } } return 0; overflow: pr_err("Overflow in relocation type %d value 0x%lx\n", (int)ELF64_R_TYPE(rel[i].r_info), value); return -ENOEXEC; } int arch_kimage_file_post_load_cleanup(struct kimage *image) { vfree(image->elf_headers); image->elf_headers = NULL; image->elf_headers_sz = 0; return kexec_image_post_load_cleanup_default(image); } #endif /* CONFIG_KEXEC_FILE */ #ifdef CONFIG_CRASH_DUMP static int kexec_mark_range(unsigned long start, unsigned long end, bool protect) { struct page *page; unsigned int nr_pages; /* * For physical range: [start, end]. We must skip the unassigned * crashk resource with zero-valued "end" member. */ if (!end || start > end) return 0; page = pfn_to_page(start >> PAGE_SHIFT); nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; if (protect) return set_pages_ro(page, nr_pages); else return set_pages_rw(page, nr_pages); } static void kexec_mark_crashkres(bool protect) { unsigned long control; kexec_mark_range(crashk_low_res.start, crashk_low_res.end, protect); /* Don't touch the control code page used in crash_kexec().*/ control = PFN_PHYS(page_to_pfn(kexec_crash_image->control_code_page)); /* Control code page is located in the 2nd page. */ kexec_mark_range(crashk_res.start, control + PAGE_SIZE - 1, protect); control += KEXEC_CONTROL_PAGE_SIZE; kexec_mark_range(control, crashk_res.end, protect); } void arch_kexec_protect_crashkres(void) { kexec_mark_crashkres(true); } void arch_kexec_unprotect_crashkres(void) { kexec_mark_crashkres(false); } #endif /* * During a traditional boot under SME, SME will encrypt the kernel, * so the SME kexec kernel also needs to be un-encrypted in order to * replicate a normal SME boot. * * During a traditional boot under SEV, the kernel has already been * loaded encrypted, so the SEV kexec kernel needs to be encrypted in * order to replicate a normal SEV boot. */ int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) { if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) return 0; /* * If host memory encryption is active we need to be sure that kexec * pages are not encrypted because when we boot to the new kernel the * pages won't be accessed encrypted (initially). */ return set_memory_decrypted((unsigned long)vaddr, pages); } void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { if (!cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) return; /* * If host memory encryption is active we need to reset the pages back * to being an encrypted mapping before freeing them. */ set_memory_encrypted((unsigned long)vaddr, pages); } |
20 11 9 21 20 1 1 27 10 5 42 42 41 16 27 41 13 13 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 | /* * Copyright (c) 2004 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2004 Infinicon Corporation. All rights reserved. * Copyright (c) 2004 Intel Corporation. All rights reserved. * Copyright (c) 2004 Topspin Corporation. All rights reserved. * Copyright (c) 2004 Voltaire Corporation. All rights reserved. * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved. * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/errno.h> #include <linux/err.h> #include <linux/export.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/in.h> #include <linux/in6.h> #include <net/addrconf.h> #include <linux/security.h> #include <rdma/ib_verbs.h> #include <rdma/ib_cache.h> #include <rdma/ib_addr.h> #include <rdma/rw.h> #include <rdma/lag.h> #include "core_priv.h" #include <trace/events/rdma_core.h> static int ib_resolve_eth_dmac(struct ib_device *device, struct rdma_ah_attr *ah_attr); static const char * const ib_events[] = { [IB_EVENT_CQ_ERR] = "CQ error", [IB_EVENT_QP_FATAL] = "QP fatal error", [IB_EVENT_QP_REQ_ERR] = "QP request error", [IB_EVENT_QP_ACCESS_ERR] = "QP access error", [IB_EVENT_COMM_EST] = "communication established", [IB_EVENT_SQ_DRAINED] = "send queue drained", [IB_EVENT_PATH_MIG] = "path migration successful", [IB_EVENT_PATH_MIG_ERR] = "path migration error", [IB_EVENT_DEVICE_FATAL] = "device fatal error", [IB_EVENT_PORT_ACTIVE] = "port active", [IB_EVENT_PORT_ERR] = "port error", [IB_EVENT_LID_CHANGE] = "LID change", [IB_EVENT_PKEY_CHANGE] = "P_key change", [IB_EVENT_SM_CHANGE] = "SM change", [IB_EVENT_SRQ_ERR] = "SRQ error", [IB_EVENT_SRQ_LIMIT_REACHED] = "SRQ limit reached", [IB_EVENT_QP_LAST_WQE_REACHED] = "last WQE reached", [IB_EVENT_CLIENT_REREGISTER] = "client reregister", [IB_EVENT_GID_CHANGE] = "GID changed", }; const char *__attribute_const__ ib_event_msg(enum ib_event_type event) { size_t index = event; return (index < ARRAY_SIZE(ib_events) && ib_events[index]) ? ib_events[index] : "unrecognized event"; } EXPORT_SYMBOL(ib_event_msg); static const char * const wc_statuses[] = { [IB_WC_SUCCESS] = "success", [IB_WC_LOC_LEN_ERR] = "local length error", [IB_WC_LOC_QP_OP_ERR] = "local QP operation error", [IB_WC_LOC_EEC_OP_ERR] = "local EE context operation error", [IB_WC_LOC_PROT_ERR] = "local protection error", [IB_WC_WR_FLUSH_ERR] = "WR flushed", [IB_WC_MW_BIND_ERR] = "memory bind operation error", [IB_WC_BAD_RESP_ERR] = "bad response error", [IB_WC_LOC_ACCESS_ERR] = "local access error", [IB_WC_REM_INV_REQ_ERR] = "remote invalid request error", [IB_WC_REM_ACCESS_ERR] = "remote access error", [IB_WC_REM_OP_ERR] = "remote operation error", [IB_WC_RETRY_EXC_ERR] = "transport retry counter exceeded", [IB_WC_RNR_RETRY_EXC_ERR] = "RNR retry counter exceeded", [IB_WC_LOC_RDD_VIOL_ERR] = "local RDD violation error", [IB_WC_REM_INV_RD_REQ_ERR] = "remote invalid RD request", [IB_WC_REM_ABORT_ERR] = "operation aborted", [IB_WC_INV_EECN_ERR] = "invalid EE context number", [IB_WC_INV_EEC_STATE_ERR] = "invalid EE context state", [IB_WC_FATAL_ERR] = "fatal error", [IB_WC_RESP_TIMEOUT_ERR] = "response timeout error", [IB_WC_GENERAL_ERR] = "general error", }; const char *__attribute_const__ ib_wc_status_msg(enum ib_wc_status status) { size_t index = status; return (index < ARRAY_SIZE(wc_statuses) && wc_statuses[index]) ? wc_statuses[index] : "unrecognized status"; } EXPORT_SYMBOL(ib_wc_status_msg); __attribute_const__ int ib_rate_to_mult(enum ib_rate rate) { switch (rate) { case IB_RATE_2_5_GBPS: return 1; case IB_RATE_5_GBPS: return 2; case IB_RATE_10_GBPS: return 4; case IB_RATE_20_GBPS: return 8; case IB_RATE_30_GBPS: return 12; case IB_RATE_40_GBPS: return 16; case IB_RATE_60_GBPS: return 24; case IB_RATE_80_GBPS: return 32; case IB_RATE_120_GBPS: return 48; case IB_RATE_14_GBPS: return 6; case IB_RATE_56_GBPS: return 22; case IB_RATE_112_GBPS: return 45; case IB_RATE_168_GBPS: return 67; case IB_RATE_25_GBPS: return 10; case IB_RATE_100_GBPS: return 40; case IB_RATE_200_GBPS: return 80; case IB_RATE_300_GBPS: return 120; case IB_RATE_28_GBPS: return 11; case IB_RATE_50_GBPS: return 20; case IB_RATE_400_GBPS: return 160; case IB_RATE_600_GBPS: return 240; case IB_RATE_800_GBPS: return 320; default: return -1; } } EXPORT_SYMBOL(ib_rate_to_mult); __attribute_const__ enum ib_rate mult_to_ib_rate(int mult) { switch (mult) { case 1: return IB_RATE_2_5_GBPS; case 2: return IB_RATE_5_GBPS; case 4: return IB_RATE_10_GBPS; case 8: return IB_RATE_20_GBPS; case 12: return IB_RATE_30_GBPS; case 16: return IB_RATE_40_GBPS; case 24: return IB_RATE_60_GBPS; case 32: return IB_RATE_80_GBPS; case 48: return IB_RATE_120_GBPS; case 6: return IB_RATE_14_GBPS; case 22: return IB_RATE_56_GBPS; case 45: return IB_RATE_112_GBPS; case 67: return IB_RATE_168_GBPS; case 10: return IB_RATE_25_GBPS; case 40: return IB_RATE_100_GBPS; case 80: return IB_RATE_200_GBPS; case 120: return IB_RATE_300_GBPS; case 11: return IB_RATE_28_GBPS; case 20: return IB_RATE_50_GBPS; case 160: return IB_RATE_400_GBPS; case 240: return IB_RATE_600_GBPS; case 320: return IB_RATE_800_GBPS; default: return IB_RATE_PORT_CURRENT; } } EXPORT_SYMBOL(mult_to_ib_rate); __attribute_const__ int ib_rate_to_mbps(enum ib_rate rate) { switch (rate) { case IB_RATE_2_5_GBPS: return 2500; case IB_RATE_5_GBPS: return 5000; case IB_RATE_10_GBPS: return 10000; case IB_RATE_20_GBPS: return 20000; case IB_RATE_30_GBPS: return 30000; case IB_RATE_40_GBPS: return 40000; case IB_RATE_60_GBPS: return 60000; case IB_RATE_80_GBPS: return 80000; case IB_RATE_120_GBPS: return 120000; case IB_RATE_14_GBPS: return 14062; case IB_RATE_56_GBPS: return 56250; case IB_RATE_112_GBPS: return 112500; case IB_RATE_168_GBPS: return 168750; case IB_RATE_25_GBPS: return 25781; case IB_RATE_100_GBPS: return 103125; case IB_RATE_200_GBPS: return 206250; case IB_RATE_300_GBPS: return 309375; case IB_RATE_28_GBPS: return 28125; case IB_RATE_50_GBPS: return 53125; case IB_RATE_400_GBPS: return 425000; case IB_RATE_600_GBPS: return 637500; case IB_RATE_800_GBPS: return 850000; default: return -1; } } EXPORT_SYMBOL(ib_rate_to_mbps); __attribute_const__ enum rdma_transport_type rdma_node_get_transport(unsigned int node_type) { if (node_type == RDMA_NODE_USNIC) return RDMA_TRANSPORT_USNIC; if (node_type == RDMA_NODE_USNIC_UDP) return RDMA_TRANSPORT_USNIC_UDP; if (node_type == RDMA_NODE_RNIC) return RDMA_TRANSPORT_IWARP; if (node_type == RDMA_NODE_UNSPECIFIED) return RDMA_TRANSPORT_UNSPECIFIED; return RDMA_TRANSPORT_IB; } EXPORT_SYMBOL(rdma_node_get_transport); enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u32 port_num) { enum rdma_transport_type lt; if (device->ops.get_link_layer) return device->ops.get_link_layer(device, port_num); lt = rdma_node_get_transport(device->node_type); if (lt == RDMA_TRANSPORT_IB) return IB_LINK_LAYER_INFINIBAND; return IB_LINK_LAYER_ETHERNET; } EXPORT_SYMBOL(rdma_port_get_link_layer); /* Protection domains */ /** * __ib_alloc_pd - Allocates an unused protection domain. * @device: The device on which to allocate the protection domain. * @flags: protection domain flags * @caller: caller's build-time module name * * A protection domain object provides an association between QPs, shared * receive queues, address handles, memory regions, and memory windows. * * Every PD has a local_dma_lkey which can be used as the lkey value for local * memory operations. */ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, const char *caller) { struct ib_pd *pd; int mr_access_flags = 0; int ret; pd = rdma_zalloc_drv_obj(device, ib_pd); if (!pd) return ERR_PTR(-ENOMEM); pd->device = device; pd->flags = flags; rdma_restrack_new(&pd->res, RDMA_RESTRACK_PD); rdma_restrack_set_name(&pd->res, caller); ret = device->ops.alloc_pd(pd, NULL); if (ret) { rdma_restrack_put(&pd->res); kfree(pd); return ERR_PTR(ret); } rdma_restrack_add(&pd->res); if (device->attrs.kernel_cap_flags & IBK_LOCAL_DMA_LKEY) pd->local_dma_lkey = device->local_dma_lkey; else mr_access_flags |= IB_ACCESS_LOCAL_WRITE; if (flags & IB_PD_UNSAFE_GLOBAL_RKEY) { pr_warn("%s: enabling unsafe global rkey\n", caller); mr_access_flags |= IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE; } if (mr_access_flags) { struct ib_mr *mr; mr = pd->device->ops.get_dma_mr(pd, mr_access_flags); if (IS_ERR(mr)) { ib_dealloc_pd(pd); return ERR_CAST(mr); } mr->device = pd->device; mr->pd = pd; mr->type = IB_MR_TYPE_DMA; mr->uobject = NULL; mr->need_inval = false; pd->__internal_mr = mr; if (!(device->attrs.kernel_cap_flags & IBK_LOCAL_DMA_LKEY)) pd->local_dma_lkey = pd->__internal_mr->lkey; if (flags & IB_PD_UNSAFE_GLOBAL_RKEY) pd->unsafe_global_rkey = pd->__internal_mr->rkey; } return pd; } EXPORT_SYMBOL(__ib_alloc_pd); /** * ib_dealloc_pd_user - Deallocates a protection domain. * @pd: The protection domain to deallocate. * @udata: Valid user data or NULL for kernel object * * It is an error to call this function while any resources in the pd still * exist. The caller is responsible to synchronously destroy them and * guarantee no new allocations will happen. */ int ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata) { int ret; if (pd->__internal_mr) { ret = pd->device->ops.dereg_mr(pd->__internal_mr, NULL); WARN_ON(ret); pd->__internal_mr = NULL; } ret = pd->device->ops.dealloc_pd(pd, udata); if (ret) return ret; rdma_restrack_del(&pd->res); kfree(pd); return ret; } EXPORT_SYMBOL(ib_dealloc_pd_user); /* Address handles */ /** * rdma_copy_ah_attr - Copy rdma ah attribute from source to destination. * @dest: Pointer to destination ah_attr. Contents of the destination * pointer is assumed to be invalid and attribute are overwritten. * @src: Pointer to source ah_attr. */ void rdma_copy_ah_attr(struct rdma_ah_attr *dest, const struct rdma_ah_attr *src) { *dest = *src; if (dest->grh.sgid_attr) rdma_hold_gid_attr(dest->grh.sgid_attr); } EXPORT_SYMBOL(rdma_copy_ah_attr); /** * rdma_replace_ah_attr - Replace valid ah_attr with new one. * @old: Pointer to existing ah_attr which needs to be replaced. * old is assumed to be valid or zero'd * @new: Pointer to the new ah_attr. * * rdma_replace_ah_attr() first releases any reference in the old ah_attr if * old the ah_attr is valid; after that it copies the new attribute and holds * the reference to the replaced ah_attr. */ void rdma_replace_ah_attr(struct rdma_ah_attr *old, const struct rdma_ah_attr *new) { rdma_destroy_ah_attr(old); *old = *new; if (old->grh.sgid_attr) rdma_hold_gid_attr(old->grh.sgid_attr); } EXPORT_SYMBOL(rdma_replace_ah_attr); /** * rdma_move_ah_attr - Move ah_attr pointed by source to destination. * @dest: Pointer to destination ah_attr to copy to. * dest is assumed to be valid or zero'd * @src: Pointer to the new ah_attr. * * rdma_move_ah_attr() first releases any reference in the destination ah_attr * if it is valid. This also transfers ownership of internal references from * src to dest, making src invalid in the process. No new reference of the src * ah_attr is taken. */ void rdma_move_ah_attr(struct rdma_ah_attr *dest, struct rdma_ah_attr *src) { rdma_destroy_ah_attr(dest); *dest = *src; src->grh.sgid_attr = NULL; } EXPORT_SYMBOL(rdma_move_ah_attr); /* * Validate that the rdma_ah_attr is valid for the device before passing it * off to the driver. */ static int rdma_check_ah_attr(struct ib_device *device, struct rdma_ah_attr *ah_attr) { if (!rdma_is_port_valid(device, ah_attr->port_num)) return -EINVAL; if ((rdma_is_grh_required(device, ah_attr->port_num) || ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) && !(ah_attr->ah_flags & IB_AH_GRH)) return -EINVAL; if (ah_attr->grh.sgid_attr) { /* * Make sure the passed sgid_attr is consistent with the * parameters */ if (ah_attr->grh.sgid_attr->index != ah_attr->grh.sgid_index || ah_attr->grh.sgid_attr->port_num != ah_attr->port_num) return -EINVAL; } return 0; } /* * If the ah requires a GRH then ensure that sgid_attr pointer is filled in. * On success the caller is responsible to call rdma_unfill_sgid_attr(). */ static int rdma_fill_sgid_attr(struct ib_device *device, struct rdma_ah_attr *ah_attr, const struct ib_gid_attr **old_sgid_attr) { const struct ib_gid_attr *sgid_attr; struct ib_global_route *grh; int ret; *old_sgid_attr = ah_attr->grh.sgid_attr; ret = rdma_check_ah_attr(device, ah_attr); if (ret) return ret; if (!(ah_attr->ah_flags & IB_AH_GRH)) return 0; grh = rdma_ah_retrieve_grh(ah_attr); if (grh->sgid_attr) return 0; sgid_attr = rdma_get_gid_attr(device, ah_attr->port_num, grh->sgid_index); if (IS_ERR(sgid_attr)) return PTR_ERR(sgid_attr); /* Move ownerhip of the kref into the ah_attr */ grh->sgid_attr = sgid_attr; return 0; } static void rdma_unfill_sgid_attr(struct rdma_ah_attr *ah_attr, const struct ib_gid_attr *old_sgid_attr) { /* * Fill didn't change anything, the caller retains ownership of * whatever it passed */ if (ah_attr->grh.sgid_attr == old_sgid_attr) return; /* * Otherwise, we need to undo what rdma_fill_sgid_attr so the caller * doesn't see any change in the rdma_ah_attr. If we get here * old_sgid_attr is NULL. */ rdma_destroy_ah_attr(ah_attr); } static const struct ib_gid_attr * rdma_update_sgid_attr(struct rdma_ah_attr *ah_attr, const struct ib_gid_attr *old_attr) { if (old_attr) rdma_put_gid_attr(old_attr); if (ah_attr->ah_flags & IB_AH_GRH) { rdma_hold_gid_attr(ah_attr->grh.sgid_attr); return ah_attr->grh.sgid_attr; } return NULL; } static struct ib_ah *_rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, u32 flags, struct ib_udata *udata, struct net_device *xmit_slave) { struct rdma_ah_init_attr init_attr = {}; struct ib_device *device = pd->device; struct ib_ah *ah; int ret; might_sleep_if(flags & RDMA_CREATE_AH_SLEEPABLE); if (!udata && !device->ops.create_ah) return ERR_PTR(-EOPNOTSUPP); ah = rdma_zalloc_drv_obj_gfp( device, ib_ah, (flags & RDMA_CREATE_AH_SLEEPABLE) ? GFP_KERNEL : GFP_ATOMIC); if (!ah) return ERR_PTR(-ENOMEM); ah->device = device; ah->pd = pd; ah->type = ah_attr->type; ah->sgid_attr = rdma_update_sgid_attr(ah_attr, NULL); init_attr.ah_attr = ah_attr; init_attr.flags = flags; init_attr.xmit_slave = xmit_slave; if (udata) ret = device->ops.create_user_ah(ah, &init_attr, udata); else ret = device->ops.create_ah(ah, &init_attr, NULL); if (ret) { if (ah->sgid_attr) rdma_put_gid_attr(ah->sgid_attr); kfree(ah); return ERR_PTR(ret); } atomic_inc(&pd->usecnt); return ah; } /** * rdma_create_ah - Creates an address handle for the * given address vector. * @pd: The protection domain associated with the address handle. * @ah_attr: The attributes of the address vector. * @flags: Create address handle flags (see enum rdma_create_ah_flags). * * It returns 0 on success and returns appropriate error code on error. * The address handle is used to reference a local or global destination * in all UD QP post sends. */ struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, u32 flags) { const struct ib_gid_attr *old_sgid_attr; struct net_device *slave; struct ib_ah *ah; int ret; ret = rdma_fill_sgid_attr(pd->device, ah_attr, &old_sgid_attr); if (ret) return ERR_PTR(ret); slave = rdma_lag_get_ah_roce_slave(pd->device, ah_attr, (flags & RDMA_CREATE_AH_SLEEPABLE) ? GFP_KERNEL : GFP_ATOMIC); if (IS_ERR(slave)) { rdma_unfill_sgid_attr(ah_attr, old_sgid_attr); return (void *)slave; } ah = _rdma_create_ah(pd, ah_attr, flags, NULL, slave); rdma_lag_put_ah_roce_slave(slave); rdma_unfill_sgid_attr(ah_attr, old_sgid_attr); return ah; } EXPORT_SYMBOL(rdma_create_ah); /** * rdma_create_user_ah - Creates an address handle for the * given address vector. * It resolves destination mac address for ah attribute of RoCE type. * @pd: The protection domain associated with the address handle. * @ah_attr: The attributes of the address vector. * @udata: pointer to user's input output buffer information need by * provider driver. * * It returns 0 on success and returns appropriate error code on error. * The address handle is used to reference a local or global destination * in all UD QP post sends. */ struct ib_ah *rdma_create_user_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr, struct ib_udata *udata) { const struct ib_gid_attr *old_sgid_attr; struct ib_ah *ah; int err; err = rdma_fill_sgid_attr(pd->device, ah_attr, &old_sgid_attr); if (err) return ERR_PTR(err); if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) { err = ib_resolve_eth_dmac(pd->device, ah_attr); if (err) { ah = ERR_PTR(err); goto out; } } ah = _rdma_create_ah(pd, ah_attr, RDMA_CREATE_AH_SLEEPABLE, udata, NULL); out: rdma_unfill_sgid_attr(ah_attr, old_sgid_attr); return ah; } EXPORT_SYMBOL(rdma_create_user_ah); int ib_get_rdma_header_version(const union rdma_network_hdr *hdr) { const struct iphdr *ip4h = (struct iphdr *)&hdr->roce4grh; struct iphdr ip4h_checked; const struct ipv6hdr *ip6h = (struct ipv6hdr *)&hdr->ibgrh; /* If it's IPv6, the version must be 6, otherwise, the first * 20 bytes (before the IPv4 header) are garbled. */ if (ip6h->version != 6) return (ip4h->version == 4) ? 4 : 0; /* version may be 6 or 4 because the first 20 bytes could be garbled */ /* RoCE v2 requires no options, thus header length * must be 5 words */ if (ip4h->ihl != 5) return 6; /* Verify checksum. * We can't write on scattered buffers so we need to copy to * temp buffer. */ memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked)); ip4h_checked.check = 0; ip4h_checked.check = ip_fast_csum((u8 *)&ip4h_checked, 5); /* if IPv4 header checksum is OK, believe it */ if (ip4h->check == ip4h_checked.check) return 4; return 6; } EXPORT_SYMBOL(ib_get_rdma_header_version); static enum rdma_network_type ib_get_net_type_by_grh(struct ib_device *device, u32 port_num, const struct ib_grh *grh) { int grh_version; if (rdma_protocol_ib(device, port_num)) return RDMA_NETWORK_IB; grh_version = ib_get_rdma_header_version((union rdma_network_hdr *)grh); if (grh_version == 4) return RDMA_NETWORK_IPV4; if (grh->next_hdr == IPPROTO_UDP) return RDMA_NETWORK_IPV6; return RDMA_NETWORK_ROCE_V1; } struct find_gid_index_context { u16 vlan_id; enum ib_gid_type gid_type; }; static bool find_gid_index(const union ib_gid *gid, const struct ib_gid_attr *gid_attr, void *context) { struct find_gid_index_context *ctx = context; u16 vlan_id = 0xffff; int ret; if (ctx->gid_type != gid_attr->gid_type) return false; ret = rdma_read_gid_l2_fields(gid_attr, &vlan_id, NULL); if (ret) return false; return ctx->vlan_id == vlan_id; } static const struct ib_gid_attr * get_sgid_attr_from_eth(struct ib_device *device, u32 port_num, u16 vlan_id, const union ib_gid *sgid, enum ib_gid_type gid_type) { struct find_gid_index_context context = {.vlan_id = vlan_id, .gid_type = gid_type}; return rdma_find_gid_by_filter(device, sgid, port_num, find_gid_index, &context); } int ib_get_gids_from_rdma_hdr(const union rdma_network_hdr *hdr, enum rdma_network_type net_type, union ib_gid *sgid, union ib_gid *dgid) { struct sockaddr_in src_in; struct sockaddr_in dst_in; __be32 src_saddr, dst_saddr; if (!sgid || !dgid) return -EINVAL; if (net_type == RDMA_NETWORK_IPV4) { memcpy(&src_in.sin_addr.s_addr, &hdr->roce4grh.saddr, 4); memcpy(&dst_in.sin_addr.s_addr, &hdr->roce4grh.daddr, 4); src_saddr = src_in.sin_addr.s_addr; dst_saddr = dst_in.sin_addr.s_addr; ipv6_addr_set_v4mapped(src_saddr, (struct in6_addr *)sgid); ipv6_addr_set_v4mapped(dst_saddr, (struct in6_addr *)dgid); return 0; } else if (net_type == RDMA_NETWORK_IPV6 || net_type == RDMA_NETWORK_IB || RDMA_NETWORK_ROCE_V1) { *dgid = hdr->ibgrh.dgid; *sgid = hdr->ibgrh.sgid; return 0; } else { return -EINVAL; } } EXPORT_SYMBOL(ib_get_gids_from_rdma_hdr); /* Resolve destination mac address and hop limit for unicast destination * GID entry, considering the source GID entry as well. * ah_attribute must have valid port_num, sgid_index. */ static int ib_resolve_unicast_gid_dmac(struct ib_device *device, struct rdma_ah_attr *ah_attr) { struct ib_global_route *grh = rdma_ah_retrieve_grh(ah_attr); const struct ib_gid_attr *sgid_attr = grh->sgid_attr; int hop_limit = 0xff; int ret = 0; /* If destination is link local and source GID is RoCEv1, * IP stack is not used. */ if (rdma_link_local_addr((struct in6_addr *)grh->dgid.raw) && sgid_attr->gid_type == IB_GID_TYPE_ROCE) { rdma_get_ll_mac((struct in6_addr *)grh->dgid.raw, ah_attr->roce.dmac); return ret; } ret = rdma_addr_find_l2_eth_by_grh(&sgid_attr->gid, &grh->dgid, ah_attr->roce.dmac, sgid_attr, &hop_limit); grh->hop_limit = hop_limit; return ret; } /* * This function initializes address handle attributes from the incoming packet. * Incoming packet has dgid of the receiver node on which this code is * getting executed and, sgid contains the GID of the sender. * * When resolving mac address of destination, the arrived dgid is used * as sgid and, sgid is used as dgid because sgid contains destinations * GID whom to respond to. * * On success the caller is responsible to call rdma_destroy_ah_attr on the * attr. */ int ib_init_ah_attr_from_wc(struct ib_device *device, u32 port_num, const struct ib_wc *wc, const struct ib_grh *grh, struct rdma_ah_attr *ah_attr) { u32 flow_class; int ret; enum rdma_network_type net_type = RDMA_NETWORK_IB; enum ib_gid_type gid_type = IB_GID_TYPE_IB; const struct ib_gid_attr *sgid_attr; int hoplimit = 0xff; union ib_gid dgid; union ib_gid sgid; might_sleep(); memset(ah_attr, 0, sizeof *ah_attr); ah_attr->type = rdma_ah_find_type(device, port_num); if (rdma_cap_eth_ah(device, port_num)) { if (wc->wc_flags & IB_WC_WITH_NETWORK_HDR_TYPE) net_type = wc->network_hdr_type; else net_type = ib_get_net_type_by_grh(device, port_num, grh); gid_type = ib_network_to_gid_type(net_type); } ret = ib_get_gids_from_rdma_hdr((union rdma_network_hdr *)grh, net_type, &sgid, &dgid); if (ret) return ret; rdma_ah_set_sl(ah_attr, wc->sl); rdma_ah_set_port_num(ah_attr, port_num); if (rdma_protocol_roce(device, port_num)) { u16 vlan_id = wc->wc_flags & IB_WC_WITH_VLAN ? wc->vlan_id : 0xffff; if (!(wc->wc_flags & IB_WC_GRH)) return -EPROTOTYPE; sgid_attr = get_sgid_attr_from_eth(device, port_num, vlan_id, &dgid, gid_type); if (IS_ERR(sgid_attr)) return PTR_ERR(sgid_attr); flow_class = be32_to_cpu(grh->version_tclass_flow); rdma_move_grh_sgid_attr(ah_attr, &sgid, flow_class & 0xFFFFF, hoplimit, (flow_class >> 20) & 0xFF, sgid_attr); ret = ib_resolve_unicast_gid_dmac(device, ah_attr); if (ret) rdma_destroy_ah_attr(ah_attr); return ret; } else { rdma_ah_set_dlid(ah_attr, wc->slid); rdma_ah_set_path_bits(ah_attr, wc->dlid_path_bits); if ((wc->wc_flags & IB_WC_GRH) == 0) return 0; if (dgid.global.interface_id != cpu_to_be64(IB_SA_WELL_KNOWN_GUID)) { sgid_attr = rdma_find_gid_by_port( device, &dgid, IB_GID_TYPE_IB, port_num, NULL); } else sgid_attr = rdma_get_gid_attr(device, port_num, 0); if (IS_ERR(sgid_attr)) return PTR_ERR(sgid_attr); flow_class = be32_to_cpu(grh->version_tclass_flow); rdma_move_grh_sgid_attr(ah_attr, &sgid, flow_class & 0xFFFFF, hoplimit, (flow_class >> 20) & 0xFF, sgid_attr); return 0; } } EXPORT_SYMBOL(ib_init_ah_attr_from_wc); /** * rdma_move_grh_sgid_attr - Sets the sgid attribute of GRH, taking ownership * of the reference * * @attr: Pointer to AH attribute structure * @dgid: Destination GID * @flow_label: Flow label * @hop_limit: Hop limit * @traffic_class: traffic class * @sgid_attr: Pointer to SGID attribute * * This takes ownership of the sgid_attr reference. The caller must ensure * rdma_destroy_ah_attr() is called before destroying the rdma_ah_attr after * calling this function. */ void rdma_move_grh_sgid_attr(struct rdma_ah_attr *attr, union ib_gid *dgid, u32 flow_label, u8 hop_limit, u8 traffic_class, const struct ib_gid_attr *sgid_attr) { rdma_ah_set_grh(attr, dgid, flow_label, sgid_attr->index, hop_limit, traffic_class); attr->grh.sgid_attr = sgid_attr; } EXPORT_SYMBOL(rdma_move_grh_sgid_attr); /** * rdma_destroy_ah_attr - Release reference to SGID attribute of * ah attribute. * @ah_attr: Pointer to ah attribute * * Release reference to the SGID attribute of the ah attribute if it is * non NULL. It is safe to call this multiple times, and safe to call it on * a zero initialized ah_attr. */ void rdma_destroy_ah_attr(struct rdma_ah_attr *ah_attr) { if (ah_attr->grh.sgid_attr) { rdma_put_gid_attr(ah_attr->grh.sgid_attr); ah_attr->grh.sgid_attr = NULL; } } EXPORT_SYMBOL(rdma_destroy_ah_attr); struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc, const struct ib_grh *grh, u32 port_num) { struct rdma_ah_attr ah_attr; struct ib_ah *ah; int ret; ret = ib_init_ah_attr_from_wc(pd->device, port_num, wc, grh, &ah_attr); if (ret) return ERR_PTR(ret); ah = rdma_create_ah(pd, &ah_attr, RDMA_CREATE_AH_SLEEPABLE); rdma_destroy_ah_attr(&ah_attr); return ah; } EXPORT_SYMBOL(ib_create_ah_from_wc); int rdma_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr) { const struct ib_gid_attr *old_sgid_attr; int ret; if (ah->type != ah_attr->type) return -EINVAL; ret = rdma_fill_sgid_attr(ah->device, ah_attr, &old_sgid_attr); if (ret) return ret; ret = ah->device->ops.modify_ah ? ah->device->ops.modify_ah(ah, ah_attr) : -EOPNOTSUPP; ah->sgid_attr = rdma_update_sgid_attr(ah_attr, ah->sgid_attr); rdma_unfill_sgid_attr(ah_attr, old_sgid_attr); return ret; } EXPORT_SYMBOL(rdma_modify_ah); int rdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr) { ah_attr->grh.sgid_attr = NULL; return ah->device->ops.query_ah ? ah->device->ops.query_ah(ah, ah_attr) : -EOPNOTSUPP; } EXPORT_SYMBOL(rdma_query_ah); int rdma_destroy_ah_user(struct ib_ah *ah, u32 flags, struct ib_udata *udata) { const struct ib_gid_attr *sgid_attr = ah->sgid_attr; struct ib_pd *pd; int ret; might_sleep_if(flags & RDMA_DESTROY_AH_SLEEPABLE); pd = ah->pd; ret = ah->device->ops.destroy_ah(ah, flags); if (ret) return ret; atomic_dec(&pd->usecnt); if (sgid_attr) rdma_put_gid_attr(sgid_attr); kfree(ah); return ret; } EXPORT_SYMBOL(rdma_destroy_ah_user); /* Shared receive queues */ /** * ib_create_srq_user - Creates a SRQ associated with the specified protection * domain. * @pd: The protection domain associated with the SRQ. * @srq_init_attr: A list of initial attributes required to create the * SRQ. If SRQ creation succeeds, then the attributes are updated to * the actual capabilities of the created SRQ. * @uobject: uobject pointer if this is not a kernel SRQ * @udata: udata pointer if this is not a kernel SRQ * * srq_attr->max_wr and srq_attr->max_sge are read the determine the * requested size of the SRQ, and set to the actual values allocated * on return. If ib_create_srq() succeeds, then max_wr and max_sge * will always be at least as large as the requested values. */ struct ib_srq *ib_create_srq_user(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr, struct ib_usrq_object *uobject, struct ib_udata *udata) { struct ib_srq *srq; int ret; srq = rdma_zalloc_drv_obj(pd->device, ib_srq); if (!srq) return ERR_PTR(-ENOMEM); srq->device = pd->device; srq->pd = pd; srq->event_handler = srq_init_attr->event_handler; srq->srq_context = srq_init_attr->srq_context; srq->srq_type = srq_init_attr->srq_type; srq->uobject = uobject; if (ib_srq_has_cq(srq->srq_type)) { srq->ext.cq = srq_init_attr->ext.cq; atomic_inc(&srq->ext.cq->usecnt); } if (srq->srq_type == IB_SRQT_XRC) { srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd; if (srq->ext.xrc.xrcd) atomic_inc(&srq->ext.xrc.xrcd->usecnt); } atomic_inc(&pd->usecnt); rdma_restrack_new(&srq->res, RDMA_RESTRACK_SRQ); rdma_restrack_parent_name(&srq->res, &pd->res); ret = pd->device->ops.create_srq(srq, srq_init_attr, udata); if (ret) { rdma_restrack_put(&srq->res); atomic_dec(&pd->usecnt); if (srq->srq_type == IB_SRQT_XRC && srq->ext.xrc.xrcd) atomic_dec(&srq->ext.xrc.xrcd->usecnt); if (ib_srq_has_cq(srq->srq_type)) atomic_dec(&srq->ext.cq->usecnt); kfree(srq); return ERR_PTR(ret); } rdma_restrack_add(&srq->res); return srq; } EXPORT_SYMBOL(ib_create_srq_user); int ib_modify_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr, enum ib_srq_attr_mask srq_attr_mask) { return srq->device->ops.modify_srq ? srq->device->ops.modify_srq(srq, srq_attr, srq_attr_mask, NULL) : -EOPNOTSUPP; } EXPORT_SYMBOL(ib_modify_srq); int ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr) { return srq->device->ops.query_srq ? srq->device->ops.query_srq(srq, srq_attr) : -EOPNOTSUPP; } EXPORT_SYMBOL(ib_query_srq); int ib_destroy_srq_user(struct ib_srq *srq, struct ib_udata *udata) { int ret; if (atomic_read(&srq->usecnt)) return -EBUSY; ret = srq->device->ops.destroy_srq(srq, udata); if (ret) return ret; atomic_dec(&srq->pd->usecnt); if (srq->srq_type == IB_SRQT_XRC && srq->ext.xrc.xrcd) atomic_dec(&srq->ext.xrc.xrcd->usecnt); if (ib_srq_has_cq(srq->srq_type)) atomic_dec(&srq->ext.cq->usecnt); rdma_restrack_del(&srq->res); kfree(srq); return ret; } EXPORT_SYMBOL(ib_destroy_srq_user); /* Queue pairs */ static void __ib_qp_event_handler(struct ib_event *event, void *context) { struct ib_qp *qp = event->element.qp; if (event->event == IB_EVENT_QP_LAST_WQE_REACHED) complete(&qp->srq_completion); if (qp->registered_event_handler) qp->registered_event_handler(event, qp->qp_context); } static void __ib_shared_qp_event_handler(struct ib_event *event, void *context) { struct ib_qp *qp = context; unsigned long flags; spin_lock_irqsave(&qp->device->qp_open_list_lock, flags); list_for_each_entry(event->element.qp, &qp->open_list, open_list) if (event->element.qp->event_handler) event->element.qp->event_handler(event, event->element.qp->qp_context); spin_unlock_irqrestore(&qp->device->qp_open_list_lock, flags); } static struct ib_qp *__ib_open_qp(struct ib_qp *real_qp, void (*event_handler)(struct ib_event *, void *), void *qp_context) { struct ib_qp *qp; unsigned long flags; int err; qp = kzalloc(sizeof *qp, GFP_KERNEL); if (!qp) return ERR_PTR(-ENOMEM); qp->real_qp = real_qp; err = ib_open_shared_qp_security(qp, real_qp->device); if (err) { kfree(qp); return ERR_PTR(err); } qp->real_qp = real_qp; atomic_inc(&real_qp->usecnt); qp->device = real_qp->device; qp->event_handler = event_handler; qp->qp_context = qp_context; qp->qp_num = real_qp->qp_num; qp->qp_type = real_qp->qp_type; spin_lock_irqsave(&real_qp->device->qp_open_list_lock, flags); list_add(&qp->open_list, &real_qp->open_list); spin_unlock_irqrestore(&real_qp->device->qp_open_list_lock, flags); return qp; } struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd, struct ib_qp_open_attr *qp_open_attr) { struct ib_qp *qp, *real_qp; if (qp_open_attr->qp_type != IB_QPT_XRC_TGT) return ERR_PTR(-EINVAL); down_read(&xrcd->tgt_qps_rwsem); real_qp = xa_load(&xrcd->tgt_qps, qp_open_attr->qp_num); if (!real_qp) { up_read(&xrcd->tgt_qps_rwsem); return ERR_PTR(-EINVAL); } qp = __ib_open_qp(real_qp, qp_open_attr->event_handler, qp_open_attr->qp_context); up_read(&xrcd->tgt_qps_rwsem); return qp; } EXPORT_SYMBOL(ib_open_qp); static struct ib_qp *create_xrc_qp_user(struct ib_qp *qp, struct ib_qp_init_attr *qp_init_attr) { struct ib_qp *real_qp = qp; int err; qp->event_handler = __ib_shared_qp_event_handler; qp->qp_context = qp; qp->pd = NULL; qp->send_cq = qp->recv_cq = NULL; qp->srq = NULL; qp->xrcd = qp_init_attr->xrcd; atomic_inc(&qp_init_attr->xrcd->usecnt); INIT_LIST_HEAD(&qp->open_list); qp = __ib_open_qp(real_qp, qp_init_attr->event_handler, qp_init_attr->qp_context); if (IS_ERR(qp)) return qp; err = xa_err(xa_store(&qp_init_attr->xrcd->tgt_qps, real_qp->qp_num, real_qp, GFP_KERNEL)); if (err) { ib_close_qp(qp); return ERR_PTR(err); } return qp; } static struct ib_qp *create_qp(struct ib_device *dev, struct ib_pd *pd, struct ib_qp_init_attr *attr, struct ib_udata *udata, struct ib_uqp_object *uobj, const char *caller) { struct ib_udata dummy = {}; struct ib_qp *qp; int ret; if (!dev->ops.create_qp) return ERR_PTR(-EOPNOTSUPP); qp = rdma_zalloc_drv_obj_numa(dev, ib_qp); if (!qp) return ERR_PTR(-ENOMEM); qp->device = dev; qp->pd = pd; qp->uobject = uobj; qp->real_qp = qp; qp->qp_type = attr->qp_type; qp->rwq_ind_tbl = attr->rwq_ind_tbl; qp->srq = attr->srq; qp->event_handler = __ib_qp_event_handler; qp->registered_event_handler = attr->event_handler; qp->port = attr->port_num; qp->qp_context = attr->qp_context; spin_lock_init(&qp->mr_lock); INIT_LIST_HEAD(&qp->rdma_mrs); INIT_LIST_HEAD(&qp->sig_mrs); init_completion(&qp->srq_completion); qp->send_cq = attr->send_cq; qp->recv_cq = attr->recv_cq; rdma_restrack_new(&qp->res, RDMA_RESTRACK_QP); WARN_ONCE(!udata && !caller, "Missing kernel QP owner"); rdma_restrack_set_name(&qp->res, udata ? NULL : caller); ret = dev->ops.create_qp(qp, attr, udata); if (ret) goto err_create; /* * TODO: The mlx4 internally overwrites send_cq and recv_cq. * Unfortunately, it is not an easy task to fix that driver. */ qp->send_cq = attr->send_cq; qp->recv_cq = attr->recv_cq; ret = ib_create_qp_security(qp, dev); if (ret) goto err_security; rdma_restrack_add(&qp->res); return qp; err_security: qp->device->ops.destroy_qp(qp, udata ? &dummy : NULL); err_create: rdma_restrack_put(&qp->res); kfree(qp); return ERR_PTR(ret); } /** * ib_create_qp_user - Creates a QP associated with the specified protection * domain. * @dev: IB device * @pd: The protection domain associated with the QP. * @attr: A list of initial attributes required to create the * QP. If QP creation succeeds, then the attributes are updated to * the actual capabilities of the created QP. * @udata: User data * @uobj: uverbs obect * @caller: caller's build-time module name */ struct ib_qp *ib_create_qp_user(struct ib_device *dev, struct ib_pd *pd, struct ib_qp_init_attr *attr, struct ib_udata *udata, struct ib_uqp_object *uobj, const char *caller) { struct ib_qp *qp, *xrc_qp; if (attr->qp_type == IB_QPT_XRC_TGT) qp = create_qp(dev, pd, attr, NULL, NULL, caller); else qp = create_qp(dev, pd, attr, udata, uobj, NULL); if (attr->qp_type != IB_QPT_XRC_TGT || IS_ERR(qp)) return qp; xrc_qp = create_xrc_qp_user(qp, attr); if (IS_ERR(xrc_qp)) { ib_destroy_qp(qp); return xrc_qp; } xrc_qp->uobject = uobj; return xrc_qp; } EXPORT_SYMBOL(ib_create_qp_user); void ib_qp_usecnt_inc(struct ib_qp *qp) { if (qp->pd) atomic_inc(&qp->pd->usecnt); if (qp->send_cq) atomic_inc(&qp->send_cq->usecnt); if (qp->recv_cq) atomic_inc(&qp->recv_cq->usecnt); if (qp->srq) atomic_inc(&qp->srq->usecnt); if (qp->rwq_ind_tbl) atomic_inc(&qp->rwq_ind_tbl->usecnt); } EXPORT_SYMBOL(ib_qp_usecnt_inc); void ib_qp_usecnt_dec(struct ib_qp *qp) { if (qp->rwq_ind_tbl) atomic_dec(&qp->rwq_ind_tbl->usecnt); if (qp->srq) atomic_dec(&qp->srq->usecnt); if (qp->recv_cq) atomic_dec(&qp->recv_cq->usecnt); if (qp->send_cq) atomic_dec(&qp->send_cq->usecnt); if (qp->pd) atomic_dec(&qp->pd->usecnt); } EXPORT_SYMBOL(ib_qp_usecnt_dec); struct ib_qp *ib_create_qp_kernel(struct ib_pd *pd, struct ib_qp_init_attr *qp_init_attr, const char *caller) { struct ib_device *device = pd->device; struct ib_qp *qp; int ret; /* * If the callers is using the RDMA API calculate the resources * needed for the RDMA READ/WRITE operations. * * Note that these callers need to pass in a port number. */ if (qp_init_attr->cap.max_rdma_ctxs) rdma_rw_init_qp(device, qp_init_attr); qp = create_qp(device, pd, qp_init_attr, NULL, NULL, caller); if (IS_ERR(qp)) return qp; ib_qp_usecnt_inc(qp); if (qp_init_attr->cap.max_rdma_ctxs) { ret = rdma_rw_init_mrs(qp, qp_init_attr); if (ret) goto err; } /* * Note: all hw drivers guarantee that max_send_sge is lower than * the device RDMA WRITE SGE limit but not all hw drivers ensure that * max_send_sge <= max_sge_rd. */ qp->max_write_sge = qp_init_attr->cap.max_send_sge; qp->max_read_sge = min_t(u32, qp_init_attr->cap.max_send_sge, device->attrs.max_sge_rd); if (qp_init_attr->create_flags & IB_QP_CREATE_INTEGRITY_EN) qp->integrity_en = true; return qp; err: ib_destroy_qp(qp); return ERR_PTR(ret); } EXPORT_SYMBOL(ib_create_qp_kernel); static const struct { int valid; enum ib_qp_attr_mask req_param[IB_QPT_MAX]; enum ib_qp_attr_mask opt_param[IB_QPT_MAX]; } qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = { [IB_QPS_RESET] = { [IB_QPS_RESET] = { .valid = 1 }, [IB_QPS_INIT] = { .valid = 1, .req_param = { [IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_QKEY), [IB_QPT_RAW_PACKET] = IB_QP_PORT, [IB_QPT_UC] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), [IB_QPT_RC] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), } }, }, [IB_QPS_INIT] = { [IB_QPS_RESET] = { .valid = 1 }, [IB_QPS_ERR] = { .valid = 1 }, [IB_QPS_INIT] = { .valid = 1, .opt_param = { [IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_QKEY), [IB_QPT_UC] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), [IB_QPT_RC] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), [IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), [IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_ACCESS_FLAGS), [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), } }, [IB_QPS_RTR] = { .valid = 1, .req_param = { [IB_QPT_UC] = (IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | IB_QP_RQ_PSN), [IB_QPT_RC] = (IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER), [IB_QPT_XRC_INI] = (IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | IB_QP_RQ_PSN), [IB_QPT_XRC_TGT] = (IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN | IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER), }, .opt_param = { [IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), [IB_QPT_UC] = (IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX), [IB_QPT_RC] = (IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX), [IB_QPT_XRC_INI] = (IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX), [IB_QPT_XRC_TGT] = (IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX), [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), }, }, }, [IB_QPS_RTR] = { [IB_QPS_RESET] = { .valid = 1 }, [IB_QPS_ERR] = { .valid = 1 }, [IB_QPS_RTS] = { .valid = 1, .req_param = { [IB_QPT_UD] = IB_QP_SQ_PSN, [IB_QPT_UC] = IB_QP_SQ_PSN, [IB_QPT_RC] = (IB_QP_TIMEOUT | IB_QP_RETRY_CNT | IB_QP_RNR_RETRY | IB_QP_SQ_PSN | IB_QP_MAX_QP_RD_ATOMIC), [IB_QPT_XRC_INI] = (IB_QP_TIMEOUT | IB_QP_RETRY_CNT | IB_QP_RNR_RETRY | IB_QP_SQ_PSN | IB_QP_MAX_QP_RD_ATOMIC), [IB_QPT_XRC_TGT] = (IB_QP_TIMEOUT | IB_QP_SQ_PSN), [IB_QPT_SMI] = IB_QP_SQ_PSN, [IB_QPT_GSI] = IB_QP_SQ_PSN, }, .opt_param = { [IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_UC] = (IB_QP_CUR_STATE | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PATH_MIG_STATE), [IB_QPT_RC] = (IB_QP_CUR_STATE | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE), [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PATH_MIG_STATE), [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE), [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_RAW_PACKET] = IB_QP_RATE_LIMIT, } } }, [IB_QPS_RTS] = { [IB_QPS_RESET] = { .valid = 1 }, [IB_QPS_ERR] = { .valid = 1 }, [IB_QPS_RTS] = { .valid = 1, .opt_param = { [IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_UC] = (IB_QP_CUR_STATE | IB_QP_ACCESS_FLAGS | IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE), [IB_QPT_RC] = (IB_QP_CUR_STATE | IB_QP_ACCESS_FLAGS | IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE | IB_QP_MIN_RNR_TIMER), [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | IB_QP_ACCESS_FLAGS | IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE), [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | IB_QP_ACCESS_FLAGS | IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE | IB_QP_MIN_RNR_TIMER), [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_RAW_PACKET] = IB_QP_RATE_LIMIT, } }, [IB_QPS_SQD] = { .valid = 1, .opt_param = { [IB_QPT_UD] = IB_QP_EN_SQD_ASYNC_NOTIFY, [IB_QPT_UC] = IB_QP_EN_SQD_ASYNC_NOTIFY, [IB_QPT_RC] = IB_QP_EN_SQD_ASYNC_NOTIFY, [IB_QPT_XRC_INI] = IB_QP_EN_SQD_ASYNC_NOTIFY, [IB_QPT_XRC_TGT] = IB_QP_EN_SQD_ASYNC_NOTIFY, /* ??? */ [IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY, [IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY } }, }, [IB_QPS_SQD] = { [IB_QPS_RESET] = { .valid = 1 }, [IB_QPS_ERR] = { .valid = 1 }, [IB_QPS_RTS] = { .valid = 1, .opt_param = { [IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_UC] = (IB_QP_CUR_STATE | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PATH_MIG_STATE), [IB_QPT_RC] = (IB_QP_CUR_STATE | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE), [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PATH_MIG_STATE), [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE), [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY), } }, [IB_QPS_SQD] = { .valid = 1, .opt_param = { [IB_QPT_UD] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), [IB_QPT_UC] = (IB_QP_AV | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PATH_MIG_STATE), [IB_QPT_RC] = (IB_QP_PORT | IB_QP_AV | IB_QP_TIMEOUT | IB_QP_RETRY_CNT | IB_QP_RNR_RETRY | IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE), [IB_QPT_XRC_INI] = (IB_QP_PORT | IB_QP_AV | IB_QP_TIMEOUT | IB_QP_RETRY_CNT | IB_QP_RNR_RETRY | IB_QP_MAX_QP_RD_ATOMIC | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PATH_MIG_STATE), [IB_QPT_XRC_TGT] = (IB_QP_PORT | IB_QP_AV | IB_QP_TIMEOUT | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_ALT_PATH | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_MIN_RNR_TIMER | IB_QP_PATH_MIG_STATE), [IB_QPT_SMI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_PKEY_INDEX | IB_QP_QKEY), } } }, [IB_QPS_SQE] = { [IB_QPS_RESET] = { .valid = 1 }, [IB_QPS_ERR] = { .valid = 1 }, [IB_QPS_RTS] = { .valid = 1, .opt_param = { [IB_QPT_UD] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_UC] = (IB_QP_CUR_STATE | IB_QP_ACCESS_FLAGS), [IB_QPT_SMI] = (IB_QP_CUR_STATE | IB_QP_QKEY), [IB_QPT_GSI] = (IB_QP_CUR_STATE | IB_QP_QKEY), } } }, [IB_QPS_ERR] = { [IB_QPS_RESET] = { .valid = 1 }, [IB_QPS_ERR] = { .valid = 1 } } }; bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, enum ib_qp_type type, enum ib_qp_attr_mask mask) { enum ib_qp_attr_mask req_param, opt_param; if (mask & IB_QP_CUR_STATE && cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS && cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE) return false; if (!qp_state_table[cur_state][next_state].valid) return false; req_param = qp_state_table[cur_state][next_state].req_param[type]; opt_param = qp_state_table[cur_state][next_state].opt_param[type]; if ((mask & req_param) != req_param) return false; if (mask & ~(req_param | opt_param | IB_QP_STATE)) return false; return true; } EXPORT_SYMBOL(ib_modify_qp_is_ok); /** * ib_resolve_eth_dmac - Resolve destination mac address * @device: Device to consider * @ah_attr: address handle attribute which describes the * source and destination parameters * ib_resolve_eth_dmac() resolves destination mac address and L3 hop limit It * returns 0 on success or appropriate error code. It initializes the * necessary ah_attr fields when call is successful. */ static int ib_resolve_eth_dmac(struct ib_device *device, struct rdma_ah_attr *ah_attr) { int ret = 0; if (rdma_is_multicast_addr((struct in6_addr *)ah_attr->grh.dgid.raw)) { if (ipv6_addr_v4mapped((struct in6_addr *)ah_attr->grh.dgid.raw)) { __be32 addr = 0; memcpy(&addr, ah_attr->grh.dgid.raw + 12, 4); ip_eth_mc_map(addr, (char *)ah_attr->roce.dmac); } else { ipv6_eth_mc_map((struct in6_addr *)ah_attr->grh.dgid.raw, (char *)ah_attr->roce.dmac); } } else { ret = ib_resolve_unicast_gid_dmac(device, ah_attr); } return ret; } static bool is_qp_type_connected(const struct ib_qp *qp) { return (qp->qp_type == IB_QPT_UC || qp->qp_type == IB_QPT_RC || qp->qp_type == IB_QPT_XRC_INI || qp->qp_type == IB_QPT_XRC_TGT); } /* * IB core internal function to perform QP attributes modification. */ static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { u32 port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; const struct ib_gid_attr *old_sgid_attr_av; const struct ib_gid_attr *old_sgid_attr_alt_av; int ret; attr->xmit_slave = NULL; if (attr_mask & IB_QP_AV) { ret = rdma_fill_sgid_attr(qp->device, &attr->ah_attr, &old_sgid_attr_av); if (ret) return ret; if (attr->ah_attr.type == RDMA_AH_ATTR_TYPE_ROCE && is_qp_type_connected(qp)) { struct net_device *slave; /* * If the user provided the qp_attr then we have to * resolve it. Kerne users have to provide already * resolved rdma_ah_attr's. */ if (udata) { ret = ib_resolve_eth_dmac(qp->device, &attr->ah_attr); if (ret) goto out_av; } slave = rdma_lag_get_ah_roce_slave(qp->device, &attr->ah_attr, GFP_KERNEL); if (IS_ERR(slave)) { ret = PTR_ERR(slave); goto out_av; } attr->xmit_slave = slave; } } if (attr_mask & IB_QP_ALT_PATH) { /* * FIXME: This does not track the migration state, so if the * user loads a new alternate path after the HW has migrated * from primary->alternate we will keep the wrong * references. This is OK for IB because the reference * counting does not serve any functional purpose. */ ret = rdma_fill_sgid_attr(qp->device, &attr->alt_ah_attr, &old_sgid_attr_alt_av); if (ret) goto out_av; /* * Today the core code can only handle alternate paths and APM * for IB. Ban them in roce mode. */ if (!(rdma_protocol_ib(qp->device, attr->alt_ah_attr.port_num) && rdma_protocol_ib(qp->device, port))) { ret = -EINVAL; goto out; } } if (rdma_ib_or_roce(qp->device, port)) { if (attr_mask & IB_QP_RQ_PSN && attr->rq_psn & ~0xffffff) { dev_warn(&qp->device->dev, "%s rq_psn overflow, masking to 24 bits\n", __func__); attr->rq_psn &= 0xffffff; } if (attr_mask & IB_QP_SQ_PSN && attr->sq_psn & ~0xffffff) { dev_warn(&qp->device->dev, " %s sq_psn overflow, masking to 24 bits\n", __func__); attr->sq_psn &= 0xffffff; } } /* * Bind this qp to a counter automatically based on the rdma counter * rules. This only set in RST2INIT with port specified */ if (!qp->counter && (attr_mask & IB_QP_PORT) && ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_INIT)) rdma_counter_bind_qp_auto(qp, attr->port_num); ret = ib_security_modify_qp(qp, attr, attr_mask, udata); if (ret) goto out; if (attr_mask & IB_QP_PORT) qp->port = attr->port_num; if (attr_mask & IB_QP_AV) qp->av_sgid_attr = rdma_update_sgid_attr(&attr->ah_attr, qp->av_sgid_attr); if (attr_mask & IB_QP_ALT_PATH) qp->alt_path_sgid_attr = rdma_update_sgid_attr( &attr->alt_ah_attr, qp->alt_path_sgid_attr); out: if (attr_mask & IB_QP_ALT_PATH) rdma_unfill_sgid_attr(&attr->alt_ah_attr, old_sgid_attr_alt_av); out_av: if (attr_mask & IB_QP_AV) { rdma_lag_put_ah_roce_slave(attr->xmit_slave); rdma_unfill_sgid_attr(&attr->ah_attr, old_sgid_attr_av); } return ret; } /** * ib_modify_qp_with_udata - Modifies the attributes for the specified QP. * @ib_qp: The QP to modify. * @attr: On input, specifies the QP attributes to modify. On output, * the current values of selected QP attributes are returned. * @attr_mask: A bit-mask used to specify which attributes of the QP * are being modified. * @udata: pointer to user's input output buffer information * are being modified. * It returns 0 on success and returns appropriate error code on error. */ int ib_modify_qp_with_udata(struct ib_qp *ib_qp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { return _ib_modify_qp(ib_qp->real_qp, attr, attr_mask, udata); } EXPORT_SYMBOL(ib_modify_qp_with_udata); static void ib_get_width_and_speed(u32 netdev_speed, u32 lanes, u16 *speed, u8 *width) { if (!lanes) { if (netdev_speed <= SPEED_1000) { *width = IB_WIDTH_1X; *speed = IB_SPEED_SDR; } else if (netdev_speed <= SPEED_10000) { *width = IB_WIDTH_1X; *speed = IB_SPEED_FDR10; } else if (netdev_speed <= SPEED_20000) { *width = IB_WIDTH_4X; *speed = IB_SPEED_DDR; } else if (netdev_speed <= SPEED_25000) { *width = IB_WIDTH_1X; *speed = IB_SPEED_EDR; } else if (netdev_speed <= SPEED_40000) { *width = IB_WIDTH_4X; *speed = IB_SPEED_FDR10; } else if (netdev_speed <= SPEED_50000) { *width = IB_WIDTH_2X; *speed = IB_SPEED_EDR; } else if (netdev_speed <= SPEED_100000) { *width = IB_WIDTH_4X; *speed = IB_SPEED_EDR; } else if (netdev_speed <= SPEED_200000) { *width = IB_WIDTH_4X; *speed = IB_SPEED_HDR; } else { *width = IB_WIDTH_4X; *speed = IB_SPEED_NDR; } return; } switch (lanes) { case 1: *width = IB_WIDTH_1X; break; case 2: *width = IB_WIDTH_2X; break; case 4: *width = IB_WIDTH_4X; break; case 8: *width = IB_WIDTH_8X; break; case 12: *width = IB_WIDTH_12X; break; default: *width = IB_WIDTH_1X; } switch (netdev_speed / lanes) { case SPEED_2500: *speed = IB_SPEED_SDR; break; case SPEED_5000: *speed = IB_SPEED_DDR; break; case SPEED_10000: *speed = IB_SPEED_FDR10; break; case SPEED_14000: *speed = IB_SPEED_FDR; break; case SPEED_25000: *speed = IB_SPEED_EDR; break; case SPEED_50000: *speed = IB_SPEED_HDR; break; case SPEED_100000: *speed = IB_SPEED_NDR; break; default: *speed = IB_SPEED_SDR; } } int ib_get_eth_speed(struct ib_device *dev, u32 port_num, u16 *speed, u8 *width) { int rc; u32 netdev_speed; struct net_device *netdev; struct ethtool_link_ksettings lksettings = {}; if (rdma_port_get_link_layer(dev, port_num) != IB_LINK_LAYER_ETHERNET) return -EINVAL; netdev = ib_device_get_netdev(dev, port_num); if (!netdev) return -ENODEV; rtnl_lock(); rc = __ethtool_get_link_ksettings(netdev, &lksettings); rtnl_unlock(); dev_put(netdev); if (!rc && lksettings.base.speed != (u32)SPEED_UNKNOWN) { netdev_speed = lksettings.base.speed; } else { netdev_speed = SPEED_1000; if (rc) pr_warn("%s speed is unknown, defaulting to %u\n", netdev->name, netdev_speed); } ib_get_width_and_speed(netdev_speed, lksettings.lanes, speed, width); return 0; } EXPORT_SYMBOL(ib_get_eth_speed); int ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask) { return _ib_modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL); } EXPORT_SYMBOL(ib_modify_qp); int ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) { qp_attr->ah_attr.grh.sgid_attr = NULL; qp_attr->alt_ah_attr.grh.sgid_attr = NULL; return qp->device->ops.query_qp ? qp->device->ops.query_qp(qp->real_qp, qp_attr, qp_attr_mask, qp_init_attr) : -EOPNOTSUPP; } EXPORT_SYMBOL(ib_query_qp); int ib_close_qp(struct ib_qp *qp) { struct ib_qp *real_qp; unsigned long flags; real_qp = qp->real_qp; if (real_qp == qp) return -EINVAL; spin_lock_irqsave(&real_qp->device->qp_open_list_lock, flags); list_del(&qp->open_list); spin_unlock_irqrestore(&real_qp->device->qp_open_list_lock, flags); atomic_dec(&real_qp->usecnt); if (qp->qp_sec) ib_close_shared_qp_security(qp->qp_sec); kfree(qp); return 0; } EXPORT_SYMBOL(ib_close_qp); static int __ib_destroy_shared_qp(struct ib_qp *qp) { struct ib_xrcd *xrcd; struct ib_qp *real_qp; int ret; real_qp = qp->real_qp; xrcd = real_qp->xrcd; down_write(&xrcd->tgt_qps_rwsem); ib_close_qp(qp); if (atomic_read(&real_qp->usecnt) == 0) xa_erase(&xrcd->tgt_qps, real_qp->qp_num); else real_qp = NULL; up_write(&xrcd->tgt_qps_rwsem); if (real_qp) { ret = ib_destroy_qp(real_qp); if (!ret) atomic_dec(&xrcd->usecnt); } return 0; } int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata) { const struct ib_gid_attr *alt_path_sgid_attr = qp->alt_path_sgid_attr; const struct ib_gid_attr *av_sgid_attr = qp->av_sgid_attr; struct ib_qp_security *sec; int ret; WARN_ON_ONCE(qp->mrs_used > 0); if (atomic_read(&qp->usecnt)) return -EBUSY; if (qp->real_qp != qp) return __ib_destroy_shared_qp(qp); sec = qp->qp_sec; if (sec) ib_destroy_qp_security_begin(sec); if (!qp->uobject) rdma_rw_cleanup_mrs(qp); rdma_counter_unbind_qp(qp, true); ret = qp->device->ops.destroy_qp(qp, udata); if (ret) { if (sec) ib_destroy_qp_security_abort(sec); return ret; } if (alt_path_sgid_attr) rdma_put_gid_attr(alt_path_sgid_attr); if (av_sgid_attr) rdma_put_gid_attr(av_sgid_attr); ib_qp_usecnt_dec(qp); if (sec) ib_destroy_qp_security_end(sec); rdma_restrack_del(&qp->res); kfree(qp); return ret; } EXPORT_SYMBOL(ib_destroy_qp_user); /* Completion queues */ struct ib_cq *__ib_create_cq(struct ib_device *device, ib_comp_handler comp_handler, void (*event_handler)(struct ib_event *, void *), void *cq_context, const struct ib_cq_init_attr *cq_attr, const char *caller) { struct ib_cq *cq; int ret; cq = rdma_zalloc_drv_obj(device, ib_cq); if (!cq) return ERR_PTR(-ENOMEM); cq->device = device; cq->uobject = NULL; cq->comp_handler = comp_handler; cq->event_handler = event_handler; cq->cq_context = cq_context; atomic_set(&cq->usecnt, 0); rdma_restrack_new(&cq->res, RDMA_RESTRACK_CQ); rdma_restrack_set_name(&cq->res, caller); ret = device->ops.create_cq(cq, cq_attr, NULL); if (ret) { rdma_restrack_put(&cq->res); kfree(cq); return ERR_PTR(ret); } rdma_restrack_add(&cq->res); return cq; } EXPORT_SYMBOL(__ib_create_cq); int rdma_set_cq_moderation(struct ib_cq *cq, u16 cq_count, u16 cq_period) { if (cq->shared) return -EOPNOTSUPP; return cq->device->ops.modify_cq ? cq->device->ops.modify_cq(cq, cq_count, cq_period) : -EOPNOTSUPP; } EXPORT_SYMBOL(rdma_set_cq_moderation); int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata) { int ret; if (WARN_ON_ONCE(cq->shared)) return -EOPNOTSUPP; if (atomic_read(&cq->usecnt)) return -EBUSY; ret = cq->device->ops.destroy_cq(cq, udata); if (ret) return ret; rdma_restrack_del(&cq->res); kfree(cq); return ret; } EXPORT_SYMBOL(ib_destroy_cq_user); int ib_resize_cq(struct ib_cq *cq, int cqe) { if (cq->shared) return -EOPNOTSUPP; return cq->device->ops.resize_cq ? cq->device->ops.resize_cq(cq, cqe, NULL) : -EOPNOTSUPP; } EXPORT_SYMBOL(ib_resize_cq); /* Memory regions */ struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int access_flags) { struct ib_mr *mr; if (access_flags & IB_ACCESS_ON_DEMAND) { if (!(pd->device->attrs.kernel_cap_flags & IBK_ON_DEMAND_PAGING)) { pr_debug("ODP support not available\n"); return ERR_PTR(-EINVAL); } } mr = pd->device->ops.reg_user_mr(pd, start, length, virt_addr, access_flags, NULL); if (IS_ERR(mr)) return mr; mr->device = pd->device; mr->type = IB_MR_TYPE_USER; mr->pd = pd; mr->dm = NULL; atomic_inc(&pd->usecnt); mr->iova = virt_addr; mr->length = length; rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); rdma_restrack_parent_name(&mr->res, &pd->res); rdma_restrack_add(&mr->res); return mr; } EXPORT_SYMBOL(ib_reg_user_mr); int ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice, u32 flags, struct ib_sge *sg_list, u32 num_sge) { if (!pd->device->ops.advise_mr) return -EOPNOTSUPP; if (!num_sge) return 0; return pd->device->ops.advise_mr(pd, advice, flags, sg_list, num_sge, NULL); } EXPORT_SYMBOL(ib_advise_mr); int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata) { struct ib_pd *pd = mr->pd; struct ib_dm *dm = mr->dm; struct ib_sig_attrs *sig_attrs = mr->sig_attrs; int ret; trace_mr_dereg(mr); rdma_restrack_del(&mr->res); ret = mr->device->ops.dereg_mr(mr, udata); if (!ret) { atomic_dec(&pd->usecnt); if (dm) atomic_dec(&dm->usecnt); kfree(sig_attrs); } return ret; } EXPORT_SYMBOL(ib_dereg_mr_user); /** * ib_alloc_mr() - Allocates a memory region * @pd: protection domain associated with the region * @mr_type: memory region type * @max_num_sg: maximum sg entries available for registration. * * Notes: * Memory registeration page/sg lists must not exceed max_num_sg. * For mr_type IB_MR_TYPE_MEM_REG, the total length cannot exceed * max_num_sg * used_page_size. * */ struct ib_mr *ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, u32 max_num_sg) { struct ib_mr *mr; if (!pd->device->ops.alloc_mr) { mr = ERR_PTR(-EOPNOTSUPP); goto out; } if (mr_type == IB_MR_TYPE_INTEGRITY) { WARN_ON_ONCE(1); mr = ERR_PTR(-EINVAL); goto out; } mr = pd->device->ops.alloc_mr(pd, mr_type, max_num_sg); if (IS_ERR(mr)) goto out; mr->device = pd->device; mr->pd = pd; mr->dm = NULL; mr->uobject = NULL; atomic_inc(&pd->usecnt); mr->need_inval = false; mr->type = mr_type; mr->sig_attrs = NULL; rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); rdma_restrack_parent_name(&mr->res, &pd->res); rdma_restrack_add(&mr->res); out: trace_mr_alloc(pd, mr_type, max_num_sg, mr); return mr; } EXPORT_SYMBOL(ib_alloc_mr); /** * ib_alloc_mr_integrity() - Allocates an integrity memory region * @pd: protection domain associated with the region * @max_num_data_sg: maximum data sg entries available for registration * @max_num_meta_sg: maximum metadata sg entries available for * registration * * Notes: * Memory registration page/sg lists must not exceed max_num_sg, * also the integrity page/sg lists must not exceed max_num_meta_sg. * */ struct ib_mr *ib_alloc_mr_integrity(struct ib_pd *pd, u32 max_num_data_sg, u32 max_num_meta_sg) { struct ib_mr *mr; struct ib_sig_attrs *sig_attrs; if (!pd->device->ops.alloc_mr_integrity || !pd->device->ops.map_mr_sg_pi) { mr = ERR_PTR(-EOPNOTSUPP); goto out; } if (!max_num_meta_sg) { mr = ERR_PTR(-EINVAL); goto out; } sig_attrs = kzalloc(sizeof(struct ib_sig_attrs), GFP_KERNEL); if (!sig_attrs) { mr = ERR_PTR(-ENOMEM); goto out; } mr = pd->device->ops.alloc_mr_integrity(pd, max_num_data_sg, max_num_meta_sg); if (IS_ERR(mr)) { kfree(sig_attrs); goto out; } mr->device = pd->device; mr->pd = pd; mr->dm = NULL; mr->uobject = NULL; atomic_inc(&pd->usecnt); mr->need_inval = false; mr->type = IB_MR_TYPE_INTEGRITY; mr->sig_attrs = sig_attrs; rdma_restrack_new(&mr->res, RDMA_RESTRACK_MR); rdma_restrack_parent_name(&mr->res, &pd->res); rdma_restrack_add(&mr->res); out: trace_mr_integ_alloc(pd, max_num_data_sg, max_num_meta_sg, mr); return mr; } EXPORT_SYMBOL(ib_alloc_mr_integrity); /* Multicast groups */ static bool is_valid_mcast_lid(struct ib_qp *qp, u16 lid) { struct ib_qp_init_attr init_attr = {}; struct ib_qp_attr attr = {}; int num_eth_ports = 0; unsigned int port; /* If QP state >= init, it is assigned to a port and we can check this * port only. */ if (!ib_query_qp(qp, &attr, IB_QP_STATE | IB_QP_PORT, &init_attr)) { if (attr.qp_state >= IB_QPS_INIT) { if (rdma_port_get_link_layer(qp->device, attr.port_num) != IB_LINK_LAYER_INFINIBAND) return true; goto lid_check; } } /* Can't get a quick answer, iterate over all ports */ rdma_for_each_port(qp->device, port) if (rdma_port_get_link_layer(qp->device, port) != IB_LINK_LAYER_INFINIBAND) num_eth_ports++; /* If we have at lease one Ethernet port, RoCE annex declares that * multicast LID should be ignored. We can't tell at this step if the * QP belongs to an IB or Ethernet port. */ if (num_eth_ports) return true; /* If all the ports are IB, we can check according to IB spec. */ lid_check: return !(lid < be16_to_cpu(IB_MULTICAST_LID_BASE) || lid == be16_to_cpu(IB_LID_PERMISSIVE)); } int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) { int ret; if (!qp->device->ops.attach_mcast) return -EOPNOTSUPP; if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) || qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid)) return -EINVAL; ret = qp->device->ops.attach_mcast(qp, gid, lid); if (!ret) atomic_inc(&qp->usecnt); return ret; } EXPORT_SYMBOL(ib_attach_mcast); int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid) { int ret; if (!qp->device->ops.detach_mcast) return -EOPNOTSUPP; if (!rdma_is_multicast_addr((struct in6_addr *)gid->raw) || qp->qp_type != IB_QPT_UD || !is_valid_mcast_lid(qp, lid)) return -EINVAL; ret = qp->device->ops.detach_mcast(qp, gid, lid); if (!ret) atomic_dec(&qp->usecnt); return ret; } EXPORT_SYMBOL(ib_detach_mcast); /** * ib_alloc_xrcd_user - Allocates an XRC domain. * @device: The device on which to allocate the XRC domain. * @inode: inode to connect XRCD * @udata: Valid user data or NULL for kernel object */ struct ib_xrcd *ib_alloc_xrcd_user(struct ib_device *device, struct inode *inode, struct ib_udata *udata) { struct ib_xrcd *xrcd; int ret; if (!device->ops.alloc_xrcd) return ERR_PTR(-EOPNOTSUPP); xrcd = rdma_zalloc_drv_obj(device, ib_xrcd); if (!xrcd) return ERR_PTR(-ENOMEM); xrcd->device = device; xrcd->inode = inode; atomic_set(&xrcd->usecnt, 0); init_rwsem(&xrcd->tgt_qps_rwsem); xa_init(&xrcd->tgt_qps); ret = device->ops.alloc_xrcd(xrcd, udata); if (ret) goto err; return xrcd; err: kfree(xrcd); return ERR_PTR(ret); } EXPORT_SYMBOL(ib_alloc_xrcd_user); /** * ib_dealloc_xrcd_user - Deallocates an XRC domain. * @xrcd: The XRC domain to deallocate. * @udata: Valid user data or NULL for kernel object */ int ib_dealloc_xrcd_user(struct ib_xrcd *xrcd, struct ib_udata *udata) { int ret; if (atomic_read(&xrcd->usecnt)) return -EBUSY; WARN_ON(!xa_empty(&xrcd->tgt_qps)); ret = xrcd->device->ops.dealloc_xrcd(xrcd, udata); if (ret) return ret; kfree(xrcd); return ret; } EXPORT_SYMBOL(ib_dealloc_xrcd_user); /** * ib_create_wq - Creates a WQ associated with the specified protection * domain. * @pd: The protection domain associated with the WQ. * @wq_attr: A list of initial attributes required to create the * WQ. If WQ creation succeeds, then the attributes are updated to * the actual capabilities of the created WQ. * * wq_attr->max_wr and wq_attr->max_sge determine * the requested size of the WQ, and set to the actual values allocated * on return. * If ib_create_wq() succeeds, then max_wr and max_sge will always be * at least as large as the requested values. */ struct ib_wq *ib_create_wq(struct ib_pd *pd, struct ib_wq_init_attr *wq_attr) { struct ib_wq *wq; if (!pd->device->ops.create_wq) return ERR_PTR(-EOPNOTSUPP); wq = pd->device->ops.create_wq(pd, wq_attr, NULL); if (!IS_ERR(wq)) { wq->event_handler = wq_attr->event_handler; wq->wq_context = wq_attr->wq_context; wq->wq_type = wq_attr->wq_type; wq->cq = wq_attr->cq; wq->device = pd->device; wq->pd = pd; wq->uobject = NULL; atomic_inc(&pd->usecnt); atomic_inc(&wq_attr->cq->usecnt); atomic_set(&wq->usecnt, 0); } return wq; } EXPORT_SYMBOL(ib_create_wq); /** * ib_destroy_wq_user - Destroys the specified user WQ. * @wq: The WQ to destroy. * @udata: Valid user data */ int ib_destroy_wq_user(struct ib_wq *wq, struct ib_udata *udata) { struct ib_cq *cq = wq->cq; struct ib_pd *pd = wq->pd; int ret; if (atomic_read(&wq->usecnt)) return -EBUSY; ret = wq->device->ops.destroy_wq(wq, udata); if (ret) return ret; atomic_dec(&pd->usecnt); atomic_dec(&cq->usecnt); return ret; } EXPORT_SYMBOL(ib_destroy_wq_user); int ib_check_mr_status(struct ib_mr *mr, u32 check_mask, struct ib_mr_status *mr_status) { if (!mr->device->ops.check_mr_status) return -EOPNOTSUPP; return mr->device->ops.check_mr_status(mr, check_mask, mr_status); } EXPORT_SYMBOL(ib_check_mr_status); int ib_set_vf_link_state(struct ib_device *device, int vf, u32 port, int state) { if (!device->ops.set_vf_link_state) return -EOPNOTSUPP; return device->ops.set_vf_link_state(device, vf, port, state); } EXPORT_SYMBOL(ib_set_vf_link_state); int ib_get_vf_config(struct ib_device *device, int vf, u32 port, struct ifla_vf_info *info) { if (!device->ops.get_vf_config) return -EOPNOTSUPP; return device->ops.get_vf_config(device, vf, port, info); } EXPORT_SYMBOL(ib_get_vf_config); int ib_get_vf_stats(struct ib_device *device, int vf, u32 port, struct ifla_vf_stats *stats) { if (!device->ops.get_vf_stats) return -EOPNOTSUPP; return device->ops.get_vf_stats(device, vf, port, stats); } EXPORT_SYMBOL(ib_get_vf_stats); int ib_set_vf_guid(struct ib_device *device, int vf, u32 port, u64 guid, int type) { if (!device->ops.set_vf_guid) return -EOPNOTSUPP; return device->ops.set_vf_guid(device, vf, port, guid, type); } EXPORT_SYMBOL(ib_set_vf_guid); int ib_get_vf_guid(struct ib_device *device, int vf, u32 port, struct ifla_vf_guid *node_guid, struct ifla_vf_guid *port_guid) { if (!device->ops.get_vf_guid) return -EOPNOTSUPP; return device->ops.get_vf_guid(device, vf, port, node_guid, port_guid); } EXPORT_SYMBOL(ib_get_vf_guid); /** * ib_map_mr_sg_pi() - Map the dma mapped SG lists for PI (protection * information) and set an appropriate memory region for registration. * @mr: memory region * @data_sg: dma mapped scatterlist for data * @data_sg_nents: number of entries in data_sg * @data_sg_offset: offset in bytes into data_sg * @meta_sg: dma mapped scatterlist for metadata * @meta_sg_nents: number of entries in meta_sg * @meta_sg_offset: offset in bytes into meta_sg * @page_size: page vector desired page size * * Constraints: * - The MR must be allocated with type IB_MR_TYPE_INTEGRITY. * * Return: 0 on success. * * After this completes successfully, the memory region * is ready for registration. */ int ib_map_mr_sg_pi(struct ib_mr *mr, struct scatterlist *data_sg, int data_sg_nents, unsigned int *data_sg_offset, struct scatterlist *meta_sg, int meta_sg_nents, unsigned int *meta_sg_offset, unsigned int page_size) { if (unlikely(!mr->device->ops.map_mr_sg_pi || WARN_ON_ONCE(mr->type != IB_MR_TYPE_INTEGRITY))) return -EOPNOTSUPP; mr->page_size = page_size; return mr->device->ops.map_mr_sg_pi(mr, data_sg, data_sg_nents, data_sg_offset, meta_sg, meta_sg_nents, meta_sg_offset); } EXPORT_SYMBOL(ib_map_mr_sg_pi); /** * ib_map_mr_sg() - Map the largest prefix of a dma mapped SG list * and set it the memory region. * @mr: memory region * @sg: dma mapped scatterlist * @sg_nents: number of entries in sg * @sg_offset: offset in bytes into sg * @page_size: page vector desired page size * * Constraints: * * - The first sg element is allowed to have an offset. * - Each sg element must either be aligned to page_size or virtually * contiguous to the previous element. In case an sg element has a * non-contiguous offset, the mapping prefix will not include it. * - The last sg element is allowed to have length less than page_size. * - If sg_nents total byte length exceeds the mr max_num_sge * page_size * then only max_num_sg entries will be mapped. * - If the MR was allocated with type IB_MR_TYPE_SG_GAPS, none of these * constraints holds and the page_size argument is ignored. * * Returns the number of sg elements that were mapped to the memory region. * * After this completes successfully, the memory region * is ready for registration. */ int ib_map_mr_sg(struct ib_mr *mr, struct scatterlist *sg, int sg_nents, unsigned int *sg_offset, unsigned int page_size) { if (unlikely(!mr->device->ops.map_mr_sg)) return -EOPNOTSUPP; mr->page_size = page_size; return mr->device->ops.map_mr_sg(mr, sg, sg_nents, sg_offset); } EXPORT_SYMBOL(ib_map_mr_sg); /** * ib_sg_to_pages() - Convert the largest prefix of a sg list * to a page vector * @mr: memory region * @sgl: dma mapped scatterlist * @sg_nents: number of entries in sg * @sg_offset_p: ==== ======================================================= * IN start offset in bytes into sg * OUT offset in bytes for element n of the sg of the first * byte that has not been processed where n is the return * value of this function. * ==== ======================================================= * @set_page: driver page assignment function pointer * * Core service helper for drivers to convert the largest * prefix of given sg list to a page vector. The sg list * prefix converted is the prefix that meet the requirements * of ib_map_mr_sg. * * Returns the number of sg elements that were assigned to * a page vector. */ int ib_sg_to_pages(struct ib_mr *mr, struct scatterlist *sgl, int sg_nents, unsigned int *sg_offset_p, int (*set_page)(struct ib_mr *, u64)) { struct scatterlist *sg; u64 last_end_dma_addr = 0; unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0; unsigned int last_page_off = 0; u64 page_mask = ~((u64)mr->page_size - 1); int i, ret; if (unlikely(sg_nents <= 0 || sg_offset > sg_dma_len(&sgl[0]))) return -EINVAL; mr->iova = sg_dma_address(&sgl[0]) + sg_offset; mr->length = 0; for_each_sg(sgl, sg, sg_nents, i) { u64 dma_addr = sg_dma_address(sg) + sg_offset; u64 prev_addr = dma_addr; unsigned int dma_len = sg_dma_len(sg) - sg_offset; u64 end_dma_addr = dma_addr + dma_len; u64 page_addr = dma_addr & page_mask; /* * For the second and later elements, check whether either the * end of element i-1 or the start of element i is not aligned * on a page boundary. */ if (i && (last_page_off != 0 || page_addr != dma_addr)) { /* Stop mapping if there is a gap. */ if (last_end_dma_addr != dma_addr) break; /* * Coalesce this element with the last. If it is small * enough just update mr->length. Otherwise start * mapping from the next page. */ goto next_page; } do { ret = set_page(mr, page_addr); if (unlikely(ret < 0)) { sg_offset = prev_addr - sg_dma_address(sg); mr->length += prev_addr - dma_addr; if (sg_offset_p) *sg_offset_p = sg_offset; return i || sg_offset ? i : ret; } prev_addr = page_addr; next_page: page_addr += mr->page_size; } while (page_addr < end_dma_addr); mr->length += dma_len; last_end_dma_addr = end_dma_addr; last_page_off = end_dma_addr & ~page_mask; sg_offset = 0; } if (sg_offset_p) *sg_offset_p = 0; return i; } EXPORT_SYMBOL(ib_sg_to_pages); struct ib_drain_cqe { struct ib_cqe cqe; struct completion done; }; static void ib_drain_qp_done(struct ib_cq *cq, struct ib_wc *wc) { struct ib_drain_cqe *cqe = container_of(wc->wr_cqe, struct ib_drain_cqe, cqe); complete(&cqe->done); } /* * Post a WR and block until its completion is reaped for the SQ. */ static void __ib_drain_sq(struct ib_qp *qp) { struct ib_cq *cq = qp->send_cq; struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; struct ib_drain_cqe sdrain; struct ib_rdma_wr swr = { .wr = { .next = NULL, { .wr_cqe = &sdrain.cqe, }, .opcode = IB_WR_RDMA_WRITE, }, }; int ret; ret = ib_modify_qp(qp, &attr, IB_QP_STATE); if (ret) { WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); return; } sdrain.cqe.done = ib_drain_qp_done; init_completion(&sdrain.done); ret = ib_post_send(qp, &swr.wr, NULL); if (ret) { WARN_ONCE(ret, "failed to drain send queue: %d\n", ret); return; } if (cq->poll_ctx == IB_POLL_DIRECT) while (wait_for_completion_timeout(&sdrain.done, HZ / 10) <= 0) ib_process_cq_direct(cq, -1); else wait_for_completion(&sdrain.done); } /* * Post a WR and block until its completion is reaped for the RQ. */ static void __ib_drain_rq(struct ib_qp *qp) { struct ib_cq *cq = qp->recv_cq; struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; struct ib_drain_cqe rdrain; struct ib_recv_wr rwr = {}; int ret; ret = ib_modify_qp(qp, &attr, IB_QP_STATE); if (ret) { WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); return; } rwr.wr_cqe = &rdrain.cqe; rdrain.cqe.done = ib_drain_qp_done; init_completion(&rdrain.done); ret = ib_post_recv(qp, &rwr, NULL); if (ret) { WARN_ONCE(ret, "failed to drain recv queue: %d\n", ret); return; } if (cq->poll_ctx == IB_POLL_DIRECT) while (wait_for_completion_timeout(&rdrain.done, HZ / 10) <= 0) ib_process_cq_direct(cq, -1); else wait_for_completion(&rdrain.done); } /* * __ib_drain_srq() - Block until Last WQE Reached event arrives, or timeout * expires. * @qp: queue pair associated with SRQ to drain * * Quoting 10.3.1 Queue Pair and EE Context States: * * Note, for QPs that are associated with an SRQ, the Consumer should take the * QP through the Error State before invoking a Destroy QP or a Modify QP to the * Reset State. The Consumer may invoke the Destroy QP without first performing * a Modify QP to the Error State and waiting for the Affiliated Asynchronous * Last WQE Reached Event. However, if the Consumer does not wait for the * Affiliated Asynchronous Last WQE Reached Event, then WQE and Data Segment * leakage may occur. Therefore, it is good programming practice to tear down a * QP that is associated with an SRQ by using the following process: * * - Put the QP in the Error State * - Wait for the Affiliated Asynchronous Last WQE Reached Event; * - either: * drain the CQ by invoking the Poll CQ verb and either wait for CQ * to be empty or the number of Poll CQ operations has exceeded * CQ capacity size; * - or * post another WR that completes on the same CQ and wait for this * WR to return as a WC; * - and then invoke a Destroy QP or Reset QP. * * We use the first option. */ static void __ib_drain_srq(struct ib_qp *qp) { struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; struct ib_cq *cq; int n, polled = 0; int ret; if (!qp->srq) { WARN_ONCE(1, "QP 0x%p is not associated with SRQ\n", qp); return; } ret = ib_modify_qp(qp, &attr, IB_QP_STATE); if (ret) { WARN_ONCE(ret, "failed to drain shared recv queue: %d\n", ret); return; } if (ib_srq_has_cq(qp->srq->srq_type)) { cq = qp->srq->ext.cq; } else if (qp->recv_cq) { cq = qp->recv_cq; } else { WARN_ONCE(1, "QP 0x%p has no CQ associated with SRQ\n", qp); return; } if (wait_for_completion_timeout(&qp->srq_completion, 60 * HZ) > 0) { while (polled != cq->cqe) { n = ib_process_cq_direct(cq, cq->cqe - polled); if (!n) return; polled += n; } } } /** * ib_drain_sq() - Block until all SQ CQEs have been consumed by the * application. * @qp: queue pair to drain * * If the device has a provider-specific drain function, then * call that. Otherwise call the generic drain function * __ib_drain_sq(). * * The caller must: * * ensure there is room in the CQ and SQ for the drain work request and * completion. * * allocate the CQ using ib_alloc_cq(). * * ensure that there are no other contexts that are posting WRs concurrently. * Otherwise the drain is not guaranteed. */ void ib_drain_sq(struct ib_qp *qp) { if (qp->device->ops.drain_sq) qp->device->ops.drain_sq(qp); else __ib_drain_sq(qp); trace_cq_drain_complete(qp->send_cq); } EXPORT_SYMBOL(ib_drain_sq); /** * ib_drain_rq() - Block until all RQ CQEs have been consumed by the * application. * @qp: queue pair to drain * * If the device has a provider-specific drain function, then * call that. Otherwise call the generic drain function * __ib_drain_rq(). * * The caller must: * * ensure there is room in the CQ and RQ for the drain work request and * completion. * * allocate the CQ using ib_alloc_cq(). * * ensure that there are no other contexts that are posting WRs concurrently. * Otherwise the drain is not guaranteed. */ void ib_drain_rq(struct ib_qp *qp) { if (qp->device->ops.drain_rq) qp->device->ops.drain_rq(qp); else __ib_drain_rq(qp); trace_cq_drain_complete(qp->recv_cq); } EXPORT_SYMBOL(ib_drain_rq); /** * ib_drain_qp() - Block until all CQEs have been consumed by the * application on both the RQ and SQ. * @qp: queue pair to drain * * The caller must: * * ensure there is room in the CQ(s), SQ, and RQ for drain work requests * and completions. * * allocate the CQs using ib_alloc_cq(). * * ensure that there are no other contexts that are posting WRs concurrently. * Otherwise the drain is not guaranteed. */ void ib_drain_qp(struct ib_qp *qp) { ib_drain_sq(qp); if (!qp->srq) ib_drain_rq(qp); else __ib_drain_srq(qp); } EXPORT_SYMBOL(ib_drain_qp); struct net_device *rdma_alloc_netdev(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *)) { struct rdma_netdev_alloc_params params; struct net_device *netdev; int rc; if (!device->ops.rdma_netdev_get_params) return ERR_PTR(-EOPNOTSUPP); rc = device->ops.rdma_netdev_get_params(device, port_num, type, ¶ms); if (rc) return ERR_PTR(rc); netdev = alloc_netdev_mqs(params.sizeof_priv, name, name_assign_type, setup, params.txqs, params.rxqs); if (!netdev) return ERR_PTR(-ENOMEM); return netdev; } EXPORT_SYMBOL(rdma_alloc_netdev); int rdma_init_netdev(struct ib_device *device, u32 port_num, enum rdma_netdev_t type, const char *name, unsigned char name_assign_type, void (*setup)(struct net_device *), struct net_device *netdev) { struct rdma_netdev_alloc_params params; int rc; if (!device->ops.rdma_netdev_get_params) return -EOPNOTSUPP; rc = device->ops.rdma_netdev_get_params(device, port_num, type, ¶ms); if (rc) return rc; return params.initialize_rdma_netdev(device, port_num, netdev, params.param); } EXPORT_SYMBOL(rdma_init_netdev); void __rdma_block_iter_start(struct ib_block_iter *biter, struct scatterlist *sglist, unsigned int nents, unsigned long pgsz) { memset(biter, 0, sizeof(struct ib_block_iter)); biter->__sg = sglist; biter->__sg_nents = nents; /* Driver provides best block size to use */ biter->__pg_bit = __fls(pgsz); } EXPORT_SYMBOL(__rdma_block_iter_start); bool __rdma_block_iter_next(struct ib_block_iter *biter) { unsigned int block_offset; unsigned int sg_delta; if (!biter->__sg_nents || !biter->__sg) return false; biter->__dma_addr = sg_dma_address(biter->__sg) + biter->__sg_advance; block_offset = biter->__dma_addr & (BIT_ULL(biter->__pg_bit) - 1); sg_delta = BIT_ULL(biter->__pg_bit) - block_offset; if (sg_dma_len(biter->__sg) - biter->__sg_advance > sg_delta) { biter->__sg_advance += sg_delta; } else { biter->__sg_advance = 0; biter->__sg = sg_next(biter->__sg); biter->__sg_nents--; } return true; } EXPORT_SYMBOL(__rdma_block_iter_next); /** * rdma_alloc_hw_stats_struct - Helper function to allocate dynamic struct * for the drivers. * @descs: array of static descriptors * @num_counters: number of elements in array * @lifespan: milliseconds between updates */ struct rdma_hw_stats *rdma_alloc_hw_stats_struct( const struct rdma_stat_desc *descs, int num_counters, unsigned long lifespan) { struct rdma_hw_stats *stats; stats = kzalloc(struct_size(stats, value, num_counters), GFP_KERNEL); if (!stats) return NULL; stats->is_disabled = kcalloc(BITS_TO_LONGS(num_counters), sizeof(*stats->is_disabled), GFP_KERNEL); if (!stats->is_disabled) goto err; stats->descs = descs; stats->num_counters = num_counters; stats->lifespan = msecs_to_jiffies(lifespan); mutex_init(&stats->lock); return stats; err: kfree(stats); return NULL; } EXPORT_SYMBOL(rdma_alloc_hw_stats_struct); /** * rdma_free_hw_stats_struct - Helper function to release rdma_hw_stats * @stats: statistics to release */ void rdma_free_hw_stats_struct(struct rdma_hw_stats *stats) { if (!stats) return; kfree(stats->is_disabled); kfree(stats); } EXPORT_SYMBOL(rdma_free_hw_stats_struct); |
1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) */ #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/kernel.h> #include <linux/interrupt.h> #include <linux/fs.h> #include <linux/types.h> #include <linux/sysctl.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/errno.h> #include <linux/fcntl.h> #include <linux/in.h> #include <linux/if_ether.h> #include <linux/slab.h> #include <asm/io.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/ip.h> #include <net/arp.h> #include <net/ax25.h> #include <net/rose.h> static int rose_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned int len) { unsigned char *buff = skb_push(skb, ROSE_MIN_LEN + 2); if (daddr) memcpy(buff + 7, daddr, dev->addr_len); *buff++ = ROSE_GFI | ROSE_Q_BIT; *buff++ = 0x00; *buff++ = ROSE_DATA; *buff++ = 0x7F; *buff++ = AX25_P_IP; if (daddr != NULL) return 37; return -37; } static int rose_set_mac_address(struct net_device *dev, void *addr) { struct sockaddr *sa = addr; int err; if (!memcmp(dev->dev_addr, sa->sa_data, dev->addr_len)) return 0; if (dev->flags & IFF_UP) { err = rose_add_loopback_node((rose_address *)sa->sa_data); if (err) return err; rose_del_loopback_node((const rose_address *)dev->dev_addr); } dev_addr_set(dev, sa->sa_data); return 0; } static int rose_open(struct net_device *dev) { int err; err = rose_add_loopback_node((const rose_address *)dev->dev_addr); if (err) return err; netif_start_queue(dev); return 0; } static int rose_close(struct net_device *dev) { netif_stop_queue(dev); rose_del_loopback_node((const rose_address *)dev->dev_addr); return 0; } static netdev_tx_t rose_xmit(struct sk_buff *skb, struct net_device *dev) { struct net_device_stats *stats = &dev->stats; unsigned int len = skb->len; if (!netif_running(dev)) { printk(KERN_ERR "ROSE: rose_xmit - called when iface is down\n"); return NETDEV_TX_BUSY; } if (!rose_route_frame(skb, NULL)) { dev_kfree_skb(skb); stats->tx_errors++; return NETDEV_TX_OK; } stats->tx_packets++; stats->tx_bytes += len; return NETDEV_TX_OK; } static const struct header_ops rose_header_ops = { .create = rose_header, }; static const struct net_device_ops rose_netdev_ops = { .ndo_open = rose_open, .ndo_stop = rose_close, .ndo_start_xmit = rose_xmit, .ndo_set_mac_address = rose_set_mac_address, }; void rose_setup(struct net_device *dev) { dev->mtu = ROSE_MAX_PACKET_SIZE - 2; dev->netdev_ops = &rose_netdev_ops; dev->header_ops = &rose_header_ops; dev->hard_header_len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN; dev->addr_len = ROSE_ADDR_LEN; dev->type = ARPHRD_ROSE; /* New-style flags. */ dev->flags = IFF_NOARP; } |
338 359 17 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_NSPROXY_H #define _LINUX_NSPROXY_H #include <linux/refcount.h> #include <linux/spinlock.h> #include <linux/sched.h> struct mnt_namespace; struct uts_namespace; struct ipc_namespace; struct pid_namespace; struct cgroup_namespace; struct fs_struct; /* * A structure to contain pointers to all per-process * namespaces - fs (mount), uts, network, sysvipc, etc. * * The pid namespace is an exception -- it's accessed using * task_active_pid_ns. The pid namespace here is the * namespace that children will use. * * 'count' is the number of tasks holding a reference. * The count for each namespace, then, will be the number * of nsproxies pointing to it, not the number of tasks. * * The nsproxy is shared by tasks which share all namespaces. * As soon as a single namespace is cloned or unshared, the * nsproxy is copied. */ struct nsproxy { refcount_t count; struct uts_namespace *uts_ns; struct ipc_namespace *ipc_ns; struct mnt_namespace *mnt_ns; struct pid_namespace *pid_ns_for_children; struct net *net_ns; struct time_namespace *time_ns; struct time_namespace *time_ns_for_children; struct cgroup_namespace *cgroup_ns; }; extern struct nsproxy init_nsproxy; #define to_ns_common(__ns) \ _Generic((__ns), \ struct cgroup_namespace *: &(__ns->ns), \ struct ipc_namespace *: &(__ns->ns), \ struct net *: &(__ns->ns), \ struct pid_namespace *: &(__ns->ns), \ struct mnt_namespace *: &(__ns->ns), \ struct time_namespace *: &(__ns->ns), \ struct user_namespace *: &(__ns->ns), \ struct uts_namespace *: &(__ns->ns)) /* * A structure to encompass all bits needed to install * a partial or complete new set of namespaces. * * If a new user namespace is requested cred will * point to a modifiable set of credentials. If a pointer * to a modifiable set is needed nsset_cred() must be * used and tested. */ struct nsset { unsigned flags; struct nsproxy *nsproxy; struct fs_struct *fs; const struct cred *cred; }; static inline struct cred *nsset_cred(struct nsset *set) { if (set->flags & CLONE_NEWUSER) return (struct cred *)set->cred; return NULL; } /* * the namespaces access rules are: * * 1. only current task is allowed to change tsk->nsproxy pointer or * any pointer on the nsproxy itself. Current must hold the task_lock * when changing tsk->nsproxy. * * 2. when accessing (i.e. reading) current task's namespaces - no * precautions should be taken - just dereference the pointers * * 3. the access to other task namespaces is performed like this * task_lock(task); * nsproxy = task->nsproxy; * if (nsproxy != NULL) { * / * * * work with the namespaces here * * e.g. get the reference on one of them * * / * } / * * * NULL task->nsproxy means that this task is * * almost dead (zombie) * * / * task_unlock(task); * */ int copy_namespaces(unsigned long flags, struct task_struct *tsk); void exit_task_namespaces(struct task_struct *tsk); void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new); int exec_task_namespaces(void); void free_nsproxy(struct nsproxy *ns); int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **, struct cred *, struct fs_struct *); int __init nsproxy_cache_init(void); static inline void put_nsproxy(struct nsproxy *ns) { if (refcount_dec_and_test(&ns->count)) free_nsproxy(ns); } static inline void get_nsproxy(struct nsproxy *ns) { refcount_inc(&ns->count); } DEFINE_FREE(put_nsproxy, struct nsproxy *, if (_T) put_nsproxy(_T)) #endif |
1 1 1 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2016-present, Facebook, Inc. * All rights reserved. * */ #include <linux/bio.h> #include <linux/bitmap.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/pagemap.h> #include <linux/refcount.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/zstd.h> #include "misc.h" #include "fs.h" #include "btrfs_inode.h" #include "compression.h" #include "super.h" #define ZSTD_BTRFS_MAX_WINDOWLOG 17 #define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG) #define ZSTD_BTRFS_DEFAULT_LEVEL 3 #define ZSTD_BTRFS_MAX_LEVEL 15 /* 307s to avoid pathologically clashing with transaction commit */ #define ZSTD_BTRFS_RECLAIM_JIFFIES (307 * HZ) static zstd_parameters zstd_get_btrfs_parameters(unsigned int level, size_t src_len) { zstd_parameters params = zstd_get_params(level, src_len); if (params.cParams.windowLog > ZSTD_BTRFS_MAX_WINDOWLOG) params.cParams.windowLog = ZSTD_BTRFS_MAX_WINDOWLOG; WARN_ON(src_len > ZSTD_BTRFS_MAX_INPUT); return params; } struct workspace { void *mem; size_t size; char *buf; unsigned int level; unsigned int req_level; unsigned long last_used; /* jiffies */ struct list_head list; struct list_head lru_list; zstd_in_buffer in_buf; zstd_out_buffer out_buf; }; /* * Zstd Workspace Management * * Zstd workspaces have different memory requirements depending on the level. * The zstd workspaces are managed by having individual lists for each level * and a global lru. Forward progress is maintained by protecting a max level * workspace. * * Getting a workspace is done by using the bitmap to identify the levels that * have available workspaces and scans up. This lets us recycle higher level * workspaces because of the monotonic memory guarantee. A workspace's * last_used is only updated if it is being used by the corresponding memory * level. Putting a workspace involves adding it back to the appropriate places * and adding it back to the lru if necessary. * * A timer is used to reclaim workspaces if they have not been used for * ZSTD_BTRFS_RECLAIM_JIFFIES. This helps keep only active workspaces around. * The upper bound is provided by the workqueue limit which is 2 (percpu limit). */ struct zstd_workspace_manager { const struct btrfs_compress_op *ops; spinlock_t lock; struct list_head lru_list; struct list_head idle_ws[ZSTD_BTRFS_MAX_LEVEL]; unsigned long active_map; wait_queue_head_t wait; struct timer_list timer; }; static struct zstd_workspace_manager wsm; static size_t zstd_ws_mem_sizes[ZSTD_BTRFS_MAX_LEVEL]; static inline struct workspace *list_to_workspace(struct list_head *list) { return container_of(list, struct workspace, list); } void zstd_free_workspace(struct list_head *ws); struct list_head *zstd_alloc_workspace(unsigned int level); /* * Timer callback to free unused workspaces. * * @t: timer * * This scans the lru_list and attempts to reclaim any workspace that hasn't * been used for ZSTD_BTRFS_RECLAIM_JIFFIES. * * The context is softirq and does not need the _bh locking primitives. */ static void zstd_reclaim_timer_fn(struct timer_list *timer) { unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES; struct list_head *pos, *next; spin_lock(&wsm.lock); if (list_empty(&wsm.lru_list)) { spin_unlock(&wsm.lock); return; } list_for_each_prev_safe(pos, next, &wsm.lru_list) { struct workspace *victim = container_of(pos, struct workspace, lru_list); unsigned int level; if (time_after(victim->last_used, reclaim_threshold)) break; /* workspace is in use */ if (victim->req_level) continue; level = victim->level; list_del(&victim->lru_list); list_del(&victim->list); zstd_free_workspace(&victim->list); if (list_empty(&wsm.idle_ws[level - 1])) clear_bit(level - 1, &wsm.active_map); } if (!list_empty(&wsm.lru_list)) mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES); spin_unlock(&wsm.lock); } /* * Calculate monotonic memory bounds. * * It is possible based on the level configurations that a higher level * workspace uses less memory than a lower level workspace. In order to reuse * workspaces, this must be made a monotonic relationship. This precomputes * the required memory for each level and enforces the monotonicity between * level and memory required. */ static void zstd_calc_ws_mem_sizes(void) { size_t max_size = 0; unsigned int level; for (level = 1; level <= ZSTD_BTRFS_MAX_LEVEL; level++) { zstd_parameters params = zstd_get_btrfs_parameters(level, ZSTD_BTRFS_MAX_INPUT); size_t level_size = max_t(size_t, zstd_cstream_workspace_bound(¶ms.cParams), zstd_dstream_workspace_bound(ZSTD_BTRFS_MAX_INPUT)); max_size = max_t(size_t, max_size, level_size); zstd_ws_mem_sizes[level - 1] = max_size; } } void zstd_init_workspace_manager(void) { struct list_head *ws; int i; zstd_calc_ws_mem_sizes(); wsm.ops = &btrfs_zstd_compress; spin_lock_init(&wsm.lock); init_waitqueue_head(&wsm.wait); timer_setup(&wsm.timer, zstd_reclaim_timer_fn, 0); INIT_LIST_HEAD(&wsm.lru_list); for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) INIT_LIST_HEAD(&wsm.idle_ws[i]); ws = zstd_alloc_workspace(ZSTD_BTRFS_MAX_LEVEL); if (IS_ERR(ws)) { pr_warn( "BTRFS: cannot preallocate zstd compression workspace\n"); } else { set_bit(ZSTD_BTRFS_MAX_LEVEL - 1, &wsm.active_map); list_add(ws, &wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1]); } } void zstd_cleanup_workspace_manager(void) { struct workspace *workspace; int i; spin_lock_bh(&wsm.lock); for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) { while (!list_empty(&wsm.idle_ws[i])) { workspace = container_of(wsm.idle_ws[i].next, struct workspace, list); list_del(&workspace->list); list_del(&workspace->lru_list); zstd_free_workspace(&workspace->list); } } spin_unlock_bh(&wsm.lock); del_timer_sync(&wsm.timer); } /* * Find workspace for given level. * * @level: compression level * * This iterates over the set bits in the active_map beginning at the requested * compression level. This lets us utilize already allocated workspaces before * allocating a new one. If the workspace is of a larger size, it is used, but * the place in the lru_list and last_used times are not updated. This is to * offer the opportunity to reclaim the workspace in favor of allocating an * appropriately sized one in the future. */ static struct list_head *zstd_find_workspace(unsigned int level) { struct list_head *ws; struct workspace *workspace; int i = level - 1; spin_lock_bh(&wsm.lock); for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) { if (!list_empty(&wsm.idle_ws[i])) { ws = wsm.idle_ws[i].next; workspace = list_to_workspace(ws); list_del_init(ws); /* keep its place if it's a lower level using this */ workspace->req_level = level; if (level == workspace->level) list_del(&workspace->lru_list); if (list_empty(&wsm.idle_ws[i])) clear_bit(i, &wsm.active_map); spin_unlock_bh(&wsm.lock); return ws; } } spin_unlock_bh(&wsm.lock); return NULL; } /* * Zstd get_workspace for level. * * @level: compression level * * If @level is 0, then any compression level can be used. Therefore, we begin * scanning from 1. We first scan through possible workspaces and then after * attempt to allocate a new workspace. If we fail to allocate one due to * memory pressure, go to sleep waiting for the max level workspace to free up. */ struct list_head *zstd_get_workspace(unsigned int level) { struct list_head *ws; unsigned int nofs_flag; /* level == 0 means we can use any workspace */ if (!level) level = 1; again: ws = zstd_find_workspace(level); if (ws) return ws; nofs_flag = memalloc_nofs_save(); ws = zstd_alloc_workspace(level); memalloc_nofs_restore(nofs_flag); if (IS_ERR(ws)) { DEFINE_WAIT(wait); prepare_to_wait(&wsm.wait, &wait, TASK_UNINTERRUPTIBLE); schedule(); finish_wait(&wsm.wait, &wait); goto again; } return ws; } /* * Zstd put_workspace. * * @ws: list_head for the workspace * * When putting back a workspace, we only need to update the LRU if we are of * the requested compression level. Here is where we continue to protect the * max level workspace or update last_used accordingly. If the reclaim timer * isn't set, it is also set here. Only the max level workspace tries and wakes * up waiting workspaces. */ void zstd_put_workspace(struct list_head *ws) { struct workspace *workspace = list_to_workspace(ws); spin_lock_bh(&wsm.lock); /* A node is only taken off the lru if we are the corresponding level */ if (workspace->req_level == workspace->level) { /* Hide a max level workspace from reclaim */ if (list_empty(&wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) { INIT_LIST_HEAD(&workspace->lru_list); } else { workspace->last_used = jiffies; list_add(&workspace->lru_list, &wsm.lru_list); if (!timer_pending(&wsm.timer)) mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES); } } set_bit(workspace->level - 1, &wsm.active_map); list_add(&workspace->list, &wsm.idle_ws[workspace->level - 1]); workspace->req_level = 0; spin_unlock_bh(&wsm.lock); if (workspace->level == ZSTD_BTRFS_MAX_LEVEL) cond_wake_up(&wsm.wait); } void zstd_free_workspace(struct list_head *ws) { struct workspace *workspace = list_entry(ws, struct workspace, list); kvfree(workspace->mem); kfree(workspace->buf); kfree(workspace); } struct list_head *zstd_alloc_workspace(unsigned int level) { struct workspace *workspace; workspace = kzalloc(sizeof(*workspace), GFP_KERNEL); if (!workspace) return ERR_PTR(-ENOMEM); workspace->size = zstd_ws_mem_sizes[level - 1]; workspace->level = level; workspace->req_level = level; workspace->last_used = jiffies; workspace->mem = kvmalloc(workspace->size, GFP_KERNEL | __GFP_NOWARN); workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!workspace->mem || !workspace->buf) goto fail; INIT_LIST_HEAD(&workspace->list); INIT_LIST_HEAD(&workspace->lru_list); return &workspace->list; fail: zstd_free_workspace(&workspace->list); return ERR_PTR(-ENOMEM); } int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); zstd_cstream *stream; int ret = 0; int nr_folios = 0; struct folio *in_folio = NULL; /* The current folio to read. */ struct folio *out_folio = NULL; /* The current folio to write to. */ unsigned long tot_in = 0; unsigned long tot_out = 0; unsigned long len = *total_out; const unsigned long nr_dest_folios = *out_folios; unsigned long max_out = nr_dest_folios * PAGE_SIZE; zstd_parameters params = zstd_get_btrfs_parameters(workspace->req_level, len); *out_folios = 0; *total_out = 0; *total_in = 0; /* Initialize the stream */ stream = zstd_init_cstream(¶ms, len, workspace->mem, workspace->size); if (unlikely(!stream)) { struct btrfs_inode *inode = BTRFS_I(mapping->host); btrfs_err(inode->root->fs_info, "zstd compression init level %d failed, root %llu inode %llu offset %llu", workspace->req_level, btrfs_root_id(inode->root), btrfs_ino(inode), start); ret = -EIO; goto out; } /* map in the first page of input data */ ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); if (ret < 0) goto out; workspace->in_buf.src = kmap_local_folio(in_folio, 0); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); /* Allocate and map in the output buffer */ out_folio = btrfs_alloc_compr_folio(); if (out_folio == NULL) { ret = -ENOMEM; goto out; } folios[nr_folios++] = out_folio; workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); while (1) { size_t ret2; ret2 = zstd_compress_stream(stream, &workspace->out_buf, &workspace->in_buf); if (unlikely(zstd_is_error(ret2))) { struct btrfs_inode *inode = BTRFS_I(mapping->host); btrfs_warn(inode->root->fs_info, "zstd compression level %d failed, error %d root %llu inode %llu offset %llu", workspace->req_level, zstd_get_error_code(ret2), btrfs_root_id(inode->root), btrfs_ino(inode), start); ret = -EIO; goto out; } /* Check to see if we are making it bigger */ if (tot_in + workspace->in_buf.pos > 8192 && tot_in + workspace->in_buf.pos < tot_out + workspace->out_buf.pos) { ret = -E2BIG; goto out; } /* We've reached the end of our output range */ if (workspace->out_buf.pos >= max_out) { tot_out += workspace->out_buf.pos; ret = -E2BIG; goto out; } /* Check if we need more output space */ if (workspace->out_buf.pos == workspace->out_buf.size) { tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; if (nr_folios == nr_dest_folios) { ret = -E2BIG; goto out; } out_folio = btrfs_alloc_compr_folio(); if (out_folio == NULL) { ret = -ENOMEM; goto out; } folios[nr_folios++] = out_folio; workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); } /* We've reached the end of the input */ if (workspace->in_buf.pos >= len) { tot_in += workspace->in_buf.pos; break; } /* Check if we need more input */ if (workspace->in_buf.pos == workspace->in_buf.size) { tot_in += PAGE_SIZE; kunmap_local(workspace->in_buf.src); workspace->in_buf.src = NULL; folio_put(in_folio); start += PAGE_SIZE; len -= PAGE_SIZE; ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); if (ret < 0) goto out; workspace->in_buf.src = kmap_local_folio(in_folio, 0); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); } } while (1) { size_t ret2; ret2 = zstd_end_stream(stream, &workspace->out_buf); if (unlikely(zstd_is_error(ret2))) { struct btrfs_inode *inode = BTRFS_I(mapping->host); btrfs_err(inode->root->fs_info, "zstd compression end level %d failed, error %d root %llu inode %llu offset %llu", workspace->req_level, zstd_get_error_code(ret2), btrfs_root_id(inode->root), btrfs_ino(inode), start); ret = -EIO; goto out; } if (ret2 == 0) { tot_out += workspace->out_buf.pos; break; } if (workspace->out_buf.pos >= max_out) { tot_out += workspace->out_buf.pos; ret = -E2BIG; goto out; } tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; if (nr_folios == nr_dest_folios) { ret = -E2BIG; goto out; } out_folio = btrfs_alloc_compr_folio(); if (out_folio == NULL) { ret = -ENOMEM; goto out; } folios[nr_folios++] = out_folio; workspace->out_buf.dst = folio_address(out_folio); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); } if (tot_out >= tot_in) { ret = -E2BIG; goto out; } ret = 0; *total_in = tot_in; *total_out = tot_out; out: *out_folios = nr_folios; if (workspace->in_buf.src) { kunmap_local(workspace->in_buf.src); folio_put(in_folio); } return ret; } int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct workspace *workspace = list_entry(ws, struct workspace, list); struct folio **folios_in = cb->compressed_folios; size_t srclen = cb->compressed_len; zstd_dstream *stream; int ret = 0; unsigned long folio_in_index = 0; unsigned long total_folios_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; unsigned long total_out = 0; stream = zstd_init_dstream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); if (unlikely(!stream)) { struct btrfs_inode *inode = cb->bbio.inode; btrfs_err(inode->root->fs_info, "zstd decompression init failed, root %llu inode %llu offset %llu", btrfs_root_id(inode->root), btrfs_ino(inode), cb->start); ret = -EIO; goto done; } workspace->in_buf.src = kmap_local_folio(folios_in[folio_in_index], 0); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); workspace->out_buf.dst = workspace->buf; workspace->out_buf.pos = 0; workspace->out_buf.size = PAGE_SIZE; while (1) { size_t ret2; ret2 = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf); if (unlikely(zstd_is_error(ret2))) { struct btrfs_inode *inode = cb->bbio.inode; btrfs_err(inode->root->fs_info, "zstd decompression failed, error %d root %llu inode %llu offset %llu", zstd_get_error_code(ret2), btrfs_root_id(inode->root), btrfs_ino(inode), cb->start); ret = -EIO; goto done; } buf_start = total_out; total_out += workspace->out_buf.pos; workspace->out_buf.pos = 0; ret = btrfs_decompress_buf2page(workspace->out_buf.dst, total_out - buf_start, cb, buf_start); if (ret == 0) break; if (workspace->in_buf.pos >= srclen) break; /* Check if we've hit the end of a frame */ if (ret2 == 0) break; if (workspace->in_buf.pos == workspace->in_buf.size) { kunmap_local(workspace->in_buf.src); folio_in_index++; if (folio_in_index >= total_folios_in) { workspace->in_buf.src = NULL; ret = -EIO; goto done; } srclen -= PAGE_SIZE; workspace->in_buf.src = kmap_local_folio(folios_in[folio_in_index], 0); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); } } ret = 0; done: if (workspace->in_buf.src) kunmap_local(workspace->in_buf.src); return ret; } int zstd_decompress(struct list_head *ws, const u8 *data_in, struct page *dest_page, unsigned long dest_pgoff, size_t srclen, size_t destlen) { struct workspace *workspace = list_entry(ws, struct workspace, list); struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb); const u32 sectorsize = fs_info->sectorsize; zstd_dstream *stream; int ret = 0; unsigned long to_copy = 0; stream = zstd_init_dstream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); if (unlikely(!stream)) { struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host); btrfs_err(inode->root->fs_info, "zstd decompression init failed, root %llu inode %llu offset %llu", btrfs_root_id(inode->root), btrfs_ino(inode), page_offset(dest_page)); ret = -EIO; goto finish; } workspace->in_buf.src = data_in; workspace->in_buf.pos = 0; workspace->in_buf.size = srclen; workspace->out_buf.dst = workspace->buf; workspace->out_buf.pos = 0; workspace->out_buf.size = sectorsize; /* * Since both input and output buffers should not exceed one sector, * one call should end the decompression. */ ret = zstd_decompress_stream(stream, &workspace->out_buf, &workspace->in_buf); if (unlikely(zstd_is_error(ret))) { struct btrfs_inode *inode = BTRFS_I(dest_page->mapping->host); btrfs_err(inode->root->fs_info, "zstd decompression failed, error %d root %llu inode %llu offset %llu", zstd_get_error_code(ret), btrfs_root_id(inode->root), btrfs_ino(inode), page_offset(dest_page)); goto finish; } to_copy = workspace->out_buf.pos; memcpy_to_page(dest_page, dest_pgoff, workspace->out_buf.dst, to_copy); finish: /* Error or early end. */ if (unlikely(to_copy < destlen)) { ret = -EIO; memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy); } return ret; } const struct btrfs_compress_op btrfs_zstd_compress = { /* ZSTD uses own workspace manager */ .workspace_manager = NULL, .max_level = ZSTD_BTRFS_MAX_LEVEL, .default_level = ZSTD_BTRFS_DEFAULT_LEVEL, }; |
57 57 56 701 1260 729 726 374 49 64 15 613 610 611 39 71 4 1125 1130 953 363 936 22 1123 1126 932 21 3 2 113 113 113 1799 913 1181 798 795 11 39 858 858 1128 458 932 16 1142 457 944 17 2 1 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 | // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/types.h> #include <linux/export.h> #include <linux/init.h> #include <linux/udp.h> #include <linux/tcp.h> #include <linux/icmp.h> #include <linux/icmpv6.h> #include <linux/dccp.h> #include <linux/sctp.h> #include <net/sctp/checksum.h> #include <linux/netfilter.h> #include <net/netfilter/nf_nat.h> #include <linux/ipv6.h> #include <linux/netfilter_ipv6.h> #include <net/checksum.h> #include <net/ip6_checksum.h> #include <net/ip6_route.h> #include <net/xfrm.h> #include <net/ipv6.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack.h> #include <linux/netfilter/nfnetlink_conntrack.h> static void nf_csum_update(struct sk_buff *skb, unsigned int iphdroff, __sum16 *check, const struct nf_conntrack_tuple *t, enum nf_nat_manip_type maniptype); static void __udp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, struct udphdr *hdr, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype, bool do_csum) { __be16 *portptr, newport; if (maniptype == NF_NAT_MANIP_SRC) { /* Get rid of src port */ newport = tuple->src.u.udp.port; portptr = &hdr->source; } else { /* Get rid of dst port */ newport = tuple->dst.u.udp.port; portptr = &hdr->dest; } if (do_csum) { nf_csum_update(skb, iphdroff, &hdr->check, tuple, maniptype); inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, false); if (!hdr->check) hdr->check = CSUM_MANGLED_0; } *portptr = newport; } static bool udp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { struct udphdr *hdr; if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct udphdr *)(skb->data + hdroff); __udp_manip_pkt(skb, iphdroff, hdr, tuple, maniptype, !!hdr->check); return true; } static bool udplite_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { #ifdef CONFIG_NF_CT_PROTO_UDPLITE struct udphdr *hdr; if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct udphdr *)(skb->data + hdroff); __udp_manip_pkt(skb, iphdroff, hdr, tuple, maniptype, true); #endif return true; } static bool sctp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { #ifdef CONFIG_NF_CT_PROTO_SCTP struct sctphdr *hdr; int hdrsize = 8; /* This could be an inner header returned in imcp packet; in such * cases we cannot update the checksum field since it is outside * of the 8 bytes of transport layer headers we are guaranteed. */ if (skb->len >= hdroff + sizeof(*hdr)) hdrsize = sizeof(*hdr); if (skb_ensure_writable(skb, hdroff + hdrsize)) return false; hdr = (struct sctphdr *)(skb->data + hdroff); if (maniptype == NF_NAT_MANIP_SRC) { /* Get rid of src port */ hdr->source = tuple->src.u.sctp.port; } else { /* Get rid of dst port */ hdr->dest = tuple->dst.u.sctp.port; } if (hdrsize < sizeof(*hdr)) return true; if (skb->ip_summed != CHECKSUM_PARTIAL) { hdr->checksum = sctp_compute_cksum(skb, hdroff); skb->ip_summed = CHECKSUM_NONE; } #endif return true; } static bool tcp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { struct tcphdr *hdr; __be16 *portptr, newport, oldport; int hdrsize = 8; /* TCP connection tracking guarantees this much */ /* this could be a inner header returned in icmp packet; in such cases we cannot update the checksum field since it is outside of the 8 bytes of transport layer headers we are guaranteed */ if (skb->len >= hdroff + sizeof(struct tcphdr)) hdrsize = sizeof(struct tcphdr); if (skb_ensure_writable(skb, hdroff + hdrsize)) return false; hdr = (struct tcphdr *)(skb->data + hdroff); if (maniptype == NF_NAT_MANIP_SRC) { /* Get rid of src port */ newport = tuple->src.u.tcp.port; portptr = &hdr->source; } else { /* Get rid of dst port */ newport = tuple->dst.u.tcp.port; portptr = &hdr->dest; } oldport = *portptr; *portptr = newport; if (hdrsize < sizeof(*hdr)) return true; nf_csum_update(skb, iphdroff, &hdr->check, tuple, maniptype); inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, false); return true; } static bool dccp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { #ifdef CONFIG_NF_CT_PROTO_DCCP struct dccp_hdr *hdr; __be16 *portptr, oldport, newport; int hdrsize = 8; /* DCCP connection tracking guarantees this much */ if (skb->len >= hdroff + sizeof(struct dccp_hdr)) hdrsize = sizeof(struct dccp_hdr); if (skb_ensure_writable(skb, hdroff + hdrsize)) return false; hdr = (struct dccp_hdr *)(skb->data + hdroff); if (maniptype == NF_NAT_MANIP_SRC) { newport = tuple->src.u.dccp.port; portptr = &hdr->dccph_sport; } else { newport = tuple->dst.u.dccp.port; portptr = &hdr->dccph_dport; } oldport = *portptr; *portptr = newport; if (hdrsize < sizeof(*hdr)) return true; nf_csum_update(skb, iphdroff, &hdr->dccph_checksum, tuple, maniptype); inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport, false); #endif return true; } static bool icmp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { struct icmphdr *hdr; if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct icmphdr *)(skb->data + hdroff); switch (hdr->type) { case ICMP_ECHO: case ICMP_ECHOREPLY: case ICMP_TIMESTAMP: case ICMP_TIMESTAMPREPLY: case ICMP_INFO_REQUEST: case ICMP_INFO_REPLY: case ICMP_ADDRESS: case ICMP_ADDRESSREPLY: break; default: return true; } inet_proto_csum_replace2(&hdr->checksum, skb, hdr->un.echo.id, tuple->src.u.icmp.id, false); hdr->un.echo.id = tuple->src.u.icmp.id; return true; } static bool icmpv6_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { struct icmp6hdr *hdr; if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct icmp6hdr *)(skb->data + hdroff); nf_csum_update(skb, iphdroff, &hdr->icmp6_cksum, tuple, maniptype); if (hdr->icmp6_type == ICMPV6_ECHO_REQUEST || hdr->icmp6_type == ICMPV6_ECHO_REPLY) { inet_proto_csum_replace2(&hdr->icmp6_cksum, skb, hdr->icmp6_identifier, tuple->src.u.icmp.id, false); hdr->icmp6_identifier = tuple->src.u.icmp.id; } return true; } /* manipulate a GRE packet according to maniptype */ static bool gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { #if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE) const struct gre_base_hdr *greh; struct pptp_gre_header *pgreh; /* pgreh includes two optional 32bit fields which are not required * to be there. That's where the magic '8' comes from */ if (skb_ensure_writable(skb, hdroff + sizeof(*pgreh) - 8)) return false; greh = (void *)skb->data + hdroff; pgreh = (struct pptp_gre_header *)greh; /* we only have destination manip of a packet, since 'source key' * is not present in the packet itself */ if (maniptype != NF_NAT_MANIP_DST) return true; switch (greh->flags & GRE_VERSION) { case GRE_VERSION_0: /* We do not currently NAT any GREv0 packets. * Try to behave like "nf_nat_proto_unknown" */ break; case GRE_VERSION_1: pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key)); pgreh->call_id = tuple->dst.u.gre.key; break; default: pr_debug("can't nat unknown GRE version\n"); return false; } #endif return true; } static bool l4proto_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { switch (tuple->dst.protonum) { case IPPROTO_TCP: return tcp_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_UDP: return udp_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_UDPLITE: return udplite_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_SCTP: return sctp_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_ICMP: return icmp_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_ICMPV6: return icmpv6_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_DCCP: return dccp_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_GRE: return gre_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); } /* If we don't know protocol -- no error, pass it unmodified. */ return true; } static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, const struct nf_conntrack_tuple *target, enum nf_nat_manip_type maniptype) { struct iphdr *iph; unsigned int hdroff; if (skb_ensure_writable(skb, iphdroff + sizeof(*iph))) return false; iph = (void *)skb->data + iphdroff; hdroff = iphdroff + iph->ihl * 4; if (!l4proto_manip_pkt(skb, iphdroff, hdroff, target, maniptype)) return false; iph = (void *)skb->data + iphdroff; if (maniptype == NF_NAT_MANIP_SRC) { csum_replace4(&iph->check, iph->saddr, target->src.u3.ip); iph->saddr = target->src.u3.ip; } else { csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip); iph->daddr = target->dst.u3.ip; } return true; } static bool nf_nat_ipv6_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, const struct nf_conntrack_tuple *target, enum nf_nat_manip_type maniptype) { #if IS_ENABLED(CONFIG_IPV6) struct ipv6hdr *ipv6h; __be16 frag_off; int hdroff; u8 nexthdr; if (skb_ensure_writable(skb, iphdroff + sizeof(*ipv6h))) return false; ipv6h = (void *)skb->data + iphdroff; nexthdr = ipv6h->nexthdr; hdroff = ipv6_skip_exthdr(skb, iphdroff + sizeof(*ipv6h), &nexthdr, &frag_off); if (hdroff < 0) goto manip_addr; if ((frag_off & htons(~0x7)) == 0 && !l4proto_manip_pkt(skb, iphdroff, hdroff, target, maniptype)) return false; /* must reload, offset might have changed */ ipv6h = (void *)skb->data + iphdroff; manip_addr: if (maniptype == NF_NAT_MANIP_SRC) ipv6h->saddr = target->src.u3.in6; else ipv6h->daddr = target->dst.u3.in6; #endif return true; } unsigned int nf_nat_manip_pkt(struct sk_buff *skb, struct nf_conn *ct, enum nf_nat_manip_type mtype, enum ip_conntrack_dir dir) { struct nf_conntrack_tuple target; /* We are aiming to look like inverse of other direction. */ nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple); switch (target.src.l3num) { case NFPROTO_IPV6: if (nf_nat_ipv6_manip_pkt(skb, 0, &target, mtype)) return NF_ACCEPT; break; case NFPROTO_IPV4: if (nf_nat_ipv4_manip_pkt(skb, 0, &target, mtype)) return NF_ACCEPT; break; default: WARN_ON_ONCE(1); break; } return NF_DROP; } static void nf_nat_ipv4_csum_update(struct sk_buff *skb, unsigned int iphdroff, __sum16 *check, const struct nf_conntrack_tuple *t, enum nf_nat_manip_type maniptype) { struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); __be32 oldip, newip; if (maniptype == NF_NAT_MANIP_SRC) { oldip = iph->saddr; newip = t->src.u3.ip; } else { oldip = iph->daddr; newip = t->dst.u3.ip; } inet_proto_csum_replace4(check, skb, oldip, newip, true); } static void nf_nat_ipv6_csum_update(struct sk_buff *skb, unsigned int iphdroff, __sum16 *check, const struct nf_conntrack_tuple *t, enum nf_nat_manip_type maniptype) { #if IS_ENABLED(CONFIG_IPV6) const struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + iphdroff); const struct in6_addr *oldip, *newip; if (maniptype == NF_NAT_MANIP_SRC) { oldip = &ipv6h->saddr; newip = &t->src.u3.in6; } else { oldip = &ipv6h->daddr; newip = &t->dst.u3.in6; } inet_proto_csum_replace16(check, skb, oldip->s6_addr32, newip->s6_addr32, true); #endif } static void nf_csum_update(struct sk_buff *skb, unsigned int iphdroff, __sum16 *check, const struct nf_conntrack_tuple *t, enum nf_nat_manip_type maniptype) { switch (t->src.l3num) { case NFPROTO_IPV4: nf_nat_ipv4_csum_update(skb, iphdroff, check, t, maniptype); return; case NFPROTO_IPV6: nf_nat_ipv6_csum_update(skb, iphdroff, check, t, maniptype); return; } } static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb, u8 proto, void *data, __sum16 *check, int datalen, int oldlen) { if (skb->ip_summed != CHECKSUM_PARTIAL) { const struct iphdr *iph = ip_hdr(skb); skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) + ip_hdrlen(skb); skb->csum_offset = (void *)check - data; *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, datalen, proto, 0); } else { inet_proto_csum_replace2(check, skb, htons(oldlen), htons(datalen), true); } } #if IS_ENABLED(CONFIG_IPV6) static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb, u8 proto, void *data, __sum16 *check, int datalen, int oldlen) { if (skb->ip_summed != CHECKSUM_PARTIAL) { const struct ipv6hdr *ipv6h = ipv6_hdr(skb); skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) + (data - (void *)skb->data); skb->csum_offset = (void *)check - data; *check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, datalen, proto, 0); } else { inet_proto_csum_replace2(check, skb, htons(oldlen), htons(datalen), true); } } #endif void nf_nat_csum_recalc(struct sk_buff *skb, u8 nfproto, u8 proto, void *data, __sum16 *check, int datalen, int oldlen) { switch (nfproto) { case NFPROTO_IPV4: nf_nat_ipv4_csum_recalc(skb, proto, data, check, datalen, oldlen); return; #if IS_ENABLED(CONFIG_IPV6) case NFPROTO_IPV6: nf_nat_ipv6_csum_recalc(skb, proto, data, check, datalen, oldlen); return; #endif } WARN_ON_ONCE(1); } int nf_nat_icmp_reply_translation(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum) { struct { struct icmphdr icmp; struct iphdr ip; } *inside; enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); enum nf_nat_manip_type manip = HOOK2MANIP(hooknum); unsigned int hdrlen = ip_hdrlen(skb); struct nf_conntrack_tuple target; unsigned long statusbit; WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY); if (skb_ensure_writable(skb, hdrlen + sizeof(*inside))) return 0; if (nf_ip_checksum(skb, hooknum, hdrlen, IPPROTO_ICMP)) return 0; inside = (void *)skb->data + hdrlen; if (inside->icmp.type == ICMP_REDIRECT) { if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK) return 0; if (ct->status & IPS_NAT_MASK) return 0; } if (manip == NF_NAT_MANIP_SRC) statusbit = IPS_SRC_NAT; else statusbit = IPS_DST_NAT; /* Invert if this is reply direction */ if (dir == IP_CT_DIR_REPLY) statusbit ^= IPS_NAT_MASK; if (!(ct->status & statusbit)) return 1; if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp), &ct->tuplehash[!dir].tuple, !manip)) return 0; if (skb->ip_summed != CHECKSUM_PARTIAL) { /* Reloading "inside" here since manip_pkt may reallocate */ inside = (void *)skb->data + hdrlen; inside->icmp.checksum = 0; inside->icmp.checksum = csum_fold(skb_checksum(skb, hdrlen, skb->len - hdrlen, 0)); } /* Change outer to look like the reply to an incoming packet */ nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple); target.dst.protonum = IPPROTO_ICMP; if (!nf_nat_ipv4_manip_pkt(skb, 0, &target, manip)) return 0; return 1; } EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation); static unsigned int nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; ct = nf_ct_get(skb, &ctinfo); if (!ct) return NF_ACCEPT; if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) { if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, state->hook)) return NF_DROP; else return NF_ACCEPT; } } return nf_nat_inet_fn(priv, skb, state); } static unsigned int nf_nat_ipv4_pre_routing(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { unsigned int ret; __be32 daddr = ip_hdr(skb)->daddr; ret = nf_nat_ipv4_fn(priv, skb, state); if (ret == NF_ACCEPT && daddr != ip_hdr(skb)->daddr) skb_dst_drop(skb); return ret; } #ifdef CONFIG_XFRM static int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) { struct sock *sk = skb->sk; struct dst_entry *dst; unsigned int hh_len; struct flowi fl; int err; err = xfrm_decode_session(net, skb, &fl, family); if (err < 0) return err; dst = skb_dst(skb); if (dst->xfrm) dst = ((struct xfrm_dst *)dst)->route; if (!dst_hold_safe(dst)) return -EHOSTUNREACH; if (sk && !net_eq(net, sock_net(sk))) sk = NULL; dst = xfrm_lookup(net, dst, &fl, sk, 0); if (IS_ERR(dst)) return PTR_ERR(dst); skb_dst_drop(skb); skb_dst_set(skb, dst); /* Change in oif may mean change in hh_len. */ hh_len = skb_dst(skb)->dev->hard_header_len; if (skb_headroom(skb) < hh_len && pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC)) return -ENOMEM; return 0; } #endif static bool nf_nat_inet_port_was_mangled(const struct sk_buff *skb, __be16 sport) { enum ip_conntrack_info ctinfo; enum ip_conntrack_dir dir; const struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); if (!ct) return false; switch (nf_ct_protonum(ct)) { case IPPROTO_TCP: case IPPROTO_UDP: break; default: return false; } dir = CTINFO2DIR(ctinfo); if (dir != IP_CT_DIR_ORIGINAL) return false; return ct->tuplehash[!dir].tuple.dst.u.all != sport; } static unsigned int nf_nat_ipv4_local_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { __be32 saddr = ip_hdr(skb)->saddr; struct sock *sk = skb->sk; unsigned int ret; ret = nf_nat_ipv4_fn(priv, skb, state); if (ret != NF_ACCEPT || !sk || inet_sk_transparent(sk)) return ret; /* skb has a socket assigned via tcp edemux. We need to check * if nf_nat_ipv4_fn() has mangled the packet in a way that * edemux would not have found this socket. * * This includes both changes to the source address and changes * to the source port, which are both handled by the * nf_nat_ipv4_fn() call above -- long after tcp/udp early demux * might have found a socket for the old (pre-snat) address. */ if (saddr != ip_hdr(skb)->saddr || nf_nat_inet_port_was_mangled(skb, sk->sk_dport)) skb_orphan(skb); /* TCP edemux obtained wrong socket */ return ret; } static unsigned int nf_nat_ipv4_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { #ifdef CONFIG_XFRM const struct nf_conn *ct; enum ip_conntrack_info ctinfo; int err; #endif unsigned int ret; ret = nf_nat_ipv4_fn(priv, skb, state); #ifdef CONFIG_XFRM if (ret != NF_ACCEPT) return ret; if (IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) return ret; ct = nf_ct_get(skb, &ctinfo); if (ct) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); if (ct->tuplehash[dir].tuple.src.u3.ip != ct->tuplehash[!dir].tuple.dst.u3.ip || (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && ct->tuplehash[dir].tuple.src.u.all != ct->tuplehash[!dir].tuple.dst.u.all)) { err = nf_xfrm_me_harder(state->net, skb, AF_INET); if (err < 0) ret = NF_DROP_ERR(err); } } #endif return ret; } static unsigned int nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { const struct nf_conn *ct; enum ip_conntrack_info ctinfo; unsigned int ret; int err; ret = nf_nat_ipv4_fn(priv, skb, state); if (ret != NF_ACCEPT) return ret; ct = nf_ct_get(skb, &ctinfo); if (ct) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); if (ct->tuplehash[dir].tuple.dst.u3.ip != ct->tuplehash[!dir].tuple.src.u3.ip) { err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC); if (err < 0) ret = NF_DROP_ERR(err); } #ifdef CONFIG_XFRM else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && ct->tuplehash[dir].tuple.dst.u.all != ct->tuplehash[!dir].tuple.src.u.all) { err = nf_xfrm_me_harder(state->net, skb, AF_INET); if (err < 0) ret = NF_DROP_ERR(err); } #endif } return ret; } static const struct nf_hook_ops nf_nat_ipv4_ops[] = { /* Before packet filtering, change destination */ { .hook = nf_nat_ipv4_pre_routing, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_NAT_DST, }, /* After packet filtering, change source */ { .hook = nf_nat_ipv4_out, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_NAT_SRC, }, /* Before packet filtering, change destination */ { .hook = nf_nat_ipv4_local_fn, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST, }, /* After packet filtering, change source */ { .hook = nf_nat_ipv4_local_in, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC, }, }; int nf_nat_ipv4_register_fn(struct net *net, const struct nf_hook_ops *ops) { return nf_nat_register_fn(net, ops->pf, ops, nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); } EXPORT_SYMBOL_GPL(nf_nat_ipv4_register_fn); void nf_nat_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops) { nf_nat_unregister_fn(net, ops->pf, ops, ARRAY_SIZE(nf_nat_ipv4_ops)); } EXPORT_SYMBOL_GPL(nf_nat_ipv4_unregister_fn); #if IS_ENABLED(CONFIG_IPV6) int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum, unsigned int hdrlen) { struct { struct icmp6hdr icmp6; struct ipv6hdr ip6; } *inside; enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); enum nf_nat_manip_type manip = HOOK2MANIP(hooknum); struct nf_conntrack_tuple target; unsigned long statusbit; WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY); if (skb_ensure_writable(skb, hdrlen + sizeof(*inside))) return 0; if (nf_ip6_checksum(skb, hooknum, hdrlen, IPPROTO_ICMPV6)) return 0; inside = (void *)skb->data + hdrlen; if (inside->icmp6.icmp6_type == NDISC_REDIRECT) { if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK) return 0; if (ct->status & IPS_NAT_MASK) return 0; } if (manip == NF_NAT_MANIP_SRC) statusbit = IPS_SRC_NAT; else statusbit = IPS_DST_NAT; /* Invert if this is reply direction */ if (dir == IP_CT_DIR_REPLY) statusbit ^= IPS_NAT_MASK; if (!(ct->status & statusbit)) return 1; if (!nf_nat_ipv6_manip_pkt(skb, hdrlen + sizeof(inside->icmp6), &ct->tuplehash[!dir].tuple, !manip)) return 0; if (skb->ip_summed != CHECKSUM_PARTIAL) { struct ipv6hdr *ipv6h = ipv6_hdr(skb); inside = (void *)skb->data + hdrlen; inside->icmp6.icmp6_cksum = 0; inside->icmp6.icmp6_cksum = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len - hdrlen, IPPROTO_ICMPV6, skb_checksum(skb, hdrlen, skb->len - hdrlen, 0)); } nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple); target.dst.protonum = IPPROTO_ICMPV6; if (!nf_nat_ipv6_manip_pkt(skb, 0, &target, manip)) return 0; return 1; } EXPORT_SYMBOL_GPL(nf_nat_icmpv6_reply_translation); static unsigned int nf_nat_ipv6_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; __be16 frag_off; int hdrlen; u8 nexthdr; ct = nf_ct_get(skb, &ctinfo); /* Can't track? It's not due to stress, or conntrack would * have dropped it. Hence it's the user's responsibilty to * packet filter it out, or implement conntrack/NAT for that * protocol. 8) --RR */ if (!ct) return NF_ACCEPT; if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) { nexthdr = ipv6_hdr(skb)->nexthdr; hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo, state->hook, hdrlen)) return NF_DROP; else return NF_ACCEPT; } } return nf_nat_inet_fn(priv, skb, state); } static unsigned int nf_nat_ipv6_local_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct in6_addr saddr = ipv6_hdr(skb)->saddr; struct sock *sk = skb->sk; unsigned int ret; ret = nf_nat_ipv6_fn(priv, skb, state); if (ret != NF_ACCEPT || !sk || inet_sk_transparent(sk)) return ret; /* see nf_nat_ipv4_local_in */ if (ipv6_addr_cmp(&saddr, &ipv6_hdr(skb)->saddr) || nf_nat_inet_port_was_mangled(skb, sk->sk_dport)) skb_orphan(skb); return ret; } static unsigned int nf_nat_ipv6_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { unsigned int ret, verdict; struct in6_addr daddr = ipv6_hdr(skb)->daddr; ret = nf_nat_ipv6_fn(priv, skb, state); verdict = ret & NF_VERDICT_MASK; if (verdict != NF_DROP && verdict != NF_STOLEN && ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr)) skb_dst_drop(skb); return ret; } static unsigned int nf_nat_ipv6_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { #ifdef CONFIG_XFRM const struct nf_conn *ct; enum ip_conntrack_info ctinfo; int err; #endif unsigned int ret; ret = nf_nat_ipv6_fn(priv, skb, state); #ifdef CONFIG_XFRM if (ret != NF_ACCEPT) return ret; if (IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) return ret; ct = nf_ct_get(skb, &ctinfo); if (ct) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, &ct->tuplehash[!dir].tuple.dst.u3) || (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && ct->tuplehash[dir].tuple.src.u.all != ct->tuplehash[!dir].tuple.dst.u.all)) { err = nf_xfrm_me_harder(state->net, skb, AF_INET6); if (err < 0) ret = NF_DROP_ERR(err); } } #endif return ret; } static unsigned int nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { const struct nf_conn *ct; enum ip_conntrack_info ctinfo; unsigned int ret; int err; ret = nf_nat_ipv6_fn(priv, skb, state); if (ret != NF_ACCEPT) return ret; ct = nf_ct_get(skb, &ctinfo); if (ct) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, &ct->tuplehash[!dir].tuple.src.u3)) { err = nf_ip6_route_me_harder(state->net, state->sk, skb); if (err < 0) ret = NF_DROP_ERR(err); } #ifdef CONFIG_XFRM else if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && ct->tuplehash[dir].tuple.dst.u.all != ct->tuplehash[!dir].tuple.src.u.all) { err = nf_xfrm_me_harder(state->net, skb, AF_INET6); if (err < 0) ret = NF_DROP_ERR(err); } #endif } return ret; } static const struct nf_hook_ops nf_nat_ipv6_ops[] = { /* Before packet filtering, change destination */ { .hook = nf_nat_ipv6_in, .pf = NFPROTO_IPV6, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP6_PRI_NAT_DST, }, /* After packet filtering, change source */ { .hook = nf_nat_ipv6_out, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_NAT_SRC, }, /* Before packet filtering, change destination */ { .hook = nf_nat_ipv6_local_fn, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST, }, /* After packet filtering, change source */ { .hook = nf_nat_ipv6_local_in, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC, }, }; int nf_nat_ipv6_register_fn(struct net *net, const struct nf_hook_ops *ops) { return nf_nat_register_fn(net, ops->pf, ops, nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops)); } EXPORT_SYMBOL_GPL(nf_nat_ipv6_register_fn); void nf_nat_ipv6_unregister_fn(struct net *net, const struct nf_hook_ops *ops) { nf_nat_unregister_fn(net, ops->pf, ops, ARRAY_SIZE(nf_nat_ipv6_ops)); } EXPORT_SYMBOL_GPL(nf_nat_ipv6_unregister_fn); #endif /* CONFIG_IPV6 */ #if defined(CONFIG_NF_TABLES_INET) && IS_ENABLED(CONFIG_NFT_NAT) int nf_nat_inet_register_fn(struct net *net, const struct nf_hook_ops *ops) { int ret; if (WARN_ON_ONCE(ops->pf != NFPROTO_INET)) return -EINVAL; ret = nf_nat_register_fn(net, NFPROTO_IPV6, ops, nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops)); if (ret) return ret; ret = nf_nat_register_fn(net, NFPROTO_IPV4, ops, nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); if (ret) nf_nat_unregister_fn(net, NFPROTO_IPV6, ops, ARRAY_SIZE(nf_nat_ipv6_ops)); return ret; } EXPORT_SYMBOL_GPL(nf_nat_inet_register_fn); void nf_nat_inet_unregister_fn(struct net *net, const struct nf_hook_ops *ops) { nf_nat_unregister_fn(net, NFPROTO_IPV4, ops, ARRAY_SIZE(nf_nat_ipv4_ops)); nf_nat_unregister_fn(net, NFPROTO_IPV6, ops, ARRAY_SIZE(nf_nat_ipv6_ops)); } EXPORT_SYMBOL_GPL(nf_nat_inet_unregister_fn); #endif /* NFT INET NAT */ |
90 59 11 50 50 55 55 8 51 8 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 | /* * net/tipc/name_distr.c: TIPC name distribution code * * Copyright (c) 2000-2006, 2014-2019, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * Copyright (c) 2020-2021, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "link.h" #include "name_distr.h" int sysctl_tipc_named_timeout __read_mostly = 2000; /** * publ_to_item - add publication info to a publication message * @p: publication info * @i: location of item in the message */ static void publ_to_item(struct distr_item *i, struct publication *p) { i->type = htonl(p->sr.type); i->lower = htonl(p->sr.lower); i->upper = htonl(p->sr.upper); i->port = htonl(p->sk.ref); i->key = htonl(p->key); } /** * named_prepare_buf - allocate & initialize a publication message * @net: the associated network namespace * @type: message type * @size: payload size * @dest: destination node * * The buffer returned is of size INT_H_SIZE + payload size */ static struct sk_buff *named_prepare_buf(struct net *net, u32 type, u32 size, u32 dest) { struct sk_buff *buf = tipc_buf_acquire(INT_H_SIZE + size, GFP_ATOMIC); u32 self = tipc_own_addr(net); struct tipc_msg *msg; if (buf != NULL) { msg = buf_msg(buf); tipc_msg_init(self, msg, NAME_DISTRIBUTOR, type, INT_H_SIZE, dest); msg_set_size(msg, INT_H_SIZE + size); } return buf; } /** * tipc_named_publish - tell other nodes about a new publication by this node * @net: the associated network namespace * @p: the new publication */ struct sk_buff *tipc_named_publish(struct net *net, struct publication *p) { struct name_table *nt = tipc_name_table(net); struct distr_item *item; struct sk_buff *skb; if (p->scope == TIPC_NODE_SCOPE) { list_add_tail_rcu(&p->binding_node, &nt->node_scope); return NULL; } write_lock_bh(&nt->cluster_scope_lock); list_add_tail(&p->binding_node, &nt->cluster_scope); write_unlock_bh(&nt->cluster_scope_lock); skb = named_prepare_buf(net, PUBLICATION, ITEM_SIZE, 0); if (!skb) { pr_warn("Publication distribution failure\n"); return NULL; } msg_set_named_seqno(buf_msg(skb), nt->snd_nxt++); msg_set_non_legacy(buf_msg(skb)); item = (struct distr_item *)msg_data(buf_msg(skb)); publ_to_item(item, p); return skb; } /** * tipc_named_withdraw - tell other nodes about a withdrawn publication by this node * @net: the associated network namespace * @p: the withdrawn publication */ struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *p) { struct name_table *nt = tipc_name_table(net); struct distr_item *item; struct sk_buff *skb; write_lock_bh(&nt->cluster_scope_lock); list_del(&p->binding_node); write_unlock_bh(&nt->cluster_scope_lock); if (p->scope == TIPC_NODE_SCOPE) return NULL; skb = named_prepare_buf(net, WITHDRAWAL, ITEM_SIZE, 0); if (!skb) { pr_warn("Withdrawal distribution failure\n"); return NULL; } msg_set_named_seqno(buf_msg(skb), nt->snd_nxt++); msg_set_non_legacy(buf_msg(skb)); item = (struct distr_item *)msg_data(buf_msg(skb)); publ_to_item(item, p); return skb; } /** * named_distribute - prepare name info for bulk distribution to another node * @net: the associated network namespace * @list: list of messages (buffers) to be returned from this function * @dnode: node to be updated * @pls: linked list of publication items to be packed into buffer chain * @seqno: sequence number for this message */ static void named_distribute(struct net *net, struct sk_buff_head *list, u32 dnode, struct list_head *pls, u16 seqno) { struct publication *publ; struct sk_buff *skb = NULL; struct distr_item *item = NULL; u32 msg_dsz = ((tipc_node_get_mtu(net, dnode, 0, false) - INT_H_SIZE) / ITEM_SIZE) * ITEM_SIZE; u32 msg_rem = msg_dsz; struct tipc_msg *hdr; list_for_each_entry(publ, pls, binding_node) { /* Prepare next buffer: */ if (!skb) { skb = named_prepare_buf(net, PUBLICATION, msg_rem, dnode); if (!skb) { pr_warn("Bulk publication failure\n"); return; } hdr = buf_msg(skb); msg_set_bc_ack_invalid(hdr, true); msg_set_bulk(hdr); msg_set_non_legacy(hdr); item = (struct distr_item *)msg_data(hdr); } /* Pack publication into message: */ publ_to_item(item, publ); item++; msg_rem -= ITEM_SIZE; /* Append full buffer to list: */ if (!msg_rem) { __skb_queue_tail(list, skb); skb = NULL; msg_rem = msg_dsz; } } if (skb) { hdr = buf_msg(skb); msg_set_size(hdr, INT_H_SIZE + (msg_dsz - msg_rem)); skb_trim(skb, INT_H_SIZE + (msg_dsz - msg_rem)); __skb_queue_tail(list, skb); } hdr = buf_msg(skb_peek_tail(list)); msg_set_last_bulk(hdr); msg_set_named_seqno(hdr, seqno); } /** * tipc_named_node_up - tell specified node about all publications by this node * @net: the associated network namespace * @dnode: destination node * @capabilities: peer node's capabilities */ void tipc_named_node_up(struct net *net, u32 dnode, u16 capabilities) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); struct sk_buff_head head; u16 seqno; __skb_queue_head_init(&head); spin_lock_bh(&tn->nametbl_lock); if (!(capabilities & TIPC_NAMED_BCAST)) nt->rc_dests++; seqno = nt->snd_nxt; spin_unlock_bh(&tn->nametbl_lock); read_lock_bh(&nt->cluster_scope_lock); named_distribute(net, &head, dnode, &nt->cluster_scope, seqno); tipc_node_xmit(net, &head, dnode, 0); read_unlock_bh(&nt->cluster_scope_lock); } /** * tipc_publ_purge - remove publication associated with a failed node * @net: the associated network namespace * @p: the publication to remove * @addr: failed node's address * * Invoked for each publication issued by a newly failed node. * Removes publication structure from name table & deletes it. */ static void tipc_publ_purge(struct net *net, struct publication *p, u32 addr) { struct tipc_net *tn = tipc_net(net); struct publication *_p; struct tipc_uaddr ua; tipc_uaddr(&ua, TIPC_SERVICE_RANGE, p->scope, p->sr.type, p->sr.lower, p->sr.upper); spin_lock_bh(&tn->nametbl_lock); _p = tipc_nametbl_remove_publ(net, &ua, &p->sk, p->key); if (_p) tipc_node_unsubscribe(net, &_p->binding_node, addr); spin_unlock_bh(&tn->nametbl_lock); if (_p) kfree_rcu(_p, rcu); } void tipc_publ_notify(struct net *net, struct list_head *nsub_list, u32 addr, u16 capabilities) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); struct publication *publ, *tmp; list_for_each_entry_safe(publ, tmp, nsub_list, binding_node) tipc_publ_purge(net, publ, addr); spin_lock_bh(&tn->nametbl_lock); if (!(capabilities & TIPC_NAMED_BCAST)) nt->rc_dests--; spin_unlock_bh(&tn->nametbl_lock); } /** * tipc_update_nametbl - try to process a nametable update and notify * subscribers * @net: the associated network namespace * @i: location of item in the message * @node: node address * @dtype: name distributor message type * * tipc_nametbl_lock must be held. * Return: the publication item if successful, otherwise NULL. */ static bool tipc_update_nametbl(struct net *net, struct distr_item *i, u32 node, u32 dtype) { struct publication *p = NULL; struct tipc_socket_addr sk; struct tipc_uaddr ua; u32 key = ntohl(i->key); tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_CLUSTER_SCOPE, ntohl(i->type), ntohl(i->lower), ntohl(i->upper)); sk.ref = ntohl(i->port); sk.node = node; if (dtype == PUBLICATION) { p = tipc_nametbl_insert_publ(net, &ua, &sk, key); if (p) { tipc_node_subscribe(net, &p->binding_node, node); return true; } } else if (dtype == WITHDRAWAL) { p = tipc_nametbl_remove_publ(net, &ua, &sk, key); if (p) { tipc_node_unsubscribe(net, &p->binding_node, node); kfree_rcu(p, rcu); return true; } pr_warn_ratelimited("Failed to remove binding %u,%u from %u\n", ua.sr.type, ua.sr.lower, node); } else { pr_warn_ratelimited("Unknown name table message received\n"); } return false; } static struct sk_buff *tipc_named_dequeue(struct sk_buff_head *namedq, u16 *rcv_nxt, bool *open) { struct sk_buff *skb, *tmp; struct tipc_msg *hdr; u16 seqno; spin_lock_bh(&namedq->lock); skb_queue_walk_safe(namedq, skb, tmp) { if (unlikely(skb_linearize(skb))) { __skb_unlink(skb, namedq); kfree_skb(skb); continue; } hdr = buf_msg(skb); seqno = msg_named_seqno(hdr); if (msg_is_last_bulk(hdr)) { *rcv_nxt = seqno; *open = true; } if (msg_is_bulk(hdr) || msg_is_legacy(hdr)) { __skb_unlink(skb, namedq); spin_unlock_bh(&namedq->lock); return skb; } if (*open && (*rcv_nxt == seqno)) { (*rcv_nxt)++; __skb_unlink(skb, namedq); spin_unlock_bh(&namedq->lock); return skb; } if (less(seqno, *rcv_nxt)) { __skb_unlink(skb, namedq); kfree_skb(skb); continue; } } spin_unlock_bh(&namedq->lock); return NULL; } /** * tipc_named_rcv - process name table update messages sent by another node * @net: the associated network namespace * @namedq: queue to receive from * @rcv_nxt: store last received seqno here * @open: last bulk msg was received (FIXME) */ void tipc_named_rcv(struct net *net, struct sk_buff_head *namedq, u16 *rcv_nxt, bool *open) { struct tipc_net *tn = tipc_net(net); struct distr_item *item; struct tipc_msg *hdr; struct sk_buff *skb; u32 count, node; spin_lock_bh(&tn->nametbl_lock); while ((skb = tipc_named_dequeue(namedq, rcv_nxt, open))) { hdr = buf_msg(skb); node = msg_orignode(hdr); item = (struct distr_item *)msg_data(hdr); count = msg_data_sz(hdr) / ITEM_SIZE; while (count--) { tipc_update_nametbl(net, item, node, msg_type(hdr)); item++; } kfree_skb(skb); } spin_unlock_bh(&tn->nametbl_lock); } /** * tipc_named_reinit - re-initialize local publications * @net: the associated network namespace * * This routine is called whenever TIPC networking is enabled. * All name table entries published by this node are updated to reflect * the node's new network address. */ void tipc_named_reinit(struct net *net) { struct name_table *nt = tipc_name_table(net); struct tipc_net *tn = tipc_net(net); struct publication *p; u32 self = tipc_own_addr(net); spin_lock_bh(&tn->nametbl_lock); list_for_each_entry_rcu(p, &nt->node_scope, binding_node) p->sk.node = self; list_for_each_entry_rcu(p, &nt->cluster_scope, binding_node) p->sk.node = self; nt->rc_dests = 0; spin_unlock_bh(&tn->nametbl_lock); } |
16 1 116 246 46 87 48 203 48 4660 866 1084 2 851 10 888 3 1 1097 45 1137 4 53 417 1065 3 12 1068 8 1074 620 16 9 23 417 423 407 1323 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_SIGNAL_H #define _LINUX_SCHED_SIGNAL_H #include <linux/rculist.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/sched/jobctl.h> #include <linux/sched/task.h> #include <linux/cred.h> #include <linux/refcount.h> #include <linux/pid.h> #include <linux/posix-timers.h> #include <linux/mm_types.h> #include <asm/ptrace.h> /* * Types defining task->signal and task->sighand and APIs using them: */ struct sighand_struct { spinlock_t siglock; refcount_t count; wait_queue_head_t signalfd_wqh; struct k_sigaction action[_NSIG]; }; /* * Per-process accounting stats: */ struct pacct_struct { int ac_flag; long ac_exitcode; unsigned long ac_mem; u64 ac_utime, ac_stime; unsigned long ac_minflt, ac_majflt; }; struct cpu_itimer { u64 expires; u64 incr; }; /* * This is the atomic variant of task_cputime, which can be used for * storing and updating task_cputime statistics without locking. */ struct task_cputime_atomic { atomic64_t utime; atomic64_t stime; atomic64_t sum_exec_runtime; }; #define INIT_CPUTIME_ATOMIC \ (struct task_cputime_atomic) { \ .utime = ATOMIC64_INIT(0), \ .stime = ATOMIC64_INIT(0), \ .sum_exec_runtime = ATOMIC64_INIT(0), \ } /** * struct thread_group_cputimer - thread group interval timer counts * @cputime_atomic: atomic thread group interval timers. * * This structure contains the version of task_cputime, above, that is * used for thread group CPU timer calculations. */ struct thread_group_cputimer { struct task_cputime_atomic cputime_atomic; }; struct multiprocess_signals { sigset_t signal; struct hlist_node node; }; struct core_thread { struct task_struct *task; struct core_thread *next; }; struct core_state { atomic_t nr_threads; struct core_thread dumper; struct completion startup; }; /* * NOTE! "signal_struct" does not have its own * locking, because a shared signal_struct always * implies a shared sighand_struct, so locking * sighand_struct is always a proper superset of * the locking of signal_struct. */ struct signal_struct { refcount_t sigcnt; atomic_t live; int nr_threads; int quick_threads; struct list_head thread_head; wait_queue_head_t wait_chldexit; /* for wait4() */ /* current thread group signal load-balancing target: */ struct task_struct *curr_target; /* shared signal handling: */ struct sigpending shared_pending; /* For collecting multiprocess signals during fork */ struct hlist_head multiprocess; /* thread group exit support */ int group_exit_code; /* notify group_exec_task when notify_count is less or equal to 0 */ int notify_count; struct task_struct *group_exec_task; /* thread group stop support, overloads group_exit_code too */ int group_stop_count; unsigned int flags; /* see SIGNAL_* flags below */ struct core_state *core_state; /* coredumping support */ /* * PR_SET_CHILD_SUBREAPER marks a process, like a service * manager, to re-parent orphan (double-forking) child processes * to this process instead of 'init'. The service manager is * able to receive SIGCHLD signals and is able to investigate * the process until it calls wait(). All children of this * process will inherit a flag if they should look for a * child_subreaper process at exit. */ unsigned int is_child_subreaper:1; unsigned int has_child_subreaper:1; #ifdef CONFIG_POSIX_TIMERS /* POSIX.1b Interval Timers */ unsigned int next_posix_timer_id; struct list_head posix_timers; /* ITIMER_REAL timer for the process */ struct hrtimer real_timer; ktime_t it_real_incr; /* * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these * values are defined to 0 and 1 respectively */ struct cpu_itimer it[2]; /* * Thread group totals for process CPU timers. * See thread_group_cputimer(), et al, for details. */ struct thread_group_cputimer cputimer; #endif /* Empty if CONFIG_POSIX_TIMERS=n */ struct posix_cputimers posix_cputimers; /* PID/PID hash table linkage. */ struct pid *pids[PIDTYPE_MAX]; #ifdef CONFIG_NO_HZ_FULL atomic_t tick_dep_mask; #endif struct pid *tty_old_pgrp; /* boolean value for session group leader */ int leader; struct tty_struct *tty; /* NULL if no tty */ #ifdef CONFIG_SCHED_AUTOGROUP struct autogroup *autogroup; #endif /* * Cumulative resource counters for dead threads in the group, * and for reaped dead child processes forked by this group. * Live threads maintain their own counters and add to these * in __exit_signal, except for the group leader. */ seqlock_t stats_lock; u64 utime, stime, cutime, cstime; u64 gtime; u64 cgtime; struct prev_cputime prev_cputime; unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; unsigned long inblock, oublock, cinblock, coublock; unsigned long maxrss, cmaxrss; struct task_io_accounting ioac; /* * Cumulative ns of schedule CPU time fo dead threads in the * group, not including a zombie group leader, (This only differs * from jiffies_to_ns(utime + stime) if sched_clock uses something * other than jiffies.) */ unsigned long long sum_sched_runtime; /* * We don't bother to synchronize most readers of this at all, * because there is no reader checking a limit that actually needs * to get both rlim_cur and rlim_max atomically, and either one * alone is a single word that can safely be read normally. * getrlimit/setrlimit use task_lock(current->group_leader) to * protect this instead of the siglock, because they really * have no need to disable irqs. */ struct rlimit rlim[RLIM_NLIMITS]; #ifdef CONFIG_BSD_PROCESS_ACCT struct pacct_struct pacct; /* per-process accounting information */ #endif #ifdef CONFIG_TASKSTATS struct taskstats *stats; #endif #ifdef CONFIG_AUDIT unsigned audit_tty; struct tty_audit_buf *tty_audit_buf; #endif /* * Thread is the potential origin of an oom condition; kill first on * oom */ bool oom_flag_origin; short oom_score_adj; /* OOM kill score adjustment */ short oom_score_adj_min; /* OOM kill score adjustment min value. * Only settable by CAP_SYS_RESOURCE. */ struct mm_struct *oom_mm; /* recorded mm when the thread group got * killed by the oom killer */ struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations * (notably. ptrace) * Deprecated do not use in new code. * Use exec_update_lock instead. */ struct rw_semaphore exec_update_lock; /* Held while task_struct is * being updated during exec, * and may have inconsistent * permissions. */ } __randomize_layout; /* * Bits in flags field of signal_struct. */ #define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */ #define SIGNAL_STOP_CONTINUED 0x00000002 /* SIGCONT since WCONTINUED reap */ #define SIGNAL_GROUP_EXIT 0x00000004 /* group exit in progress */ /* * Pending notifications to parent. */ #define SIGNAL_CLD_STOPPED 0x00000010 #define SIGNAL_CLD_CONTINUED 0x00000020 #define SIGNAL_CLD_MASK (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED) #define SIGNAL_UNKILLABLE 0x00000040 /* for init: ignore fatal signals */ #define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \ SIGNAL_STOP_CONTINUED) static inline void signal_set_stop_flags(struct signal_struct *sig, unsigned int flags) { WARN_ON(sig->flags & SIGNAL_GROUP_EXIT); sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags; } extern void flush_signals(struct task_struct *); extern void ignore_signals(struct task_struct *); extern void flush_signal_handlers(struct task_struct *, int force_default); extern int dequeue_signal(struct task_struct *task, sigset_t *mask, kernel_siginfo_t *info, enum pid_type *type); static inline int kernel_dequeue_signal(void) { struct task_struct *task = current; kernel_siginfo_t __info; enum pid_type __type; int ret; spin_lock_irq(&task->sighand->siglock); ret = dequeue_signal(task, &task->blocked, &__info, &__type); spin_unlock_irq(&task->sighand->siglock); return ret; } static inline void kernel_signal_stop(void) { spin_lock_irq(¤t->sighand->siglock); if (current->jobctl & JOBCTL_STOP_DEQUEUED) { current->jobctl |= JOBCTL_STOPPED; set_special_state(TASK_STOPPED); } spin_unlock_irq(¤t->sighand->siglock); schedule(); } int force_sig_fault_to_task(int sig, int code, void __user *addr, struct task_struct *t); int force_sig_fault(int sig, int code, void __user *addr); int send_sig_fault(int sig, int code, void __user *addr, struct task_struct *t); int force_sig_mceerr(int code, void __user *, short); int send_sig_mceerr(int code, void __user *, short, struct task_struct *); int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper); int force_sig_pkuerr(void __user *addr, u32 pkey); int send_sig_perf(void __user *addr, u32 type, u64 sig_data); int force_sig_ptrace_errno_trap(int errno, void __user *addr); int force_sig_fault_trapno(int sig, int code, void __user *addr, int trapno); int send_sig_fault_trapno(int sig, int code, void __user *addr, int trapno, struct task_struct *t); int force_sig_seccomp(int syscall, int reason, bool force_coredump); extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *); extern void force_sigsegv(int sig); extern int force_sig_info(struct kernel_siginfo *); extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp); extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid); extern int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, struct pid *, const struct cred *); extern int kill_pgrp(struct pid *pid, int sig, int priv); extern int kill_pid(struct pid *pid, int sig, int priv); extern __must_check bool do_notify_parent(struct task_struct *, int); extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); extern void force_sig(int); extern void force_fatal_sig(int); extern void force_exit_sig(int); extern int send_sig(int, struct task_struct *, int); extern int zap_other_threads(struct task_struct *p); extern struct sigqueue *sigqueue_alloc(void); extern void sigqueue_free(struct sigqueue *); extern int send_sigqueue(struct sigqueue *, struct pid *, enum pid_type); extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); static inline void clear_notify_signal(void) { clear_thread_flag(TIF_NOTIFY_SIGNAL); smp_mb__after_atomic(); } /* * Returns 'true' if kick_process() is needed to force a transition from * user -> kernel to guarantee expedient run of TWA_SIGNAL based task_work. */ static inline bool __set_notify_signal(struct task_struct *task) { return !test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) && !wake_up_state(task, TASK_INTERRUPTIBLE); } /* * Called to break out of interruptible wait loops, and enter the * exit_to_user_mode_loop(). */ static inline void set_notify_signal(struct task_struct *task) { if (__set_notify_signal(task)) kick_process(task); } static inline int restart_syscall(void) { set_tsk_thread_flag(current, TIF_SIGPENDING); return -ERESTARTNOINTR; } static inline int task_sigpending(struct task_struct *p) { return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); } static inline int signal_pending(struct task_struct *p) { /* * TIF_NOTIFY_SIGNAL isn't really a signal, but it requires the same * behavior in terms of ensuring that we break out of wait loops * so that notify signal callbacks can be processed. */ if (unlikely(test_tsk_thread_flag(p, TIF_NOTIFY_SIGNAL))) return 1; return task_sigpending(p); } static inline int __fatal_signal_pending(struct task_struct *p) { return unlikely(sigismember(&p->pending.signal, SIGKILL)); } static inline int fatal_signal_pending(struct task_struct *p) { return task_sigpending(p) && __fatal_signal_pending(p); } static inline int signal_pending_state(unsigned int state, struct task_struct *p) { if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL))) return 0; if (!signal_pending(p)) return 0; return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p); } /* * This should only be used in fault handlers to decide whether we * should stop the current fault routine to handle the signals * instead, especially with the case where we've got interrupted with * a VM_FAULT_RETRY. */ static inline bool fault_signal_pending(vm_fault_t fault_flags, struct pt_regs *regs) { return unlikely((fault_flags & VM_FAULT_RETRY) && (fatal_signal_pending(current) || (user_mode(regs) && signal_pending(current)))); } /* * Reevaluate whether the task has signals pending delivery. * Wake the task if so. * This is required every time the blocked sigset_t changes. * callers must hold sighand->siglock. */ extern void recalc_sigpending(void); extern void calculate_sigpending(void); extern void signal_wake_up_state(struct task_struct *t, unsigned int state); static inline void signal_wake_up(struct task_struct *t, bool fatal) { unsigned int state = 0; if (fatal && !(t->jobctl & JOBCTL_PTRACE_FROZEN)) { t->jobctl &= ~(JOBCTL_STOPPED | JOBCTL_TRACED); state = TASK_WAKEKILL | __TASK_TRACED; } signal_wake_up_state(t, state); } static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume) { unsigned int state = 0; if (resume) { t->jobctl &= ~JOBCTL_TRACED; state = __TASK_TRACED; } signal_wake_up_state(t, state); } void task_join_group_stop(struct task_struct *task); #ifdef TIF_RESTORE_SIGMASK /* * Legacy restore_sigmask accessors. These are inefficient on * SMP architectures because they require atomic operations. */ /** * set_restore_sigmask() - make sure saved_sigmask processing gets done * * This sets TIF_RESTORE_SIGMASK and ensures that the arch signal code * will run before returning to user mode, to process the flag. For * all callers, TIF_SIGPENDING is already set or it's no harm to set * it. TIF_RESTORE_SIGMASK need not be in the set of bits that the * arch code will notice on return to user mode, in case those bits * are scarce. We set TIF_SIGPENDING here to ensure that the arch * signal code always gets run when TIF_RESTORE_SIGMASK is set. */ static inline void set_restore_sigmask(void) { set_thread_flag(TIF_RESTORE_SIGMASK); } static inline void clear_tsk_restore_sigmask(struct task_struct *task) { clear_tsk_thread_flag(task, TIF_RESTORE_SIGMASK); } static inline void clear_restore_sigmask(void) { clear_thread_flag(TIF_RESTORE_SIGMASK); } static inline bool test_tsk_restore_sigmask(struct task_struct *task) { return test_tsk_thread_flag(task, TIF_RESTORE_SIGMASK); } static inline bool test_restore_sigmask(void) { return test_thread_flag(TIF_RESTORE_SIGMASK); } static inline bool test_and_clear_restore_sigmask(void) { return test_and_clear_thread_flag(TIF_RESTORE_SIGMASK); } #else /* TIF_RESTORE_SIGMASK */ /* Higher-quality implementation, used if TIF_RESTORE_SIGMASK doesn't exist. */ static inline void set_restore_sigmask(void) { current->restore_sigmask = true; } static inline void clear_tsk_restore_sigmask(struct task_struct *task) { task->restore_sigmask = false; } static inline void clear_restore_sigmask(void) { current->restore_sigmask = false; } static inline bool test_restore_sigmask(void) { return current->restore_sigmask; } static inline bool test_tsk_restore_sigmask(struct task_struct *task) { return task->restore_sigmask; } static inline bool test_and_clear_restore_sigmask(void) { if (!current->restore_sigmask) return false; current->restore_sigmask = false; return true; } #endif static inline void restore_saved_sigmask(void) { if (test_and_clear_restore_sigmask()) __set_current_blocked(¤t->saved_sigmask); } extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize); static inline void restore_saved_sigmask_unless(bool interrupted) { if (interrupted) WARN_ON(!signal_pending(current)); else restore_saved_sigmask(); } static inline sigset_t *sigmask_to_save(void) { sigset_t *res = ¤t->blocked; if (unlikely(test_restore_sigmask())) res = ¤t->saved_sigmask; return res; } static inline int kill_cad_pid(int sig, int priv) { return kill_pid(cad_pid, sig, priv); } /* These can be the second arg to send_sig_info/send_group_sig_info. */ #define SEND_SIG_NOINFO ((struct kernel_siginfo *) 0) #define SEND_SIG_PRIV ((struct kernel_siginfo *) 1) static inline int __on_sig_stack(unsigned long sp) { #ifdef CONFIG_STACK_GROWSUP return sp >= current->sas_ss_sp && sp - current->sas_ss_sp < current->sas_ss_size; #else return sp > current->sas_ss_sp && sp - current->sas_ss_sp <= current->sas_ss_size; #endif } /* * True if we are on the alternate signal stack. */ static inline int on_sig_stack(unsigned long sp) { /* * If the signal stack is SS_AUTODISARM then, by construction, we * can't be on the signal stack unless user code deliberately set * SS_AUTODISARM when we were already on it. * * This improves reliability: if user state gets corrupted such that * the stack pointer points very close to the end of the signal stack, * then this check will enable the signal to be handled anyway. */ if (current->sas_ss_flags & SS_AUTODISARM) return 0; return __on_sig_stack(sp); } static inline int sas_ss_flags(unsigned long sp) { if (!current->sas_ss_size) return SS_DISABLE; return on_sig_stack(sp) ? SS_ONSTACK : 0; } static inline void sas_ss_reset(struct task_struct *p) { p->sas_ss_sp = 0; p->sas_ss_size = 0; p->sas_ss_flags = SS_DISABLE; } static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig) { if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp)) #ifdef CONFIG_STACK_GROWSUP return current->sas_ss_sp; #else return current->sas_ss_sp + current->sas_ss_size; #endif return sp; } extern void __cleanup_sighand(struct sighand_struct *); extern void flush_itimer_signals(void); #define tasklist_empty() \ list_empty(&init_task.tasks) #define next_task(p) \ list_entry_rcu((p)->tasks.next, struct task_struct, tasks) #define for_each_process(p) \ for (p = &init_task ; (p = next_task(p)) != &init_task ; ) extern bool current_is_single_threaded(void); /* * Without tasklist/siglock it is only rcu-safe if g can't exit/exec, * otherwise next_thread(t) will never reach g after list_del_rcu(g). */ #define while_each_thread(g, t) \ while ((t = next_thread(t)) != g) #define for_other_threads(p, t) \ for (t = p; (t = next_thread(t)) != p; ) #define __for_each_thread(signal, t) \ list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \ lockdep_is_held(&tasklist_lock)) #define for_each_thread(p, t) \ __for_each_thread((p)->signal, t) /* Careful: this is a double loop, 'break' won't work as expected. */ #define for_each_process_thread(p, t) \ for_each_process(p) for_each_thread(p, t) typedef int (*proc_visitor)(struct task_struct *p, void *data); void walk_process_tree(struct task_struct *top, proc_visitor, void *); static inline struct pid *task_pid_type(struct task_struct *task, enum pid_type type) { struct pid *pid; if (type == PIDTYPE_PID) pid = task_pid(task); else pid = task->signal->pids[type]; return pid; } static inline struct pid *task_tgid(struct task_struct *task) { return task->signal->pids[PIDTYPE_TGID]; } /* * Without tasklist or RCU lock it is not safe to dereference * the result of task_pgrp/task_session even if task == current, * we can race with another thread doing sys_setsid/sys_setpgid. */ static inline struct pid *task_pgrp(struct task_struct *task) { return task->signal->pids[PIDTYPE_PGID]; } static inline struct pid *task_session(struct task_struct *task) { return task->signal->pids[PIDTYPE_SID]; } static inline int get_nr_threads(struct task_struct *task) { return task->signal->nr_threads; } static inline bool thread_group_leader(struct task_struct *p) { return p->exit_signal >= 0; } static inline bool same_thread_group(struct task_struct *p1, struct task_struct *p2) { return p1->signal == p2->signal; } /* * returns NULL if p is the last thread in the thread group */ static inline struct task_struct *__next_thread(struct task_struct *p) { return list_next_or_null_rcu(&p->signal->thread_head, &p->thread_node, struct task_struct, thread_node); } static inline struct task_struct *next_thread(struct task_struct *p) { return __next_thread(p) ?: p->group_leader; } static inline int thread_group_empty(struct task_struct *p) { return thread_group_leader(p) && list_is_last(&p->thread_node, &p->signal->thread_head); } #define delay_group_leader(p) \ (thread_group_leader(p) && !thread_group_empty(p)) extern struct sighand_struct *__lock_task_sighand(struct task_struct *task, unsigned long *flags); static inline struct sighand_struct *lock_task_sighand(struct task_struct *task, unsigned long *flags) { struct sighand_struct *ret; ret = __lock_task_sighand(task, flags); (void)__cond_lock(&task->sighand->siglock, ret); return ret; } static inline void unlock_task_sighand(struct task_struct *task, unsigned long *flags) { spin_unlock_irqrestore(&task->sighand->siglock, *flags); } #ifdef CONFIG_LOCKDEP extern void lockdep_assert_task_sighand_held(struct task_struct *task); #else static inline void lockdep_assert_task_sighand_held(struct task_struct *task) { } #endif static inline unsigned long task_rlimit(const struct task_struct *task, unsigned int limit) { return READ_ONCE(task->signal->rlim[limit].rlim_cur); } static inline unsigned long task_rlimit_max(const struct task_struct *task, unsigned int limit) { return READ_ONCE(task->signal->rlim[limit].rlim_max); } static inline unsigned long rlimit(unsigned int limit) { return task_rlimit(current, limit); } static inline unsigned long rlimit_max(unsigned int limit) { return task_rlimit_max(current, limit); } #endif /* _LINUX_SCHED_SIGNAL_H */ |
24 23 1 14 1 8 5 13 1 4 4 32 1 5 1 1 1 22 3 1 2 22 1 14 2 2 4 2 43 1 6 6 2 2 34 34 12 4 32 1 29 2 2 28 4 2 1 1 6 1 2 2 1 6 5 1 1 2 1 4 1 2 2 9 1 3 6 6 6 6 5 5 5 8 1 2 1 1 1 1 1 2 46 46 1 46 4 4 12 12 42 4 3 13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 | /* * Copyright (c) 2016 Intel Corporation * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that copyright * notice and this permission notice appear in supporting documentation, and * that the name of the copyright holders not be used in advertising or * publicity pertaining to distribution of the software without specific, * written prior permission. The copyright holders make no representations * about the suitability of this software for any purpose. It is provided "as * is" without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THIS SOFTWARE. */ #include <linux/export.h> #include <linux/uaccess.h> #include <drm/drm_atomic.h> #include <drm/drm_atomic_uapi.h> #include <drm/drm_auth.h> #include <drm/drm_debugfs.h> #include <drm/drm_drv.h> #include <drm/drm_file.h> #include <drm/drm_fourcc.h> #include <drm/drm_framebuffer.h> #include <drm/drm_gem.h> #include <drm/drm_print.h> #include <drm/drm_util.h> #include "drm_crtc_internal.h" #include "drm_internal.h" /** * DOC: overview * * Frame buffers are abstract memory objects that provide a source of pixels to * scanout to a CRTC. Applications explicitly request the creation of frame * buffers through the DRM_IOCTL_MODE_ADDFB(2) ioctls and receive an opaque * handle that can be passed to the KMS CRTC control, plane configuration and * page flip functions. * * Frame buffers rely on the underlying memory manager for allocating backing * storage. When creating a frame buffer applications pass a memory handle * (or a list of memory handles for multi-planar formats) through the * &struct drm_mode_fb_cmd2 argument. For drivers using GEM as their userspace * buffer management interface this would be a GEM handle. Drivers are however * free to use their own backing storage object handles, e.g. vmwgfx directly * exposes special TTM handles to userspace and so expects TTM handles in the * create ioctl and not GEM handles. * * Framebuffers are tracked with &struct drm_framebuffer. They are published * using drm_framebuffer_init() - after calling that function userspace can use * and access the framebuffer object. The helper function * drm_helper_mode_fill_fb_struct() can be used to pre-fill the required * metadata fields. * * The lifetime of a drm framebuffer is controlled with a reference count, * drivers can grab additional references with drm_framebuffer_get() and drop * them again with drm_framebuffer_put(). For driver-private framebuffers for * which the last reference is never dropped (e.g. for the fbdev framebuffer * when the struct &struct drm_framebuffer is embedded into the fbdev helper * struct) drivers can manually clean up a framebuffer at module unload time * with drm_framebuffer_unregister_private(). But doing this is not * recommended, and it's better to have a normal free-standing &struct * drm_framebuffer. */ int drm_framebuffer_check_src_coords(uint32_t src_x, uint32_t src_y, uint32_t src_w, uint32_t src_h, const struct drm_framebuffer *fb) { unsigned int fb_width, fb_height; fb_width = fb->width << 16; fb_height = fb->height << 16; /* Make sure source coordinates are inside the fb. */ if (src_w > fb_width || src_x > fb_width - src_w || src_h > fb_height || src_y > fb_height - src_h) { drm_dbg_kms(fb->dev, "Invalid source coordinates " "%u.%06ux%u.%06u+%u.%06u+%u.%06u (fb %ux%u)\n", src_w >> 16, ((src_w & 0xffff) * 15625) >> 10, src_h >> 16, ((src_h & 0xffff) * 15625) >> 10, src_x >> 16, ((src_x & 0xffff) * 15625) >> 10, src_y >> 16, ((src_y & 0xffff) * 15625) >> 10, fb->width, fb->height); return -ENOSPC; } return 0; } /** * drm_mode_addfb - add an FB to the graphics configuration * @dev: drm device for the ioctl * @or: pointer to request structure * @file_priv: drm file * * Add a new FB to the specified CRTC, given a user request. This is the * original addfb ioctl which only supported RGB formats. * * Called by the user via ioctl, or by an in-kernel client. * * Returns: * Zero on success, negative errno on failure. */ int drm_mode_addfb(struct drm_device *dev, struct drm_mode_fb_cmd *or, struct drm_file *file_priv) { struct drm_mode_fb_cmd2 r = {}; int ret; if (!drm_core_check_feature(dev, DRIVER_MODESET)) return -EOPNOTSUPP; r.pixel_format = drm_driver_legacy_fb_format(dev, or->bpp, or->depth); if (r.pixel_format == DRM_FORMAT_INVALID) { drm_dbg_kms(dev, "bad {bpp:%d, depth:%d}\n", or->bpp, or->depth); return -EINVAL; } /* convert to new format and call new ioctl */ r.fb_id = or->fb_id; r.width = or->width; r.height = or->height; r.pitches[0] = or->pitch; r.handles[0] = or->handle; ret = drm_mode_addfb2(dev, &r, file_priv); if (ret) return ret; or->fb_id = r.fb_id; return 0; } int drm_mode_addfb_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { return drm_mode_addfb(dev, data, file_priv); } static int framebuffer_check(struct drm_device *dev, const struct drm_mode_fb_cmd2 *r) { const struct drm_format_info *info; int i; /* check if the format is supported at all */ if (!__drm_format_info(r->pixel_format)) { drm_dbg_kms(dev, "bad framebuffer format %p4cc\n", &r->pixel_format); return -EINVAL; } if (r->width == 0) { drm_dbg_kms(dev, "bad framebuffer width %u\n", r->width); return -EINVAL; } if (r->height == 0) { drm_dbg_kms(dev, "bad framebuffer height %u\n", r->height); return -EINVAL; } /* now let the driver pick its own format info */ info = drm_get_format_info(dev, r); for (i = 0; i < info->num_planes; i++) { unsigned int width = drm_format_info_plane_width(info, r->width, i); unsigned int height = drm_format_info_plane_height(info, r->height, i); unsigned int block_size = info->char_per_block[i]; u64 min_pitch = drm_format_info_min_pitch(info, i, width); if (!block_size && (r->modifier[i] == DRM_FORMAT_MOD_LINEAR)) { drm_dbg_kms(dev, "Format requires non-linear modifier for plane %d\n", i); return -EINVAL; } if (!r->handles[i]) { drm_dbg_kms(dev, "no buffer object handle for plane %d\n", i); return -EINVAL; } if (min_pitch > UINT_MAX) return -ERANGE; if ((uint64_t) height * r->pitches[i] + r->offsets[i] > UINT_MAX) return -ERANGE; if (block_size && r->pitches[i] < min_pitch) { drm_dbg_kms(dev, "bad pitch %u for plane %d\n", r->pitches[i], i); return -EINVAL; } if (r->modifier[i] && !(r->flags & DRM_MODE_FB_MODIFIERS)) { drm_dbg_kms(dev, "bad fb modifier %llu for plane %d\n", r->modifier[i], i); return -EINVAL; } if (r->flags & DRM_MODE_FB_MODIFIERS && r->modifier[i] != r->modifier[0]) { drm_dbg_kms(dev, "bad fb modifier %llu for plane %d\n", r->modifier[i], i); return -EINVAL; } /* modifier specific checks: */ switch (r->modifier[i]) { case DRM_FORMAT_MOD_SAMSUNG_64_32_TILE: /* NOTE: the pitch restriction may be lifted later if it turns * out that no hw has this restriction: */ if (r->pixel_format != DRM_FORMAT_NV12 || width % 128 || height % 32 || r->pitches[i] % 128) { drm_dbg_kms(dev, "bad modifier data for plane %d\n", i); return -EINVAL; } break; default: break; } } for (i = info->num_planes; i < 4; i++) { if (r->modifier[i]) { drm_dbg_kms(dev, "non-zero modifier for unused plane %d\n", i); return -EINVAL; } /* Pre-FB_MODIFIERS userspace didn't clear the structs properly. */ if (!(r->flags & DRM_MODE_FB_MODIFIERS)) continue; if (r->handles[i]) { drm_dbg_kms(dev, "buffer object handle for unused plane %d\n", i); return -EINVAL; } if (r->pitches[i]) { drm_dbg_kms(dev, "non-zero pitch for unused plane %d\n", i); return -EINVAL; } if (r->offsets[i]) { drm_dbg_kms(dev, "non-zero offset for unused plane %d\n", i); return -EINVAL; } } return 0; } struct drm_framebuffer * drm_internal_framebuffer_create(struct drm_device *dev, const struct drm_mode_fb_cmd2 *r, struct drm_file *file_priv) { struct drm_mode_config *config = &dev->mode_config; struct drm_framebuffer *fb; int ret; if (r->flags & ~(DRM_MODE_FB_INTERLACED | DRM_MODE_FB_MODIFIERS)) { drm_dbg_kms(dev, "bad framebuffer flags 0x%08x\n", r->flags); return ERR_PTR(-EINVAL); } if ((config->min_width > r->width) || (r->width > config->max_width)) { drm_dbg_kms(dev, "bad framebuffer width %d, should be >= %d && <= %d\n", r->width, config->min_width, config->max_width); return ERR_PTR(-EINVAL); } if ((config->min_height > r->height) || (r->height > config->max_height)) { drm_dbg_kms(dev, "bad framebuffer height %d, should be >= %d && <= %d\n", r->height, config->min_height, config->max_height); return ERR_PTR(-EINVAL); } if (r->flags & DRM_MODE_FB_MODIFIERS && dev->mode_config.fb_modifiers_not_supported) { drm_dbg_kms(dev, "driver does not support fb modifiers\n"); return ERR_PTR(-EINVAL); } ret = framebuffer_check(dev, r); if (ret) return ERR_PTR(ret); fb = dev->mode_config.funcs->fb_create(dev, file_priv, r); if (IS_ERR(fb)) { drm_dbg_kms(dev, "could not create framebuffer\n"); return fb; } return fb; } EXPORT_SYMBOL_FOR_TESTS_ONLY(drm_internal_framebuffer_create); /** * drm_mode_addfb2 - add an FB to the graphics configuration * @dev: drm device for the ioctl * @data: data pointer for the ioctl * @file_priv: drm file for the ioctl call * * Add a new FB to the specified CRTC, given a user request with format. This is * the 2nd version of the addfb ioctl, which supports multi-planar framebuffers * and uses fourcc codes as pixel format specifiers. * * Called by the user via ioctl. * * Returns: * Zero on success, negative errno on failure. */ int drm_mode_addfb2(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_mode_fb_cmd2 *r = data; struct drm_framebuffer *fb; if (!drm_core_check_feature(dev, DRIVER_MODESET)) return -EOPNOTSUPP; fb = drm_internal_framebuffer_create(dev, r, file_priv); if (IS_ERR(fb)) return PTR_ERR(fb); drm_dbg_kms(dev, "[FB:%d]\n", fb->base.id); r->fb_id = fb->base.id; /* Transfer ownership to the filp for reaping on close */ mutex_lock(&file_priv->fbs_lock); list_add(&fb->filp_head, &file_priv->fbs); mutex_unlock(&file_priv->fbs_lock); return 0; } int drm_mode_addfb2_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { #ifdef __BIG_ENDIAN if (!dev->mode_config.quirk_addfb_prefer_host_byte_order) { /* * Drivers must set the * quirk_addfb_prefer_host_byte_order quirk to make * the drm_mode_addfb() compat code work correctly on * bigendian machines. * * If they don't they interpret pixel_format values * incorrectly for bug compatibility, which in turn * implies the ADDFB2 ioctl does not work correctly * then. So block it to make userspace fallback to * ADDFB. */ drm_dbg_kms(dev, "addfb2 broken on bigendian"); return -EOPNOTSUPP; } #endif return drm_mode_addfb2(dev, data, file_priv); } struct drm_mode_rmfb_work { struct work_struct work; struct list_head fbs; }; static void drm_mode_rmfb_work_fn(struct work_struct *w) { struct drm_mode_rmfb_work *arg = container_of(w, typeof(*arg), work); while (!list_empty(&arg->fbs)) { struct drm_framebuffer *fb = list_first_entry(&arg->fbs, typeof(*fb), filp_head); drm_dbg_kms(fb->dev, "Removing [FB:%d] from all active usage due to RMFB ioctl\n", fb->base.id); list_del_init(&fb->filp_head); drm_framebuffer_remove(fb); } } static int drm_mode_closefb(struct drm_framebuffer *fb, struct drm_file *file_priv) { struct drm_framebuffer *fbl; bool found = false; mutex_lock(&file_priv->fbs_lock); list_for_each_entry(fbl, &file_priv->fbs, filp_head) if (fb == fbl) found = true; if (!found) { mutex_unlock(&file_priv->fbs_lock); return -ENOENT; } list_del_init(&fb->filp_head); mutex_unlock(&file_priv->fbs_lock); /* Drop the reference that was stored in the fbs list */ drm_framebuffer_put(fb); return 0; } /** * drm_mode_rmfb - remove an FB from the configuration * @dev: drm device * @fb_id: id of framebuffer to remove * @file_priv: drm file * * Remove the specified FB. * * Called by the user via ioctl, or by an in-kernel client. * * Returns: * Zero on success, negative errno on failure. */ int drm_mode_rmfb(struct drm_device *dev, u32 fb_id, struct drm_file *file_priv) { struct drm_framebuffer *fb; int ret; if (!drm_core_check_feature(dev, DRIVER_MODESET)) return -EOPNOTSUPP; fb = drm_framebuffer_lookup(dev, file_priv, fb_id); if (!fb) return -ENOENT; ret = drm_mode_closefb(fb, file_priv); if (ret != 0) { drm_framebuffer_put(fb); return ret; } /* * drm_framebuffer_remove may fail with -EINTR on pending signals, * so run this in a separate stack as there's no way to correctly * handle this after the fb is already removed from the lookup table. */ if (drm_framebuffer_read_refcount(fb) > 1) { struct drm_mode_rmfb_work arg; INIT_WORK_ONSTACK(&arg.work, drm_mode_rmfb_work_fn); INIT_LIST_HEAD(&arg.fbs); drm_WARN_ON(dev, !list_empty(&fb->filp_head)); list_add_tail(&fb->filp_head, &arg.fbs); schedule_work(&arg.work); flush_work(&arg.work); destroy_work_on_stack(&arg.work); } else drm_framebuffer_put(fb); return 0; } int drm_mode_rmfb_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { uint32_t *fb_id = data; return drm_mode_rmfb(dev, *fb_id, file_priv); } int drm_mode_closefb_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_mode_closefb *r = data; struct drm_framebuffer *fb; int ret; if (!drm_core_check_feature(dev, DRIVER_MODESET)) return -EOPNOTSUPP; if (r->pad) return -EINVAL; fb = drm_framebuffer_lookup(dev, file_priv, r->fb_id); if (!fb) return -ENOENT; ret = drm_mode_closefb(fb, file_priv); drm_framebuffer_put(fb); return ret; } /** * drm_mode_getfb - get FB info * @dev: drm device for the ioctl * @data: data pointer for the ioctl * @file_priv: drm file for the ioctl call * * Lookup the FB given its ID and return info about it. * * Called by the user via ioctl. * * Returns: * Zero on success, negative errno on failure. */ int drm_mode_getfb(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_mode_fb_cmd *r = data; struct drm_framebuffer *fb; int ret; if (!drm_core_check_feature(dev, DRIVER_MODESET)) return -EOPNOTSUPP; fb = drm_framebuffer_lookup(dev, file_priv, r->fb_id); if (!fb) return -ENOENT; /* Multi-planar framebuffers need getfb2. */ if (fb->format->num_planes > 1) { ret = -EINVAL; goto out; } if (!fb->funcs->create_handle) { ret = -ENODEV; goto out; } r->height = fb->height; r->width = fb->width; r->depth = fb->format->depth; r->bpp = drm_format_info_bpp(fb->format, 0); r->pitch = fb->pitches[0]; /* GET_FB() is an unprivileged ioctl so we must not return a * buffer-handle to non-master processes! For * backwards-compatibility reasons, we cannot make GET_FB() privileged, * so just return an invalid handle for non-masters. */ if (!drm_is_current_master(file_priv) && !capable(CAP_SYS_ADMIN)) { r->handle = 0; ret = 0; goto out; } ret = fb->funcs->create_handle(fb, file_priv, &r->handle); out: drm_framebuffer_put(fb); return ret; } /** * drm_mode_getfb2_ioctl - get extended FB info * @dev: drm device for the ioctl * @data: data pointer for the ioctl * @file_priv: drm file for the ioctl call * * Lookup the FB given its ID and return info about it. * * Called by the user via ioctl. * * Returns: * Zero on success, negative errno on failure. */ int drm_mode_getfb2_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_mode_fb_cmd2 *r = data; struct drm_framebuffer *fb; unsigned int i; int ret = 0; if (!drm_core_check_feature(dev, DRIVER_MODESET)) return -EINVAL; fb = drm_framebuffer_lookup(dev, file_priv, r->fb_id); if (!fb) return -ENOENT; /* For multi-plane framebuffers, we require the driver to place the * GEM objects directly in the drm_framebuffer. For single-plane * framebuffers, we can fall back to create_handle. */ if (!fb->obj[0] && (fb->format->num_planes > 1 || !fb->funcs->create_handle)) { ret = -ENODEV; goto out; } r->height = fb->height; r->width = fb->width; r->pixel_format = fb->format->format; r->flags = 0; if (!dev->mode_config.fb_modifiers_not_supported) r->flags |= DRM_MODE_FB_MODIFIERS; for (i = 0; i < ARRAY_SIZE(r->handles); i++) { r->handles[i] = 0; r->pitches[i] = 0; r->offsets[i] = 0; r->modifier[i] = 0; } for (i = 0; i < fb->format->num_planes; i++) { r->pitches[i] = fb->pitches[i]; r->offsets[i] = fb->offsets[i]; if (!dev->mode_config.fb_modifiers_not_supported) r->modifier[i] = fb->modifier; } /* GET_FB2() is an unprivileged ioctl so we must not return a * buffer-handle to non master/root processes! To match GET_FB() * just return invalid handles (0) for non masters/root * rather than making GET_FB2() privileged. */ if (!drm_is_current_master(file_priv) && !capable(CAP_SYS_ADMIN)) { ret = 0; goto out; } for (i = 0; i < fb->format->num_planes; i++) { int j; /* If we reuse the same object for multiple planes, also * return the same handle. */ for (j = 0; j < i; j++) { if (fb->obj[i] == fb->obj[j]) { r->handles[i] = r->handles[j]; break; } } if (r->handles[i]) continue; if (fb->obj[i]) { ret = drm_gem_handle_create(file_priv, fb->obj[i], &r->handles[i]); } else { WARN_ON(i > 0); ret = fb->funcs->create_handle(fb, file_priv, &r->handles[i]); } if (ret != 0) goto out; } out: if (ret != 0) { /* Delete any previously-created handles on failure. */ for (i = 0; i < ARRAY_SIZE(r->handles); i++) { int j; if (r->handles[i]) drm_gem_handle_delete(file_priv, r->handles[i]); /* Zero out any handles identical to the one we just * deleted. */ for (j = i + 1; j < ARRAY_SIZE(r->handles); j++) { if (r->handles[j] == r->handles[i]) r->handles[j] = 0; } } } drm_framebuffer_put(fb); return ret; } /** * drm_mode_dirtyfb_ioctl - flush frontbuffer rendering on an FB * @dev: drm device for the ioctl * @data: data pointer for the ioctl * @file_priv: drm file for the ioctl call * * Lookup the FB and flush out the damaged area supplied by userspace as a clip * rectangle list. Generic userspace which does frontbuffer rendering must call * this ioctl to flush out the changes on manual-update display outputs, e.g. * usb display-link, mipi manual update panels or edp panel self refresh modes. * * Modesetting drivers which always update the frontbuffer do not need to * implement the corresponding &drm_framebuffer_funcs.dirty callback. * * Called by the user via ioctl. * * Returns: * Zero on success, negative errno on failure. */ int drm_mode_dirtyfb_ioctl(struct drm_device *dev, void *data, struct drm_file *file_priv) { struct drm_clip_rect __user *clips_ptr; struct drm_clip_rect *clips = NULL; struct drm_mode_fb_dirty_cmd *r = data; struct drm_framebuffer *fb; unsigned flags; int num_clips; int ret; if (!drm_core_check_feature(dev, DRIVER_MODESET)) return -EOPNOTSUPP; fb = drm_framebuffer_lookup(dev, file_priv, r->fb_id); if (!fb) return -ENOENT; num_clips = r->num_clips; clips_ptr = (struct drm_clip_rect __user *)(unsigned long)r->clips_ptr; if (!num_clips != !clips_ptr) { ret = -EINVAL; goto out_err1; } flags = DRM_MODE_FB_DIRTY_FLAGS & r->flags; /* If userspace annotates copy, clips must come in pairs */ if (flags & DRM_MODE_FB_DIRTY_ANNOTATE_COPY && (num_clips % 2)) { ret = -EINVAL; goto out_err1; } if (num_clips && clips_ptr) { if (num_clips < 0 || num_clips > DRM_MODE_FB_DIRTY_MAX_CLIPS) { ret = -EINVAL; goto out_err1; } clips = kcalloc(num_clips, sizeof(*clips), GFP_KERNEL); if (!clips) { ret = -ENOMEM; goto out_err1; } ret = copy_from_user(clips, clips_ptr, num_clips * sizeof(*clips)); if (ret) { ret = -EFAULT; goto out_err2; } } if (fb->funcs->dirty) { ret = fb->funcs->dirty(fb, file_priv, flags, r->color, clips, num_clips); } else { ret = -ENOSYS; } out_err2: kfree(clips); out_err1: drm_framebuffer_put(fb); return ret; } /** * drm_fb_release - remove and free the FBs on this file * @priv: drm file for the ioctl * * Destroy all the FBs associated with @filp. * * Called by the user via ioctl. * * Returns: * Zero on success, negative errno on failure. */ void drm_fb_release(struct drm_file *priv) { struct drm_framebuffer *fb, *tfb; struct drm_mode_rmfb_work arg; INIT_LIST_HEAD(&arg.fbs); /* * When the file gets released that means no one else can access the fb * list any more, so no need to grab fpriv->fbs_lock. And we need to * avoid upsetting lockdep since the universal cursor code adds a * framebuffer while holding mutex locks. * * Note that a real deadlock between fpriv->fbs_lock and the modeset * locks is impossible here since no one else but this function can get * at it any more. */ list_for_each_entry_safe(fb, tfb, &priv->fbs, filp_head) { if (drm_framebuffer_read_refcount(fb) > 1) { list_move_tail(&fb->filp_head, &arg.fbs); } else { list_del_init(&fb->filp_head); /* This drops the fpriv->fbs reference. */ drm_framebuffer_put(fb); } } if (!list_empty(&arg.fbs)) { INIT_WORK_ONSTACK(&arg.work, drm_mode_rmfb_work_fn); schedule_work(&arg.work); flush_work(&arg.work); destroy_work_on_stack(&arg.work); } } void drm_framebuffer_free(struct kref *kref) { struct drm_framebuffer *fb = container_of(kref, struct drm_framebuffer, base.refcount); struct drm_device *dev = fb->dev; drm_WARN_ON(dev, !list_empty(&fb->filp_head)); /* * The lookup idr holds a weak reference, which has not necessarily been * removed at this point. Check for that. */ drm_mode_object_unregister(dev, &fb->base); fb->funcs->destroy(fb); } /** * drm_framebuffer_init - initialize a framebuffer * @dev: DRM device * @fb: framebuffer to be initialized * @funcs: ... with these functions * * Allocates an ID for the framebuffer's parent mode object, sets its mode * functions & device file and adds it to the master fd list. * * IMPORTANT: * This functions publishes the fb and makes it available for concurrent access * by other users. Which means by this point the fb _must_ be fully set up - * since all the fb attributes are invariant over its lifetime, no further * locking but only correct reference counting is required. * * Returns: * Zero on success, error code on failure. */ int drm_framebuffer_init(struct drm_device *dev, struct drm_framebuffer *fb, const struct drm_framebuffer_funcs *funcs) { int ret; if (WARN_ON_ONCE(fb->dev != dev || !fb->format)) return -EINVAL; INIT_LIST_HEAD(&fb->filp_head); fb->funcs = funcs; strcpy(fb->comm, current->comm); ret = __drm_mode_object_add(dev, &fb->base, DRM_MODE_OBJECT_FB, false, drm_framebuffer_free); if (ret) goto out; mutex_lock(&dev->mode_config.fb_lock); dev->mode_config.num_fb++; list_add(&fb->head, &dev->mode_config.fb_list); mutex_unlock(&dev->mode_config.fb_lock); drm_mode_object_register(dev, &fb->base); out: return ret; } EXPORT_SYMBOL(drm_framebuffer_init); /** * drm_framebuffer_lookup - look up a drm framebuffer and grab a reference * @dev: drm device * @file_priv: drm file to check for lease against. * @id: id of the fb object * * If successful, this grabs an additional reference to the framebuffer - * callers need to make sure to eventually unreference the returned framebuffer * again, using drm_framebuffer_put(). */ struct drm_framebuffer *drm_framebuffer_lookup(struct drm_device *dev, struct drm_file *file_priv, uint32_t id) { struct drm_mode_object *obj; struct drm_framebuffer *fb = NULL; obj = __drm_mode_object_find(dev, file_priv, id, DRM_MODE_OBJECT_FB); if (obj) fb = obj_to_fb(obj); return fb; } EXPORT_SYMBOL(drm_framebuffer_lookup); /** * drm_framebuffer_unregister_private - unregister a private fb from the lookup idr * @fb: fb to unregister * * Drivers need to call this when cleaning up driver-private framebuffers, e.g. * those used for fbdev. Note that the caller must hold a reference of its own, * i.e. the object may not be destroyed through this call (since it'll lead to a * locking inversion). * * NOTE: This function is deprecated. For driver-private framebuffers it is not * recommended to embed a framebuffer struct info fbdev struct, instead, a * framebuffer pointer is preferred and drm_framebuffer_put() should be called * when the framebuffer is to be cleaned up. */ void drm_framebuffer_unregister_private(struct drm_framebuffer *fb) { struct drm_device *dev; if (!fb) return; dev = fb->dev; /* Mark fb as reaped and drop idr ref. */ drm_mode_object_unregister(dev, &fb->base); } EXPORT_SYMBOL(drm_framebuffer_unregister_private); /** * drm_framebuffer_cleanup - remove a framebuffer object * @fb: framebuffer to remove * * Cleanup framebuffer. This function is intended to be used from the drivers * &drm_framebuffer_funcs.destroy callback. It can also be used to clean up * driver private framebuffers embedded into a larger structure. * * Note that this function does not remove the fb from active usage - if it is * still used anywhere, hilarity can ensue since userspace could call getfb on * the id and get back -EINVAL. Obviously no concern at driver unload time. * * Also, the framebuffer will not be removed from the lookup idr - for * user-created framebuffers this will happen in the rmfb ioctl. For * driver-private objects (e.g. for fbdev) drivers need to explicitly call * drm_framebuffer_unregister_private. */ void drm_framebuffer_cleanup(struct drm_framebuffer *fb) { struct drm_device *dev = fb->dev; mutex_lock(&dev->mode_config.fb_lock); list_del(&fb->head); dev->mode_config.num_fb--; mutex_unlock(&dev->mode_config.fb_lock); } EXPORT_SYMBOL(drm_framebuffer_cleanup); static int atomic_remove_fb(struct drm_framebuffer *fb) { struct drm_modeset_acquire_ctx ctx; struct drm_device *dev = fb->dev; struct drm_atomic_state *state; struct drm_plane *plane; struct drm_connector *conn __maybe_unused; struct drm_connector_state *conn_state; int i, ret; unsigned plane_mask; bool disable_crtcs = false; retry_disable: drm_modeset_acquire_init(&ctx, 0); state = drm_atomic_state_alloc(dev); if (!state) { ret = -ENOMEM; goto out; } state->acquire_ctx = &ctx; retry: plane_mask = 0; ret = drm_modeset_lock_all_ctx(dev, &ctx); if (ret) goto unlock; drm_for_each_plane(plane, dev) { struct drm_plane_state *plane_state; if (plane->state->fb != fb) continue; drm_dbg_kms(dev, "Disabling [PLANE:%d:%s] because [FB:%d] is removed\n", plane->base.id, plane->name, fb->base.id); plane_state = drm_atomic_get_plane_state(state, plane); if (IS_ERR(plane_state)) { ret = PTR_ERR(plane_state); goto unlock; } if (disable_crtcs && plane_state->crtc->primary == plane) { struct drm_crtc_state *crtc_state; drm_dbg_kms(dev, "Disabling [CRTC:%d:%s] because [FB:%d] is removed\n", plane_state->crtc->base.id, plane_state->crtc->name, fb->base.id); crtc_state = drm_atomic_get_existing_crtc_state(state, plane_state->crtc); ret = drm_atomic_add_affected_connectors(state, plane_state->crtc); if (ret) goto unlock; crtc_state->active = false; ret = drm_atomic_set_mode_for_crtc(crtc_state, NULL); if (ret) goto unlock; } drm_atomic_set_fb_for_plane(plane_state, NULL); ret = drm_atomic_set_crtc_for_plane(plane_state, NULL); if (ret) goto unlock; plane_mask |= drm_plane_mask(plane); } /* This list is only filled when disable_crtcs is set. */ for_each_new_connector_in_state(state, conn, conn_state, i) { ret = drm_atomic_set_crtc_for_connector(conn_state, NULL); if (ret) goto unlock; } if (plane_mask) ret = drm_atomic_commit(state); unlock: if (ret == -EDEADLK) { drm_atomic_state_clear(state); drm_modeset_backoff(&ctx); goto retry; } drm_atomic_state_put(state); out: drm_modeset_drop_locks(&ctx); drm_modeset_acquire_fini(&ctx); if (ret == -EINVAL && !disable_crtcs) { disable_crtcs = true; goto retry_disable; } return ret; } static void legacy_remove_fb(struct drm_framebuffer *fb) { struct drm_device *dev = fb->dev; struct drm_crtc *crtc; struct drm_plane *plane; drm_modeset_lock_all(dev); /* remove from any CRTC */ drm_for_each_crtc(crtc, dev) { if (crtc->primary->fb == fb) { drm_dbg_kms(dev, "Disabling [CRTC:%d:%s] because [FB:%d] is removed\n", crtc->base.id, crtc->name, fb->base.id); /* should turn off the crtc */ if (drm_crtc_force_disable(crtc)) DRM_ERROR("failed to reset crtc %p when fb was deleted\n", crtc); } } drm_for_each_plane(plane, dev) { if (plane->fb == fb) { drm_dbg_kms(dev, "Disabling [PLANE:%d:%s] because [FB:%d] is removed\n", plane->base.id, plane->name, fb->base.id); drm_plane_force_disable(plane); } } drm_modeset_unlock_all(dev); } /** * drm_framebuffer_remove - remove and unreference a framebuffer object * @fb: framebuffer to remove * * Scans all the CRTCs and planes in @dev's mode_config. If they're * using @fb, removes it, setting it to NULL. Then drops the reference to the * passed-in framebuffer. Might take the modeset locks. * * Note that this function optimizes the cleanup away if the caller holds the * last reference to the framebuffer. It is also guaranteed to not take the * modeset locks in this case. */ void drm_framebuffer_remove(struct drm_framebuffer *fb) { struct drm_device *dev; if (!fb) return; dev = fb->dev; drm_WARN_ON(dev, !list_empty(&fb->filp_head)); /* * drm ABI mandates that we remove any deleted framebuffers from active * usage. But since most sane clients only remove framebuffers they no * longer need, try to optimize this away. * * Since we're holding a reference ourselves, observing a refcount of 1 * means that we're the last holder and can skip it. Also, the refcount * can never increase from 1 again, so we don't need any barriers or * locks. * * Note that userspace could try to race with use and instate a new * usage _after_ we've cleared all current ones. End result will be an * in-use fb with fb-id == 0. Userspace is allowed to shoot its own foot * in this manner. */ if (drm_framebuffer_read_refcount(fb) > 1) { if (drm_drv_uses_atomic_modeset(dev)) { int ret = atomic_remove_fb(fb); WARN(ret, "atomic remove_fb failed with %i\n", ret); } else legacy_remove_fb(fb); } drm_framebuffer_put(fb); } EXPORT_SYMBOL(drm_framebuffer_remove); void drm_framebuffer_print_info(struct drm_printer *p, unsigned int indent, const struct drm_framebuffer *fb) { unsigned int i; drm_printf_indent(p, indent, "allocated by = %s\n", fb->comm); drm_printf_indent(p, indent, "refcount=%u\n", drm_framebuffer_read_refcount(fb)); drm_printf_indent(p, indent, "format=%p4cc\n", &fb->format->format); drm_printf_indent(p, indent, "modifier=0x%llx\n", fb->modifier); drm_printf_indent(p, indent, "size=%ux%u\n", fb->width, fb->height); drm_printf_indent(p, indent, "layers:\n"); for (i = 0; i < fb->format->num_planes; i++) { drm_printf_indent(p, indent + 1, "size[%u]=%dx%d\n", i, drm_format_info_plane_width(fb->format, fb->width, i), drm_format_info_plane_height(fb->format, fb->height, i)); drm_printf_indent(p, indent + 1, "pitch[%u]=%u\n", i, fb->pitches[i]); drm_printf_indent(p, indent + 1, "offset[%u]=%u\n", i, fb->offsets[i]); drm_printf_indent(p, indent + 1, "obj[%u]:%s\n", i, fb->obj[i] ? "" : "(null)"); if (fb->obj[i]) drm_gem_print_info(p, indent + 2, fb->obj[i]); } } #ifdef CONFIG_DEBUG_FS static int drm_framebuffer_info(struct seq_file *m, void *data) { struct drm_debugfs_entry *entry = m->private; struct drm_device *dev = entry->dev; struct drm_printer p = drm_seq_file_printer(m); struct drm_framebuffer *fb; mutex_lock(&dev->mode_config.fb_lock); drm_for_each_fb(fb, dev) { drm_printf(&p, "framebuffer[%u]:\n", fb->base.id); drm_framebuffer_print_info(&p, 1, fb); } mutex_unlock(&dev->mode_config.fb_lock); return 0; } static const struct drm_debugfs_info drm_framebuffer_debugfs_list[] = { { "framebuffer", drm_framebuffer_info, 0 }, }; void drm_framebuffer_debugfs_init(struct drm_device *dev) { drm_debugfs_add_files(dev, drm_framebuffer_debugfs_list, ARRAY_SIZE(drm_framebuffer_debugfs_list)); } #endif |
133 135 139 140 139 8 8 8 7 3 139 139 139 140 139 140 139 36 119 140 140 138 1 184 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 | // SPDX-License-Identifier: GPL-2.0-only /* * Kernel-based Virtual Machine driver for Linux * * Macros and functions to access KVM PTEs (also known as SPTEs) * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2020 Red Hat, Inc. and/or its affiliates. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include "mmu.h" #include "mmu_internal.h" #include "x86.h" #include "spte.h" #include <asm/e820/api.h> #include <asm/memtype.h> #include <asm/vmx.h> bool __read_mostly enable_mmio_caching = true; static bool __ro_after_init allow_mmio_caching; module_param_named(mmio_caching, enable_mmio_caching, bool, 0444); EXPORT_SYMBOL_GPL(enable_mmio_caching); u64 __read_mostly shadow_host_writable_mask; u64 __read_mostly shadow_mmu_writable_mask; u64 __read_mostly shadow_nx_mask; u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ u64 __read_mostly shadow_user_mask; u64 __read_mostly shadow_accessed_mask; u64 __read_mostly shadow_dirty_mask; u64 __read_mostly shadow_mmio_value; u64 __read_mostly shadow_mmio_mask; u64 __read_mostly shadow_mmio_access_mask; u64 __read_mostly shadow_present_mask; u64 __read_mostly shadow_memtype_mask; u64 __read_mostly shadow_me_value; u64 __read_mostly shadow_me_mask; u64 __read_mostly shadow_acc_track_mask; u64 __read_mostly shadow_nonpresent_or_rsvd_mask; u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask; static u8 __init kvm_get_host_maxphyaddr(void) { /* * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected * in CPU detection code, but the processor treats those reduced bits as * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at * the physical address bits reported by CPUID, i.e. the raw MAXPHYADDR, * when reasoning about CPU behavior with respect to MAXPHYADDR. */ if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008)) return cpuid_eax(0x80000008) & 0xff; /* * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with * custom CPUID. Proceed with whatever the kernel found since these features * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008). */ return boot_cpu_data.x86_phys_bits; } void __init kvm_mmu_spte_module_init(void) { /* * Snapshot userspace's desire to allow MMIO caching. Whether or not * KVM can actually enable MMIO caching depends on vendor-specific * hardware capabilities and other module params that can't be resolved * until the vendor module is loaded, i.e. enable_mmio_caching can and * will change when the vendor module is (re)loaded. */ allow_mmio_caching = enable_mmio_caching; kvm_host.maxphyaddr = kvm_get_host_maxphyaddr(); } static u64 generation_mmio_spte_mask(u64 gen) { u64 mask; WARN_ON_ONCE(gen & ~MMIO_SPTE_GEN_MASK); mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK; mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK; return mask; } u64 make_mmio_spte(struct kvm_vcpu *vcpu, u64 gfn, unsigned int access) { u64 gen = kvm_vcpu_memslots(vcpu)->generation & MMIO_SPTE_GEN_MASK; u64 spte = generation_mmio_spte_mask(gen); u64 gpa = gfn << PAGE_SHIFT; WARN_ON_ONCE(!vcpu->kvm->arch.shadow_mmio_value); access &= shadow_mmio_access_mask; spte |= vcpu->kvm->arch.shadow_mmio_value | access; spte |= gpa | shadow_nonpresent_or_rsvd_mask; spte |= (gpa & shadow_nonpresent_or_rsvd_mask) << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN; return spte; } static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) { if (pfn_valid(pfn)) return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) && /* * Some reserved pages, such as those from NVDIMM * DAX devices, are not for MMIO, and can be mapped * with cached memory type for better performance. * However, the above check misconceives those pages * as MMIO, and results in KVM mapping them with UC * memory type, which would hurt the performance. * Therefore, we check the host memory type in addition * and only treat UC/UC-/WC pages as MMIO. */ (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn)); return !e820__mapped_raw_any(pfn_to_hpa(pfn), pfn_to_hpa(pfn + 1) - 1, E820_TYPE_RAM); } /* * Returns true if the SPTE has bits that may be set without holding mmu_lock. * The caller is responsible for checking if the SPTE is shadow-present, and * for determining whether or not the caller cares about non-leaf SPTEs. */ bool spte_has_volatile_bits(u64 spte) { /* * Always atomically update spte if it can be updated * out of mmu-lock, it can ensure dirty bit is not lost, * also, it can help us to get a stable is_writable_pte() * to ensure tlb flush is not missed. */ if (!is_writable_pte(spte) && is_mmu_writable_spte(spte)) return true; if (is_access_track_spte(spte)) return true; if (spte_ad_enabled(spte)) { if (!(spte & shadow_accessed_mask) || (is_writable_pte(spte) && !(spte & shadow_dirty_mask))) return true; } return false; } bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, const struct kvm_memory_slot *slot, unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn, u64 old_spte, bool prefetch, bool can_unsync, bool host_writable, u64 *new_spte) { int level = sp->role.level; u64 spte = SPTE_MMU_PRESENT_MASK; bool wrprot = false; /* * For the EPT case, shadow_present_mask has no RWX bits set if * exec-only page table entries are supported. In that case, * ACC_USER_MASK and shadow_user_mask are used to represent * read access. See FNAME(gpte_access) in paging_tmpl.h. */ WARN_ON_ONCE((pte_access | shadow_present_mask) == SHADOW_NONPRESENT_VALUE); if (sp->role.ad_disabled) spte |= SPTE_TDP_AD_DISABLED; else if (kvm_mmu_page_ad_need_write_protect(sp)) spte |= SPTE_TDP_AD_WRPROT_ONLY; spte |= shadow_present_mask; if (!prefetch) spte |= spte_shadow_accessed_mask(spte); /* * For simplicity, enforce the NX huge page mitigation even if not * strictly necessary. KVM could ignore the mitigation if paging is * disabled in the guest, as the guest doesn't have any page tables to * abuse. But to safely ignore the mitigation, KVM would have to * ensure a new MMU is loaded (or all shadow pages zapped) when CR0.PG * is toggled on, and that's a net negative for performance when TDP is * enabled. When TDP is disabled, KVM will always switch to a new MMU * when CR0.PG is toggled, but leveraging that to ignore the mitigation * would tie make_spte() further to vCPU/MMU state, and add complexity * just to optimize a mode that is anything but performance critical. */ if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && is_nx_huge_page_enabled(vcpu->kvm)) { pte_access &= ~ACC_EXEC_MASK; } if (pte_access & ACC_EXEC_MASK) spte |= shadow_x_mask; else spte |= shadow_nx_mask; if (pte_access & ACC_USER_MASK) spte |= shadow_user_mask; if (level > PG_LEVEL_4K) spte |= PT_PAGE_SIZE_MASK; if (shadow_memtype_mask) spte |= kvm_x86_call(get_mt_mask)(vcpu, gfn, kvm_is_mmio_pfn(pfn)); if (host_writable) spte |= shadow_host_writable_mask; else pte_access &= ~ACC_WRITE_MASK; if (shadow_me_value && !kvm_is_mmio_pfn(pfn)) spte |= shadow_me_value; spte |= (u64)pfn << PAGE_SHIFT; if (pte_access & ACC_WRITE_MASK) { spte |= PT_WRITABLE_MASK | shadow_mmu_writable_mask; /* * Optimization: for pte sync, if spte was writable the hash * lookup is unnecessary (and expensive). Write protection * is responsibility of kvm_mmu_get_page / kvm_mmu_sync_roots. * Same reasoning can be applied to dirty page accounting. */ if (is_writable_pte(old_spte)) goto out; /* * Unsync shadow pages that are reachable by the new, writable * SPTE. Write-protect the SPTE if the page can't be unsync'd, * e.g. it's write-tracked (upper-level SPs) or has one or more * shadow pages and unsync'ing pages is not allowed. */ if (mmu_try_to_unsync_pages(vcpu->kvm, slot, gfn, can_unsync, prefetch)) { wrprot = true; pte_access &= ~ACC_WRITE_MASK; spte &= ~(PT_WRITABLE_MASK | shadow_mmu_writable_mask); } } if (pte_access & ACC_WRITE_MASK) spte |= spte_shadow_dirty_mask(spte); out: if (prefetch) spte = mark_spte_for_access_track(spte); WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->shadow_zero_check, spte, level), "spte = 0x%llx, level = %d, rsvd bits = 0x%llx", spte, level, get_rsvd_bits(&vcpu->arch.mmu->shadow_zero_check, spte, level)); if ((spte & PT_WRITABLE_MASK) && kvm_slot_dirty_track_enabled(slot)) { /* Enforced by kvm_mmu_hugepage_adjust. */ WARN_ON_ONCE(level > PG_LEVEL_4K); mark_page_dirty_in_slot(vcpu->kvm, slot, gfn); } *new_spte = spte; return wrprot; } static u64 make_spte_executable(u64 spte) { bool is_access_track = is_access_track_spte(spte); if (is_access_track) spte = restore_acc_track_spte(spte); spte &= ~shadow_nx_mask; spte |= shadow_x_mask; if (is_access_track) spte = mark_spte_for_access_track(spte); return spte; } /* * Construct an SPTE that maps a sub-page of the given huge page SPTE where * `index` identifies which sub-page. * * This is used during huge page splitting to build the SPTEs that make up the * new page table. */ u64 make_huge_page_split_spte(struct kvm *kvm, u64 huge_spte, union kvm_mmu_page_role role, int index) { u64 child_spte = huge_spte; KVM_BUG_ON(!is_shadow_present_pte(huge_spte) || !is_large_pte(huge_spte), kvm); /* * The child_spte already has the base address of the huge page being * split. So we just have to OR in the offset to the page at the next * lower level for the given index. */ child_spte |= (index * KVM_PAGES_PER_HPAGE(role.level)) << PAGE_SHIFT; if (role.level == PG_LEVEL_4K) { child_spte &= ~PT_PAGE_SIZE_MASK; /* * When splitting to a 4K page where execution is allowed, mark * the page executable as the NX hugepage mitigation no longer * applies. */ if ((role.access & ACC_EXEC_MASK) && is_nx_huge_page_enabled(kvm)) child_spte = make_spte_executable(child_spte); } return child_spte; } u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled) { u64 spte = SPTE_MMU_PRESENT_MASK; spte |= __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK | shadow_user_mask | shadow_x_mask | shadow_me_value; if (ad_disabled) spte |= SPTE_TDP_AD_DISABLED; else spte |= shadow_accessed_mask; return spte; } u64 mark_spte_for_access_track(u64 spte) { if (spte_ad_enabled(spte)) return spte & ~shadow_accessed_mask; if (is_access_track_spte(spte)) return spte; check_spte_writable_invariants(spte); WARN_ONCE(spte & (SHADOW_ACC_TRACK_SAVED_BITS_MASK << SHADOW_ACC_TRACK_SAVED_BITS_SHIFT), "Access Tracking saved bit locations are not zero\n"); spte |= (spte & SHADOW_ACC_TRACK_SAVED_BITS_MASK) << SHADOW_ACC_TRACK_SAVED_BITS_SHIFT; spte &= ~shadow_acc_track_mask; return spte; } void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask) { BUG_ON((u64)(unsigned)access_mask != access_mask); WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask); /* * Reset to the original module param value to honor userspace's desire * to (dis)allow MMIO caching. Update the param itself so that * userspace can see whether or not KVM is actually using MMIO caching. */ enable_mmio_caching = allow_mmio_caching; if (!enable_mmio_caching) mmio_value = 0; /* * The mask must contain only bits that are carved out specifically for * the MMIO SPTE mask, e.g. to ensure there's no overlap with the MMIO * generation. */ if (WARN_ON(mmio_mask & ~SPTE_MMIO_ALLOWED_MASK)) mmio_value = 0; /* * Disable MMIO caching if the MMIO value collides with the bits that * are used to hold the relocated GFN when the L1TF mitigation is * enabled. This should never fire as there is no known hardware that * can trigger this condition, e.g. SME/SEV CPUs that require a custom * MMIO value are not susceptible to L1TF. */ if (WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << SHADOW_NONPRESENT_OR_RSVD_MASK_LEN))) mmio_value = 0; /* * The masked MMIO value must obviously match itself and a removed SPTE * must not get a false positive. Removed SPTEs and MMIO SPTEs should * never collide as MMIO must set some RWX bits, and removed SPTEs must * not set any RWX bits. */ if (WARN_ON((mmio_value & mmio_mask) != mmio_value) || WARN_ON(mmio_value && (FROZEN_SPTE & mmio_mask) == mmio_value)) mmio_value = 0; if (!mmio_value) enable_mmio_caching = false; shadow_mmio_value = mmio_value; shadow_mmio_mask = mmio_mask; shadow_mmio_access_mask = access_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask); void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask) { /* shadow_me_value must be a subset of shadow_me_mask */ if (WARN_ON(me_value & ~me_mask)) me_value = me_mask = 0; shadow_me_value = me_value; shadow_me_mask = me_mask; } EXPORT_SYMBOL_GPL(kvm_mmu_set_me_spte_mask); void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only) { shadow_user_mask = VMX_EPT_READABLE_MASK; shadow_accessed_mask = has_ad_bits ? VMX_EPT_ACCESS_BIT : 0ull; shadow_dirty_mask = has_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull; shadow_nx_mask = 0ull; shadow_x_mask = VMX_EPT_EXECUTABLE_MASK; /* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */ shadow_present_mask = (has_exec_only ? 0ull : VMX_EPT_READABLE_MASK) | VMX_EPT_SUPPRESS_VE_BIT; /* * EPT overrides the host MTRRs, and so KVM must program the desired * memtype directly into the SPTEs. Note, this mask is just the mask * of all bits that factor into the memtype, the actual memtype must be * dynamically calculated, e.g. to ensure host MMIO is mapped UC. */ shadow_memtype_mask = VMX_EPT_MT_MASK | VMX_EPT_IPAT_BIT; shadow_acc_track_mask = VMX_EPT_RWX_MASK; shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE; shadow_mmu_writable_mask = EPT_SPTE_MMU_WRITABLE; /* * EPT Misconfigurations are generated if the value of bits 2:0 * of an EPT paging-structure entry is 110b (write/execute). */ kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, VMX_EPT_RWX_MASK | VMX_EPT_SUPPRESS_VE_BIT, 0); } EXPORT_SYMBOL_GPL(kvm_mmu_set_ept_masks); void kvm_mmu_reset_all_pte_masks(void) { u8 low_phys_bits; u64 mask; /* * If the CPU has 46 or less physical address bits, then set an * appropriate mask to guard against L1TF attacks. Otherwise, it is * assumed that the CPU is not vulnerable to L1TF. * * Some Intel CPUs address the L1 cache using more PA bits than are * reported by CPUID. Use the PA width of the L1 cache when possible * to achieve more effective mitigation, e.g. if system RAM overlaps * the most significant bits of legal physical address space. */ shadow_nonpresent_or_rsvd_mask = 0; low_phys_bits = boot_cpu_data.x86_phys_bits; if (boot_cpu_has_bug(X86_BUG_L1TF) && !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >= 52 - SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)) { low_phys_bits = boot_cpu_data.x86_cache_bits - SHADOW_NONPRESENT_OR_RSVD_MASK_LEN; shadow_nonpresent_or_rsvd_mask = rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1); } shadow_nonpresent_or_rsvd_lower_gfn_mask = GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT); shadow_user_mask = PT_USER_MASK; shadow_accessed_mask = PT_ACCESSED_MASK; shadow_dirty_mask = PT_DIRTY_MASK; shadow_nx_mask = PT64_NX_MASK; shadow_x_mask = 0; shadow_present_mask = PT_PRESENT_MASK; /* * For shadow paging and NPT, KVM uses PAT entry '0' to encode WB * memtype in the SPTEs, i.e. relies on host MTRRs to provide the * correct memtype (WB is the "weakest" memtype). */ shadow_memtype_mask = 0; shadow_acc_track_mask = 0; shadow_me_mask = 0; shadow_me_value = 0; shadow_host_writable_mask = DEFAULT_SPTE_HOST_WRITABLE; shadow_mmu_writable_mask = DEFAULT_SPTE_MMU_WRITABLE; /* * Set a reserved PA bit in MMIO SPTEs to generate page faults with * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports * 52-bit physical addresses then there are no reserved PA bits in the * PTEs and so the reserved PA approach must be disabled. */ if (kvm_host.maxphyaddr < 52) mask = BIT_ULL(51) | PT_PRESENT_MASK; else mask = 0; kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK); } |
19 1 288 13 280 280 288 288 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 | // SPDX-License-Identifier: GPL-2.0 #define CREATE_TRACE_POINTS #include <trace/events/mmap_lock.h> #include <linux/mm.h> #include <linux/cgroup.h> #include <linux/memcontrol.h> #include <linux/mmap_lock.h> #include <linux/mutex.h> #include <linux/percpu.h> #include <linux/rcupdate.h> #include <linux/smp.h> #include <linux/trace_events.h> #include <linux/local_lock.h> EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking); EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned); EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released); #ifdef CONFIG_MEMCG static atomic_t reg_refcount; /* * Size of the buffer for memcg path names. Ignoring stack trace support, * trace_events_hist.c uses MAX_FILTER_STR_VAL for this, so we also use it. */ #define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL int trace_mmap_lock_reg(void) { atomic_inc(®_refcount); return 0; } void trace_mmap_lock_unreg(void) { atomic_dec(®_refcount); } #define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ do { \ char buf[MEMCG_PATH_BUF_SIZE]; \ get_mm_memcg_path(mm, buf, sizeof(buf)); \ trace_mmap_lock_##type(mm, buf, ##__VA_ARGS__); \ } while (0) #else /* !CONFIG_MEMCG */ int trace_mmap_lock_reg(void) { return 0; } void trace_mmap_lock_unreg(void) { } #define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ trace_mmap_lock_##type(mm, "", ##__VA_ARGS__) #endif /* CONFIG_MEMCG */ #ifdef CONFIG_TRACING #ifdef CONFIG_MEMCG /* * Write the given mm_struct's memcg path to a buffer. If the path cannot be * determined or the trace event is being unregistered, empty string is written. */ static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen) { struct mem_cgroup *memcg; buf[0] = '\0'; /* No need to get path if no trace event is registered. */ if (!atomic_read(®_refcount)) return; memcg = get_mem_cgroup_from_mm(mm); if (memcg == NULL) return; if (memcg->css.cgroup) cgroup_path(memcg->css.cgroup, buf, buflen); css_put(&memcg->css); } #endif /* CONFIG_MEMCG */ /* * Trace calls must be in a separate file, as otherwise there's a circular * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h. */ void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write) { TRACE_MMAP_LOCK_EVENT(start_locking, mm, write); } EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking); void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, bool success) { TRACE_MMAP_LOCK_EVENT(acquire_returned, mm, write, success); } EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned); void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write) { TRACE_MMAP_LOCK_EVENT(released, mm, write); } EXPORT_SYMBOL(__mmap_lock_do_trace_released); #endif /* CONFIG_TRACING */ |
1 1 2 3 4 4 3 4 5 5 5 5 4 2 2 2 1 7 7 7 2 2 2 2 2 2 2 5 5 5 5 23 23 21 6 23 21 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 | // SPDX-License-Identifier: GPL-2.0-only /* * dma-fence-array: aggregate fences to be waited together * * Copyright (C) 2016 Collabora Ltd * Copyright (C) 2016 Advanced Micro Devices, Inc. * Authors: * Gustavo Padovan <gustavo@padovan.org> * Christian König <christian.koenig@amd.com> */ #include <linux/export.h> #include <linux/slab.h> #include <linux/dma-fence-array.h> #define PENDING_ERROR 1 static const char *dma_fence_array_get_driver_name(struct dma_fence *fence) { return "dma_fence_array"; } static const char *dma_fence_array_get_timeline_name(struct dma_fence *fence) { return "unbound"; } static void dma_fence_array_set_pending_error(struct dma_fence_array *array, int error) { /* * Propagate the first error reported by any of our fences, but only * before we ourselves are signaled. */ if (error) cmpxchg(&array->base.error, PENDING_ERROR, error); } static void dma_fence_array_clear_pending_error(struct dma_fence_array *array) { /* Clear the error flag if not actually set. */ cmpxchg(&array->base.error, PENDING_ERROR, 0); } static void irq_dma_fence_array_work(struct irq_work *wrk) { struct dma_fence_array *array = container_of(wrk, typeof(*array), work); dma_fence_array_clear_pending_error(array); dma_fence_signal(&array->base); dma_fence_put(&array->base); } static void dma_fence_array_cb_func(struct dma_fence *f, struct dma_fence_cb *cb) { struct dma_fence_array_cb *array_cb = container_of(cb, struct dma_fence_array_cb, cb); struct dma_fence_array *array = array_cb->array; dma_fence_array_set_pending_error(array, f->error); if (atomic_dec_and_test(&array->num_pending)) irq_work_queue(&array->work); else dma_fence_put(&array->base); } static bool dma_fence_array_enable_signaling(struct dma_fence *fence) { struct dma_fence_array *array = to_dma_fence_array(fence); struct dma_fence_array_cb *cb = array->callbacks; unsigned i; for (i = 0; i < array->num_fences; ++i) { cb[i].array = array; /* * As we may report that the fence is signaled before all * callbacks are complete, we need to take an additional * reference count on the array so that we do not free it too * early. The core fence handling will only hold the reference * until we signal the array as complete (but that is now * insufficient). */ dma_fence_get(&array->base); if (dma_fence_add_callback(array->fences[i], &cb[i].cb, dma_fence_array_cb_func)) { int error = array->fences[i]->error; dma_fence_array_set_pending_error(array, error); dma_fence_put(&array->base); if (atomic_dec_and_test(&array->num_pending)) { dma_fence_array_clear_pending_error(array); return false; } } } return true; } static bool dma_fence_array_signaled(struct dma_fence *fence) { struct dma_fence_array *array = to_dma_fence_array(fence); if (atomic_read(&array->num_pending) > 0) return false; dma_fence_array_clear_pending_error(array); return true; } static void dma_fence_array_release(struct dma_fence *fence) { struct dma_fence_array *array = to_dma_fence_array(fence); unsigned i; for (i = 0; i < array->num_fences; ++i) dma_fence_put(array->fences[i]); kfree(array->fences); dma_fence_free(fence); } static void dma_fence_array_set_deadline(struct dma_fence *fence, ktime_t deadline) { struct dma_fence_array *array = to_dma_fence_array(fence); unsigned i; for (i = 0; i < array->num_fences; ++i) dma_fence_set_deadline(array->fences[i], deadline); } const struct dma_fence_ops dma_fence_array_ops = { .get_driver_name = dma_fence_array_get_driver_name, .get_timeline_name = dma_fence_array_get_timeline_name, .enable_signaling = dma_fence_array_enable_signaling, .signaled = dma_fence_array_signaled, .release = dma_fence_array_release, .set_deadline = dma_fence_array_set_deadline, }; EXPORT_SYMBOL(dma_fence_array_ops); /** * dma_fence_array_create - Create a custom fence array * @num_fences: [in] number of fences to add in the array * @fences: [in] array containing the fences * @context: [in] fence context to use * @seqno: [in] sequence number to use * @signal_on_any: [in] signal on any fence in the array * * Allocate a dma_fence_array object and initialize the base fence with * dma_fence_init(). * In case of error it returns NULL. * * The caller should allocate the fences array with num_fences size * and fill it with the fences it wants to add to the object. Ownership of this * array is taken and dma_fence_put() is used on each fence on release. * * If @signal_on_any is true the fence array signals if any fence in the array * signals, otherwise it signals when all fences in the array signal. */ struct dma_fence_array *dma_fence_array_create(int num_fences, struct dma_fence **fences, u64 context, unsigned seqno, bool signal_on_any) { struct dma_fence_array *array; WARN_ON(!num_fences || !fences); array = kzalloc(struct_size(array, callbacks, num_fences), GFP_KERNEL); if (!array) return NULL; array->num_fences = num_fences; spin_lock_init(&array->lock); dma_fence_init(&array->base, &dma_fence_array_ops, &array->lock, context, seqno); init_irq_work(&array->work, irq_dma_fence_array_work); atomic_set(&array->num_pending, signal_on_any ? 1 : num_fences); array->fences = fences; array->base.error = PENDING_ERROR; /* * dma_fence_array objects should never contain any other fence * containers or otherwise we run into recursion and potential kernel * stack overflow on operations on the dma_fence_array. * * The correct way of handling this is to flatten out the array by the * caller instead. * * Enforce this here by checking that we don't create a dma_fence_array * with any container inside. */ while (num_fences--) WARN_ON(dma_fence_is_container(fences[num_fences])); return array; } EXPORT_SYMBOL(dma_fence_array_create); /** * dma_fence_match_context - Check if all fences are from the given context * @fence: [in] fence or fence array * @context: [in] fence context to check all fences against * * Checks the provided fence or, for a fence array, all fences in the array * against the given context. Returns false if any fence is from a different * context. */ bool dma_fence_match_context(struct dma_fence *fence, u64 context) { struct dma_fence_array *array = to_dma_fence_array(fence); unsigned i; if (!dma_fence_is_array(fence)) return fence->context == context; for (i = 0; i < array->num_fences; i++) { if (array->fences[i]->context != context) return false; } return true; } EXPORT_SYMBOL(dma_fence_match_context); struct dma_fence *dma_fence_array_first(struct dma_fence *head) { struct dma_fence_array *array; if (!head) return NULL; array = to_dma_fence_array(head); if (!array) return head; if (!array->num_fences) return NULL; return array->fences[0]; } EXPORT_SYMBOL(dma_fence_array_first); struct dma_fence *dma_fence_array_next(struct dma_fence *head, unsigned int index) { struct dma_fence_array *array = to_dma_fence_array(head); if (!array || index >= array->num_fences) return NULL; return array->fences[index]; } EXPORT_SYMBOL(dma_fence_array_next); |
8 9 2 2 2 2 2 2 2 2 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2010-2013, The Linux Foundation. All rights reserved. */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/usb.h> #include <linux/usb/ch11.h> #define TEST_SE0_NAK_PID 0x0101 #define TEST_J_PID 0x0102 #define TEST_K_PID 0x0103 #define TEST_PACKET_PID 0x0104 #define TEST_HS_HOST_PORT_SUSPEND_RESUME 0x0106 #define TEST_SINGLE_STEP_GET_DEV_DESC 0x0107 #define TEST_SINGLE_STEP_SET_FEATURE 0x0108 extern const struct usb_device_id *usb_device_match_id(struct usb_device *udev, const struct usb_device_id *id); /* * A list of USB hubs which requires to disable the power * to the port before starting the testing procedures. */ static const struct usb_device_id ehset_hub_list[] = { { USB_DEVICE(0x0424, 0x4502) }, { USB_DEVICE(0x0424, 0x4913) }, { USB_DEVICE(0x0451, 0x8027) }, { } }; static int ehset_prepare_port_for_testing(struct usb_device *hub_udev, u16 portnum) { int ret = 0; /* * The USB2.0 spec chapter 11.24.2.13 says that the USB port which is * going under test needs to be put in suspend before sending the * test command. Most hubs don't enforce this precondition, but there * are some hubs which needs to disable the power to the port before * starting the test. */ if (usb_device_match_id(hub_udev, ehset_hub_list)) { ret = usb_control_msg_send(hub_udev, 0, USB_REQ_CLEAR_FEATURE, USB_RT_PORT, USB_PORT_FEAT_ENABLE, portnum, NULL, 0, 1000, GFP_KERNEL); /* * Wait for the port to be disabled. It's an arbitrary value * which worked every time. */ msleep(100); } else { /* * For the hubs which are compliant with the spec, * put the port in SUSPEND. */ ret = usb_control_msg_send(hub_udev, 0, USB_REQ_SET_FEATURE, USB_RT_PORT, USB_PORT_FEAT_SUSPEND, portnum, NULL, 0, 1000, GFP_KERNEL); } return ret; } static int ehset_probe(struct usb_interface *intf, const struct usb_device_id *id) { int ret = -EINVAL; struct usb_device *dev = interface_to_usbdev(intf); struct usb_device *hub_udev = dev->parent; struct usb_device_descriptor buf; u8 portnum = dev->portnum; u16 test_pid = le16_to_cpu(dev->descriptor.idProduct); switch (test_pid) { case TEST_SE0_NAK_PID: ret = ehset_prepare_port_for_testing(hub_udev, portnum); if (ret < 0) break; ret = usb_control_msg_send(hub_udev, 0, USB_REQ_SET_FEATURE, USB_RT_PORT, USB_PORT_FEAT_TEST, (USB_TEST_SE0_NAK << 8) | portnum, NULL, 0, 1000, GFP_KERNEL); break; case TEST_J_PID: ret = ehset_prepare_port_for_testing(hub_udev, portnum); if (ret < 0) break; ret = usb_control_msg_send(hub_udev, 0, USB_REQ_SET_FEATURE, USB_RT_PORT, USB_PORT_FEAT_TEST, (USB_TEST_J << 8) | portnum, NULL, 0, 1000, GFP_KERNEL); break; case TEST_K_PID: ret = ehset_prepare_port_for_testing(hub_udev, portnum); if (ret < 0) break; ret = usb_control_msg_send(hub_udev, 0, USB_REQ_SET_FEATURE, USB_RT_PORT, USB_PORT_FEAT_TEST, (USB_TEST_K << 8) | portnum, NULL, 0, 1000, GFP_KERNEL); break; case TEST_PACKET_PID: ret = ehset_prepare_port_for_testing(hub_udev, portnum); if (ret < 0) break; ret = usb_control_msg_send(hub_udev, 0, USB_REQ_SET_FEATURE, USB_RT_PORT, USB_PORT_FEAT_TEST, (USB_TEST_PACKET << 8) | portnum, NULL, 0, 1000, GFP_KERNEL); break; case TEST_HS_HOST_PORT_SUSPEND_RESUME: /* Test: wait for 15secs -> suspend -> 15secs delay -> resume */ msleep(15 * 1000); ret = usb_control_msg_send(hub_udev, 0, USB_REQ_SET_FEATURE, USB_RT_PORT, USB_PORT_FEAT_SUSPEND, portnum, NULL, 0, 1000, GFP_KERNEL); if (ret < 0) break; msleep(15 * 1000); ret = usb_control_msg_send(hub_udev, 0, USB_REQ_CLEAR_FEATURE, USB_RT_PORT, USB_PORT_FEAT_SUSPEND, portnum, NULL, 0, 1000, GFP_KERNEL); break; case TEST_SINGLE_STEP_GET_DEV_DESC: /* Test: wait for 15secs -> GetDescriptor request */ msleep(15 * 1000); ret = usb_control_msg_recv(dev, 0, USB_REQ_GET_DESCRIPTOR, USB_DIR_IN, USB_DT_DEVICE << 8, 0, &buf, USB_DT_DEVICE_SIZE, USB_CTRL_GET_TIMEOUT, GFP_KERNEL); break; case TEST_SINGLE_STEP_SET_FEATURE: /* * GetDescriptor SETUP request -> 15secs delay -> IN & STATUS * * Note, this test is only supported on root hubs since the * SetPortFeature handling can only be done inside the HCD's * hub_control callback function. */ if (hub_udev != dev->bus->root_hub) { dev_err(&intf->dev, "SINGLE_STEP_SET_FEATURE test only supported on root hub\n"); break; } ret = usb_control_msg_send(hub_udev, 0, USB_REQ_SET_FEATURE, USB_RT_PORT, USB_PORT_FEAT_TEST, (6 << 8) | portnum, NULL, 0, 60 * 1000, GFP_KERNEL); break; default: dev_err(&intf->dev, "%s: unsupported PID: 0x%x\n", __func__, test_pid); } return ret; } static void ehset_disconnect(struct usb_interface *intf) { } static const struct usb_device_id ehset_id_table[] = { { USB_DEVICE(0x1a0a, TEST_SE0_NAK_PID) }, { USB_DEVICE(0x1a0a, TEST_J_PID) }, { USB_DEVICE(0x1a0a, TEST_K_PID) }, { USB_DEVICE(0x1a0a, TEST_PACKET_PID) }, { USB_DEVICE(0x1a0a, TEST_HS_HOST_PORT_SUSPEND_RESUME) }, { USB_DEVICE(0x1a0a, TEST_SINGLE_STEP_GET_DEV_DESC) }, { USB_DEVICE(0x1a0a, TEST_SINGLE_STEP_SET_FEATURE) }, { } /* Terminating entry */ }; MODULE_DEVICE_TABLE(usb, ehset_id_table); static struct usb_driver ehset_driver = { .name = "usb_ehset_test", .probe = ehset_probe, .disconnect = ehset_disconnect, .id_table = ehset_id_table, }; module_usb_driver(ehset_driver); MODULE_DESCRIPTION("USB Driver for EHSET Test Fixture"); MODULE_LICENSE("GPL v2"); |
2 1 1 2 1 1 3 2 2 1 1 2 1 1 3 1 1 21 21 3 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 | /* HIDP implementation for Linux Bluetooth stack (BlueZ). Copyright (C) 2003-2004 Marcel Holtmann <marcel@holtmann.org> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ #include <linux/compat.h> #include <linux/export.h> #include <linux/file.h> #include "hidp.h" static struct bt_sock_list hidp_sk_list = { .lock = __RW_LOCK_UNLOCKED(hidp_sk_list.lock) }; static int hidp_sock_release(struct socket *sock) { struct sock *sk = sock->sk; BT_DBG("sock %p sk %p", sock, sk); if (!sk) return 0; bt_sock_unlink(&hidp_sk_list, sk); sock_orphan(sk); sock_put(sk); return 0; } static int do_hidp_sock_ioctl(struct socket *sock, unsigned int cmd, void __user *argp) { struct hidp_connadd_req ca; struct hidp_conndel_req cd; struct hidp_connlist_req cl; struct hidp_conninfo ci; struct socket *csock; struct socket *isock; int err; BT_DBG("cmd %x arg %p", cmd, argp); switch (cmd) { case HIDPCONNADD: if (!capable(CAP_NET_ADMIN)) return -EPERM; if (copy_from_user(&ca, argp, sizeof(ca))) return -EFAULT; csock = sockfd_lookup(ca.ctrl_sock, &err); if (!csock) return err; isock = sockfd_lookup(ca.intr_sock, &err); if (!isock) { sockfd_put(csock); return err; } ca.name[sizeof(ca.name)-1] = 0; err = hidp_connection_add(&ca, csock, isock); if (!err && copy_to_user(argp, &ca, sizeof(ca))) err = -EFAULT; sockfd_put(csock); sockfd_put(isock); return err; case HIDPCONNDEL: if (!capable(CAP_NET_ADMIN)) return -EPERM; if (copy_from_user(&cd, argp, sizeof(cd))) return -EFAULT; return hidp_connection_del(&cd); case HIDPGETCONNLIST: if (copy_from_user(&cl, argp, sizeof(cl))) return -EFAULT; if (cl.cnum <= 0) return -EINVAL; err = hidp_get_connlist(&cl); if (!err && copy_to_user(argp, &cl, sizeof(cl))) return -EFAULT; return err; case HIDPGETCONNINFO: if (copy_from_user(&ci, argp, sizeof(ci))) return -EFAULT; err = hidp_get_conninfo(&ci); if (!err && copy_to_user(argp, &ci, sizeof(ci))) return -EFAULT; return err; } return -EINVAL; } static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { return do_hidp_sock_ioctl(sock, cmd, (void __user *)arg); } #ifdef CONFIG_COMPAT struct compat_hidp_connadd_req { int ctrl_sock; /* Connected control socket */ int intr_sock; /* Connected interrupt socket */ __u16 parser; __u16 rd_size; compat_uptr_t rd_data; __u8 country; __u8 subclass; __u16 vendor; __u16 product; __u16 version; __u32 flags; __u32 idle_to; char name[128]; }; static int hidp_sock_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = compat_ptr(arg); int err; if (cmd == HIDPGETCONNLIST) { struct hidp_connlist_req cl; u32 __user *p = argp; u32 uci; if (get_user(cl.cnum, p) || get_user(uci, p + 1)) return -EFAULT; cl.ci = compat_ptr(uci); if (cl.cnum <= 0) return -EINVAL; err = hidp_get_connlist(&cl); if (!err && put_user(cl.cnum, p)) err = -EFAULT; return err; } else if (cmd == HIDPCONNADD) { struct compat_hidp_connadd_req ca32; struct hidp_connadd_req ca; struct socket *csock; struct socket *isock; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (copy_from_user(&ca32, (void __user *) arg, sizeof(ca32))) return -EFAULT; ca.ctrl_sock = ca32.ctrl_sock; ca.intr_sock = ca32.intr_sock; ca.parser = ca32.parser; ca.rd_size = ca32.rd_size; ca.rd_data = compat_ptr(ca32.rd_data); ca.country = ca32.country; ca.subclass = ca32.subclass; ca.vendor = ca32.vendor; ca.product = ca32.product; ca.version = ca32.version; ca.flags = ca32.flags; ca.idle_to = ca32.idle_to; ca32.name[sizeof(ca32.name) - 1] = '\0'; memcpy(ca.name, ca32.name, 128); csock = sockfd_lookup(ca.ctrl_sock, &err); if (!csock) return err; isock = sockfd_lookup(ca.intr_sock, &err); if (!isock) { sockfd_put(csock); return err; } err = hidp_connection_add(&ca, csock, isock); if (!err && copy_to_user(argp, &ca32, sizeof(ca32))) err = -EFAULT; sockfd_put(csock); sockfd_put(isock); return err; } return hidp_sock_ioctl(sock, cmd, arg); } #endif static const struct proto_ops hidp_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = hidp_sock_release, .ioctl = hidp_sock_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = hidp_sock_compat_ioctl, #endif .bind = sock_no_bind, .getname = sock_no_getname, .sendmsg = sock_no_sendmsg, .recvmsg = sock_no_recvmsg, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .mmap = sock_no_mmap }; static struct proto hidp_proto = { .name = "HIDP", .owner = THIS_MODULE, .obj_size = sizeof(struct bt_sock) }; static int hidp_sock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; BT_DBG("sock %p", sock); if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; sk = bt_sock_alloc(net, sock, &hidp_proto, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; sock->ops = &hidp_sock_ops; sock->state = SS_UNCONNECTED; bt_sock_link(&hidp_sk_list, sk); return 0; } static const struct net_proto_family hidp_sock_family_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .create = hidp_sock_create }; int __init hidp_init_sockets(void) { int err; err = proto_register(&hidp_proto, 0); if (err < 0) return err; err = bt_sock_register(BTPROTO_HIDP, &hidp_sock_family_ops); if (err < 0) { BT_ERR("Can't register HIDP socket"); goto error; } err = bt_procfs_init(&init_net, "hidp", &hidp_sk_list, NULL); if (err < 0) { BT_ERR("Failed to create HIDP proc file"); bt_sock_unregister(BTPROTO_HIDP); goto error; } BT_INFO("HIDP socket layer initialized"); return 0; error: proto_unregister(&hidp_proto); return err; } void __exit hidp_cleanup_sockets(void) { bt_procfs_cleanup(&init_net, "hidp"); bt_sock_unregister(BTPROTO_HIDP); proto_unregister(&hidp_proto); } |
59 59 59 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 | // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2002, 2004 * Copyright (c) 2002 Intel Corp. * * This file is part of the SCTP kernel implementation * * Sysctl related interfaces for SCTP. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Mingqin Liu <liuming@us.ibm.com> * Jon Grimm <jgrimm@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> * Ryan Layer <rmlayer@us.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <net/sctp/structs.h> #include <net/sctp/sctp.h> #include <linux/sysctl.h> static int timer_max = 86400000; /* ms in one day */ static int sack_timer_min = 1; static int sack_timer_max = 500; static int addr_scope_max = SCTP_SCOPE_POLICY_MAX; static int rwnd_scale_max = 16; static int rto_alpha_min = 0; static int rto_beta_min = 0; static int rto_alpha_max = 1000; static int rto_beta_max = 1000; static int pf_expose_max = SCTP_PF_EXPOSE_MAX; static int ps_retrans_max = SCTP_PS_RETRANS_MAX; static int udp_port_max = 65535; static unsigned long max_autoclose_min = 0; static unsigned long max_autoclose_max = (MAX_SCHEDULE_TIMEOUT / HZ > UINT_MAX) ? UINT_MAX : MAX_SCHEDULE_TIMEOUT / HZ; static int proc_sctp_do_hmac_alg(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); static int proc_sctp_do_rto_min(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); static int proc_sctp_do_rto_max(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); static int proc_sctp_do_udp_port(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); static int proc_sctp_do_alpha_beta(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); static int proc_sctp_do_auth(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); static int proc_sctp_do_probe_interval(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); static struct ctl_table sctp_table[] = { { .procname = "sctp_mem", .data = &sysctl_sctp_mem, .maxlen = sizeof(sysctl_sctp_mem), .mode = 0644, .proc_handler = proc_doulongvec_minmax }, { .procname = "sctp_rmem", .data = &sysctl_sctp_rmem, .maxlen = sizeof(sysctl_sctp_rmem), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "sctp_wmem", .data = &sysctl_sctp_wmem, .maxlen = sizeof(sysctl_sctp_wmem), .mode = 0644, .proc_handler = proc_dointvec, }, }; /* The following index defines are used in sctp_sysctl_net_register(). * If you add new items to the sctp_net_table, please ensure that * the index values of these defines hold the same meaning indicated by * their macro names when they appear in sctp_net_table. */ #define SCTP_RTO_MIN_IDX 0 #define SCTP_RTO_MAX_IDX 1 #define SCTP_PF_RETRANS_IDX 2 #define SCTP_PS_RETRANS_IDX 3 static struct ctl_table sctp_net_table[] = { [SCTP_RTO_MIN_IDX] = { .procname = "rto_min", .data = &init_net.sctp.rto_min, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_sctp_do_rto_min, .extra1 = SYSCTL_ONE, .extra2 = &init_net.sctp.rto_max }, [SCTP_RTO_MAX_IDX] = { .procname = "rto_max", .data = &init_net.sctp.rto_max, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_sctp_do_rto_max, .extra1 = &init_net.sctp.rto_min, .extra2 = &timer_max }, [SCTP_PF_RETRANS_IDX] = { .procname = "pf_retrans", .data = &init_net.sctp.pf_retrans, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &init_net.sctp.ps_retrans, }, [SCTP_PS_RETRANS_IDX] = { .procname = "ps_retrans", .data = &init_net.sctp.ps_retrans, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &init_net.sctp.pf_retrans, .extra2 = &ps_retrans_max, }, { .procname = "rto_initial", .data = &init_net.sctp.rto_initial, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, .extra2 = &timer_max }, { .procname = "rto_alpha_exp_divisor", .data = &init_net.sctp.rto_alpha, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_sctp_do_alpha_beta, .extra1 = &rto_alpha_min, .extra2 = &rto_alpha_max, }, { .procname = "rto_beta_exp_divisor", .data = &init_net.sctp.rto_beta, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_sctp_do_alpha_beta, .extra1 = &rto_beta_min, .extra2 = &rto_beta_max, }, { .procname = "max_burst", .data = &init_net.sctp.max_burst, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, { .procname = "cookie_preserve_enable", .data = &init_net.sctp.cookie_preserve_enable, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "cookie_hmac_alg", .data = &init_net.sctp.sctp_hmac_alg, .maxlen = 8, .mode = 0644, .proc_handler = proc_sctp_do_hmac_alg, }, { .procname = "valid_cookie_life", .data = &init_net.sctp.valid_cookie_life, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, .extra2 = &timer_max }, { .procname = "sack_timeout", .data = &init_net.sctp.sack_timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &sack_timer_min, .extra2 = &sack_timer_max, }, { .procname = "hb_interval", .data = &init_net.sctp.hb_interval, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, .extra2 = &timer_max }, { .procname = "association_max_retrans", .data = &init_net.sctp.max_retrans_association, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_INT_MAX, }, { .procname = "path_max_retrans", .data = &init_net.sctp.max_retrans_path, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_INT_MAX, }, { .procname = "max_init_retransmits", .data = &init_net.sctp.max_retrans_init, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_INT_MAX, }, { .procname = "sndbuf_policy", .data = &init_net.sctp.sndbuf_policy, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "rcvbuf_policy", .data = &init_net.sctp.rcvbuf_policy, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "default_auto_asconf", .data = &init_net.sctp.default_auto_asconf, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "addip_enable", .data = &init_net.sctp.addip_enable, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "addip_noauth_enable", .data = &init_net.sctp.addip_noauth, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "prsctp_enable", .data = &init_net.sctp.prsctp_enable, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "reconf_enable", .data = &init_net.sctp.reconf_enable, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "auth_enable", .data = &init_net.sctp.auth_enable, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_sctp_do_auth, }, { .procname = "intl_enable", .data = &init_net.sctp.intl_enable, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "ecn_enable", .data = &init_net.sctp.ecn_enable, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "plpmtud_probe_interval", .data = &init_net.sctp.probe_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_sctp_do_probe_interval, }, { .procname = "udp_port", .data = &init_net.sctp.udp_port, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_sctp_do_udp_port, .extra1 = SYSCTL_ZERO, .extra2 = &udp_port_max, }, { .procname = "encap_port", .data = &init_net.sctp.encap_port, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &udp_port_max, }, { .procname = "addr_scope_policy", .data = &init_net.sctp.scope_policy, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &addr_scope_max, }, { .procname = "rwnd_update_shift", .data = &init_net.sctp.rwnd_upd_shift, .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec_minmax, .extra1 = SYSCTL_ONE, .extra2 = &rwnd_scale_max, }, { .procname = "max_autoclose", .data = &init_net.sctp.max_autoclose, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = &proc_doulongvec_minmax, .extra1 = &max_autoclose_min, .extra2 = &max_autoclose_max, }, #ifdef CONFIG_NET_L3_MASTER_DEV { .procname = "l3mdev_accept", .data = &init_net.sctp.l3mdev_accept, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif { .procname = "pf_enable", .data = &init_net.sctp.pf_enable, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "pf_expose", .data = &init_net.sctp.pf_expose, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &pf_expose_max, }, }; static int proc_sctp_do_hmac_alg(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; struct ctl_table tbl; bool changed = false; char *none = "none"; char tmp[8] = {0}; int ret; memset(&tbl, 0, sizeof(struct ctl_table)); if (write) { tbl.data = tmp; tbl.maxlen = sizeof(tmp); } else { tbl.data = net->sctp.sctp_hmac_alg ? : none; tbl.maxlen = strlen(tbl.data); } ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) { #ifdef CONFIG_CRYPTO_MD5 if (!strncmp(tmp, "md5", 3)) { net->sctp.sctp_hmac_alg = "md5"; changed = true; } #endif #ifdef CONFIG_CRYPTO_SHA1 if (!strncmp(tmp, "sha1", 4)) { net->sctp.sctp_hmac_alg = "sha1"; changed = true; } #endif if (!strncmp(tmp, "none", 4)) { net->sctp.sctp_hmac_alg = NULL; changed = true; } if (!changed) ret = -EINVAL; } return ret; } static int proc_sctp_do_rto_min(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; unsigned int min = *(unsigned int *) ctl->extra1; unsigned int max = *(unsigned int *) ctl->extra2; struct ctl_table tbl; int ret, new_value; memset(&tbl, 0, sizeof(struct ctl_table)); tbl.maxlen = sizeof(unsigned int); if (write) tbl.data = &new_value; else tbl.data = &net->sctp.rto_min; ret = proc_dointvec(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) { if (new_value > max || new_value < min) return -EINVAL; net->sctp.rto_min = new_value; } return ret; } static int proc_sctp_do_rto_max(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; unsigned int min = *(unsigned int *) ctl->extra1; unsigned int max = *(unsigned int *) ctl->extra2; struct ctl_table tbl; int ret, new_value; memset(&tbl, 0, sizeof(struct ctl_table)); tbl.maxlen = sizeof(unsigned int); if (write) tbl.data = &new_value; else tbl.data = &net->sctp.rto_max; ret = proc_dointvec(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) { if (new_value > max || new_value < min) return -EINVAL; net->sctp.rto_max = new_value; } return ret; } static int proc_sctp_do_alpha_beta(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (write) pr_warn_once("Changing rto_alpha or rto_beta may lead to " "suboptimal rtt/srtt estimations!\n"); return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); } static int proc_sctp_do_auth(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; struct ctl_table tbl; int new_value, ret; memset(&tbl, 0, sizeof(struct ctl_table)); tbl.maxlen = sizeof(unsigned int); if (write) tbl.data = &new_value; else tbl.data = &net->sctp.auth_enable; ret = proc_dointvec(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) { struct sock *sk = net->sctp.ctl_sock; net->sctp.auth_enable = new_value; /* Update the value in the control socket */ lock_sock(sk); sctp_sk(sk)->ep->auth_enable = new_value; release_sock(sk); } return ret; } static int proc_sctp_do_udp_port(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; unsigned int min = *(unsigned int *)ctl->extra1; unsigned int max = *(unsigned int *)ctl->extra2; struct ctl_table tbl; int ret, new_value; memset(&tbl, 0, sizeof(struct ctl_table)); tbl.maxlen = sizeof(unsigned int); if (write) tbl.data = &new_value; else tbl.data = &net->sctp.udp_port; ret = proc_dointvec(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) { struct sock *sk = net->sctp.ctl_sock; if (new_value > max || new_value < min) return -EINVAL; net->sctp.udp_port = new_value; sctp_udp_sock_stop(net); if (new_value) { ret = sctp_udp_sock_start(net); if (ret) net->sctp.udp_port = 0; } /* Update the value in the control socket */ lock_sock(sk); sctp_sk(sk)->udp_port = htons(net->sctp.udp_port); release_sock(sk); } return ret; } static int proc_sctp_do_probe_interval(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; struct ctl_table tbl; int ret, new_value; memset(&tbl, 0, sizeof(struct ctl_table)); tbl.maxlen = sizeof(unsigned int); if (write) tbl.data = &new_value; else tbl.data = &net->sctp.probe_interval; ret = proc_dointvec(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) { if (new_value && new_value < SCTP_PROBE_TIMER_MIN) return -EINVAL; net->sctp.probe_interval = new_value; } return ret; } int sctp_sysctl_net_register(struct net *net) { size_t table_size = ARRAY_SIZE(sctp_net_table); struct ctl_table *table; int i; table = kmemdup(sctp_net_table, sizeof(sctp_net_table), GFP_KERNEL); if (!table) return -ENOMEM; for (i = 0; i < table_size; i++) table[i].data += (char *)(&net->sctp) - (char *)&init_net.sctp; table[SCTP_RTO_MIN_IDX].extra2 = &net->sctp.rto_max; table[SCTP_RTO_MAX_IDX].extra1 = &net->sctp.rto_min; table[SCTP_PF_RETRANS_IDX].extra2 = &net->sctp.ps_retrans; table[SCTP_PS_RETRANS_IDX].extra1 = &net->sctp.pf_retrans; net->sctp.sysctl_header = register_net_sysctl_sz(net, "net/sctp", table, table_size); if (net->sctp.sysctl_header == NULL) { kfree(table); return -ENOMEM; } return 0; } void sctp_sysctl_net_unregister(struct net *net) { const struct ctl_table *table; table = net->sctp.sysctl_header->ctl_table_arg; unregister_net_sysctl_table(net->sctp.sysctl_header); kfree(table); } static struct ctl_table_header *sctp_sysctl_header; /* Sysctl registration. */ void sctp_sysctl_register(void) { sctp_sysctl_header = register_net_sysctl(&init_net, "net/sctp", sctp_table); } /* Sysctl deregistration. */ void sctp_sysctl_unregister(void) { unregister_net_sysctl_table(sctp_sysctl_header); } |
11 1 1 11 34 34 13 7 14 2 2 2 4 2 7 7 1 1 6 6 1 2 1 16 1 2 1 1 4 1 1 9 7 2 3 7 2 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 | // SPDX-License-Identifier: GPL-2.0-only #include <linux/slab.h> #include <linux/stat.h> #include <linux/sched/xacct.h> #include <linux/fcntl.h> #include <linux/file.h> #include <linux/uio.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/export.h> #include <linux/syscalls.h> #include <linux/pagemap.h> #include <linux/splice.h> #include <linux/compat.h> #include <linux/mount.h> #include <linux/fs.h> #include <linux/dax.h> #include <linux/overflow.h> #include "internal.h" #include <linux/uaccess.h> #include <asm/unistd.h> /* * Performs necessary checks before doing a clone. * * Can adjust amount of bytes to clone via @req_count argument. * Returns appropriate error code that caller should return or * zero in case the clone should be allowed. */ static int generic_remap_checks(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *req_count, unsigned int remap_flags) { struct inode *inode_in = file_in->f_mapping->host; struct inode *inode_out = file_out->f_mapping->host; uint64_t count = *req_count; uint64_t bcount; loff_t size_in, size_out; loff_t bs = inode_out->i_sb->s_blocksize; int ret; /* The start of both ranges must be aligned to an fs block. */ if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs)) return -EINVAL; /* Ensure offsets don't wrap. */ if (pos_in + count < pos_in || pos_out + count < pos_out) return -EINVAL; size_in = i_size_read(inode_in); size_out = i_size_read(inode_out); /* Dedupe requires both ranges to be within EOF. */ if ((remap_flags & REMAP_FILE_DEDUP) && (pos_in >= size_in || pos_in + count > size_in || pos_out >= size_out || pos_out + count > size_out)) return -EINVAL; /* Ensure the infile range is within the infile. */ if (pos_in >= size_in) return -EINVAL; count = min(count, size_in - (uint64_t)pos_in); ret = generic_write_check_limits(file_out, pos_out, &count); if (ret) return ret; /* * If the user wanted us to link to the infile's EOF, round up to the * next block boundary for this check. * * Otherwise, make sure the count is also block-aligned, having * already confirmed the starting offsets' block alignment. */ if (pos_in + count == size_in && (!(remap_flags & REMAP_FILE_DEDUP) || pos_out + count == size_out)) { bcount = ALIGN(size_in, bs) - pos_in; } else { if (!IS_ALIGNED(count, bs)) count = ALIGN_DOWN(count, bs); bcount = count; } /* Don't allow overlapped cloning within the same file. */ if (inode_in == inode_out && pos_out + bcount > pos_in && pos_out < pos_in + bcount) return -EINVAL; /* * We shortened the request but the caller can't deal with that, so * bounce the request back to userspace. */ if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN)) return -EINVAL; *req_count = count; return 0; } int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write) { int mask = write ? MAY_WRITE : MAY_READ; loff_t tmp; int ret; if (unlikely(pos < 0 || len < 0)) return -EINVAL; if (unlikely(check_add_overflow(pos, len, &tmp))) return -EINVAL; ret = security_file_permission(file, mask); if (ret) return ret; return fsnotify_file_area_perm(file, mask, &pos, len); } EXPORT_SYMBOL_GPL(remap_verify_area); /* * Ensure that we don't remap a partial EOF block in the middle of something * else. Assume that the offsets have already been checked for block * alignment. * * For clone we only link a partial EOF block above or at the destination file's * EOF. For deduplication we accept a partial EOF block only if it ends at the * destination file's EOF (can not link it into the middle of a file). * * Shorten the request if possible. */ static int generic_remap_check_len(struct inode *inode_in, struct inode *inode_out, loff_t pos_out, loff_t *len, unsigned int remap_flags) { u64 blkmask = i_blocksize(inode_in) - 1; loff_t new_len = *len; if ((*len & blkmask) == 0) return 0; if (pos_out + *len < i_size_read(inode_out)) new_len &= ~blkmask; if (new_len == *len) return 0; if (remap_flags & REMAP_FILE_CAN_SHORTEN) { *len = new_len; return 0; } return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL; } /* Read a page's worth of file data into the page cache. */ static struct folio *vfs_dedupe_get_folio(struct file *file, loff_t pos) { return read_mapping_folio(file->f_mapping, pos >> PAGE_SHIFT, file); } /* * Lock two folios, ensuring that we lock in offset order if the folios * are from the same file. */ static void vfs_lock_two_folios(struct folio *folio1, struct folio *folio2) { /* Always lock in order of increasing index. */ if (folio1->index > folio2->index) swap(folio1, folio2); folio_lock(folio1); if (folio1 != folio2) folio_lock(folio2); } /* Unlock two folios, being careful not to unlock the same folio twice. */ static void vfs_unlock_two_folios(struct folio *folio1, struct folio *folio2) { folio_unlock(folio1); if (folio1 != folio2) folio_unlock(folio2); } /* * Compare extents of two files to see if they are the same. * Caller must have locked both inodes to prevent write races. */ static int vfs_dedupe_file_range_compare(struct file *src, loff_t srcoff, struct file *dest, loff_t dstoff, loff_t len, bool *is_same) { bool same = true; int error = -EINVAL; while (len) { struct folio *src_folio, *dst_folio; void *src_addr, *dst_addr; loff_t cmp_len = min(PAGE_SIZE - offset_in_page(srcoff), PAGE_SIZE - offset_in_page(dstoff)); cmp_len = min(cmp_len, len); if (cmp_len <= 0) goto out_error; src_folio = vfs_dedupe_get_folio(src, srcoff); if (IS_ERR(src_folio)) { error = PTR_ERR(src_folio); goto out_error; } dst_folio = vfs_dedupe_get_folio(dest, dstoff); if (IS_ERR(dst_folio)) { error = PTR_ERR(dst_folio); folio_put(src_folio); goto out_error; } vfs_lock_two_folios(src_folio, dst_folio); /* * Now that we've locked both folios, make sure they're still * mapped to the file data we're interested in. If not, * someone is invalidating pages on us and we lose. */ if (!folio_test_uptodate(src_folio) || !folio_test_uptodate(dst_folio) || src_folio->mapping != src->f_mapping || dst_folio->mapping != dest->f_mapping) { same = false; goto unlock; } src_addr = kmap_local_folio(src_folio, offset_in_folio(src_folio, srcoff)); dst_addr = kmap_local_folio(dst_folio, offset_in_folio(dst_folio, dstoff)); flush_dcache_folio(src_folio); flush_dcache_folio(dst_folio); if (memcmp(src_addr, dst_addr, cmp_len)) same = false; kunmap_local(dst_addr); kunmap_local(src_addr); unlock: vfs_unlock_two_folios(src_folio, dst_folio); folio_put(dst_folio); folio_put(src_folio); if (!same) break; srcoff += cmp_len; dstoff += cmp_len; len -= cmp_len; } *is_same = same; return 0; out_error: return error; } /* * Check that the two inodes are eligible for cloning, the ranges make * sense, and then flush all dirty data. Caller must ensure that the * inodes have been locked against any other modifications. * * If there's an error, then the usual negative error code is returned. * Otherwise returns 0 with *len set to the request length. */ int __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags, const struct iomap_ops *dax_read_ops) { struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); bool same_inode = (inode_in == inode_out); int ret; /* Don't touch certain kinds of inodes */ if (IS_IMMUTABLE(inode_out)) return -EPERM; if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) return -ETXTBSY; /* Don't reflink dirs, pipes, sockets... */ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) return -EISDIR; if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) return -EINVAL; /* Zero length dedupe exits immediately; reflink goes to EOF. */ if (*len == 0) { loff_t isize = i_size_read(inode_in); if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize) return 0; if (pos_in > isize) return -EINVAL; *len = isize - pos_in; if (*len == 0) return 0; } /* Check that we don't violate system file offset limits. */ ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len, remap_flags); if (ret || *len == 0) return ret; /* Wait for the completion of any pending IOs on both files */ inode_dio_wait(inode_in); if (!same_inode) inode_dio_wait(inode_out); ret = filemap_write_and_wait_range(inode_in->i_mapping, pos_in, pos_in + *len - 1); if (ret) return ret; ret = filemap_write_and_wait_range(inode_out->i_mapping, pos_out, pos_out + *len - 1); if (ret) return ret; /* * Check that the extents are the same. */ if (remap_flags & REMAP_FILE_DEDUP) { bool is_same = false; if (!IS_DAX(inode_in)) ret = vfs_dedupe_file_range_compare(file_in, pos_in, file_out, pos_out, *len, &is_same); else if (dax_read_ops) ret = dax_dedupe_file_range_compare(inode_in, pos_in, inode_out, pos_out, *len, &is_same, dax_read_ops); else return -EINVAL; if (ret) return ret; if (!is_same) return -EBADE; } ret = generic_remap_check_len(inode_in, inode_out, pos_out, len, remap_flags); if (ret || *len == 0) return ret; /* If can't alter the file contents, we're done. */ if (!(remap_flags & REMAP_FILE_DEDUP)) ret = file_modified(file_out); return ret; } int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags) { return __generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out, len, remap_flags, NULL); } EXPORT_SYMBOL(generic_remap_file_range_prep); loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags) { loff_t ret; WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP); if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb) return -EXDEV; ret = generic_file_rw_checks(file_in, file_out); if (ret < 0) return ret; if (!file_in->f_op->remap_file_range) return -EOPNOTSUPP; ret = remap_verify_area(file_in, pos_in, len, false); if (ret) return ret; ret = remap_verify_area(file_out, pos_out, len, true); if (ret) return ret; file_start_write(file_out); ret = file_in->f_op->remap_file_range(file_in, pos_in, file_out, pos_out, len, remap_flags); file_end_write(file_out); if (ret < 0) return ret; fsnotify_access(file_in); fsnotify_modify(file_out); return ret; } EXPORT_SYMBOL(vfs_clone_file_range); /* Check whether we are allowed to dedupe the destination file */ static bool may_dedupe_file(struct file *file) { struct mnt_idmap *idmap = file_mnt_idmap(file); struct inode *inode = file_inode(file); if (capable(CAP_SYS_ADMIN)) return true; if (file->f_mode & FMODE_WRITE) return true; if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), current_fsuid())) return true; if (!inode_permission(idmap, inode, MAY_WRITE)) return true; return false; } loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, struct file *dst_file, loff_t dst_pos, loff_t len, unsigned int remap_flags) { loff_t ret; WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN)); /* * This is redundant if called from vfs_dedupe_file_range(), but other * callers need it and it's not performance sesitive... */ ret = remap_verify_area(src_file, src_pos, len, false); if (ret) return ret; ret = remap_verify_area(dst_file, dst_pos, len, true); if (ret) return ret; /* * This needs to be called after remap_verify_area() because of * sb_start_write() and before may_dedupe_file() because the mount's * MAY_WRITE need to be checked with mnt_get_write_access_file() held. */ ret = mnt_want_write_file(dst_file); if (ret) return ret; ret = -EPERM; if (!may_dedupe_file(dst_file)) goto out_drop_write; ret = -EXDEV; if (file_inode(src_file)->i_sb != file_inode(dst_file)->i_sb) goto out_drop_write; ret = -EISDIR; if (S_ISDIR(file_inode(dst_file)->i_mode)) goto out_drop_write; ret = -EINVAL; if (!dst_file->f_op->remap_file_range) goto out_drop_write; if (len == 0) { ret = 0; goto out_drop_write; } ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file, dst_pos, len, remap_flags | REMAP_FILE_DEDUP); out_drop_write: mnt_drop_write_file(dst_file); return ret; } EXPORT_SYMBOL(vfs_dedupe_file_range_one); int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) { struct file_dedupe_range_info *info; struct inode *src = file_inode(file); u64 off; u64 len; int i; int ret; u16 count = same->dest_count; loff_t deduped; if (!(file->f_mode & FMODE_READ)) return -EINVAL; if (same->reserved1 || same->reserved2) return -EINVAL; off = same->src_offset; len = same->src_length; if (S_ISDIR(src->i_mode)) return -EISDIR; if (!S_ISREG(src->i_mode)) return -EINVAL; if (!file->f_op->remap_file_range) return -EOPNOTSUPP; ret = remap_verify_area(file, off, len, false); if (ret < 0) return ret; ret = 0; if (off + len > i_size_read(src)) return -EINVAL; /* Arbitrary 1G limit on a single dedupe request, can be raised. */ len = min_t(u64, len, 1 << 30); /* pre-format output fields to sane values */ for (i = 0; i < count; i++) { same->info[i].bytes_deduped = 0ULL; same->info[i].status = FILE_DEDUPE_RANGE_SAME; } for (i = 0, info = same->info; i < count; i++, info++) { struct fd dst_fd = fdget(info->dest_fd); struct file *dst_file = dst_fd.file; if (!dst_file) { info->status = -EBADF; goto next_loop; } if (info->reserved) { info->status = -EINVAL; goto next_fdput; } deduped = vfs_dedupe_file_range_one(file, off, dst_file, info->dest_offset, len, REMAP_FILE_CAN_SHORTEN); if (deduped == -EBADE) info->status = FILE_DEDUPE_RANGE_DIFFERS; else if (deduped < 0) info->status = deduped; else info->bytes_deduped = len; next_fdput: fdput(dst_fd); next_loop: if (fatal_signal_pending(current)) break; } return ret; } EXPORT_SYMBOL(vfs_dedupe_file_range); |
13 13 21 6 11 6 13 13 20 21 13 13 12 13 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 | // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. */ #include <linux/dma-mapping.h> #include <net/addrconf.h> #include <rdma/uverbs_ioctl.h> #include "rxe.h" #include "rxe_queue.h" #include "rxe_hw_counters.h" static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr); /* dev */ static int rxe_query_device(struct ib_device *ibdev, struct ib_device_attr *attr, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibdev); int err; if (udata->inlen || udata->outlen) { rxe_dbg_dev(rxe, "malformed udata\n"); err = -EINVAL; goto err_out; } memcpy(attr, &rxe->attr, sizeof(*attr)); return 0; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_query_port(struct ib_device *ibdev, u32 port_num, struct ib_port_attr *attr) { struct rxe_dev *rxe = to_rdev(ibdev); int err, ret; if (port_num != 1) { err = -EINVAL; rxe_dbg_dev(rxe, "bad port_num = %d\n", port_num); goto err_out; } memcpy(attr, &rxe->port.attr, sizeof(*attr)); mutex_lock(&rxe->usdev_lock); ret = ib_get_eth_speed(ibdev, port_num, &attr->active_speed, &attr->active_width); if (attr->state == IB_PORT_ACTIVE) attr->phys_state = IB_PORT_PHYS_STATE_LINK_UP; else if (dev_get_flags(rxe->ndev) & IFF_UP) attr->phys_state = IB_PORT_PHYS_STATE_POLLING; else attr->phys_state = IB_PORT_PHYS_STATE_DISABLED; mutex_unlock(&rxe->usdev_lock); return ret; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_query_pkey(struct ib_device *ibdev, u32 port_num, u16 index, u16 *pkey) { struct rxe_dev *rxe = to_rdev(ibdev); int err; if (index != 0) { err = -EINVAL; rxe_dbg_dev(rxe, "bad pkey index = %d\n", index); goto err_out; } *pkey = IB_DEFAULT_PKEY_FULL; return 0; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_modify_device(struct ib_device *ibdev, int mask, struct ib_device_modify *attr) { struct rxe_dev *rxe = to_rdev(ibdev); int err; if (mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID | IB_DEVICE_MODIFY_NODE_DESC)) { err = -EOPNOTSUPP; rxe_dbg_dev(rxe, "unsupported mask = 0x%x\n", mask); goto err_out; } if (mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) rxe->attr.sys_image_guid = cpu_to_be64(attr->sys_image_guid); if (mask & IB_DEVICE_MODIFY_NODE_DESC) { memcpy(rxe->ib_dev.node_desc, attr->node_desc, sizeof(rxe->ib_dev.node_desc)); } return 0; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_modify_port(struct ib_device *ibdev, u32 port_num, int mask, struct ib_port_modify *attr) { struct rxe_dev *rxe = to_rdev(ibdev); struct rxe_port *port; int err; if (port_num != 1) { err = -EINVAL; rxe_dbg_dev(rxe, "bad port_num = %d\n", port_num); goto err_out; } //TODO is shutdown useful if (mask & ~(IB_PORT_RESET_QKEY_CNTR)) { err = -EOPNOTSUPP; rxe_dbg_dev(rxe, "unsupported mask = 0x%x\n", mask); goto err_out; } port = &rxe->port; port->attr.port_cap_flags |= attr->set_port_cap_mask; port->attr.port_cap_flags &= ~attr->clr_port_cap_mask; if (mask & IB_PORT_RESET_QKEY_CNTR) port->attr.qkey_viol_cntr = 0; return 0; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static enum rdma_link_layer rxe_get_link_layer(struct ib_device *ibdev, u32 port_num) { struct rxe_dev *rxe = to_rdev(ibdev); int err; if (port_num != 1) { err = -EINVAL; rxe_dbg_dev(rxe, "bad port_num = %d\n", port_num); goto err_out; } return IB_LINK_LAYER_ETHERNET; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_port_immutable(struct ib_device *ibdev, u32 port_num, struct ib_port_immutable *immutable) { struct rxe_dev *rxe = to_rdev(ibdev); struct ib_port_attr attr = {}; int err; if (port_num != 1) { err = -EINVAL; rxe_dbg_dev(rxe, "bad port_num = %d\n", port_num); goto err_out; } err = ib_query_port(ibdev, port_num, &attr); if (err) goto err_out; immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; immutable->pkey_tbl_len = attr.pkey_tbl_len; immutable->gid_tbl_len = attr.gid_tbl_len; immutable->max_mad_size = IB_MGMT_MAD_SIZE; return 0; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } /* uc */ static int rxe_alloc_ucontext(struct ib_ucontext *ibuc, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibuc->device); struct rxe_ucontext *uc = to_ruc(ibuc); int err; err = rxe_add_to_pool(&rxe->uc_pool, uc); if (err) rxe_err_dev(rxe, "unable to create uc\n"); return err; } static void rxe_dealloc_ucontext(struct ib_ucontext *ibuc) { struct rxe_ucontext *uc = to_ruc(ibuc); int err; err = rxe_cleanup(uc); if (err) rxe_err_uc(uc, "cleanup failed, err = %d\n", err); } /* pd */ static int rxe_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibpd->device); struct rxe_pd *pd = to_rpd(ibpd); int err; err = rxe_add_to_pool(&rxe->pd_pool, pd); if (err) { rxe_dbg_dev(rxe, "unable to alloc pd\n"); goto err_out; } return 0; err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata) { struct rxe_pd *pd = to_rpd(ibpd); int err; err = rxe_cleanup(pd); if (err) rxe_err_pd(pd, "cleanup failed, err = %d\n", err); return 0; } /* ah */ static int rxe_create_ah(struct ib_ah *ibah, struct rdma_ah_init_attr *init_attr, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibah->device); struct rxe_ah *ah = to_rah(ibah); struct rxe_create_ah_resp __user *uresp = NULL; int err, cleanup_err; if (udata) { /* test if new user provider */ if (udata->outlen >= sizeof(*uresp)) uresp = udata->outbuf; ah->is_user = true; } else { ah->is_user = false; } err = rxe_add_to_pool_ah(&rxe->ah_pool, ah, init_attr->flags & RDMA_CREATE_AH_SLEEPABLE); if (err) { rxe_dbg_dev(rxe, "unable to create ah\n"); goto err_out; } /* create index > 0 */ ah->ah_num = ah->elem.index; err = rxe_ah_chk_attr(ah, init_attr->ah_attr); if (err) { rxe_dbg_ah(ah, "bad attr\n"); goto err_cleanup; } if (uresp) { /* only if new user provider */ err = copy_to_user(&uresp->ah_num, &ah->ah_num, sizeof(uresp->ah_num)); if (err) { err = -EFAULT; rxe_dbg_ah(ah, "unable to copy to user\n"); goto err_cleanup; } } else if (ah->is_user) { /* only if old user provider */ ah->ah_num = 0; } rxe_init_av(init_attr->ah_attr, &ah->av); rxe_finalize(ah); return 0; err_cleanup: cleanup_err = rxe_cleanup(ah); if (cleanup_err) rxe_err_ah(ah, "cleanup failed, err = %d\n", cleanup_err); err_out: rxe_err_ah(ah, "returned err = %d\n", err); return err; } static int rxe_modify_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr) { struct rxe_ah *ah = to_rah(ibah); int err; err = rxe_ah_chk_attr(ah, attr); if (err) { rxe_dbg_ah(ah, "bad attr\n"); goto err_out; } rxe_init_av(attr, &ah->av); return 0; err_out: rxe_err_ah(ah, "returned err = %d\n", err); return err; } static int rxe_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr) { struct rxe_ah *ah = to_rah(ibah); memset(attr, 0, sizeof(*attr)); attr->type = ibah->type; rxe_av_to_attr(&ah->av, attr); return 0; } static int rxe_destroy_ah(struct ib_ah *ibah, u32 flags) { struct rxe_ah *ah = to_rah(ibah); int err; err = rxe_cleanup_ah(ah, flags & RDMA_DESTROY_AH_SLEEPABLE); if (err) rxe_err_ah(ah, "cleanup failed, err = %d\n", err); return 0; } /* srq */ static int rxe_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibsrq->device); struct rxe_pd *pd = to_rpd(ibsrq->pd); struct rxe_srq *srq = to_rsrq(ibsrq); struct rxe_create_srq_resp __user *uresp = NULL; int err, cleanup_err; if (udata) { if (udata->outlen < sizeof(*uresp)) { err = -EINVAL; rxe_err_dev(rxe, "malformed udata\n"); goto err_out; } uresp = udata->outbuf; } if (init->srq_type != IB_SRQT_BASIC) { err = -EOPNOTSUPP; rxe_dbg_dev(rxe, "srq type = %d, not supported\n", init->srq_type); goto err_out; } err = rxe_srq_chk_init(rxe, init); if (err) { rxe_dbg_dev(rxe, "invalid init attributes\n"); goto err_out; } err = rxe_add_to_pool(&rxe->srq_pool, srq); if (err) { rxe_dbg_dev(rxe, "unable to create srq, err = %d\n", err); goto err_out; } rxe_get(pd); srq->pd = pd; err = rxe_srq_from_init(rxe, srq, init, udata, uresp); if (err) { rxe_dbg_srq(srq, "create srq failed, err = %d\n", err); goto err_cleanup; } return 0; err_cleanup: cleanup_err = rxe_cleanup(srq); if (cleanup_err) rxe_err_srq(srq, "cleanup failed, err = %d\n", cleanup_err); err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr, enum ib_srq_attr_mask mask, struct ib_udata *udata) { struct rxe_srq *srq = to_rsrq(ibsrq); struct rxe_dev *rxe = to_rdev(ibsrq->device); struct rxe_modify_srq_cmd cmd = {}; int err; if (udata) { if (udata->inlen < sizeof(cmd)) { err = -EINVAL; rxe_dbg_srq(srq, "malformed udata\n"); goto err_out; } err = ib_copy_from_udata(&cmd, udata, sizeof(cmd)); if (err) { err = -EFAULT; rxe_dbg_srq(srq, "unable to read udata\n"); goto err_out; } } err = rxe_srq_chk_attr(rxe, srq, attr, mask); if (err) { rxe_dbg_srq(srq, "bad init attributes\n"); goto err_out; } err = rxe_srq_from_attr(rxe, srq, attr, mask, &cmd, udata); if (err) { rxe_dbg_srq(srq, "bad attr\n"); goto err_out; } return 0; err_out: rxe_err_srq(srq, "returned err = %d\n", err); return err; } static int rxe_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr) { struct rxe_srq *srq = to_rsrq(ibsrq); int err; if (srq->error) { err = -EINVAL; rxe_dbg_srq(srq, "srq in error state\n"); goto err_out; } attr->max_wr = srq->rq.queue->buf->index_mask; attr->max_sge = srq->rq.max_sge; attr->srq_limit = srq->limit; return 0; err_out: rxe_err_srq(srq, "returned err = %d\n", err); return err; } static int rxe_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr) { int err = 0; struct rxe_srq *srq = to_rsrq(ibsrq); unsigned long flags; spin_lock_irqsave(&srq->rq.producer_lock, flags); while (wr) { err = post_one_recv(&srq->rq, wr); if (unlikely(err)) break; wr = wr->next; } spin_unlock_irqrestore(&srq->rq.producer_lock, flags); if (err) { *bad_wr = wr; rxe_err_srq(srq, "returned err = %d\n", err); } return err; } static int rxe_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata) { struct rxe_srq *srq = to_rsrq(ibsrq); int err; err = rxe_cleanup(srq); if (err) rxe_err_srq(srq, "cleanup failed, err = %d\n", err); return 0; } /* qp */ static int rxe_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibqp->device); struct rxe_pd *pd = to_rpd(ibqp->pd); struct rxe_qp *qp = to_rqp(ibqp); struct rxe_create_qp_resp __user *uresp = NULL; int err, cleanup_err; if (udata) { if (udata->inlen) { err = -EINVAL; rxe_dbg_dev(rxe, "malformed udata, err = %d\n", err); goto err_out; } if (udata->outlen < sizeof(*uresp)) { err = -EINVAL; rxe_dbg_dev(rxe, "malformed udata, err = %d\n", err); goto err_out; } qp->is_user = true; uresp = udata->outbuf; } else { qp->is_user = false; } if (init->create_flags) { err = -EOPNOTSUPP; rxe_dbg_dev(rxe, "unsupported create_flags, err = %d\n", err); goto err_out; } err = rxe_qp_chk_init(rxe, init); if (err) { rxe_dbg_dev(rxe, "bad init attr, err = %d\n", err); goto err_out; } err = rxe_add_to_pool(&rxe->qp_pool, qp); if (err) { rxe_dbg_dev(rxe, "unable to create qp, err = %d\n", err); goto err_out; } err = rxe_qp_from_init(rxe, qp, pd, init, uresp, ibqp->pd, udata); if (err) { rxe_dbg_qp(qp, "create qp failed, err = %d\n", err); goto err_cleanup; } rxe_finalize(qp); return 0; err_cleanup: cleanup_err = rxe_cleanup(qp); if (cleanup_err) rxe_err_qp(qp, "cleanup failed, err = %d\n", cleanup_err); err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibqp->device); struct rxe_qp *qp = to_rqp(ibqp); int err; if (mask & ~IB_QP_ATTR_STANDARD_BITS) { err = -EOPNOTSUPP; rxe_dbg_qp(qp, "unsupported mask = 0x%x, err = %d\n", mask, err); goto err_out; } err = rxe_qp_chk_attr(rxe, qp, attr, mask); if (err) { rxe_dbg_qp(qp, "bad mask/attr, err = %d\n", err); goto err_out; } err = rxe_qp_from_attr(qp, attr, mask, udata); if (err) { rxe_dbg_qp(qp, "modify qp failed, err = %d\n", err); goto err_out; } if ((mask & IB_QP_AV) && (attr->ah_attr.ah_flags & IB_AH_GRH)) qp->src_port = rdma_get_udp_sport(attr->ah_attr.grh.flow_label, qp->ibqp.qp_num, qp->attr.dest_qp_num); return 0; err_out: rxe_err_qp(qp, "returned err = %d\n", err); return err; } static int rxe_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int mask, struct ib_qp_init_attr *init) { struct rxe_qp *qp = to_rqp(ibqp); rxe_qp_to_init(qp, init); rxe_qp_to_attr(qp, attr, mask); return 0; } static int rxe_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata) { struct rxe_qp *qp = to_rqp(ibqp); int err; err = rxe_qp_chk_destroy(qp); if (err) { rxe_dbg_qp(qp, "unable to destroy qp, err = %d\n", err); goto err_out; } err = rxe_cleanup(qp); if (err) rxe_err_qp(qp, "cleanup failed, err = %d\n", err); return 0; err_out: rxe_err_qp(qp, "returned err = %d\n", err); return err; } /* send wr */ /* sanity check incoming send work request */ static int validate_send_wr(struct rxe_qp *qp, const struct ib_send_wr *ibwr, unsigned int *maskp, unsigned int *lengthp) { int num_sge = ibwr->num_sge; struct rxe_sq *sq = &qp->sq; unsigned int mask = 0; unsigned long length = 0; int err = -EINVAL; int i; do { mask = wr_opcode_mask(ibwr->opcode, qp); if (!mask) { rxe_err_qp(qp, "bad wr opcode for qp type\n"); break; } if (num_sge > sq->max_sge) { rxe_err_qp(qp, "num_sge > max_sge\n"); break; } length = 0; for (i = 0; i < ibwr->num_sge; i++) length += ibwr->sg_list[i].length; if (length > (1UL << 31)) { rxe_err_qp(qp, "message length too long\n"); break; } if (mask & WR_ATOMIC_MASK) { if (length != 8) { rxe_err_qp(qp, "atomic length != 8\n"); break; } if (atomic_wr(ibwr)->remote_addr & 0x7) { rxe_err_qp(qp, "misaligned atomic address\n"); break; } } if (ibwr->send_flags & IB_SEND_INLINE) { if (!(mask & WR_INLINE_MASK)) { rxe_err_qp(qp, "opcode doesn't support inline data\n"); break; } if (length > sq->max_inline) { rxe_err_qp(qp, "inline length too big\n"); break; } } err = 0; } while (0); *maskp = mask; *lengthp = (int)length; return err; } static int init_send_wr(struct rxe_qp *qp, struct rxe_send_wr *wr, const struct ib_send_wr *ibwr) { wr->wr_id = ibwr->wr_id; wr->opcode = ibwr->opcode; wr->send_flags = ibwr->send_flags; if (qp_type(qp) == IB_QPT_UD || qp_type(qp) == IB_QPT_GSI) { struct ib_ah *ibah = ud_wr(ibwr)->ah; wr->wr.ud.remote_qpn = ud_wr(ibwr)->remote_qpn; wr->wr.ud.remote_qkey = ud_wr(ibwr)->remote_qkey; wr->wr.ud.ah_num = to_rah(ibah)->ah_num; if (qp_type(qp) == IB_QPT_GSI) wr->wr.ud.pkey_index = ud_wr(ibwr)->pkey_index; switch (wr->opcode) { case IB_WR_SEND_WITH_IMM: wr->ex.imm_data = ibwr->ex.imm_data; break; case IB_WR_SEND: break; default: rxe_err_qp(qp, "bad wr opcode %d for UD/GSI QP\n", wr->opcode); return -EINVAL; } } else { switch (wr->opcode) { case IB_WR_RDMA_WRITE_WITH_IMM: wr->ex.imm_data = ibwr->ex.imm_data; fallthrough; case IB_WR_RDMA_READ: case IB_WR_RDMA_WRITE: wr->wr.rdma.remote_addr = rdma_wr(ibwr)->remote_addr; wr->wr.rdma.rkey = rdma_wr(ibwr)->rkey; break; case IB_WR_SEND_WITH_IMM: wr->ex.imm_data = ibwr->ex.imm_data; break; case IB_WR_SEND_WITH_INV: wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey; break; case IB_WR_RDMA_READ_WITH_INV: wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey; wr->wr.rdma.remote_addr = rdma_wr(ibwr)->remote_addr; wr->wr.rdma.rkey = rdma_wr(ibwr)->rkey; break; case IB_WR_ATOMIC_CMP_AND_SWP: case IB_WR_ATOMIC_FETCH_AND_ADD: wr->wr.atomic.remote_addr = atomic_wr(ibwr)->remote_addr; wr->wr.atomic.compare_add = atomic_wr(ibwr)->compare_add; wr->wr.atomic.swap = atomic_wr(ibwr)->swap; wr->wr.atomic.rkey = atomic_wr(ibwr)->rkey; break; case IB_WR_LOCAL_INV: wr->ex.invalidate_rkey = ibwr->ex.invalidate_rkey; break; case IB_WR_REG_MR: wr->wr.reg.mr = reg_wr(ibwr)->mr; wr->wr.reg.key = reg_wr(ibwr)->key; wr->wr.reg.access = reg_wr(ibwr)->access; break; case IB_WR_SEND: case IB_WR_BIND_MW: case IB_WR_FLUSH: case IB_WR_ATOMIC_WRITE: break; default: rxe_err_qp(qp, "unsupported wr opcode %d\n", wr->opcode); return -EINVAL; } } return 0; } static void copy_inline_data_to_wqe(struct rxe_send_wqe *wqe, const struct ib_send_wr *ibwr) { struct ib_sge *sge = ibwr->sg_list; u8 *p = wqe->dma.inline_data; int i; for (i = 0; i < ibwr->num_sge; i++, sge++) { memcpy(p, ib_virt_dma_to_ptr(sge->addr), sge->length); p += sge->length; } } static int init_send_wqe(struct rxe_qp *qp, const struct ib_send_wr *ibwr, unsigned int mask, unsigned int length, struct rxe_send_wqe *wqe) { int num_sge = ibwr->num_sge; int err; err = init_send_wr(qp, &wqe->wr, ibwr); if (err) return err; /* local operation */ if (unlikely(mask & WR_LOCAL_OP_MASK)) { wqe->mask = mask; wqe->state = wqe_state_posted; return 0; } if (unlikely(ibwr->send_flags & IB_SEND_INLINE)) copy_inline_data_to_wqe(wqe, ibwr); else memcpy(wqe->dma.sge, ibwr->sg_list, num_sge * sizeof(struct ib_sge)); wqe->iova = mask & WR_ATOMIC_MASK ? atomic_wr(ibwr)->remote_addr : mask & WR_READ_OR_WRITE_MASK ? rdma_wr(ibwr)->remote_addr : 0; wqe->mask = mask; wqe->dma.length = length; wqe->dma.resid = length; wqe->dma.num_sge = num_sge; wqe->dma.cur_sge = 0; wqe->dma.sge_offset = 0; wqe->state = wqe_state_posted; wqe->ssn = atomic_add_return(1, &qp->ssn); return 0; } static int post_one_send(struct rxe_qp *qp, const struct ib_send_wr *ibwr) { int err; struct rxe_sq *sq = &qp->sq; struct rxe_send_wqe *send_wqe; unsigned int mask; unsigned int length; int full; err = validate_send_wr(qp, ibwr, &mask, &length); if (err) return err; full = queue_full(sq->queue, QUEUE_TYPE_FROM_ULP); if (unlikely(full)) { rxe_err_qp(qp, "send queue full\n"); return -ENOMEM; } send_wqe = queue_producer_addr(sq->queue, QUEUE_TYPE_FROM_ULP); err = init_send_wqe(qp, ibwr, mask, length, send_wqe); if (!err) queue_advance_producer(sq->queue, QUEUE_TYPE_FROM_ULP); return err; } static int rxe_post_send_kernel(struct rxe_qp *qp, const struct ib_send_wr *ibwr, const struct ib_send_wr **bad_wr) { int err = 0; unsigned long flags; int good = 0; spin_lock_irqsave(&qp->sq.sq_lock, flags); while (ibwr) { err = post_one_send(qp, ibwr); if (err) { *bad_wr = ibwr; break; } else { good++; } ibwr = ibwr->next; } spin_unlock_irqrestore(&qp->sq.sq_lock, flags); /* kickoff processing of any posted wqes */ if (good) rxe_sched_task(&qp->send_task); return err; } static int rxe_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, const struct ib_send_wr **bad_wr) { struct rxe_qp *qp = to_rqp(ibqp); int err; unsigned long flags; spin_lock_irqsave(&qp->state_lock, flags); /* caller has already called destroy_qp */ if (WARN_ON_ONCE(!qp->valid)) { spin_unlock_irqrestore(&qp->state_lock, flags); rxe_err_qp(qp, "qp has been destroyed\n"); return -EINVAL; } if (unlikely(qp_state(qp) < IB_QPS_RTS)) { spin_unlock_irqrestore(&qp->state_lock, flags); *bad_wr = wr; rxe_err_qp(qp, "qp not ready to send\n"); return -EINVAL; } spin_unlock_irqrestore(&qp->state_lock, flags); if (qp->is_user) { /* Utilize process context to do protocol processing */ rxe_sched_task(&qp->send_task); } else { err = rxe_post_send_kernel(qp, wr, bad_wr); if (err) return err; } return 0; } /* recv wr */ static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr) { int i; unsigned long length; struct rxe_recv_wqe *recv_wqe; int num_sge = ibwr->num_sge; int full; int err; full = queue_full(rq->queue, QUEUE_TYPE_FROM_ULP); if (unlikely(full)) { err = -ENOMEM; rxe_dbg("queue full\n"); goto err_out; } if (unlikely(num_sge > rq->max_sge)) { err = -EINVAL; rxe_dbg("bad num_sge > max_sge\n"); goto err_out; } length = 0; for (i = 0; i < num_sge; i++) length += ibwr->sg_list[i].length; /* IBA max message size is 2^31 */ if (length >= (1UL<<31)) { err = -EINVAL; rxe_dbg("message length too long\n"); goto err_out; } recv_wqe = queue_producer_addr(rq->queue, QUEUE_TYPE_FROM_ULP); recv_wqe->wr_id = ibwr->wr_id; recv_wqe->dma.length = length; recv_wqe->dma.resid = length; recv_wqe->dma.num_sge = num_sge; recv_wqe->dma.cur_sge = 0; recv_wqe->dma.sge_offset = 0; memcpy(recv_wqe->dma.sge, ibwr->sg_list, num_sge * sizeof(struct ib_sge)); queue_advance_producer(rq->queue, QUEUE_TYPE_FROM_ULP); return 0; err_out: rxe_dbg("returned err = %d\n", err); return err; } static int rxe_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr) { int err = 0; struct rxe_qp *qp = to_rqp(ibqp); struct rxe_rq *rq = &qp->rq; unsigned long flags; spin_lock_irqsave(&qp->state_lock, flags); /* caller has already called destroy_qp */ if (WARN_ON_ONCE(!qp->valid)) { spin_unlock_irqrestore(&qp->state_lock, flags); rxe_err_qp(qp, "qp has been destroyed\n"); return -EINVAL; } /* see C10-97.2.1 */ if (unlikely((qp_state(qp) < IB_QPS_INIT))) { spin_unlock_irqrestore(&qp->state_lock, flags); *bad_wr = wr; rxe_dbg_qp(qp, "qp not ready to post recv\n"); return -EINVAL; } spin_unlock_irqrestore(&qp->state_lock, flags); if (unlikely(qp->srq)) { *bad_wr = wr; rxe_dbg_qp(qp, "qp has srq, use post_srq_recv instead\n"); return -EINVAL; } spin_lock_irqsave(&rq->producer_lock, flags); while (wr) { err = post_one_recv(rq, wr); if (unlikely(err)) { *bad_wr = wr; break; } wr = wr->next; } spin_unlock_irqrestore(&rq->producer_lock, flags); spin_lock_irqsave(&qp->state_lock, flags); if (qp_state(qp) == IB_QPS_ERR) rxe_sched_task(&qp->recv_task); spin_unlock_irqrestore(&qp->state_lock, flags); return err; } /* cq */ static int rxe_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, struct uverbs_attr_bundle *attrs) { struct ib_udata *udata = &attrs->driver_udata; struct ib_device *dev = ibcq->device; struct rxe_dev *rxe = to_rdev(dev); struct rxe_cq *cq = to_rcq(ibcq); struct rxe_create_cq_resp __user *uresp = NULL; int err, cleanup_err; if (udata) { if (udata->outlen < sizeof(*uresp)) { err = -EINVAL; rxe_dbg_dev(rxe, "malformed udata, err = %d\n", err); goto err_out; } uresp = udata->outbuf; } if (attr->flags) { err = -EOPNOTSUPP; rxe_dbg_dev(rxe, "bad attr->flags, err = %d\n", err); goto err_out; } err = rxe_cq_chk_attr(rxe, NULL, attr->cqe, attr->comp_vector); if (err) { rxe_dbg_dev(rxe, "bad init attributes, err = %d\n", err); goto err_out; } err = rxe_add_to_pool(&rxe->cq_pool, cq); if (err) { rxe_dbg_dev(rxe, "unable to create cq, err = %d\n", err); goto err_out; } err = rxe_cq_from_init(rxe, cq, attr->cqe, attr->comp_vector, udata, uresp); if (err) { rxe_dbg_cq(cq, "create cq failed, err = %d\n", err); goto err_cleanup; } return 0; err_cleanup: cleanup_err = rxe_cleanup(cq); if (cleanup_err) rxe_err_cq(cq, "cleanup failed, err = %d\n", cleanup_err); err_out: rxe_err_dev(rxe, "returned err = %d\n", err); return err; } static int rxe_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata) { struct rxe_cq *cq = to_rcq(ibcq); struct rxe_dev *rxe = to_rdev(ibcq->device); struct rxe_resize_cq_resp __user *uresp = NULL; int err; if (udata) { if (udata->outlen < sizeof(*uresp)) { err = -EINVAL; rxe_dbg_cq(cq, "malformed udata\n"); goto err_out; } uresp = udata->outbuf; } err = rxe_cq_chk_attr(rxe, cq, cqe, 0); if (err) { rxe_dbg_cq(cq, "bad attr, err = %d\n", err); goto err_out; } err = rxe_cq_resize_queue(cq, cqe, uresp, udata); if (err) { rxe_dbg_cq(cq, "resize cq failed, err = %d\n", err); goto err_out; } return 0; err_out: rxe_err_cq(cq, "returned err = %d\n", err); return err; } static int rxe_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc) { int i; struct rxe_cq *cq = to_rcq(ibcq); struct rxe_cqe *cqe; unsigned long flags; spin_lock_irqsave(&cq->cq_lock, flags); for (i = 0; i < num_entries; i++) { cqe = queue_head(cq->queue, QUEUE_TYPE_TO_ULP); if (!cqe) break; /* queue empty */ memcpy(wc++, &cqe->ibwc, sizeof(*wc)); queue_advance_consumer(cq->queue, QUEUE_TYPE_TO_ULP); } spin_unlock_irqrestore(&cq->cq_lock, flags); return i; } static int rxe_peek_cq(struct ib_cq *ibcq, int wc_cnt) { struct rxe_cq *cq = to_rcq(ibcq); int count; count = queue_count(cq->queue, QUEUE_TYPE_TO_ULP); return (count > wc_cnt) ? wc_cnt : count; } static int rxe_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags) { struct rxe_cq *cq = to_rcq(ibcq); int ret = 0; int empty; unsigned long irq_flags; spin_lock_irqsave(&cq->cq_lock, irq_flags); cq->notify |= flags & IB_CQ_SOLICITED_MASK; empty = queue_empty(cq->queue, QUEUE_TYPE_TO_ULP); if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && !empty) ret = 1; spin_unlock_irqrestore(&cq->cq_lock, irq_flags); return ret; } static int rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata) { struct rxe_cq *cq = to_rcq(ibcq); int err; /* See IBA C11-17: The CI shall return an error if this Verb is * invoked while a Work Queue is still associated with the CQ. */ if (atomic_read(&cq->num_wq)) { err = -EINVAL; rxe_dbg_cq(cq, "still in use\n"); goto err_out; } err = rxe_cleanup(cq); if (err) rxe_err_cq(cq, "cleanup failed, err = %d\n", err); return 0; err_out: rxe_err_cq(cq, "returned err = %d\n", err); return err; } /* mr */ static struct ib_mr *rxe_get_dma_mr(struct ib_pd *ibpd, int access) { struct rxe_dev *rxe = to_rdev(ibpd->device); struct rxe_pd *pd = to_rpd(ibpd); struct rxe_mr *mr; int err; mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); err = rxe_add_to_pool(&rxe->mr_pool, mr); if (err) { rxe_dbg_dev(rxe, "unable to create mr\n"); goto err_free; } rxe_get(pd); mr->ibmr.pd = ibpd; mr->ibmr.device = ibpd->device; rxe_mr_init_dma(access, mr); rxe_finalize(mr); return &mr->ibmr; err_free: kfree(mr); rxe_err_pd(pd, "returned err = %d\n", err); return ERR_PTR(err); } static struct ib_mr *rxe_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length, u64 iova, int access, struct ib_udata *udata) { struct rxe_dev *rxe = to_rdev(ibpd->device); struct rxe_pd *pd = to_rpd(ibpd); struct rxe_mr *mr; int err, cleanup_err; if (access & ~RXE_ACCESS_SUPPORTED_MR) { rxe_err_pd(pd, "access = %#x not supported (%#x)\n", access, RXE_ACCESS_SUPPORTED_MR); return ERR_PTR(-EOPNOTSUPP); } mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); err = rxe_add_to_pool(&rxe->mr_pool, mr); if (err) { rxe_dbg_pd(pd, "unable to create mr\n"); goto err_free; } rxe_get(pd); mr->ibmr.pd = ibpd; mr->ibmr.device = ibpd->device; err = rxe_mr_init_user(rxe, start, length, access, mr); if (err) { rxe_dbg_mr(mr, "reg_user_mr failed, err = %d\n", err); goto err_cleanup; } rxe_finalize(mr); return &mr->ibmr; err_cleanup: cleanup_err = rxe_cleanup(mr); if (cleanup_err) rxe_err_mr(mr, "cleanup failed, err = %d\n", cleanup_err); err_free: kfree(mr); rxe_err_pd(pd, "returned err = %d\n", err); return ERR_PTR(err); } static struct ib_mr *rxe_rereg_user_mr(struct ib_mr *ibmr, int flags, u64 start, u64 length, u64 iova, int access, struct ib_pd *ibpd, struct ib_udata *udata) { struct rxe_mr *mr = to_rmr(ibmr); struct rxe_pd *old_pd = to_rpd(ibmr->pd); struct rxe_pd *pd = to_rpd(ibpd); /* for now only support the two easy cases: * rereg_pd and rereg_access */ if (flags & ~RXE_MR_REREG_SUPPORTED) { rxe_err_mr(mr, "flags = %#x not supported\n", flags); return ERR_PTR(-EOPNOTSUPP); } if (flags & IB_MR_REREG_PD) { rxe_put(old_pd); rxe_get(pd); mr->ibmr.pd = ibpd; } if (flags & IB_MR_REREG_ACCESS) { if (access & ~RXE_ACCESS_SUPPORTED_MR) { rxe_err_mr(mr, "access = %#x not supported\n", access); return ERR_PTR(-EOPNOTSUPP); } mr->access = access; } return NULL; } static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type, u32 max_num_sg) { struct rxe_dev *rxe = to_rdev(ibpd->device); struct rxe_pd *pd = to_rpd(ibpd); struct rxe_mr *mr; int err, cleanup_err; if (mr_type != IB_MR_TYPE_MEM_REG) { err = -EINVAL; rxe_dbg_pd(pd, "mr type %d not supported, err = %d\n", mr_type, err); goto err_out; } mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) return ERR_PTR(-ENOMEM); err = rxe_add_to_pool(&rxe->mr_pool, mr); if (err) goto err_free; rxe_get(pd); mr->ibmr.pd = ibpd; mr->ibmr.device = ibpd->device; err = rxe_mr_init_fast(max_num_sg, mr); if (err) { rxe_dbg_mr(mr, "alloc_mr failed, err = %d\n", err); goto err_cleanup; } rxe_finalize(mr); return &mr->ibmr; err_cleanup: cleanup_err = rxe_cleanup(mr); if (cleanup_err) rxe_err_mr(mr, "cleanup failed, err = %d\n", err); err_free: kfree(mr); err_out: rxe_err_pd(pd, "returned err = %d\n", err); return ERR_PTR(err); } static int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata) { struct rxe_mr *mr = to_rmr(ibmr); int err, cleanup_err; /* See IBA 10.6.7.2.6 */ if (atomic_read(&mr->num_mw) > 0) { err = -EINVAL; rxe_dbg_mr(mr, "mr has mw's bound\n"); goto err_out; } cleanup_err = rxe_cleanup(mr); if (cleanup_err) rxe_err_mr(mr, "cleanup failed, err = %d\n", cleanup_err); kfree_rcu_mightsleep(mr); return 0; err_out: rxe_err_mr(mr, "returned err = %d\n", err); return err; } static ssize_t parent_show(struct device *device, struct device_attribute *attr, char *buf) { struct rxe_dev *rxe = rdma_device_to_drv_device(device, struct rxe_dev, ib_dev); return sysfs_emit(buf, "%s\n", rxe_parent_name(rxe, 1)); } static DEVICE_ATTR_RO(parent); static struct attribute *rxe_dev_attributes[] = { &dev_attr_parent.attr, NULL }; static const struct attribute_group rxe_attr_group = { .attrs = rxe_dev_attributes, }; static int rxe_enable_driver(struct ib_device *ib_dev) { struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev); rxe_set_port_state(rxe); dev_info(&rxe->ib_dev.dev, "added %s\n", netdev_name(rxe->ndev)); return 0; } static const struct ib_device_ops rxe_dev_ops = { .owner = THIS_MODULE, .driver_id = RDMA_DRIVER_RXE, .uverbs_abi_ver = RXE_UVERBS_ABI_VERSION, .alloc_hw_port_stats = rxe_ib_alloc_hw_port_stats, .alloc_mr = rxe_alloc_mr, .alloc_mw = rxe_alloc_mw, .alloc_pd = rxe_alloc_pd, .alloc_ucontext = rxe_alloc_ucontext, .attach_mcast = rxe_attach_mcast, .create_ah = rxe_create_ah, .create_cq = rxe_create_cq, .create_qp = rxe_create_qp, .create_srq = rxe_create_srq, .create_user_ah = rxe_create_ah, .dealloc_driver = rxe_dealloc, .dealloc_mw = rxe_dealloc_mw, .dealloc_pd = rxe_dealloc_pd, .dealloc_ucontext = rxe_dealloc_ucontext, .dereg_mr = rxe_dereg_mr, .destroy_ah = rxe_destroy_ah, .destroy_cq = rxe_destroy_cq, .destroy_qp = rxe_destroy_qp, .destroy_srq = rxe_destroy_srq, .detach_mcast = rxe_detach_mcast, .device_group = &rxe_attr_group, .enable_driver = rxe_enable_driver, .get_dma_mr = rxe_get_dma_mr, .get_hw_stats = rxe_ib_get_hw_stats, .get_link_layer = rxe_get_link_layer, .get_port_immutable = rxe_port_immutable, .map_mr_sg = rxe_map_mr_sg, .mmap = rxe_mmap, .modify_ah = rxe_modify_ah, .modify_device = rxe_modify_device, .modify_port = rxe_modify_port, .modify_qp = rxe_modify_qp, .modify_srq = rxe_modify_srq, .peek_cq = rxe_peek_cq, .poll_cq = rxe_poll_cq, .post_recv = rxe_post_recv, .post_send = rxe_post_send, .post_srq_recv = rxe_post_srq_recv, .query_ah = rxe_query_ah, .query_device = rxe_query_device, .query_pkey = rxe_query_pkey, .query_port = rxe_query_port, .query_qp = rxe_query_qp, .query_srq = rxe_query_srq, .reg_user_mr = rxe_reg_user_mr, .req_notify_cq = rxe_req_notify_cq, .rereg_user_mr = rxe_rereg_user_mr, .resize_cq = rxe_resize_cq, INIT_RDMA_OBJ_SIZE(ib_ah, rxe_ah, ibah), INIT_RDMA_OBJ_SIZE(ib_cq, rxe_cq, ibcq), INIT_RDMA_OBJ_SIZE(ib_pd, rxe_pd, ibpd), INIT_RDMA_OBJ_SIZE(ib_qp, rxe_qp, ibqp), INIT_RDMA_OBJ_SIZE(ib_srq, rxe_srq, ibsrq), INIT_RDMA_OBJ_SIZE(ib_ucontext, rxe_ucontext, ibuc), INIT_RDMA_OBJ_SIZE(ib_mw, rxe_mw, ibmw), }; int rxe_register_device(struct rxe_dev *rxe, const char *ibdev_name) { int err; struct ib_device *dev = &rxe->ib_dev; strscpy(dev->node_desc, "rxe", sizeof(dev->node_desc)); dev->node_type = RDMA_NODE_IB_CA; dev->phys_port_cnt = 1; dev->num_comp_vectors = num_possible_cpus(); dev->local_dma_lkey = 0; addrconf_addr_eui48((unsigned char *)&dev->node_guid, rxe->ndev->dev_addr); dev->uverbs_cmd_mask |= BIT_ULL(IB_USER_VERBS_CMD_POST_SEND) | BIT_ULL(IB_USER_VERBS_CMD_REQ_NOTIFY_CQ); ib_set_device_ops(dev, &rxe_dev_ops); err = ib_device_set_netdev(&rxe->ib_dev, rxe->ndev, 1); if (err) return err; err = rxe_icrc_init(rxe); if (err) return err; err = ib_register_device(dev, ibdev_name, NULL); if (err) rxe_dbg_dev(rxe, "failed with error %d\n", err); /* * Note that rxe may be invalid at this point if another thread * unregistered it. */ return err; } |
229 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 | // SPDX-License-Identifier: GPL-2.0 /* User-mappable watch queue * * Copyright (C) 2020 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * See Documentation/core-api/watch_queue.rst */ #ifndef _LINUX_WATCH_QUEUE_H #define _LINUX_WATCH_QUEUE_H #include <uapi/linux/watch_queue.h> #include <linux/kref.h> #include <linux/rcupdate.h> #ifdef CONFIG_WATCH_QUEUE struct cred; struct watch_type_filter { enum watch_notification_type type; __u32 subtype_filter[1]; /* Bitmask of subtypes to filter on */ __u32 info_filter; /* Filter on watch_notification::info */ __u32 info_mask; /* Mask of relevant bits in info_filter */ }; struct watch_filter { union { struct rcu_head rcu; /* Bitmask of accepted types */ DECLARE_BITMAP(type_filter, WATCH_TYPE__NR); }; u32 nr_filters; /* Number of filters */ struct watch_type_filter filters[] __counted_by(nr_filters); }; struct watch_queue { struct rcu_head rcu; struct watch_filter __rcu *filter; struct pipe_inode_info *pipe; /* Pipe we use as a buffer, NULL if queue closed */ struct hlist_head watches; /* Contributory watches */ struct page **notes; /* Preallocated notifications */ unsigned long *notes_bitmap; /* Allocation bitmap for notes */ struct kref usage; /* Object usage count */ spinlock_t lock; unsigned int nr_notes; /* Number of notes */ unsigned int nr_pages; /* Number of pages in notes[] */ }; /* * Representation of a watch on an object. */ struct watch { union { struct rcu_head rcu; u32 info_id; /* ID to be OR'd in to info field */ }; struct watch_queue __rcu *queue; /* Queue to post events to */ struct hlist_node queue_node; /* Link in queue->watches */ struct watch_list __rcu *watch_list; struct hlist_node list_node; /* Link in watch_list->watchers */ const struct cred *cred; /* Creds of the owner of the watch */ void *private; /* Private data for the watched object */ u64 id; /* Internal identifier */ struct kref usage; /* Object usage count */ }; /* * List of watches on an object. */ struct watch_list { struct rcu_head rcu; struct hlist_head watchers; void (*release_watch)(struct watch *); spinlock_t lock; }; extern void __post_watch_notification(struct watch_list *, struct watch_notification *, const struct cred *, u64); extern struct watch_queue *get_watch_queue(int); extern void put_watch_queue(struct watch_queue *); extern void init_watch(struct watch *, struct watch_queue *); extern int add_watch_to_object(struct watch *, struct watch_list *); extern int remove_watch_from_object(struct watch_list *, struct watch_queue *, u64, bool); extern long watch_queue_set_size(struct pipe_inode_info *, unsigned int); extern long watch_queue_set_filter(struct pipe_inode_info *, struct watch_notification_filter __user *); extern int watch_queue_init(struct pipe_inode_info *); extern void watch_queue_clear(struct watch_queue *); static inline void init_watch_list(struct watch_list *wlist, void (*release_watch)(struct watch *)) { INIT_HLIST_HEAD(&wlist->watchers); spin_lock_init(&wlist->lock); wlist->release_watch = release_watch; } static inline void post_watch_notification(struct watch_list *wlist, struct watch_notification *n, const struct cred *cred, u64 id) { if (unlikely(wlist)) __post_watch_notification(wlist, n, cred, id); } static inline void remove_watch_list(struct watch_list *wlist, u64 id) { if (wlist) { remove_watch_from_object(wlist, NULL, id, true); kfree_rcu(wlist, rcu); } } /** * watch_sizeof - Calculate the information part of the size of a watch record, * given the structure size. */ #define watch_sizeof(STRUCT) (sizeof(STRUCT) << WATCH_INFO_LENGTH__SHIFT) #else static inline int watch_queue_init(struct pipe_inode_info *pipe) { return -ENOPKG; } #endif #endif /* _LINUX_WATCH_QUEUE_H */ |
4 7 2 2 15 14 2 1 1 4 2 4 1 1 1 1 1 1 4 4 4 17 8 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 | /* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ #ifndef __IP_SET_BITMAP_IP_GEN_H #define __IP_SET_BITMAP_IP_GEN_H #include <linux/rcupdate_wait.h> #define mtype_do_test IPSET_TOKEN(MTYPE, _do_test) #define mtype_gc_test IPSET_TOKEN(MTYPE, _gc_test) #define mtype_is_filled IPSET_TOKEN(MTYPE, _is_filled) #define mtype_do_add IPSET_TOKEN(MTYPE, _do_add) #define mtype_ext_cleanup IPSET_TOKEN(MTYPE, _ext_cleanup) #define mtype_do_del IPSET_TOKEN(MTYPE, _do_del) #define mtype_do_list IPSET_TOKEN(MTYPE, _do_list) #define mtype_do_head IPSET_TOKEN(MTYPE, _do_head) #define mtype_adt_elem IPSET_TOKEN(MTYPE, _adt_elem) #define mtype_add_timeout IPSET_TOKEN(MTYPE, _add_timeout) #define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init) #define mtype_kadt IPSET_TOKEN(MTYPE, _kadt) #define mtype_uadt IPSET_TOKEN(MTYPE, _uadt) #define mtype_destroy IPSET_TOKEN(MTYPE, _destroy) #define mtype_memsize IPSET_TOKEN(MTYPE, _memsize) #define mtype_flush IPSET_TOKEN(MTYPE, _flush) #define mtype_head IPSET_TOKEN(MTYPE, _head) #define mtype_same_set IPSET_TOKEN(MTYPE, _same_set) #define mtype_elem IPSET_TOKEN(MTYPE, _elem) #define mtype_test IPSET_TOKEN(MTYPE, _test) #define mtype_add IPSET_TOKEN(MTYPE, _add) #define mtype_del IPSET_TOKEN(MTYPE, _del) #define mtype_list IPSET_TOKEN(MTYPE, _list) #define mtype_gc IPSET_TOKEN(MTYPE, _gc) #define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc) #define mtype MTYPE #define get_ext(set, map, id) ((map)->extensions + ((set)->dsize * (id))) static void mtype_gc_init(struct ip_set *set, void (*gc)(struct timer_list *t)) { struct mtype *map = set->data; timer_setup(&map->gc, gc, 0); mod_timer(&map->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ); } static void mtype_ext_cleanup(struct ip_set *set) { struct mtype *map = set->data; u32 id; for (id = 0; id < map->elements; id++) if (test_bit(id, map->members)) ip_set_ext_destroy(set, get_ext(set, map, id)); } static void mtype_destroy(struct ip_set *set) { struct mtype *map = set->data; if (set->dsize && set->extensions & IPSET_EXT_DESTROY) mtype_ext_cleanup(set); ip_set_free(map->members); ip_set_free(map); set->data = NULL; } static void mtype_flush(struct ip_set *set) { struct mtype *map = set->data; if (set->extensions & IPSET_EXT_DESTROY) mtype_ext_cleanup(set); bitmap_zero(map->members, map->elements); set->elements = 0; set->ext_size = 0; } /* Calculate the actual memory size of the set data */ static size_t mtype_memsize(const struct mtype *map, size_t dsize) { return sizeof(*map) + map->memsize + map->elements * dsize; } static int mtype_head(struct ip_set *set, struct sk_buff *skb) { const struct mtype *map = set->data; struct nlattr *nested; size_t memsize = mtype_memsize(map, set->dsize) + set->ext_size; nested = nla_nest_start(skb, IPSET_ATTR_DATA); if (!nested) goto nla_put_failure; if (mtype_do_head(skb, map) || nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) || nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) || nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(set->elements))) goto nla_put_failure; if (unlikely(ip_set_put_flags(skb, set))) goto nla_put_failure; nla_nest_end(skb, nested); return 0; nla_put_failure: return -EMSGSIZE; } static int mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags) { struct mtype *map = set->data; const struct mtype_adt_elem *e = value; void *x = get_ext(set, map, e->id); int ret = mtype_do_test(e, map, set->dsize); if (ret <= 0) return ret; return ip_set_match_extensions(set, ext, mext, flags, x); } static int mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags) { struct mtype *map = set->data; const struct mtype_adt_elem *e = value; void *x = get_ext(set, map, e->id); int ret = mtype_do_add(e, map, flags, set->dsize); if (ret == IPSET_ADD_FAILED) { if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(x, set))) { set->elements--; ret = 0; } else if (!(flags & IPSET_FLAG_EXIST)) { set_bit(e->id, map->members); return -IPSET_ERR_EXIST; } /* Element is re-added, cleanup extensions */ ip_set_ext_destroy(set, x); } if (ret > 0) set->elements--; if (SET_WITH_TIMEOUT(set)) #ifdef IP_SET_BITMAP_STORED_TIMEOUT mtype_add_timeout(ext_timeout(x, set), e, ext, set, map, ret); #else ip_set_timeout_set(ext_timeout(x, set), ext->timeout); #endif if (SET_WITH_COUNTER(set)) ip_set_init_counter(ext_counter(x, set), ext); if (SET_WITH_COMMENT(set)) ip_set_init_comment(set, ext_comment(x, set), ext); if (SET_WITH_SKBINFO(set)) ip_set_init_skbinfo(ext_skbinfo(x, set), ext); /* Activate element */ set_bit(e->id, map->members); set->elements++; return 0; } static int mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, struct ip_set_ext *mext, u32 flags) { struct mtype *map = set->data; const struct mtype_adt_elem *e = value; void *x = get_ext(set, map, e->id); if (mtype_do_del(e, map)) return -IPSET_ERR_EXIST; ip_set_ext_destroy(set, x); set->elements--; if (SET_WITH_TIMEOUT(set) && ip_set_timeout_expired(ext_timeout(x, set))) return -IPSET_ERR_EXIST; return 0; } #ifndef IP_SET_BITMAP_STORED_TIMEOUT static bool mtype_is_filled(const struct mtype_elem *x) { return true; } #endif static int mtype_list(const struct ip_set *set, struct sk_buff *skb, struct netlink_callback *cb) { struct mtype *map = set->data; struct nlattr *adt, *nested; void *x; u32 id, first = cb->args[IPSET_CB_ARG0]; int ret = 0; adt = nla_nest_start(skb, IPSET_ATTR_ADT); if (!adt) return -EMSGSIZE; /* Extensions may be replaced */ rcu_read_lock(); for (; cb->args[IPSET_CB_ARG0] < map->elements; cb->args[IPSET_CB_ARG0]++) { cond_resched_rcu(); id = cb->args[IPSET_CB_ARG0]; x = get_ext(set, map, id); if (!test_bit(id, map->members) || (SET_WITH_TIMEOUT(set) && #ifdef IP_SET_BITMAP_STORED_TIMEOUT mtype_is_filled(x) && #endif ip_set_timeout_expired(ext_timeout(x, set)))) continue; nested = nla_nest_start(skb, IPSET_ATTR_DATA); if (!nested) { if (id == first) { nla_nest_cancel(skb, adt); ret = -EMSGSIZE; goto out; } goto nla_put_failure; } if (mtype_do_list(skb, map, id, set->dsize)) goto nla_put_failure; if (ip_set_put_extensions(skb, set, x, mtype_is_filled(x))) goto nla_put_failure; nla_nest_end(skb, nested); } nla_nest_end(skb, adt); /* Set listing finished */ cb->args[IPSET_CB_ARG0] = 0; goto out; nla_put_failure: nla_nest_cancel(skb, nested); if (unlikely(id == first)) { cb->args[IPSET_CB_ARG0] = 0; ret = -EMSGSIZE; } nla_nest_end(skb, adt); out: rcu_read_unlock(); return ret; } static void mtype_gc(struct timer_list *t) { struct mtype *map = from_timer(map, t, gc); struct ip_set *set = map->set; void *x; u32 id; /* We run parallel with other readers (test element) * but adding/deleting new entries is locked out */ spin_lock_bh(&set->lock); for (id = 0; id < map->elements; id++) if (mtype_gc_test(id, map, set->dsize)) { x = get_ext(set, map, id); if (ip_set_timeout_expired(ext_timeout(x, set))) { clear_bit(id, map->members); ip_set_ext_destroy(set, x); set->elements--; } } spin_unlock_bh(&set->lock); map->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; add_timer(&map->gc); } static void mtype_cancel_gc(struct ip_set *set) { struct mtype *map = set->data; if (SET_WITH_TIMEOUT(set)) del_timer_sync(&map->gc); } static const struct ip_set_type_variant mtype = { .kadt = mtype_kadt, .uadt = mtype_uadt, .adt = { [IPSET_ADD] = mtype_add, [IPSET_DEL] = mtype_del, [IPSET_TEST] = mtype_test, }, .destroy = mtype_destroy, .flush = mtype_flush, .head = mtype_head, .list = mtype_list, .same_set = mtype_same_set, .cancel_gc = mtype_cancel_gc, }; #endif /* __IP_SET_BITMAP_IP_GEN_H */ |
3 1 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 | // SPDX-License-Identifier: GPL-2.0-only /* Kernel module to match Hop-by-Hop and Destination parameters. */ /* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ipv6.h> #include <linux/types.h> #include <net/checksum.h> #include <net/ipv6.h> #include <asm/byteorder.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter_ipv6/ip6t_opts.h> MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: IPv6 Hop-By-Hop and Destination Header match"); MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); MODULE_ALIAS("ip6t_dst"); /* * (Type & 0xC0) >> 6 * 0 -> ignorable * 1 -> must drop the packet * 2 -> send ICMP PARM PROB regardless and drop packet * 3 -> Send ICMP if not a multicast address and drop packet * (Type & 0x20) >> 5 * 0 -> invariant * 1 -> can change the routing * (Type & 0x1F) Type * 0 -> Pad1 (only 1 byte!) * 1 -> PadN LENGTH info (total length = length + 2) * C0 | 2 -> JUMBO 4 x x x x ( xxxx > 64k ) * 5 -> RTALERT 2 x x */ static struct xt_match hbh_mt6_reg[] __read_mostly; static bool hbh_mt6(const struct sk_buff *skb, struct xt_action_param *par) { struct ipv6_opt_hdr _optsh; const struct ipv6_opt_hdr *oh; const struct ip6t_opts *optinfo = par->matchinfo; unsigned int temp; unsigned int ptr = 0; unsigned int hdrlen = 0; bool ret = false; u8 _opttype; u8 _optlen; const u_int8_t *tp = NULL; const u_int8_t *lp = NULL; unsigned int optlen; int err; err = ipv6_find_hdr(skb, &ptr, (par->match == &hbh_mt6_reg[0]) ? NEXTHDR_HOP : NEXTHDR_DEST, NULL, NULL); if (err < 0) { if (err != -ENOENT) par->hotdrop = true; return false; } oh = skb_header_pointer(skb, ptr, sizeof(_optsh), &_optsh); if (oh == NULL) { par->hotdrop = true; return false; } hdrlen = ipv6_optlen(oh); if (skb->len - ptr < hdrlen) { /* Packet smaller than it's length field */ return false; } pr_debug("IPv6 OPTS LEN %u %u ", hdrlen, oh->hdrlen); pr_debug("len %02X %04X %02X ", optinfo->hdrlen, hdrlen, (!(optinfo->flags & IP6T_OPTS_LEN) || ((optinfo->hdrlen == hdrlen) ^ !!(optinfo->invflags & IP6T_OPTS_INV_LEN)))); ret = (!(optinfo->flags & IP6T_OPTS_LEN) || ((optinfo->hdrlen == hdrlen) ^ !!(optinfo->invflags & IP6T_OPTS_INV_LEN))); ptr += 2; hdrlen -= 2; if (!(optinfo->flags & IP6T_OPTS_OPTS)) { return ret; } else { pr_debug("Strict "); pr_debug("#%d ", optinfo->optsnr); for (temp = 0; temp < optinfo->optsnr; temp++) { /* type field exists ? */ if (hdrlen < 1) break; tp = skb_header_pointer(skb, ptr, sizeof(_opttype), &_opttype); if (tp == NULL) break; /* Type check */ if (*tp != (optinfo->opts[temp] & 0xFF00) >> 8) { pr_debug("Tbad %02X %02X\n", *tp, (optinfo->opts[temp] & 0xFF00) >> 8); return false; } else { pr_debug("Tok "); } /* Length check */ if (*tp) { u16 spec_len; /* length field exists ? */ if (hdrlen < 2) break; lp = skb_header_pointer(skb, ptr + 1, sizeof(_optlen), &_optlen); if (lp == NULL) break; spec_len = optinfo->opts[temp] & 0x00FF; if (spec_len != 0x00FF && spec_len != *lp) { pr_debug("Lbad %02X %04X\n", *lp, spec_len); return false; } pr_debug("Lok "); optlen = *lp + 2; } else { pr_debug("Pad1\n"); optlen = 1; } /* Step to the next */ pr_debug("len%04X\n", optlen); if ((ptr > skb->len - optlen || hdrlen < optlen) && temp < optinfo->optsnr - 1) { pr_debug("new pointer is too large!\n"); break; } ptr += optlen; hdrlen -= optlen; } if (temp == optinfo->optsnr) return ret; else return false; } return false; } static int hbh_mt6_check(const struct xt_mtchk_param *par) { const struct ip6t_opts *optsinfo = par->matchinfo; if (optsinfo->invflags & ~IP6T_OPTS_INV_MASK) { pr_debug("unknown flags %X\n", optsinfo->invflags); return -EINVAL; } if (optsinfo->flags & IP6T_OPTS_NSTRICT) { pr_debug("Not strict - not implemented"); return -EINVAL; } return 0; } static struct xt_match hbh_mt6_reg[] __read_mostly = { { /* Note, hbh_mt6 relies on the order of hbh_mt6_reg */ .name = "hbh", .family = NFPROTO_IPV6, .match = hbh_mt6, .matchsize = sizeof(struct ip6t_opts), .checkentry = hbh_mt6_check, .me = THIS_MODULE, }, { .name = "dst", .family = NFPROTO_IPV6, .match = hbh_mt6, .matchsize = sizeof(struct ip6t_opts), .checkentry = hbh_mt6_check, .me = THIS_MODULE, }, }; static int __init hbh_mt6_init(void) { return xt_register_matches(hbh_mt6_reg, ARRAY_SIZE(hbh_mt6_reg)); } static void __exit hbh_mt6_exit(void) { xt_unregister_matches(hbh_mt6_reg, ARRAY_SIZE(hbh_mt6_reg)); } module_init(hbh_mt6_init); module_exit(hbh_mt6_exit); |
50 12 1 8 2 50 46 7 64 65 1 64 3537 65 3540 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM cgroup #if !defined(_TRACE_CGROUP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_CGROUP_H #include <linux/cgroup.h> #include <linux/tracepoint.h> DECLARE_EVENT_CLASS(cgroup_root, TP_PROTO(struct cgroup_root *root), TP_ARGS(root), TP_STRUCT__entry( __field( int, root ) __field( u16, ss_mask ) __string( name, root->name ) ), TP_fast_assign( __entry->root = root->hierarchy_id; __entry->ss_mask = root->subsys_mask; __assign_str(name); ), TP_printk("root=%d ss_mask=%#x name=%s", __entry->root, __entry->ss_mask, __get_str(name)) ); DEFINE_EVENT(cgroup_root, cgroup_setup_root, TP_PROTO(struct cgroup_root *root), TP_ARGS(root) ); DEFINE_EVENT(cgroup_root, cgroup_destroy_root, TP_PROTO(struct cgroup_root *root), TP_ARGS(root) ); DEFINE_EVENT(cgroup_root, cgroup_remount, TP_PROTO(struct cgroup_root *root), TP_ARGS(root) ); DECLARE_EVENT_CLASS(cgroup, TP_PROTO(struct cgroup *cgrp, const char *path), TP_ARGS(cgrp, path), TP_STRUCT__entry( __field( int, root ) __field( int, level ) __field( u64, id ) __string( path, path ) ), TP_fast_assign( __entry->root = cgrp->root->hierarchy_id; __entry->id = cgroup_id(cgrp); __entry->level = cgrp->level; __assign_str(path); ), TP_printk("root=%d id=%llu level=%d path=%s", __entry->root, __entry->id, __entry->level, __get_str(path)) ); DEFINE_EVENT(cgroup, cgroup_mkdir, TP_PROTO(struct cgroup *cgrp, const char *path), TP_ARGS(cgrp, path) ); DEFINE_EVENT(cgroup, cgroup_rmdir, TP_PROTO(struct cgroup *cgrp, const char *path), TP_ARGS(cgrp, path) ); DEFINE_EVENT(cgroup, cgroup_release, TP_PROTO(struct cgroup *cgrp, const char *path), TP_ARGS(cgrp, path) ); DEFINE_EVENT(cgroup, cgroup_rename, TP_PROTO(struct cgroup *cgrp, const char *path), TP_ARGS(cgrp, path) ); DEFINE_EVENT(cgroup, cgroup_freeze, TP_PROTO(struct cgroup *cgrp, const char *path), TP_ARGS(cgrp, path) ); DEFINE_EVENT(cgroup, cgroup_unfreeze, TP_PROTO(struct cgroup *cgrp, const char *path), TP_ARGS(cgrp, path) ); DECLARE_EVENT_CLASS(cgroup_migrate, TP_PROTO(struct cgroup *dst_cgrp, const char *path, struct task_struct *task, bool threadgroup), TP_ARGS(dst_cgrp, path, task, threadgroup), TP_STRUCT__entry( __field( int, dst_root ) __field( int, dst_level ) __field( u64, dst_id ) __field( int, pid ) __string( dst_path, path ) __string( comm, task->comm ) ), TP_fast_assign( __entry->dst_root = dst_cgrp->root->hierarchy_id; __entry->dst_id = cgroup_id(dst_cgrp); __entry->dst_level = dst_cgrp->level; __assign_str(dst_path); __entry->pid = task->pid; __assign_str(comm); ), TP_printk("dst_root=%d dst_id=%llu dst_level=%d dst_path=%s pid=%d comm=%s", __entry->dst_root, __entry->dst_id, __entry->dst_level, __get_str(dst_path), __entry->pid, __get_str(comm)) ); DEFINE_EVENT(cgroup_migrate, cgroup_attach_task, TP_PROTO(struct cgroup *dst_cgrp, const char *path, struct task_struct *task, bool threadgroup), TP_ARGS(dst_cgrp, path, task, threadgroup) ); DEFINE_EVENT(cgroup_migrate, cgroup_transfer_tasks, TP_PROTO(struct cgroup *dst_cgrp, const char *path, struct task_struct *task, bool threadgroup), TP_ARGS(dst_cgrp, path, task, threadgroup) ); DECLARE_EVENT_CLASS(cgroup_event, TP_PROTO(struct cgroup *cgrp, const char *path, int val), TP_ARGS(cgrp, path, val), TP_STRUCT__entry( __field( int, root ) __field( int, level ) __field( u64, id ) __string( path, path ) __field( int, val ) ), TP_fast_assign( __entry->root = cgrp->root->hierarchy_id; __entry->id = cgroup_id(cgrp); __entry->level = cgrp->level; __assign_str(path); __entry->val = val; ), TP_printk("root=%d id=%llu level=%d path=%s val=%d", __entry->root, __entry->id, __entry->level, __get_str(path), __entry->val) ); DEFINE_EVENT(cgroup_event, cgroup_notify_populated, TP_PROTO(struct cgroup *cgrp, const char *path, int val), TP_ARGS(cgrp, path, val) ); DEFINE_EVENT(cgroup_event, cgroup_notify_frozen, TP_PROTO(struct cgroup *cgrp, const char *path, int val), TP_ARGS(cgrp, path, val) ); DECLARE_EVENT_CLASS(cgroup_rstat, TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), TP_ARGS(cgrp, cpu, contended), TP_STRUCT__entry( __field( int, root ) __field( int, level ) __field( u64, id ) __field( int, cpu ) __field( bool, contended ) ), TP_fast_assign( __entry->root = cgrp->root->hierarchy_id; __entry->id = cgroup_id(cgrp); __entry->level = cgrp->level; __entry->cpu = cpu; __entry->contended = contended; ), TP_printk("root=%d id=%llu level=%d cpu=%d lock contended:%d", __entry->root, __entry->id, __entry->level, __entry->cpu, __entry->contended) ); /* Related to global: cgroup_rstat_lock */ DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended, TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), TP_ARGS(cgrp, cpu, contended) ); DEFINE_EVENT(cgroup_rstat, cgroup_rstat_locked, TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), TP_ARGS(cgrp, cpu, contended) ); DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock, TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), TP_ARGS(cgrp, cpu, contended) ); /* Related to per CPU: cgroup_rstat_cpu_lock */ DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended, TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), TP_ARGS(cgrp, cpu, contended) ); DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended_fastpath, TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), TP_ARGS(cgrp, cpu, contended) ); DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked, TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), TP_ARGS(cgrp, cpu, contended) ); DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_locked_fastpath, TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), TP_ARGS(cgrp, cpu, contended) ); DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock, TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), TP_ARGS(cgrp, cpu, contended) ); DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_unlock_fastpath, TP_PROTO(struct cgroup *cgrp, int cpu, bool contended), TP_ARGS(cgrp, cpu, contended) ); #endif /* _TRACE_CGROUP_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
54 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * NET An implementation of the SOCKET network access protocol. * This is the master header file for the Linux NET layer, * or, in plain English: the networking handling part of the * kernel. * * Version: @(#)net.h 1.0.3 05/25/93 * * Authors: Orest Zborowski, <obz@Kodak.COM> * Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_NET_H #define _LINUX_NET_H #include <linux/stringify.h> #include <linux/random.h> #include <linux/wait.h> #include <linux/fcntl.h> /* For O_CLOEXEC and O_NONBLOCK */ #include <linux/rcupdate.h> #include <linux/once.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/sockptr.h> #include <uapi/linux/net.h> struct poll_table_struct; struct pipe_inode_info; struct inode; struct file; struct net; /* Historically, SOCKWQ_ASYNC_NOSPACE & SOCKWQ_ASYNC_WAITDATA were located * in sock->flags, but moved into sk->sk_wq->flags to be RCU protected. * Eventually all flags will be in sk->sk_wq->flags. */ #define SOCKWQ_ASYNC_NOSPACE 0 #define SOCKWQ_ASYNC_WAITDATA 1 #define SOCK_NOSPACE 2 #define SOCK_PASSCRED 3 #define SOCK_PASSSEC 4 #define SOCK_SUPPORT_ZC 5 #define SOCK_CUSTOM_SOCKOPT 6 #define SOCK_PASSPIDFD 7 #ifndef ARCH_HAS_SOCKET_TYPES /** * enum sock_type - Socket types * @SOCK_STREAM: stream (connection) socket * @SOCK_DGRAM: datagram (conn.less) socket * @SOCK_RAW: raw socket * @SOCK_RDM: reliably-delivered message * @SOCK_SEQPACKET: sequential packet socket * @SOCK_DCCP: Datagram Congestion Control Protocol socket * @SOCK_PACKET: linux specific way of getting packets at the dev level. * For writing rarp and other similar things on the user level. * * When adding some new socket type please * grep ARCH_HAS_SOCKET_TYPE include/asm-* /socket.h, at least MIPS * overrides this enum for binary compat reasons. */ enum sock_type { SOCK_STREAM = 1, SOCK_DGRAM = 2, SOCK_RAW = 3, SOCK_RDM = 4, SOCK_SEQPACKET = 5, SOCK_DCCP = 6, SOCK_PACKET = 10, }; #define SOCK_MAX (SOCK_PACKET + 1) /* Mask which covers at least up to SOCK_MASK-1. The * remaining bits are used as flags. */ #define SOCK_TYPE_MASK 0xf /* Flags for socket, socketpair, accept4 */ #define SOCK_CLOEXEC O_CLOEXEC #ifndef SOCK_NONBLOCK #define SOCK_NONBLOCK O_NONBLOCK #endif #endif /* ARCH_HAS_SOCKET_TYPES */ /** * enum sock_shutdown_cmd - Shutdown types * @SHUT_RD: shutdown receptions * @SHUT_WR: shutdown transmissions * @SHUT_RDWR: shutdown receptions/transmissions */ enum sock_shutdown_cmd { SHUT_RD, SHUT_WR, SHUT_RDWR, }; struct socket_wq { /* Note: wait MUST be first field of socket_wq */ wait_queue_head_t wait; struct fasync_struct *fasync_list; unsigned long flags; /* %SOCKWQ_ASYNC_NOSPACE, etc */ struct rcu_head rcu; } ____cacheline_aligned_in_smp; /** * struct socket - general BSD socket * @state: socket state (%SS_CONNECTED, etc) * @type: socket type (%SOCK_STREAM, etc) * @flags: socket flags (%SOCK_NOSPACE, etc) * @ops: protocol specific socket operations * @file: File back pointer for gc * @sk: internal networking protocol agnostic socket representation * @wq: wait queue for several uses */ struct socket { socket_state state; short type; unsigned long flags; struct file *file; struct sock *sk; const struct proto_ops *ops; /* Might change with IPV6_ADDRFORM or MPTCP. */ struct socket_wq wq; }; /* * "descriptor" for what we're up to with a read. * This allows us to use the same read code yet * have multiple different users of the data that * we read from a file. * * The simplest case just copies the data to user * mode. */ typedef struct { size_t written; size_t count; union { char __user *buf; void *data; } arg; int error; } read_descriptor_t; struct vm_area_struct; struct page; struct sockaddr; struct msghdr; struct module; struct sk_buff; struct proto_accept_arg; typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *, unsigned int, size_t); typedef int (*skb_read_actor_t)(struct sock *, struct sk_buff *); struct proto_ops { int family; struct module *owner; int (*release) (struct socket *sock); int (*bind) (struct socket *sock, struct sockaddr *myaddr, int sockaddr_len); int (*connect) (struct socket *sock, struct sockaddr *vaddr, int sockaddr_len, int flags); int (*socketpair)(struct socket *sock1, struct socket *sock2); int (*accept) (struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg); int (*getname) (struct socket *sock, struct sockaddr *addr, int peer); __poll_t (*poll) (struct file *file, struct socket *sock, struct poll_table_struct *wait); int (*ioctl) (struct socket *sock, unsigned int cmd, unsigned long arg); #ifdef CONFIG_COMPAT int (*compat_ioctl) (struct socket *sock, unsigned int cmd, unsigned long arg); #endif int (*gettstamp) (struct socket *sock, void __user *userstamp, bool timeval, bool time32); int (*listen) (struct socket *sock, int len); int (*shutdown) (struct socket *sock, int flags); int (*setsockopt)(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen); int (*getsockopt)(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen); void (*show_fdinfo)(struct seq_file *m, struct socket *sock); int (*sendmsg) (struct socket *sock, struct msghdr *m, size_t total_len); /* Notes for implementing recvmsg: * =============================== * msg->msg_namelen should get updated by the recvmsg handlers * iff msg_name != NULL. It is by default 0 to prevent * returning uninitialized memory to user space. The recvfrom * handlers can assume that msg.msg_name is either NULL or has * a minimum size of sizeof(struct sockaddr_storage). */ int (*recvmsg) (struct socket *sock, struct msghdr *m, size_t total_len, int flags); int (*mmap) (struct file *file, struct socket *sock, struct vm_area_struct * vma); ssize_t (*splice_read)(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); void (*splice_eof)(struct socket *sock); int (*set_peek_off)(struct sock *sk, int val); int (*peek_len)(struct socket *sock); /* The following functions are called internally by kernel with * sock lock already held. */ int (*read_sock)(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); /* This is different from read_sock(), it reads an entire skb at a time. */ int (*read_skb)(struct sock *sk, skb_read_actor_t recv_actor); int (*sendmsg_locked)(struct sock *sk, struct msghdr *msg, size_t size); int (*set_rcvlowat)(struct sock *sk, int val); }; #define DECLARE_SOCKADDR(type, dst, src) \ type dst = ({ __sockaddr_check_size(sizeof(*dst)); (type) src; }) struct net_proto_family { int family; int (*create)(struct net *net, struct socket *sock, int protocol, int kern); struct module *owner; }; struct iovec; struct kvec; enum { SOCK_WAKE_IO, SOCK_WAKE_WAITD, SOCK_WAKE_SPACE, SOCK_WAKE_URG, }; int sock_wake_async(struct socket_wq *sk_wq, int how, int band); int sock_register(const struct net_proto_family *fam); void sock_unregister(int family); bool sock_is_registered(int family); int __sock_create(struct net *net, int family, int type, int proto, struct socket **res, int kern); int sock_create(int family, int type, int proto, struct socket **res); int sock_create_kern(struct net *net, int family, int type, int proto, struct socket **res); int sock_create_lite(int family, int type, int proto, struct socket **res); struct socket *sock_alloc(void); void sock_release(struct socket *sock); int sock_sendmsg(struct socket *sock, struct msghdr *msg); int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags); struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname); struct socket *sockfd_lookup(int fd, int *err); struct socket *sock_from_file(struct file *file); #define sockfd_put(sock) fput(sock->file) int net_ratelimit(void); #define net_ratelimited_function(function, ...) \ do { \ if (net_ratelimit()) \ function(__VA_ARGS__); \ } while (0) #define net_emerg_ratelimited(fmt, ...) \ net_ratelimited_function(pr_emerg, fmt, ##__VA_ARGS__) #define net_alert_ratelimited(fmt, ...) \ net_ratelimited_function(pr_alert, fmt, ##__VA_ARGS__) #define net_crit_ratelimited(fmt, ...) \ net_ratelimited_function(pr_crit, fmt, ##__VA_ARGS__) #define net_err_ratelimited(fmt, ...) \ net_ratelimited_function(pr_err, fmt, ##__VA_ARGS__) #define net_notice_ratelimited(fmt, ...) \ net_ratelimited_function(pr_notice, fmt, ##__VA_ARGS__) #define net_warn_ratelimited(fmt, ...) \ net_ratelimited_function(pr_warn, fmt, ##__VA_ARGS__) #define net_info_ratelimited(fmt, ...) \ net_ratelimited_function(pr_info, fmt, ##__VA_ARGS__) #if defined(CONFIG_DYNAMIC_DEBUG) || \ (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define net_dbg_ratelimited(fmt, ...) \ do { \ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ if (DYNAMIC_DEBUG_BRANCH(descriptor) && \ net_ratelimit()) \ __dynamic_pr_debug(&descriptor, pr_fmt(fmt), \ ##__VA_ARGS__); \ } while (0) #elif defined(DEBUG) #define net_dbg_ratelimited(fmt, ...) \ net_ratelimited_function(pr_debug, fmt, ##__VA_ARGS__) #else #define net_dbg_ratelimited(fmt, ...) \ no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #endif #define net_get_random_once(buf, nbytes) \ get_random_once((buf), (nbytes)) /* * E.g. XFS meta- & log-data is in slab pages, or bcache meta * data pages, or other high order pages allocated by * __get_free_pages() without __GFP_COMP, which have a page_count * of 0 and/or have PageSlab() set. We cannot use send_page for * those, as that does get_page(); put_page(); and would cause * either a VM_BUG directly, or __page_cache_release a page that * would actually still be referenced by someone, leading to some * obscure delayed Oops somewhere else. */ static inline bool sendpage_ok(struct page *page) { return !PageSlab(page) && page_count(page) >= 1; } int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t len); int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg, struct kvec *vec, size_t num, size_t len); int kernel_recvmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t len, int flags); int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen); int kernel_listen(struct socket *sock, int backlog); int kernel_accept(struct socket *sock, struct socket **newsock, int flags); int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, int flags); int kernel_getsockname(struct socket *sock, struct sockaddr *addr); int kernel_getpeername(struct socket *sock, struct sockaddr *addr); int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how); /* Routine returns the IP overhead imposed by a (caller-protected) socket. */ u32 kernel_sock_ip_overhead(struct sock *sk); #define MODULE_ALIAS_NETPROTO(proto) \ MODULE_ALIAS("net-pf-" __stringify(proto)) #define MODULE_ALIAS_NET_PF_PROTO(pf, proto) \ MODULE_ALIAS("net-pf-" __stringify(pf) "-proto-" __stringify(proto)) #define MODULE_ALIAS_NET_PF_PROTO_TYPE(pf, proto, type) \ MODULE_ALIAS("net-pf-" __stringify(pf) "-proto-" __stringify(proto) \ "-type-" __stringify(type)) #define MODULE_ALIAS_NET_PF_PROTO_NAME(pf, proto, name) \ MODULE_ALIAS("net-pf-" __stringify(pf) "-proto-" __stringify(proto) \ name) #endif /* _LINUX_NET_H */ |
14 11 1 1 14 1 1 1 27 27 7 7 1 10 9 2 5 8 3 13 11 2 10 8 5 2 8 8 3 13 11 2 13 13 21 21 1 20 14 14 14 14 16 13 3 16 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 | // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/sch_sfq.c Stochastic Fairness Queueing discipline. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/string.h> #include <linux/in.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/siphash.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/red.h> /* Stochastic Fairness Queuing algorithm. ======================================= Source: Paul E. McKenney "Stochastic Fairness Queuing", IEEE INFOCOMM'90 Proceedings, San Francisco, 1990. Paul E. McKenney "Stochastic Fairness Queuing", "Interworking: Research and Experience", v.2, 1991, p.113-131. See also: M. Shreedhar and George Varghese "Efficient Fair Queuing using Deficit Round Robin", Proc. SIGCOMM 95. This is not the thing that is usually called (W)FQ nowadays. It does not use any timestamp mechanism, but instead processes queues in round-robin order. ADVANTAGE: - It is very cheap. Both CPU and memory requirements are minimal. DRAWBACKS: - "Stochastic" -> It is not 100% fair. When hash collisions occur, several flows are considered as one. - "Round-robin" -> It introduces larger delays than virtual clock based schemes, and should not be used for isolating interactive traffic from non-interactive. It means, that this scheduler should be used as leaf of CBQ or P3, which put interactive traffic to higher priority band. We still need true WFQ for top level CSZ, but using WFQ for the best effort traffic is absolutely pointless: SFQ is superior for this purpose. IMPLEMENTATION: This implementation limits : - maximal queue length per flow to 127 packets. - max mtu to 2^18-1; - max 65408 flows, - number of hash buckets to 65536. It is easy to increase these values, but not in flight. */ #define SFQ_MAX_DEPTH 127 /* max number of packets per flow */ #define SFQ_DEFAULT_FLOWS 128 #define SFQ_MAX_FLOWS (0x10000 - SFQ_MAX_DEPTH - 1) /* max number of flows */ #define SFQ_EMPTY_SLOT 0xffff #define SFQ_DEFAULT_HASH_DIVISOR 1024 /* We use 16 bits to store allot, and want to handle packets up to 64K * Scale allot by 8 (1<<3) so that no overflow occurs. */ #define SFQ_ALLOT_SHIFT 3 #define SFQ_ALLOT_SIZE(X) DIV_ROUND_UP(X, 1 << SFQ_ALLOT_SHIFT) /* This type should contain at least SFQ_MAX_DEPTH + 1 + SFQ_MAX_FLOWS values */ typedef u16 sfq_index; /* * We dont use pointers to save space. * Small indexes [0 ... SFQ_MAX_FLOWS - 1] are 'pointers' to slots[] array * while following values [SFQ_MAX_FLOWS ... SFQ_MAX_FLOWS + SFQ_MAX_DEPTH] * are 'pointers' to dep[] array */ struct sfq_head { sfq_index next; sfq_index prev; }; struct sfq_slot { struct sk_buff *skblist_next; struct sk_buff *skblist_prev; sfq_index qlen; /* number of skbs in skblist */ sfq_index next; /* next slot in sfq RR chain */ struct sfq_head dep; /* anchor in dep[] chains */ unsigned short hash; /* hash value (index in ht[]) */ short allot; /* credit for this slot */ unsigned int backlog; struct red_vars vars; }; struct sfq_sched_data { /* frequently used fields */ int limit; /* limit of total number of packets in this qdisc */ unsigned int divisor; /* number of slots in hash table */ u8 headdrop; u8 maxdepth; /* limit of packets per flow */ siphash_key_t perturbation; u8 cur_depth; /* depth of longest slot */ u8 flags; unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */ struct tcf_proto __rcu *filter_list; struct tcf_block *block; sfq_index *ht; /* Hash table ('divisor' slots) */ struct sfq_slot *slots; /* Flows table ('maxflows' entries) */ struct red_parms *red_parms; struct tc_sfqred_stats stats; struct sfq_slot *tail; /* current slot in round */ struct sfq_head dep[SFQ_MAX_DEPTH + 1]; /* Linked lists of slots, indexed by depth * dep[0] : list of unused flows * dep[1] : list of flows with 1 packet * dep[X] : list of flows with X packets */ unsigned int maxflows; /* number of flows in flows array */ int perturb_period; unsigned int quantum; /* Allotment per round: MUST BE >= MTU */ struct timer_list perturb_timer; struct Qdisc *sch; }; /* * sfq_head are either in a sfq_slot or in dep[] array */ static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index val) { if (val < SFQ_MAX_FLOWS) return &q->slots[val].dep; return &q->dep[val - SFQ_MAX_FLOWS]; } static unsigned int sfq_hash(const struct sfq_sched_data *q, const struct sk_buff *skb) { return skb_get_hash_perturb(skb, &q->perturbation) & (q->divisor - 1); } static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) { struct sfq_sched_data *q = qdisc_priv(sch); struct tcf_result res; struct tcf_proto *fl; int result; if (TC_H_MAJ(skb->priority) == sch->handle && TC_H_MIN(skb->priority) > 0 && TC_H_MIN(skb->priority) <= q->divisor) return TC_H_MIN(skb->priority); fl = rcu_dereference_bh(q->filter_list); if (!fl) return sfq_hash(q, skb) + 1; *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; result = tcf_classify(skb, NULL, fl, &res, false); if (result >= 0) { #ifdef CONFIG_NET_CLS_ACT switch (result) { case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; fallthrough; case TC_ACT_SHOT: return 0; } #endif if (TC_H_MIN(res.classid) <= q->divisor) return TC_H_MIN(res.classid); } return 0; } /* * x : slot number [0 .. SFQ_MAX_FLOWS - 1] */ static inline void sfq_link(struct sfq_sched_data *q, sfq_index x) { sfq_index p, n; struct sfq_slot *slot = &q->slots[x]; int qlen = slot->qlen; p = qlen + SFQ_MAX_FLOWS; n = q->dep[qlen].next; slot->dep.next = n; slot->dep.prev = p; q->dep[qlen].next = x; /* sfq_dep_head(q, p)->next = x */ sfq_dep_head(q, n)->prev = x; } #define sfq_unlink(q, x, n, p) \ do { \ n = q->slots[x].dep.next; \ p = q->slots[x].dep.prev; \ sfq_dep_head(q, p)->next = n; \ sfq_dep_head(q, n)->prev = p; \ } while (0) static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x) { sfq_index p, n; int d; sfq_unlink(q, x, n, p); d = q->slots[x].qlen--; if (n == p && q->cur_depth == d) q->cur_depth--; sfq_link(q, x); } static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x) { sfq_index p, n; int d; sfq_unlink(q, x, n, p); d = ++q->slots[x].qlen; if (q->cur_depth < d) q->cur_depth = d; sfq_link(q, x); } /* helper functions : might be changed when/if skb use a standard list_head */ /* remove one skb from tail of slot queue */ static inline struct sk_buff *slot_dequeue_tail(struct sfq_slot *slot) { struct sk_buff *skb = slot->skblist_prev; slot->skblist_prev = skb->prev; skb->prev->next = (struct sk_buff *)slot; skb->next = skb->prev = NULL; return skb; } /* remove one skb from head of slot queue */ static inline struct sk_buff *slot_dequeue_head(struct sfq_slot *slot) { struct sk_buff *skb = slot->skblist_next; slot->skblist_next = skb->next; skb->next->prev = (struct sk_buff *)slot; skb->next = skb->prev = NULL; return skb; } static inline void slot_queue_init(struct sfq_slot *slot) { memset(slot, 0, sizeof(*slot)); slot->skblist_prev = slot->skblist_next = (struct sk_buff *)slot; } /* add skb to slot queue (tail add) */ static inline void slot_queue_add(struct sfq_slot *slot, struct sk_buff *skb) { skb->prev = slot->skblist_prev; skb->next = (struct sk_buff *)slot; slot->skblist_prev->next = skb; slot->skblist_prev = skb; } static unsigned int sfq_drop(struct Qdisc *sch, struct sk_buff **to_free) { struct sfq_sched_data *q = qdisc_priv(sch); sfq_index x, d = q->cur_depth; struct sk_buff *skb; unsigned int len; struct sfq_slot *slot; /* Queue is full! Find the longest slot and drop tail packet from it */ if (d > 1) { x = q->dep[d].next; slot = &q->slots[x]; drop: skb = q->headdrop ? slot_dequeue_head(slot) : slot_dequeue_tail(slot); len = qdisc_pkt_len(skb); slot->backlog -= len; sfq_dec(q, x); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); qdisc_drop(skb, sch, to_free); return len; } if (d == 1) { /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ x = q->tail->next; slot = &q->slots[x]; q->tail->next = slot->next; q->ht[slot->hash] = SFQ_EMPTY_SLOT; goto drop; } return 0; } /* Is ECN parameter configured */ static int sfq_prob_mark(const struct sfq_sched_data *q) { return q->flags & TC_RED_ECN; } /* Should packets over max threshold just be marked */ static int sfq_hard_mark(const struct sfq_sched_data *q) { return (q->flags & (TC_RED_ECN | TC_RED_HARDDROP)) == TC_RED_ECN; } static int sfq_headdrop(const struct sfq_sched_data *q) { return q->headdrop; } static int sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct sfq_sched_data *q = qdisc_priv(sch); unsigned int hash, dropped; sfq_index x, qlen; struct sfq_slot *slot; int ret; struct sk_buff *head; int delta; hash = sfq_classify(skb, sch, &ret); if (hash == 0) { if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); __qdisc_drop(skb, to_free); return ret; } hash--; x = q->ht[hash]; slot = &q->slots[x]; if (x == SFQ_EMPTY_SLOT) { x = q->dep[0].next; /* get a free slot */ if (x >= SFQ_MAX_FLOWS) return qdisc_drop(skb, sch, to_free); q->ht[hash] = x; slot = &q->slots[x]; slot->hash = hash; slot->backlog = 0; /* should already be 0 anyway... */ red_set_vars(&slot->vars); goto enqueue; } if (q->red_parms) { slot->vars.qavg = red_calc_qavg_no_idle_time(q->red_parms, &slot->vars, slot->backlog); switch (red_action(q->red_parms, &slot->vars, slot->vars.qavg)) { case RED_DONT_MARK: break; case RED_PROB_MARK: qdisc_qstats_overlimit(sch); if (sfq_prob_mark(q)) { /* We know we have at least one packet in queue */ if (sfq_headdrop(q) && INET_ECN_set_ce(slot->skblist_next)) { q->stats.prob_mark_head++; break; } if (INET_ECN_set_ce(skb)) { q->stats.prob_mark++; break; } } q->stats.prob_drop++; goto congestion_drop; case RED_HARD_MARK: qdisc_qstats_overlimit(sch); if (sfq_hard_mark(q)) { /* We know we have at least one packet in queue */ if (sfq_headdrop(q) && INET_ECN_set_ce(slot->skblist_next)) { q->stats.forced_mark_head++; break; } if (INET_ECN_set_ce(skb)) { q->stats.forced_mark++; break; } } q->stats.forced_drop++; goto congestion_drop; } } if (slot->qlen >= q->maxdepth) { congestion_drop: if (!sfq_headdrop(q)) return qdisc_drop(skb, sch, to_free); /* We know we have at least one packet in queue */ head = slot_dequeue_head(slot); delta = qdisc_pkt_len(head) - qdisc_pkt_len(skb); sch->qstats.backlog -= delta; slot->backlog -= delta; qdisc_drop(head, sch, to_free); slot_queue_add(slot, skb); qdisc_tree_reduce_backlog(sch, 0, delta); return NET_XMIT_CN; } enqueue: qdisc_qstats_backlog_inc(sch, skb); slot->backlog += qdisc_pkt_len(skb); slot_queue_add(slot, skb); sfq_inc(q, x); if (slot->qlen == 1) { /* The flow is new */ if (q->tail == NULL) { /* It is the first flow */ slot->next = x; } else { slot->next = q->tail->next; q->tail->next = x; } /* We put this flow at the end of our flow list. * This might sound unfair for a new flow to wait after old ones, * but we could endup servicing new flows only, and freeze old ones. */ q->tail = slot; /* We could use a bigger initial quantum for new flows */ slot->allot = q->scaled_quantum; } if (++sch->q.qlen <= q->limit) return NET_XMIT_SUCCESS; qlen = slot->qlen; dropped = sfq_drop(sch, to_free); /* Return Congestion Notification only if we dropped a packet * from this flow. */ if (qlen != slot->qlen) { qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb)); return NET_XMIT_CN; } /* As we dropped a packet, better let upper stack know this */ qdisc_tree_reduce_backlog(sch, 1, dropped); return NET_XMIT_SUCCESS; } static struct sk_buff * sfq_dequeue(struct Qdisc *sch) { struct sfq_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; sfq_index a, next_a; struct sfq_slot *slot; /* No active slots */ if (q->tail == NULL) return NULL; next_slot: a = q->tail->next; slot = &q->slots[a]; if (slot->allot <= 0) { q->tail = slot; slot->allot += q->scaled_quantum; goto next_slot; } skb = slot_dequeue_head(slot); sfq_dec(q, a); qdisc_bstats_update(sch, skb); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); slot->backlog -= qdisc_pkt_len(skb); /* Is the slot empty? */ if (slot->qlen == 0) { q->ht[slot->hash] = SFQ_EMPTY_SLOT; next_a = slot->next; if (a == next_a) { q->tail = NULL; /* no more active slots */ return skb; } q->tail->next = next_a; } else { slot->allot -= SFQ_ALLOT_SIZE(qdisc_pkt_len(skb)); } return skb; } static void sfq_reset(struct Qdisc *sch) { struct sk_buff *skb; while ((skb = sfq_dequeue(sch)) != NULL) rtnl_kfree_skbs(skb, skb); } /* * When q->perturbation is changed, we rehash all queued skbs * to avoid OOO (Out Of Order) effects. * We dont use sfq_dequeue()/sfq_enqueue() because we dont want to change * counters. */ static void sfq_rehash(struct Qdisc *sch) { struct sfq_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; int i; struct sfq_slot *slot; struct sk_buff_head list; int dropped = 0; unsigned int drop_len = 0; __skb_queue_head_init(&list); for (i = 0; i < q->maxflows; i++) { slot = &q->slots[i]; if (!slot->qlen) continue; while (slot->qlen) { skb = slot_dequeue_head(slot); sfq_dec(q, i); __skb_queue_tail(&list, skb); } slot->backlog = 0; red_set_vars(&slot->vars); q->ht[slot->hash] = SFQ_EMPTY_SLOT; } q->tail = NULL; while ((skb = __skb_dequeue(&list)) != NULL) { unsigned int hash = sfq_hash(q, skb); sfq_index x = q->ht[hash]; slot = &q->slots[x]; if (x == SFQ_EMPTY_SLOT) { x = q->dep[0].next; /* get a free slot */ if (x >= SFQ_MAX_FLOWS) { drop: qdisc_qstats_backlog_dec(sch, skb); drop_len += qdisc_pkt_len(skb); kfree_skb(skb); dropped++; continue; } q->ht[hash] = x; slot = &q->slots[x]; slot->hash = hash; } if (slot->qlen >= q->maxdepth) goto drop; slot_queue_add(slot, skb); if (q->red_parms) slot->vars.qavg = red_calc_qavg(q->red_parms, &slot->vars, slot->backlog); slot->backlog += qdisc_pkt_len(skb); sfq_inc(q, x); if (slot->qlen == 1) { /* The flow is new */ if (q->tail == NULL) { /* It is the first flow */ slot->next = x; } else { slot->next = q->tail->next; q->tail->next = x; } q->tail = slot; slot->allot = q->scaled_quantum; } } sch->q.qlen -= dropped; qdisc_tree_reduce_backlog(sch, dropped, drop_len); } static void sfq_perturbation(struct timer_list *t) { struct sfq_sched_data *q = from_timer(q, t, perturb_timer); struct Qdisc *sch = q->sch; spinlock_t *root_lock; siphash_key_t nkey; int period; get_random_bytes(&nkey, sizeof(nkey)); rcu_read_lock(); root_lock = qdisc_lock(qdisc_root_sleeping(sch)); spin_lock(root_lock); q->perturbation = nkey; if (!q->filter_list && q->tail) sfq_rehash(sch); spin_unlock(root_lock); /* q->perturb_period can change under us from * sfq_change() and sfq_destroy(). */ period = READ_ONCE(q->perturb_period); if (period) mod_timer(&q->perturb_timer, jiffies + period); rcu_read_unlock(); } static int sfq_change(struct Qdisc *sch, struct nlattr *opt) { struct sfq_sched_data *q = qdisc_priv(sch); struct tc_sfq_qopt *ctl = nla_data(opt); struct tc_sfq_qopt_v1 *ctl_v1 = NULL; unsigned int qlen, dropped = 0; struct red_parms *p = NULL; struct sk_buff *to_free = NULL; struct sk_buff *tail = NULL; if (opt->nla_len < nla_attr_size(sizeof(*ctl))) return -EINVAL; if (opt->nla_len >= nla_attr_size(sizeof(*ctl_v1))) ctl_v1 = nla_data(opt); if (ctl->divisor && (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536)) return -EINVAL; /* slot->allot is a short, make sure quantum is not too big. */ if (ctl->quantum) { unsigned int scaled = SFQ_ALLOT_SIZE(ctl->quantum); if (scaled <= 0 || scaled > SHRT_MAX) return -EINVAL; } if (ctl_v1 && !red_check_params(ctl_v1->qth_min, ctl_v1->qth_max, ctl_v1->Wlog, ctl_v1->Scell_log, NULL)) return -EINVAL; if (ctl_v1 && ctl_v1->qth_min) { p = kmalloc(sizeof(*p), GFP_KERNEL); if (!p) return -ENOMEM; } sch_tree_lock(sch); if (ctl->quantum) { q->quantum = ctl->quantum; q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); } WRITE_ONCE(q->perturb_period, ctl->perturb_period * HZ); if (ctl->flows) q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS); if (ctl->divisor) { q->divisor = ctl->divisor; q->maxflows = min_t(u32, q->maxflows, q->divisor); } if (ctl_v1) { if (ctl_v1->depth) q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH); if (p) { swap(q->red_parms, p); red_set_parms(q->red_parms, ctl_v1->qth_min, ctl_v1->qth_max, ctl_v1->Wlog, ctl_v1->Plog, ctl_v1->Scell_log, NULL, ctl_v1->max_P); } q->flags = ctl_v1->flags; q->headdrop = ctl_v1->headdrop; } if (ctl->limit) { q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows); q->maxflows = min_t(u32, q->maxflows, q->limit); } qlen = sch->q.qlen; while (sch->q.qlen > q->limit) { dropped += sfq_drop(sch, &to_free); if (!tail) tail = to_free; } rtnl_kfree_skbs(to_free, tail); qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); del_timer(&q->perturb_timer); if (q->perturb_period) { mod_timer(&q->perturb_timer, jiffies + q->perturb_period); get_random_bytes(&q->perturbation, sizeof(q->perturbation)); } sch_tree_unlock(sch); kfree(p); return 0; } static void *sfq_alloc(size_t sz) { return kvmalloc(sz, GFP_KERNEL); } static void sfq_free(void *addr) { kvfree(addr); } static void sfq_destroy(struct Qdisc *sch) { struct sfq_sched_data *q = qdisc_priv(sch); tcf_block_put(q->block); WRITE_ONCE(q->perturb_period, 0); del_timer_sync(&q->perturb_timer); sfq_free(q->ht); sfq_free(q->slots); kfree(q->red_parms); } static int sfq_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct sfq_sched_data *q = qdisc_priv(sch); int i; int err; q->sch = sch; timer_setup(&q->perturb_timer, sfq_perturbation, TIMER_DEFERRABLE); err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) return err; for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) { q->dep[i].next = i + SFQ_MAX_FLOWS; q->dep[i].prev = i + SFQ_MAX_FLOWS; } q->limit = SFQ_MAX_DEPTH; q->maxdepth = SFQ_MAX_DEPTH; q->cur_depth = 0; q->tail = NULL; q->divisor = SFQ_DEFAULT_HASH_DIVISOR; q->maxflows = SFQ_DEFAULT_FLOWS; q->quantum = psched_mtu(qdisc_dev(sch)); q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); q->perturb_period = 0; get_random_bytes(&q->perturbation, sizeof(q->perturbation)); if (opt) { int err = sfq_change(sch, opt); if (err) return err; } q->ht = sfq_alloc(sizeof(q->ht[0]) * q->divisor); q->slots = sfq_alloc(sizeof(q->slots[0]) * q->maxflows); if (!q->ht || !q->slots) { /* Note: sfq_destroy() will be called by our caller */ return -ENOMEM; } for (i = 0; i < q->divisor; i++) q->ht[i] = SFQ_EMPTY_SLOT; for (i = 0; i < q->maxflows; i++) { slot_queue_init(&q->slots[i]); sfq_link(q, i); } if (q->limit >= 1) sch->flags |= TCQ_F_CAN_BYPASS; else sch->flags &= ~TCQ_F_CAN_BYPASS; return 0; } static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) { struct sfq_sched_data *q = qdisc_priv(sch); unsigned char *b = skb_tail_pointer(skb); struct tc_sfq_qopt_v1 opt; struct red_parms *p = q->red_parms; memset(&opt, 0, sizeof(opt)); opt.v0.quantum = q->quantum; opt.v0.perturb_period = q->perturb_period / HZ; opt.v0.limit = q->limit; opt.v0.divisor = q->divisor; opt.v0.flows = q->maxflows; opt.depth = q->maxdepth; opt.headdrop = q->headdrop; if (p) { opt.qth_min = p->qth_min >> p->Wlog; opt.qth_max = p->qth_max >> p->Wlog; opt.Wlog = p->Wlog; opt.Plog = p->Plog; opt.Scell_log = p->Scell_log; opt.max_P = p->max_P; } memcpy(&opt.stats, &q->stats, sizeof(opt.stats)); opt.flags = q->flags; if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) goto nla_put_failure; return skb->len; nla_put_failure: nlmsg_trim(skb, b); return -1; } static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg) { return NULL; } static unsigned long sfq_find(struct Qdisc *sch, u32 classid) { return 0; } static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent, u32 classid) { return 0; } static void sfq_unbind(struct Qdisc *q, unsigned long cl) { } static struct tcf_block *sfq_tcf_block(struct Qdisc *sch, unsigned long cl, struct netlink_ext_ack *extack) { struct sfq_sched_data *q = qdisc_priv(sch); if (cl) return NULL; return q->block; } static int sfq_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { tcm->tcm_handle |= TC_H_MIN(cl); return 0; } static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) { struct sfq_sched_data *q = qdisc_priv(sch); sfq_index idx = q->ht[cl - 1]; struct gnet_stats_queue qs = { 0 }; struct tc_sfq_xstats xstats = { 0 }; if (idx != SFQ_EMPTY_SLOT) { const struct sfq_slot *slot = &q->slots[idx]; xstats.allot = slot->allot << SFQ_ALLOT_SHIFT; qs.qlen = slot->qlen; qs.backlog = slot->backlog; } if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0) return -1; return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); } static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct sfq_sched_data *q = qdisc_priv(sch); unsigned int i; if (arg->stop) return; for (i = 0; i < q->divisor; i++) { if (q->ht[i] == SFQ_EMPTY_SLOT) { arg->count++; continue; } if (!tc_qdisc_stats_dump(sch, i + 1, arg)) break; } } static const struct Qdisc_class_ops sfq_class_ops = { .leaf = sfq_leaf, .find = sfq_find, .tcf_block = sfq_tcf_block, .bind_tcf = sfq_bind, .unbind_tcf = sfq_unbind, .dump = sfq_dump_class, .dump_stats = sfq_dump_class_stats, .walk = sfq_walk, }; static struct Qdisc_ops sfq_qdisc_ops __read_mostly = { .cl_ops = &sfq_class_ops, .id = "sfq", .priv_size = sizeof(struct sfq_sched_data), .enqueue = sfq_enqueue, .dequeue = sfq_dequeue, .peek = qdisc_peek_dequeued, .init = sfq_init, .reset = sfq_reset, .destroy = sfq_destroy, .change = NULL, .dump = sfq_dump, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("sfq"); static int __init sfq_module_init(void) { return register_qdisc(&sfq_qdisc_ops); } static void __exit sfq_module_exit(void) { unregister_qdisc(&sfq_qdisc_ops); } module_init(sfq_module_init) module_exit(sfq_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Stochastic Fairness qdisc"); |
1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 | // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright (C) 1996 Mike Shaver (shaver@zeroknowledge.com) */ #include <linux/mm.h> #include <linux/slab.h> #include <linux/sysctl.h> #include <linux/spinlock.h> #include <net/ax25.h> static int min_ipdefmode[1], max_ipdefmode[] = {1}; static int min_axdefmode[1], max_axdefmode[] = {1}; static int min_backoff[1], max_backoff[] = {2}; static int min_conmode[1], max_conmode[] = {2}; static int min_window[] = {1}, max_window[] = {7}; static int min_ewindow[] = {1}, max_ewindow[] = {63}; static int min_t1[] = {1}, max_t1[] = {30000}; static int min_t2[] = {1}, max_t2[] = {20000}; static int min_t3[1], max_t3[] = {3600000}; static int min_idle[1], max_idle[] = {65535000}; static int min_n2[] = {1}, max_n2[] = {31}; static int min_paclen[] = {1}, max_paclen[] = {512}; static int min_proto[1], max_proto[] = { AX25_PROTO_MAX }; #ifdef CONFIG_AX25_DAMA_SLAVE static int min_ds_timeout[1], max_ds_timeout[] = {65535000}; #endif static const struct ctl_table ax25_param_table[] = { { .procname = "ip_default_mode", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_ipdefmode, .extra2 = &max_ipdefmode }, { .procname = "ax25_default_mode", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_axdefmode, .extra2 = &max_axdefmode }, { .procname = "backoff_type", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_backoff, .extra2 = &max_backoff }, { .procname = "connect_mode", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_conmode, .extra2 = &max_conmode }, { .procname = "standard_window_size", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_window, .extra2 = &max_window }, { .procname = "extended_window_size", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_ewindow, .extra2 = &max_ewindow }, { .procname = "t1_timeout", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_t1, .extra2 = &max_t1 }, { .procname = "t2_timeout", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_t2, .extra2 = &max_t2 }, { .procname = "t3_timeout", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_t3, .extra2 = &max_t3 }, { .procname = "idle_timeout", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_idle, .extra2 = &max_idle }, { .procname = "maximum_retry_count", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_n2, .extra2 = &max_n2 }, { .procname = "maximum_packet_length", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_paclen, .extra2 = &max_paclen }, { .procname = "protocol", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_proto, .extra2 = &max_proto }, #ifdef CONFIG_AX25_DAMA_SLAVE { .procname = "dama_slave_timeout", .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_ds_timeout, .extra2 = &max_ds_timeout }, #endif }; int ax25_register_dev_sysctl(ax25_dev *ax25_dev) { char path[sizeof("net/ax25/") + IFNAMSIZ]; int k; struct ctl_table *table; table = kmemdup(ax25_param_table, sizeof(ax25_param_table), GFP_KERNEL); if (!table) return -ENOMEM; BUILD_BUG_ON(ARRAY_SIZE(ax25_param_table) != AX25_MAX_VALUES); for (k = 0; k < AX25_MAX_VALUES; k++) table[k].data = &ax25_dev->values[k]; snprintf(path, sizeof(path), "net/ax25/%s", ax25_dev->dev->name); ax25_dev->sysheader = register_net_sysctl_sz(&init_net, path, table, ARRAY_SIZE(ax25_param_table)); if (!ax25_dev->sysheader) { kfree(table); return -ENOMEM; } return 0; } void ax25_unregister_dev_sysctl(ax25_dev *ax25_dev) { struct ctl_table_header *header = ax25_dev->sysheader; const struct ctl_table *table; if (header) { ax25_dev->sysheader = NULL; table = header->ctl_table_arg; unregister_net_sysctl_table(header); kfree(table); } } |
13 12 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 | // SPDX-License-Identifier: GPL-2.0 /* * Verification of builtin signatures * * Copyright 2019 Google LLC */ /* * This file implements verification of fs-verity builtin signatures. Please * take great care before using this feature. It is not the only way to do * signatures with fs-verity, and the alternatives (such as userspace signature * verification, and IMA appraisal) can be much better. For details about the * limitations of this feature, see Documentation/filesystems/fsverity.rst. */ #include "fsverity_private.h" #include <linux/cred.h> #include <linux/key.h> #include <linux/slab.h> #include <linux/verification.h> /* * /proc/sys/fs/verity/require_signatures * If 1, all verity files must have a valid builtin signature. */ int fsverity_require_signatures; /* * Keyring that contains the trusted X.509 certificates. * * Only root (kuid=0) can modify this. Also, root may use * keyctl_restrict_keyring() to prevent any more additions. */ static struct key *fsverity_keyring; /** * fsverity_verify_signature() - check a verity file's signature * @vi: the file's fsverity_info * @signature: the file's built-in signature * @sig_size: size of signature in bytes, or 0 if no signature * * If the file includes a signature of its fs-verity file digest, verify it * against the certificates in the fs-verity keyring. * * Return: 0 on success (signature valid or not required); -errno on failure */ int fsverity_verify_signature(const struct fsverity_info *vi, const u8 *signature, size_t sig_size) { const struct inode *inode = vi->inode; const struct fsverity_hash_alg *hash_alg = vi->tree_params.hash_alg; struct fsverity_formatted_digest *d; int err; if (sig_size == 0) { if (fsverity_require_signatures) { fsverity_err(inode, "require_signatures=1, rejecting unsigned file!"); return -EPERM; } return 0; } if (fsverity_keyring->keys.nr_leaves_on_tree == 0) { /* * The ".fs-verity" keyring is empty, due to builtin signatures * being supported by the kernel but not actually being used. * In this case, verify_pkcs7_signature() would always return an * error, usually ENOKEY. It could also be EBADMSG if the * PKCS#7 is malformed, but that isn't very important to * distinguish. So, just skip to ENOKEY to avoid the attack * surface of the PKCS#7 parser, which would otherwise be * reachable by any task able to execute FS_IOC_ENABLE_VERITY. */ fsverity_err(inode, "fs-verity keyring is empty, rejecting signed file!"); return -ENOKEY; } d = kzalloc(sizeof(*d) + hash_alg->digest_size, GFP_KERNEL); if (!d) return -ENOMEM; memcpy(d->magic, "FSVerity", 8); d->digest_algorithm = cpu_to_le16(hash_alg - fsverity_hash_algs); d->digest_size = cpu_to_le16(hash_alg->digest_size); memcpy(d->digest, vi->file_digest, hash_alg->digest_size); err = verify_pkcs7_signature(d, sizeof(*d) + hash_alg->digest_size, signature, sig_size, fsverity_keyring, VERIFYING_UNSPECIFIED_SIGNATURE, NULL, NULL); kfree(d); if (err) { if (err == -ENOKEY) fsverity_err(inode, "File's signing cert isn't in the fs-verity keyring"); else if (err == -EKEYREJECTED) fsverity_err(inode, "Incorrect file signature"); else if (err == -EBADMSG) fsverity_err(inode, "Malformed file signature"); else fsverity_err(inode, "Error %d verifying file signature", err); return err; } return 0; } void __init fsverity_init_signature(void) { fsverity_keyring = keyring_alloc(".fs-verity", KUIDT_INIT(0), KGIDT_INIT(0), current_cred(), KEY_POS_SEARCH | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_WRITE | KEY_USR_SEARCH | KEY_USR_SETATTR, KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(fsverity_keyring)) panic("failed to allocate \".fs-verity\" keyring"); } |
2 2 2 2 2 2 107 45 45 45 2 106 130 104 55 104 36 108 109 18 11 102 32 1 1 1 1 108 3 45 45 95 95 29 72 23 45 16 45 4 42 29 23 41 1 41 118 19 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/readpage.c * * Copyright (C) 2002, Linus Torvalds. * Copyright (C) 2015, Google, Inc. * * This was originally taken from fs/mpage.c * * The ext4_mpage_readpages() function here is intended to * replace mpage_readahead() in the general case, not just for * encrypted files. It has some limitations (see below), where it * will fall back to read_block_full_page(), but these limitations * should only be hit when page_size != block_size. * * This will allow us to attach a callback function to support ext4 * encryption. * * If anything unusual happens, such as: * * - encountering a page which has buffers * - encountering a page which has a non-hole after a hole * - encountering a page with non-contiguous blocks * * then this code just gives up and calls the buffer_head-based read function. * It does handle a page which has holes at the end - that is a common case: * the end-of-file on blocksize < PAGE_SIZE setups. * */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/mm.h> #include <linux/kdev_t.h> #include <linux/gfp.h> #include <linux/bio.h> #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/highmem.h> #include <linux/prefetch.h> #include <linux/mpage.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/pagevec.h> #include "ext4.h" #define NUM_PREALLOC_POST_READ_CTXS 128 static struct kmem_cache *bio_post_read_ctx_cache; static mempool_t *bio_post_read_ctx_pool; /* postprocessing steps for read bios */ enum bio_post_read_step { STEP_INITIAL = 0, STEP_DECRYPT, STEP_VERITY, STEP_MAX, }; struct bio_post_read_ctx { struct bio *bio; struct work_struct work; unsigned int cur_step; unsigned int enabled_steps; }; static void __read_end_io(struct bio *bio) { struct folio_iter fi; bio_for_each_folio_all(fi, bio) folio_end_read(fi.folio, bio->bi_status == 0); if (bio->bi_private) mempool_free(bio->bi_private, bio_post_read_ctx_pool); bio_put(bio); } static void bio_post_read_processing(struct bio_post_read_ctx *ctx); static void decrypt_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); struct bio *bio = ctx->bio; if (fscrypt_decrypt_bio(bio)) bio_post_read_processing(ctx); else __read_end_io(bio); } static void verity_work(struct work_struct *work) { struct bio_post_read_ctx *ctx = container_of(work, struct bio_post_read_ctx, work); struct bio *bio = ctx->bio; /* * fsverity_verify_bio() may call readahead() again, and although verity * will be disabled for that, decryption may still be needed, causing * another bio_post_read_ctx to be allocated. So to guarantee that * mempool_alloc() never deadlocks we must free the current ctx first. * This is safe because verity is the last post-read step. */ BUILD_BUG_ON(STEP_VERITY + 1 != STEP_MAX); mempool_free(ctx, bio_post_read_ctx_pool); bio->bi_private = NULL; fsverity_verify_bio(bio); __read_end_io(bio); } static void bio_post_read_processing(struct bio_post_read_ctx *ctx) { /* * We use different work queues for decryption and for verity because * verity may require reading metadata pages that need decryption, and * we shouldn't recurse to the same workqueue. */ switch (++ctx->cur_step) { case STEP_DECRYPT: if (ctx->enabled_steps & (1 << STEP_DECRYPT)) { INIT_WORK(&ctx->work, decrypt_work); fscrypt_enqueue_decrypt_work(&ctx->work); return; } ctx->cur_step++; fallthrough; case STEP_VERITY: if (ctx->enabled_steps & (1 << STEP_VERITY)) { INIT_WORK(&ctx->work, verity_work); fsverity_enqueue_verify_work(&ctx->work); return; } ctx->cur_step++; fallthrough; default: __read_end_io(ctx->bio); } } static bool bio_post_read_required(struct bio *bio) { return bio->bi_private && !bio->bi_status; } /* * I/O completion handler for multipage BIOs. * * The mpage code never puts partial pages into a BIO (except for end-of-file). * If a page does not map to a contiguous run of blocks then it simply falls * back to block_read_full_folio(). * * Why is this? If a page's completion depends on a number of different BIOs * which can complete in any order (or at the same time) then determining the * status of that page is hard. See end_buffer_async_read() for the details. * There is no point in duplicating all that complexity. */ static void mpage_end_io(struct bio *bio) { if (bio_post_read_required(bio)) { struct bio_post_read_ctx *ctx = bio->bi_private; ctx->cur_step = STEP_INITIAL; bio_post_read_processing(ctx); return; } __read_end_io(bio); } static inline bool ext4_need_verity(const struct inode *inode, pgoff_t idx) { return fsverity_active(inode) && idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE); } static void ext4_set_bio_post_read_ctx(struct bio *bio, const struct inode *inode, pgoff_t first_idx) { unsigned int post_read_steps = 0; if (fscrypt_inode_uses_fs_layer_crypto(inode)) post_read_steps |= 1 << STEP_DECRYPT; if (ext4_need_verity(inode, first_idx)) post_read_steps |= 1 << STEP_VERITY; if (post_read_steps) { /* Due to the mempool, this never fails. */ struct bio_post_read_ctx *ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS); ctx->bio = bio; ctx->enabled_steps = post_read_steps; bio->bi_private = ctx; } } static inline loff_t ext4_readpage_limit(struct inode *inode) { if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode)) return inode->i_sb->s_maxbytes; return i_size_read(inode); } int ext4_mpage_readpages(struct inode *inode, struct readahead_control *rac, struct folio *folio) { struct bio *bio = NULL; sector_t last_block_in_bio = 0; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_page = PAGE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; sector_t next_block; sector_t block_in_file; sector_t last_block; sector_t last_block_in_file; sector_t blocks[MAX_BUF_PER_PAGE]; unsigned page_block; struct block_device *bdev = inode->i_sb->s_bdev; int length; unsigned relative_block = 0; struct ext4_map_blocks map; unsigned int nr_pages = rac ? readahead_count(rac) : 1; map.m_pblk = 0; map.m_lblk = 0; map.m_len = 0; map.m_flags = 0; for (; nr_pages; nr_pages--) { int fully_mapped = 1; unsigned first_hole = blocks_per_page; if (rac) folio = readahead_folio(rac); prefetchw(&folio->flags); if (folio_buffers(folio)) goto confused; block_in_file = next_block = (sector_t)folio->index << (PAGE_SHIFT - blkbits); last_block = block_in_file + nr_pages * blocks_per_page; last_block_in_file = (ext4_readpage_limit(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) last_block = last_block_in_file; page_block = 0; /* * Map blocks using the previous result first. */ if ((map.m_flags & EXT4_MAP_MAPPED) && block_in_file > map.m_lblk && block_in_file < (map.m_lblk + map.m_len)) { unsigned map_offset = block_in_file - map.m_lblk; unsigned last = map.m_len - map_offset; for (relative_block = 0; ; relative_block++) { if (relative_block == last) { /* needed? */ map.m_flags &= ~EXT4_MAP_MAPPED; break; } if (page_block == blocks_per_page) break; blocks[page_block] = map.m_pblk + map_offset + relative_block; page_block++; block_in_file++; } } /* * Then do more ext4_map_blocks() calls until we are * done with this folio. */ while (page_block < blocks_per_page) { if (block_in_file < last_block) { map.m_lblk = block_in_file; map.m_len = last_block - block_in_file; if (ext4_map_blocks(NULL, inode, &map, 0) < 0) { set_error_page: folio_zero_segment(folio, 0, folio_size(folio)); folio_unlock(folio); goto next_page; } } if ((map.m_flags & EXT4_MAP_MAPPED) == 0) { fully_mapped = 0; if (first_hole == blocks_per_page) first_hole = page_block; page_block++; block_in_file++; continue; } if (first_hole != blocks_per_page) goto confused; /* hole -> non-hole */ /* Contiguous blocks? */ if (page_block && blocks[page_block-1] != map.m_pblk-1) goto confused; for (relative_block = 0; ; relative_block++) { if (relative_block == map.m_len) { /* needed? */ map.m_flags &= ~EXT4_MAP_MAPPED; break; } else if (page_block == blocks_per_page) break; blocks[page_block] = map.m_pblk+relative_block; page_block++; block_in_file++; } } if (first_hole != blocks_per_page) { folio_zero_segment(folio, first_hole << blkbits, folio_size(folio)); if (first_hole == 0) { if (ext4_need_verity(inode, folio->index) && !fsverity_verify_folio(folio)) goto set_error_page; folio_end_read(folio, true); continue; } } else if (fully_mapped) { folio_set_mappedtodisk(folio); } /* * This folio will go to BIO. Do we need to send this * BIO off first? */ if (bio && (last_block_in_bio != blocks[0] - 1 || !fscrypt_mergeable_bio(bio, inode, next_block))) { submit_and_realloc: submit_bio(bio); bio = NULL; } if (bio == NULL) { /* * bio_alloc will _always_ be able to allocate a bio if * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset(). */ bio = bio_alloc(bdev, bio_max_segs(nr_pages), REQ_OP_READ, GFP_KERNEL); fscrypt_set_bio_crypt_ctx(bio, inode, next_block, GFP_KERNEL); ext4_set_bio_post_read_ctx(bio, inode, folio->index); bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_end_io = mpage_end_io; if (rac) bio->bi_opf |= REQ_RAHEAD; } length = first_hole << blkbits; if (!bio_add_folio(bio, folio, length, 0)) goto submit_and_realloc; if (((map.m_flags & EXT4_MAP_BOUNDARY) && (relative_block == map.m_len)) || (first_hole != blocks_per_page)) { submit_bio(bio); bio = NULL; } else last_block_in_bio = blocks[blocks_per_page - 1]; continue; confused: if (bio) { submit_bio(bio); bio = NULL; } if (!folio_test_uptodate(folio)) block_read_full_folio(folio, ext4_get_block); else folio_unlock(folio); next_page: ; /* A label shall be followed by a statement until C23 */ } if (bio) submit_bio(bio); return 0; } int __init ext4_init_post_read_processing(void) { bio_post_read_ctx_cache = KMEM_CACHE(bio_post_read_ctx, SLAB_RECLAIM_ACCOUNT); if (!bio_post_read_ctx_cache) goto fail; bio_post_read_ctx_pool = mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS, bio_post_read_ctx_cache); if (!bio_post_read_ctx_pool) goto fail_free_cache; return 0; fail_free_cache: kmem_cache_destroy(bio_post_read_ctx_cache); fail: return -ENOMEM; } void ext4_exit_post_read_processing(void) { mempool_destroy(bio_post_read_ctx_pool); kmem_cache_destroy(bio_post_read_ctx_cache); } |
11 11 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Driver for Tascam US-X2Y USB soundcards * * FPGA Loader + ALSA Startup * * Copyright (c) 2003 by Karsten Wiese <annabellesgarden@yahoo.de> */ #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/usb.h> #include <sound/core.h> #include <sound/memalloc.h> #include <sound/pcm.h> #include <sound/hwdep.h> #include "usx2y.h" #include "usbusx2y.h" #include "usX2Yhwdep.h" static vm_fault_t snd_us428ctls_vm_fault(struct vm_fault *vmf) { unsigned long offset; struct page *page; void *vaddr; snd_printdd("ENTER, start %lXh, pgoff %ld\n", vmf->vma->vm_start, vmf->pgoff); offset = vmf->pgoff << PAGE_SHIFT; vaddr = (char *)((struct usx2ydev *)vmf->vma->vm_private_data)->us428ctls_sharedmem + offset; page = virt_to_page(vaddr); get_page(page); vmf->page = page; snd_printdd("vaddr=%p made us428ctls_vm_fault() page %p\n", vaddr, page); return 0; } static const struct vm_operations_struct us428ctls_vm_ops = { .fault = snd_us428ctls_vm_fault, }; static int snd_us428ctls_mmap(struct snd_hwdep *hw, struct file *filp, struct vm_area_struct *area) { unsigned long size = (unsigned long)(area->vm_end - area->vm_start); struct usx2ydev *us428 = hw->private_data; // FIXME this hwdep interface is used twice: fpga download and mmap for controlling Lights etc. Maybe better using 2 hwdep devs? // so as long as the device isn't fully initialised yet we return -EBUSY here. if (!(us428->chip_status & USX2Y_STAT_CHIP_INIT)) return -EBUSY; /* if userspace tries to mmap beyond end of our buffer, fail */ if (size > US428_SHAREDMEM_PAGES) { snd_printd("%lu > %lu\n", size, (unsigned long)US428_SHAREDMEM_PAGES); return -EINVAL; } area->vm_ops = &us428ctls_vm_ops; vm_flags_set(area, VM_DONTEXPAND | VM_DONTDUMP); area->vm_private_data = hw->private_data; return 0; } static __poll_t snd_us428ctls_poll(struct snd_hwdep *hw, struct file *file, poll_table *wait) { __poll_t mask = 0; struct usx2ydev *us428 = hw->private_data; struct us428ctls_sharedmem *shm = us428->us428ctls_sharedmem; if (us428->chip_status & USX2Y_STAT_CHIP_HUP) return EPOLLHUP; poll_wait(file, &us428->us428ctls_wait_queue_head, wait); if (shm && shm->ctl_snapshot_last != shm->ctl_snapshot_red) mask |= EPOLLIN; return mask; } static int snd_usx2y_hwdep_dsp_status(struct snd_hwdep *hw, struct snd_hwdep_dsp_status *info) { static const char * const type_ids[USX2Y_TYPE_NUMS] = { [USX2Y_TYPE_122] = "us122", [USX2Y_TYPE_224] = "us224", [USX2Y_TYPE_428] = "us428", }; struct usx2ydev *us428 = hw->private_data; int id = -1; switch (le16_to_cpu(us428->dev->descriptor.idProduct)) { case USB_ID_US122: id = USX2Y_TYPE_122; break; case USB_ID_US224: id = USX2Y_TYPE_224; break; case USB_ID_US428: id = USX2Y_TYPE_428; break; } if (id < 0) return -ENODEV; strcpy(info->id, type_ids[id]); info->num_dsps = 2; // 0: Prepad Data, 1: FPGA Code if (us428->chip_status & USX2Y_STAT_CHIP_INIT) info->chip_ready = 1; info->version = USX2Y_DRIVER_VERSION; return 0; } static int usx2y_create_usbmidi(struct snd_card *card) { static const struct snd_usb_midi_endpoint_info quirk_data_1 = { .out_ep = 0x06, .in_ep = 0x06, .out_cables = 0x001, .in_cables = 0x001 }; static const struct snd_usb_audio_quirk quirk_1 = { .vendor_name = "TASCAM", .product_name = NAME_ALLCAPS, .ifnum = 0, |