Total coverage: 178059 (15%)of 1222112
105 105 82 82 82 81 44 44 44 57 57 19 19 23 23 23 48 48 1 22 200 200 199 191 170 169 677 461 17 200 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2017 Pablo Neira Ayuso <pablo@netfilter.org> */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/list.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> struct nft_bitmap_elem { struct nft_elem_priv priv; struct list_head head; struct nft_set_ext ext; }; /* This bitmap uses two bits to represent one element. These two bits determine * the element state in the current and the future generation. * * An element can be in three states. The generation cursor is represented using * the ^ character, note that this cursor shifts on every successful transaction. * If no transaction is going on, we observe all elements are in the following * state: * * 11 = this element is active in the current generation. In case of no updates, * ^ it stays active in the next generation. * 00 = this element is inactive in the current generation. In case of no * ^ updates, it stays inactive in the next generation. * * On transaction handling, we observe these two temporary states: * * 01 = this element is inactive in the current generation and it becomes active * ^ in the next one. This happens when the element is inserted but commit * path has not yet been executed yet, so activation is still pending. On * transaction abortion, the element is removed. * 10 = this element is active in the current generation and it becomes inactive * ^ in the next one. This happens when the element is deactivated but commit * path has not yet been executed yet, so removal is still pending. On * transaction abortion, the next generation bit is reset to go back to * restore its previous state. */ struct nft_bitmap { struct list_head list; u16 bitmap_size; u8 bitmap[]; }; static inline void nft_bitmap_location(const struct nft_set *set, const void *key, u32 *idx, u32 *off) { u32 k; if (set->klen == 2) k = *(u16 *)key; else k = *(u8 *)key; k <<= 1; *idx = k / BITS_PER_BYTE; *off = k % BITS_PER_BYTE; } /* Fetch the two bits that represent the element and check if it is active based * on the generation mask. */ static inline bool nft_bitmap_active(const u8 *bitmap, u32 idx, u32 off, u8 genmask) { return (bitmap[idx] & (0x3 << off)) & (genmask << off); } INDIRECT_CALLABLE_SCOPE const struct nft_set_ext * nft_bitmap_lookup(const struct net *net, const struct nft_set *set, const u32 *key) { const struct nft_bitmap *priv = nft_set_priv(set); static const struct nft_set_ext found; u8 genmask = nft_genmask_cur(net); u32 idx, off; nft_bitmap_location(set, key, &idx, &off); if (nft_bitmap_active(priv->bitmap, idx, off, genmask)) return &found; return NULL; } static struct nft_bitmap_elem * nft_bitmap_elem_find(const struct net *net, const struct nft_set *set, struct nft_bitmap_elem *this, u8 genmask) { const struct nft_bitmap *priv = nft_set_priv(set); struct nft_bitmap_elem *be; list_for_each_entry_rcu(be, &priv->list, head, lockdep_is_held(&nft_pernet(net)->commit_mutex)) { if (memcmp(nft_set_ext_key(&be->ext), nft_set_ext_key(&this->ext), set->klen) || !nft_set_elem_active(&be->ext, genmask)) continue; return be; } return NULL; } static struct nft_elem_priv * nft_bitmap_get(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags) { const struct nft_bitmap *priv = nft_set_priv(set); u8 genmask = nft_genmask_cur(net); struct nft_bitmap_elem *be; list_for_each_entry_rcu(be, &priv->list, head) { if (memcmp(nft_set_ext_key(&be->ext), elem->key.val.data, set->klen) || !nft_set_elem_active(&be->ext, genmask)) continue; return &be->priv; } return ERR_PTR(-ENOENT); } static int nft_bitmap_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, struct nft_elem_priv **elem_priv) { struct nft_bitmap_elem *new = nft_elem_priv_cast(elem->priv), *be; struct nft_bitmap *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); u32 idx, off; be = nft_bitmap_elem_find(net, set, new, genmask); if (be) { *elem_priv = &be->priv; return -EEXIST; } nft_bitmap_location(set, nft_set_ext_key(&new->ext), &idx, &off); /* Enter 01 state. */ priv->bitmap[idx] |= (genmask << off); list_add_tail_rcu(&new->head, &priv->list); return 0; } static void nft_bitmap_remove(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) { struct nft_bitmap_elem *be = nft_elem_priv_cast(elem_priv); struct nft_bitmap *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); u32 idx, off; nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off); /* Enter 00 state. */ priv->bitmap[idx] &= ~(genmask << off); list_del_rcu(&be->head); } static void nft_bitmap_activate(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) { struct nft_bitmap_elem *be = nft_elem_priv_cast(elem_priv); struct nft_bitmap *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); u32 idx, off; nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off); /* Enter 11 state. */ priv->bitmap[idx] |= (genmask << off); nft_clear(net, &be->ext); } static void nft_bitmap_flush(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) { struct nft_bitmap_elem *be = nft_elem_priv_cast(elem_priv); struct nft_bitmap *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); u32 idx, off; nft_bitmap_location(set, nft_set_ext_key(&be->ext), &idx, &off); /* Enter 10 state, similar to deactivation. */ priv->bitmap[idx] &= ~(genmask << off); nft_set_elem_change_active(net, set, &be->ext); } static struct nft_elem_priv * nft_bitmap_deactivate(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_bitmap_elem *this = nft_elem_priv_cast(elem->priv), *be; struct nft_bitmap *priv = nft_set_priv(set); u8 genmask = nft_genmask_next(net); u32 idx, off; nft_bitmap_location(set, elem->key.val.data, &idx, &off); be = nft_bitmap_elem_find(net, set, this, genmask); if (!be) return NULL; /* Enter 10 state. */ priv->bitmap[idx] &= ~(genmask << off); nft_set_elem_change_active(net, set, &be->ext); return &be->priv; } static void nft_bitmap_walk(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter) { const struct nft_bitmap *priv = nft_set_priv(set); struct nft_bitmap_elem *be; list_for_each_entry_rcu(be, &priv->list, head, lockdep_is_held(&nft_pernet(ctx->net)->commit_mutex)) { if (iter->count < iter->skip) goto cont; iter->err = iter->fn(ctx, set, iter, &be->priv); if (iter->err < 0) return; cont: iter->count++; } } /* The bitmap size is pow(2, key length in bits) / bits per byte. This is * multiplied by two since each element takes two bits. For 8 bit keys, the * bitmap consumes 66 bytes. For 16 bit keys, 16388 bytes. */ static inline u32 nft_bitmap_size(u32 klen) { return ((2 << ((klen * BITS_PER_BYTE) - 1)) / BITS_PER_BYTE) << 1; } static inline u64 nft_bitmap_total_size(u32 klen) { return sizeof(struct nft_bitmap) + nft_bitmap_size(klen); } static u64 nft_bitmap_privsize(const struct nlattr * const nla[], const struct nft_set_desc *desc) { u32 klen = ntohl(nla_get_be32(nla[NFTA_SET_KEY_LEN])); return nft_bitmap_total_size(klen); } static int nft_bitmap_init(const struct nft_set *set, const struct nft_set_desc *desc, const struct nlattr * const nla[]) { struct nft_bitmap *priv = nft_set_priv(set); BUILD_BUG_ON(offsetof(struct nft_bitmap_elem, priv) != 0); INIT_LIST_HEAD(&priv->list); priv->bitmap_size = nft_bitmap_size(set->klen); return 0; } static void nft_bitmap_destroy(const struct nft_ctx *ctx, const struct nft_set *set) { struct nft_bitmap *priv = nft_set_priv(set); struct nft_bitmap_elem *be, *n; list_for_each_entry_safe(be, n, &priv->list, head) nf_tables_set_elem_destroy(ctx, set, &be->priv); } static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est) { /* Make sure bitmaps we don't get bitmaps larger than 16 Kbytes. */ if (desc->klen > 2) return false; else if (desc->expr) return false; est->size = nft_bitmap_total_size(desc->klen); est->lookup = NFT_SET_CLASS_O_1; est->space = NFT_SET_CLASS_O_1; return true; } const struct nft_set_type nft_set_bitmap_type = { .ops = { .privsize = nft_bitmap_privsize, .elemsize = offsetof(struct nft_bitmap_elem, ext), .estimate = nft_bitmap_estimate, .init = nft_bitmap_init, .destroy = nft_bitmap_destroy, .insert = nft_bitmap_insert, .remove = nft_bitmap_remove, .deactivate = nft_bitmap_deactivate, .flush = nft_bitmap_flush, .activate = nft_bitmap_activate, .lookup = nft_bitmap_lookup, .walk = nft_bitmap_walk, .get = nft_bitmap_get, }, };
838 720 7 2087 4976 5 5 4241 2088 4953 308 172 838 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef BLK_MQ_H #define BLK_MQ_H #include <linux/blkdev.h> #include <linux/sbitmap.h> #include <linux/lockdep.h> #include <linux/scatterlist.h> #include <linux/prefetch.h> #include <linux/srcu.h> #include <linux/rw_hint.h> #include <linux/rwsem.h> struct blk_mq_tags; struct blk_flush_queue; #define BLKDEV_MIN_RQ 4 #define BLKDEV_DEFAULT_RQ 128 enum rq_end_io_ret { RQ_END_IO_NONE, RQ_END_IO_FREE, }; typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t); /* * request flags */ typedef __u32 __bitwise req_flags_t; /* Keep rqf_name[] in sync with the definitions below */ enum rqf_flags { /* drive already may have started this one */ __RQF_STARTED, /* request for flush sequence */ __RQF_FLUSH_SEQ, /* merge of different types, fail separately */ __RQF_MIXED_MERGE, /* don't call prep for this one */ __RQF_DONTPREP, /* use hctx->sched_tags */ __RQF_SCHED_TAGS, /* use an I/O scheduler for this request */ __RQF_USE_SCHED, /* vaguely specified driver internal error. Ignored by block layer */ __RQF_FAILED, /* don't warn about errors */ __RQF_QUIET, /* account into disk and partition IO statistics */ __RQF_IO_STAT, /* runtime pm request */ __RQF_PM, /* on IO scheduler merge hash */ __RQF_HASHED, /* track IO completion time */ __RQF_STATS, /* Look at ->special_vec for the actual data payload instead of the bio chain. */ __RQF_SPECIAL_PAYLOAD, /* request completion needs to be signaled to zone write plugging. */ __RQF_ZONE_WRITE_PLUGGING, /* ->timeout has been called, don't expire again */ __RQF_TIMED_OUT, __RQF_RESV, __RQF_BITS }; #define RQF_STARTED ((__force req_flags_t)(1 << __RQF_STARTED)) #define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << __RQF_FLUSH_SEQ)) #define RQF_MIXED_MERGE ((__force req_flags_t)(1 << __RQF_MIXED_MERGE)) #define RQF_DONTPREP ((__force req_flags_t)(1 << __RQF_DONTPREP)) #define RQF_SCHED_TAGS ((__force req_flags_t)(1 << __RQF_SCHED_TAGS)) #define RQF_USE_SCHED ((__force req_flags_t)(1 << __RQF_USE_SCHED)) #define RQF_FAILED ((__force req_flags_t)(1 << __RQF_FAILED)) #define RQF_QUIET ((__force req_flags_t)(1 << __RQF_QUIET)) #define RQF_IO_STAT ((__force req_flags_t)(1 << __RQF_IO_STAT)) #define RQF_PM ((__force req_flags_t)(1 << __RQF_PM)) #define RQF_HASHED ((__force req_flags_t)(1 << __RQF_HASHED)) #define RQF_STATS ((__force req_flags_t)(1 << __RQF_STATS)) #define RQF_SPECIAL_PAYLOAD \ ((__force req_flags_t)(1 << __RQF_SPECIAL_PAYLOAD)) #define RQF_ZONE_WRITE_PLUGGING \ ((__force req_flags_t)(1 << __RQF_ZONE_WRITE_PLUGGING)) #define RQF_TIMED_OUT ((__force req_flags_t)(1 << __RQF_TIMED_OUT)) #define RQF_RESV ((__force req_flags_t)(1 << __RQF_RESV)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ (RQF_STARTED | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD) enum mq_rq_state { MQ_RQ_IDLE = 0, MQ_RQ_IN_FLIGHT = 1, MQ_RQ_COMPLETE = 2, }; /* * Try to put the fields that are referenced together in the same cacheline. * * If you modify this structure, make sure to update blk_rq_init() and * especially blk_mq_rq_ctx_init() to take care of the added fields. */ struct request { struct request_queue *q; struct blk_mq_ctx *mq_ctx; struct blk_mq_hw_ctx *mq_hctx; blk_opf_t cmd_flags; /* op and common flags */ req_flags_t rq_flags; int tag; int internal_tag; unsigned int timeout; /* the following two fields are internal, NEVER access directly */ unsigned int __data_len; /* total data len */ sector_t __sector; /* sector cursor */ struct bio *bio; struct bio *biotail; union { struct list_head queuelist; struct request *rq_next; }; struct block_device *part; #ifdef CONFIG_BLK_RQ_ALLOC_TIME /* Time that the first bio started allocating this request. */ u64 alloc_time_ns; #endif /* Time that this request was allocated for this IO. */ u64 start_time_ns; /* Time that I/O was submitted to the device. */ u64 io_start_time_ns; #ifdef CONFIG_BLK_WBT unsigned short wbt_flags; #endif /* * rq sectors used for blk stats. It has the same value * with blk_rq_sectors(rq), except that it never be zeroed * by completion. */ unsigned short stats_sectors; /* * Number of scatter-gather DMA addr+len pairs after * physical address coalescing is performed. */ unsigned short nr_phys_segments; unsigned short nr_integrity_segments; #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct bio_crypt_ctx *crypt_ctx; struct blk_crypto_keyslot *crypt_keyslot; #endif enum mq_rq_state state; atomic_t ref; unsigned long deadline; /* * The hash is used inside the scheduler, and killed once the * request reaches the dispatch list. The ipi_list is only used * to queue the request for softirq completion, which is long * after the request has been unhashed (and even removed from * the dispatch list). */ union { struct hlist_node hash; /* merge hash */ struct llist_node ipi_list; }; /* * The rb_node is only used inside the io scheduler, requests * are pruned when moved to the dispatch queue. special_vec must * only be used if RQF_SPECIAL_PAYLOAD is set, and those cannot be * insert into an IO scheduler. */ union { struct rb_node rb_node; /* sort/lookup */ struct bio_vec special_vec; }; /* * Three pointers are available for the IO schedulers, if they need * more they have to dynamically allocate it. */ struct { struct io_cq *icq; void *priv[2]; } elv; struct { unsigned int seq; rq_end_io_fn *saved_end_io; } flush; u64 fifo_time; /* * completion callback. */ rq_end_io_fn *end_io; void *end_io_data; }; static inline enum req_op req_op(const struct request *req) { return req->cmd_flags & REQ_OP_MASK; } static inline bool blk_rq_is_passthrough(struct request *rq) { return blk_op_is_passthrough(rq->cmd_flags); } static inline unsigned short req_get_ioprio(struct request *req) { if (req->bio) return req->bio->bi_ioprio; return 0; } #define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ) #define rq_dma_dir(rq) \ (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE) static inline int rq_list_empty(const struct rq_list *rl) { return rl->head == NULL; } static inline void rq_list_init(struct rq_list *rl) { rl->head = NULL; rl->tail = NULL; } static inline void rq_list_add_tail(struct rq_list *rl, struct request *rq) { rq->rq_next = NULL; if (rl->tail) rl->tail->rq_next = rq; else rl->head = rq; rl->tail = rq; } static inline void rq_list_add_head(struct rq_list *rl, struct request *rq) { rq->rq_next = rl->head; rl->head = rq; if (!rl->tail) rl->tail = rq; } static inline struct request *rq_list_pop(struct rq_list *rl) { struct request *rq = rl->head; if (rq) { rl->head = rl->head->rq_next; if (!rl->head) rl->tail = NULL; rq->rq_next = NULL; } return rq; } static inline struct request *rq_list_peek(struct rq_list *rl) { return rl->head; } #define rq_list_for_each(rl, pos) \ for (pos = rq_list_peek((rl)); (pos); pos = pos->rq_next) #define rq_list_for_each_safe(rl, pos, nxt) \ for (pos = rq_list_peek((rl)), nxt = pos->rq_next; \ pos; pos = nxt, nxt = pos ? pos->rq_next : NULL) /** * enum blk_eh_timer_return - How the timeout handler should proceed * @BLK_EH_DONE: The block driver completed the command or will complete it at * a later time. * @BLK_EH_RESET_TIMER: Reset the request timer and continue waiting for the * request to complete. */ enum blk_eh_timer_return { BLK_EH_DONE, BLK_EH_RESET_TIMER, }; /** * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware * block device */ struct blk_mq_hw_ctx { struct { /** @lock: Protects the dispatch list. */ spinlock_t lock; /** * @dispatch: Used for requests that are ready to be * dispatched to the hardware but for some reason (e.g. lack of * resources) could not be sent to the hardware. As soon as the * driver can send new requests, requests at this list will * be sent first for a fairer dispatch. */ struct list_head dispatch; /** * @state: BLK_MQ_S_* flags. Defines the state of the hw * queue (active, scheduled to restart, stopped). */ unsigned long state; } ____cacheline_aligned_in_smp; /** * @run_work: Used for scheduling a hardware queue run at a later time. */ struct delayed_work run_work; /** @cpumask: Map of available CPUs where this hctx can run. */ cpumask_var_t cpumask; /** * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU * selection from @cpumask. */ int next_cpu; /** * @next_cpu_batch: Counter of how many works left in the batch before * changing to the next CPU. */ int next_cpu_batch; /** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */ unsigned long flags; /** * @sched_data: Pointer owned by the IO scheduler attached to a request * queue. It's up to the IO scheduler how to use this pointer. */ void *sched_data; /** * @queue: Pointer to the request queue that owns this hardware context. */ struct request_queue *queue; /** @fq: Queue of requests that need to perform a flush operation. */ struct blk_flush_queue *fq; /** * @driver_data: Pointer to data owned by the block driver that created * this hctx */ void *driver_data; /** * @ctx_map: Bitmap for each software queue. If bit is on, there is a * pending request in that software queue. */ struct sbitmap ctx_map; /** * @dispatch_from: Software queue to be used when no scheduler was * selected. */ struct blk_mq_ctx *dispatch_from; /** * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to * decide if the hw_queue is busy using Exponential Weighted Moving * Average algorithm. */ unsigned int dispatch_busy; /** @type: HCTX_TYPE_* flags. Type of hardware queue. */ unsigned short type; /** @nr_ctx: Number of software queues. */ unsigned short nr_ctx; /** @ctxs: Array of software queues. */ struct blk_mq_ctx **ctxs; /** @dispatch_wait_lock: Lock for dispatch_wait queue. */ spinlock_t dispatch_wait_lock; /** * @dispatch_wait: Waitqueue to put requests when there is no tag * available at the moment, to wait for another try in the future. */ wait_queue_entry_t dispatch_wait; /** * @wait_index: Index of next available dispatch_wait queue to insert * requests. */ atomic_t wait_index; /** * @tags: Tags owned by the block driver. A tag at this set is only * assigned when a request is dispatched from a hardware queue. */ struct blk_mq_tags *tags; /** * @sched_tags: Tags owned by I/O scheduler. If there is an I/O * scheduler associated with a request queue, a tag is assigned when * that request is allocated. Else, this member is not used. */ struct blk_mq_tags *sched_tags; /** @numa_node: NUMA node the storage adapter has been connected to. */ unsigned int numa_node; /** @queue_num: Index of this hardware queue. */ unsigned int queue_num; /** * @nr_active: Number of active requests. Only used when a tag set is * shared across request queues. */ atomic_t nr_active; /** @cpuhp_online: List to store request if CPU is going to die */ struct hlist_node cpuhp_online; /** @cpuhp_dead: List to store request if some CPU die. */ struct hlist_node cpuhp_dead; /** @kobj: Kernel object for sysfs. */ struct kobject kobj; #ifdef CONFIG_BLK_DEBUG_FS /** * @debugfs_dir: debugfs directory for this hardware queue. Named * as cpu<cpu_number>. */ struct dentry *debugfs_dir; /** @sched_debugfs_dir: debugfs directory for the scheduler. */ struct dentry *sched_debugfs_dir; #endif /** * @hctx_list: if this hctx is not in use, this is an entry in * q->unused_hctx_list. */ struct list_head hctx_list; }; /** * struct blk_mq_queue_map - Map software queues to hardware queues * @mq_map: CPU ID to hardware queue index map. This is an array * with nr_cpu_ids elements. Each element has a value in the range * [@queue_offset, @queue_offset + @nr_queues). * @nr_queues: Number of hardware queues to map CPU IDs onto. * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe * driver to map each hardware queue type (enum hctx_type) onto a distinct * set of hardware queues. */ struct blk_mq_queue_map { unsigned int *mq_map; unsigned int nr_queues; unsigned int queue_offset; }; /** * enum hctx_type - Type of hardware queue * @HCTX_TYPE_DEFAULT: All I/O not otherwise accounted for. * @HCTX_TYPE_READ: Just for READ I/O. * @HCTX_TYPE_POLL: Polled I/O of any kind. * @HCTX_MAX_TYPES: Number of types of hctx. */ enum hctx_type { HCTX_TYPE_DEFAULT, HCTX_TYPE_READ, HCTX_TYPE_POLL, HCTX_MAX_TYPES, }; /** * struct blk_mq_tag_set - tag set that can be shared between request queues * @ops: Pointers to functions that implement block driver behavior. * @map: One or more ctx -> hctx mappings. One map exists for each * hardware queue type (enum hctx_type) that the driver wishes * to support. There are no restrictions on maps being of the * same size, and it's perfectly legal to share maps between * types. * @nr_maps: Number of elements in the @map array. A number in the range * [1, HCTX_MAX_TYPES]. * @nr_hw_queues: Number of hardware queues supported by the block driver that * owns this data structure. * @queue_depth: Number of tags per hardware queue, reserved tags included. * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag * allocations. * @cmd_size: Number of additional bytes to allocate per request. The block * driver owns these additional bytes. * @numa_node: NUMA node the storage adapter has been connected to. * @timeout: Request processing timeout in jiffies. * @flags: Zero or more BLK_MQ_F_* flags. * @driver_data: Pointer to data owned by the block driver that created this * tag set. * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues * elements. * @shared_tags: * Shared set of tags. Has @nr_hw_queues elements. If set, * shared by all @tags. * @tag_list_lock: Serializes tag_list accesses. * @tag_list: List of the request queues that use this tag set. See also * request_queue.tag_set_list. * @srcu: Use as lock when type of the request queue is blocking * (BLK_MQ_F_BLOCKING). * @update_nr_hwq_lock: * Synchronize updating nr_hw_queues with add/del disk & * switching elevator. */ struct blk_mq_tag_set { const struct blk_mq_ops *ops; struct blk_mq_queue_map map[HCTX_MAX_TYPES]; unsigned int nr_maps; unsigned int nr_hw_queues; unsigned int queue_depth; unsigned int reserved_tags; unsigned int cmd_size; int numa_node; unsigned int timeout; unsigned int flags; void *driver_data; struct blk_mq_tags **tags; struct blk_mq_tags *shared_tags; struct mutex tag_list_lock; struct list_head tag_list; struct srcu_struct *srcu; struct rw_semaphore update_nr_hwq_lock; }; /** * struct blk_mq_queue_data - Data about a request inserted in a queue * * @rq: Request pointer. * @last: If it is the last request in the queue. */ struct blk_mq_queue_data { struct request *rq; bool last; }; typedef bool (busy_tag_iter_fn)(struct request *, void *); /** * struct blk_mq_ops - Callback functions that implements block driver * behaviour. */ struct blk_mq_ops { /** * @queue_rq: Queue a new request from block IO. */ blk_status_t (*queue_rq)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *); /** * @commit_rqs: If a driver uses bd->last to judge when to submit * requests to hardware, it must define this function. In case of errors * that make us stop issuing further requests, this hook serves the * purpose of kicking the hardware (which the last request otherwise * would have done). */ void (*commit_rqs)(struct blk_mq_hw_ctx *); /** * @queue_rqs: Queue a list of new requests. Driver is guaranteed * that each request belongs to the same queue. If the driver doesn't * empty the @rqlist completely, then the rest will be queued * individually by the block layer upon return. */ void (*queue_rqs)(struct rq_list *rqlist); /** * @get_budget: Reserve budget before queue request, once .queue_rq is * run, it is driver's responsibility to release the * reserved budget. Also we have to handle failure case * of .get_budget for avoiding I/O deadlock. */ int (*get_budget)(struct request_queue *); /** * @put_budget: Release the reserved budget. */ void (*put_budget)(struct request_queue *, int); /** * @set_rq_budget_token: store rq's budget token */ void (*set_rq_budget_token)(struct request *, int); /** * @get_rq_budget_token: retrieve rq's budget token */ int (*get_rq_budget_token)(struct request *); /** * @timeout: Called on request timeout. */ enum blk_eh_timer_return (*timeout)(struct request *); /** * @poll: Called to poll for completion of a specific tag. */ int (*poll)(struct blk_mq_hw_ctx *, struct io_comp_batch *); /** * @complete: Mark the request as complete. */ void (*complete)(struct request *); /** * @init_hctx: Called when the block layer side of a hardware queue has * been set up, allowing the driver to allocate/init matching * structures. */ int (*init_hctx)(struct blk_mq_hw_ctx *, void *, unsigned int); /** * @exit_hctx: Ditto for exit/teardown. */ void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); /** * @init_request: Called for every command allocated by the block layer * to allow the driver to set up driver specific data. * * Tag greater than or equal to queue_depth is for setting up * flush request. */ int (*init_request)(struct blk_mq_tag_set *set, struct request *, unsigned int, unsigned int); /** * @exit_request: Ditto for exit/teardown. */ void (*exit_request)(struct blk_mq_tag_set *set, struct request *, unsigned int); /** * @cleanup_rq: Called before freeing one request which isn't completed * yet, and usually for freeing the driver private data. */ void (*cleanup_rq)(struct request *); /** * @busy: If set, returns whether or not this queue currently is busy. */ bool (*busy)(struct request_queue *); /** * @map_queues: This allows drivers specify their own queue mapping by * overriding the setup-time function that builds the mq_map. */ void (*map_queues)(struct blk_mq_tag_set *set); #ifdef CONFIG_BLK_DEBUG_FS /** * @show_rq: Used by the debugfs implementation to show driver-specific * information about a request. */ void (*show_rq)(struct seq_file *m, struct request *rq); #endif }; /* Keep hctx_flag_name[] in sync with the definitions below */ enum { BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1, /* * Set when this device requires underlying blk-mq device for * completing IO: */ BLK_MQ_F_STACKING = 1 << 2, BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, BLK_MQ_F_BLOCKING = 1 << 4, /* * Alloc tags on a round-robin base instead of the first available one. */ BLK_MQ_F_TAG_RR = 1 << 5, /* * Select 'none' during queue registration in case of a single hwq * or shared hwqs instead of 'mq-deadline'. */ BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 6, BLK_MQ_F_MAX = 1 << 7, }; #define BLK_MQ_MAX_DEPTH (10240) #define BLK_MQ_NO_HCTX_IDX (-1U) enum { /* Keep hctx_state_name[] in sync with the definitions below */ BLK_MQ_S_STOPPED, BLK_MQ_S_TAG_ACTIVE, BLK_MQ_S_SCHED_RESTART, /* hw queue is inactive after all its CPUs become offline */ BLK_MQ_S_INACTIVE, BLK_MQ_S_MAX }; struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, struct queue_limits *lim, void *queuedata, struct lock_class_key *lkclass); #define blk_mq_alloc_disk(set, lim, queuedata) \ ({ \ static struct lock_class_key __key; \ \ __blk_mq_alloc_disk(set, lim, queuedata, &__key); \ }) struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, struct lock_class_key *lkclass); struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, struct queue_limits *lim, void *queuedata); int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q); void blk_mq_destroy_queue(struct request_queue *); int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set); int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, const struct blk_mq_ops *ops, unsigned int queue_depth, unsigned int set_flags); void blk_mq_free_tag_set(struct blk_mq_tag_set *set); void blk_mq_free_request(struct request *rq); int blk_rq_poll(struct request *rq, struct io_comp_batch *iob, unsigned int poll_flags); bool blk_mq_queue_inflight(struct request_queue *q); enum { /* return when out of requests */ BLK_MQ_REQ_NOWAIT = (__force blk_mq_req_flags_t)(1 << 0), /* allocate from reserved pool */ BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(1 << 1), /* set RQF_PM */ BLK_MQ_REQ_PM = (__force blk_mq_req_flags_t)(1 << 2), }; struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, blk_mq_req_flags_t flags); struct request *blk_mq_alloc_request_hctx(struct request_queue *q, blk_opf_t opf, blk_mq_req_flags_t flags, unsigned int hctx_idx); /* * Tag address space map. */ struct blk_mq_tags { unsigned int nr_tags; unsigned int nr_reserved_tags; unsigned int active_queues; struct sbitmap_queue bitmap_tags; struct sbitmap_queue breserved_tags; struct request **rqs; struct request **static_rqs; struct list_head page_list; /* * used to clear request reference in rqs[] before freeing one * request pool */ spinlock_t lock; }; static inline struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) { if (tag < tags->nr_tags) { prefetch(tags->rqs[tag]); return tags->rqs[tag]; } return NULL; } enum { BLK_MQ_UNIQUE_TAG_BITS = 16, BLK_MQ_UNIQUE_TAG_MASK = (1 << BLK_MQ_UNIQUE_TAG_BITS) - 1, }; u32 blk_mq_unique_tag(struct request *rq); static inline u16 blk_mq_unique_tag_to_hwq(u32 unique_tag) { return unique_tag >> BLK_MQ_UNIQUE_TAG_BITS; } static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag) { return unique_tag & BLK_MQ_UNIQUE_TAG_MASK; } /** * blk_mq_rq_state() - read the current MQ_RQ_* state of a request * @rq: target request. */ static inline enum mq_rq_state blk_mq_rq_state(struct request *rq) { return READ_ONCE(rq->state); } static inline int blk_mq_request_started(struct request *rq) { return blk_mq_rq_state(rq) != MQ_RQ_IDLE; } static inline int blk_mq_request_completed(struct request *rq) { return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE; } /* * * Set the state to complete when completing a request from inside ->queue_rq. * This is used by drivers that want to ensure special complete actions that * need access to the request are called on failure, e.g. by nvme for * multipathing. */ static inline void blk_mq_set_request_complete(struct request *rq) { WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); } /* * Complete the request directly instead of deferring it to softirq or * completing it another CPU. Useful in preemptible instead of an interrupt. */ static inline void blk_mq_complete_request_direct(struct request *rq, void (*complete)(struct request *rq)) { WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); complete(rq); } void blk_mq_start_request(struct request *rq); void blk_mq_end_request(struct request *rq, blk_status_t error); void __blk_mq_end_request(struct request *rq, blk_status_t error); void blk_mq_end_request_batch(struct io_comp_batch *ib); /* * Only need start/end time stamping if we have iostat or * blk stats enabled, or using an IO scheduler. */ static inline bool blk_mq_need_time_stamp(struct request *rq) { return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_USE_SCHED)); } static inline bool blk_mq_is_reserved_rq(struct request *rq) { return rq->rq_flags & RQF_RESV; } /** * blk_mq_add_to_batch() - add a request to the completion batch * @req: The request to add to batch * @iob: The batch to add the request * @is_error: Specify true if the request failed with an error * @complete: The completaion handler for the request * * Batched completions only work when there is no I/O error and no special * ->end_io handler. * * Return: true when the request was added to the batch, otherwise false */ static inline bool blk_mq_add_to_batch(struct request *req, struct io_comp_batch *iob, bool is_error, void (*complete)(struct io_comp_batch *)) { /* * Check various conditions that exclude batch processing: * 1) No batch container * 2) Has scheduler data attached * 3) Not a passthrough request and end_io set * 4) Not a passthrough request and failed with an error */ if (!iob) return false; if (req->rq_flags & RQF_SCHED_TAGS) return false; if (!blk_rq_is_passthrough(req)) { if (req->end_io) return false; if (is_error) return false; } if (!iob->complete) iob->complete = complete; else if (iob->complete != complete) return false; iob->need_ts |= blk_mq_need_time_stamp(req); rq_list_add_tail(&iob->req_list, req); return true; } void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); void blk_mq_complete_request(struct request *rq); bool blk_mq_complete_request_remote(struct request *rq); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_stop_hw_queues(struct request_queue *q); void blk_mq_start_hw_queues(struct request_queue *q); void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_quiesce_queue(struct request_queue *q); void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set); void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set); void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set); void blk_mq_unquiesce_queue(struct request_queue *q); void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_run_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs); void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv); void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset); void blk_mq_freeze_queue_nomemsave(struct request_queue *q); void blk_mq_unfreeze_queue_nomemrestore(struct request_queue *q); static inline unsigned int __must_check blk_mq_freeze_queue(struct request_queue *q) { unsigned int memflags = memalloc_noio_save(); blk_mq_freeze_queue_nomemsave(q); return memflags; } static inline void blk_mq_unfreeze_queue(struct request_queue *q, unsigned int memflags) { blk_mq_unfreeze_queue_nomemrestore(q); memalloc_noio_restore(memflags); } void blk_freeze_queue_start(struct request_queue *q); void blk_mq_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); void blk_mq_unfreeze_queue_non_owner(struct request_queue *q); void blk_freeze_queue_start_non_owner(struct request_queue *q); unsigned int blk_mq_num_possible_queues(unsigned int max_queues); unsigned int blk_mq_num_online_queues(unsigned int max_queues); void blk_mq_map_queues(struct blk_mq_queue_map *qmap); void blk_mq_map_hw_queues(struct blk_mq_queue_map *qmap, struct device *dev, unsigned int offset); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); void blk_mq_quiesce_queue_nowait(struct request_queue *q); unsigned int blk_mq_rq_cpu(struct request *rq); bool __blk_should_fake_timeout(struct request_queue *q); static inline bool blk_should_fake_timeout(struct request_queue *q) { if (IS_ENABLED(CONFIG_FAIL_IO_TIMEOUT) && test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags)) return __blk_should_fake_timeout(q); return false; } /** * blk_mq_rq_from_pdu - cast a PDU to a request * @pdu: the PDU (Protocol Data Unit) to be casted * * Return: request * * Driver command data is immediately after the request. So subtract request * size to get back to the original request. */ static inline struct request *blk_mq_rq_from_pdu(void *pdu) { return pdu - sizeof(struct request); } /** * blk_mq_rq_to_pdu - cast a request to a PDU * @rq: the request to be casted * * Return: pointer to the PDU * * Driver command data is immediately after the request. So add request to get * the PDU. */ static inline void *blk_mq_rq_to_pdu(struct request *rq) { return rq + 1; } #define queue_for_each_hw_ctx(q, hctx, i) \ xa_for_each(&(q)->hctx_table, (i), (hctx)) #define hctx_for_each_ctx(hctx, ctx, i) \ for ((i) = 0; (i) < (hctx)->nr_ctx && \ ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++) static inline void blk_mq_cleanup_rq(struct request *rq) { if (rq->q->mq_ops->cleanup_rq) rq->q->mq_ops->cleanup_rq(rq); } void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, struct lock_class_key *key); static inline bool rq_is_sync(struct request *rq) { return op_is_sync(rq->cmd_flags); } void blk_rq_init(struct request_queue *q, struct request *rq); int blk_rq_prep_clone(struct request *rq, struct request *rq_src, struct bio_set *bs, gfp_t gfp_mask, int (*bio_ctr)(struct bio *, struct bio *, void *), void *data); void blk_rq_unprep_clone(struct request *rq); blk_status_t blk_insert_cloned_request(struct request *rq); struct rq_map_data { struct page **pages; unsigned long offset; unsigned short page_order; unsigned short nr_entries; bool null_mapped; bool from_user; }; int blk_rq_map_user(struct request_queue *, struct request *, struct rq_map_data *, void __user *, unsigned long, gfp_t); int blk_rq_map_user_io(struct request *, struct rq_map_data *, void __user *, unsigned long, gfp_t, bool, int, bool, int); int blk_rq_map_user_iov(struct request_queue *, struct request *, struct rq_map_data *, const struct iov_iter *, gfp_t); int blk_rq_unmap_user(struct bio *); int blk_rq_map_kern(struct request *rq, void *kbuf, unsigned int len, gfp_t gfp); int blk_rq_append_bio(struct request *rq, struct bio *bio); void blk_execute_rq_nowait(struct request *rq, bool at_head); blk_status_t blk_execute_rq(struct request *rq, bool at_head); bool blk_rq_is_poll(struct request *rq); struct req_iterator { struct bvec_iter iter; struct bio *bio; }; #define __rq_for_each_bio(_bio, rq) \ if ((rq->bio)) \ for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next) #define rq_for_each_segment(bvl, _rq, _iter) \ __rq_for_each_bio(_iter.bio, _rq) \ bio_for_each_segment(bvl, _iter.bio, _iter.iter) #define rq_for_each_bvec(bvl, _rq, _iter) \ __rq_for_each_bio(_iter.bio, _rq) \ bio_for_each_bvec(bvl, _iter.bio, _iter.iter) #define rq_iter_last(bvec, _iter) \ (_iter.bio->bi_next == NULL && \ bio_iter_last(bvec, _iter.iter)) /* * blk_rq_pos() : the current sector * blk_rq_bytes() : bytes left in the entire request * blk_rq_cur_bytes() : bytes left in the current segment * blk_rq_sectors() : sectors left in the entire request * blk_rq_cur_sectors() : sectors left in the current segment * blk_rq_stats_sectors() : sectors of the entire request used for stats */ static inline sector_t blk_rq_pos(const struct request *rq) { return rq->__sector; } static inline unsigned int blk_rq_bytes(const struct request *rq) { return rq->__data_len; } static inline int blk_rq_cur_bytes(const struct request *rq) { if (!rq->bio) return 0; if (!bio_has_data(rq->bio)) /* dataless requests such as discard */ return rq->bio->bi_iter.bi_size; return bio_iovec(rq->bio).bv_len; } static inline unsigned int blk_rq_sectors(const struct request *rq) { return blk_rq_bytes(rq) >> SECTOR_SHIFT; } static inline unsigned int blk_rq_cur_sectors(const struct request *rq) { return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT; } static inline unsigned int blk_rq_stats_sectors(const struct request *rq) { return rq->stats_sectors; } /* * Some commands like WRITE SAME have a payload or data transfer size which * is different from the size of the request. Any driver that supports such * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to * calculate the data transfer size. */ static inline unsigned int blk_rq_payload_bytes(struct request *rq) { if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) return rq->special_vec.bv_len; return blk_rq_bytes(rq); } /* * Return the first full biovec in the request. The caller needs to check that * there are any bvecs before calling this helper. */ static inline struct bio_vec req_bvec(struct request *rq) { if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) return rq->special_vec; return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter); } static inline unsigned int blk_rq_count_bios(struct request *rq) { unsigned int nr_bios = 0; struct bio *bio; __rq_for_each_bio(bio, rq) nr_bios++; return nr_bios; } void blk_steal_bios(struct bio_list *list, struct request *rq); /* * Request completion related functions. * * blk_update_request() completes given number of bytes and updates * the request without completing it. */ bool blk_update_request(struct request *rq, blk_status_t error, unsigned int nr_bytes); void blk_abort_request(struct request *); /* * Number of physical segments as sent to the device. * * Normally this is the number of discontiguous data segments sent by the * submitter. But for data-less command like discard we might have no * actual data segments submitted, but the driver might have to add it's * own special payload. In that case we still return 1 here so that this * special payload will be mapped. */ static inline unsigned short blk_rq_nr_phys_segments(struct request *rq) { if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) return 1; return rq->nr_phys_segments; } /* * Number of discard segments (or ranges) the driver needs to fill in. * Each discard bio merged into a request is counted as one segment. */ static inline unsigned short blk_rq_nr_discard_segments(struct request *rq) { return max_t(unsigned short, rq->nr_phys_segments, 1); } int __blk_rq_map_sg(struct request *rq, struct scatterlist *sglist, struct scatterlist **last_sg); static inline int blk_rq_map_sg(struct request *rq, struct scatterlist *sglist) { struct scatterlist *last_sg = NULL; return __blk_rq_map_sg(rq, sglist, &last_sg); } void blk_dump_rq_flags(struct request *, char *); #endif /* BLK_MQ_H */
1 1 84 84 84 84 84 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 // SPDX-License-Identifier: GPL-2.0 /* * udc.c - Core UDC Framework * * Copyright (C) 2010 Texas Instruments * Author: Felipe Balbi <balbi@ti.com> */ #define pr_fmt(fmt) "UDC core: " fmt #include <linux/kernel.h> #include <linux/module.h> #include <linux/device.h> #include <linux/list.h> #include <linux/idr.h> #include <linux/err.h> #include <linux/dma-mapping.h> #include <linux/sched/task_stack.h> #include <linux/workqueue.h> #include <linux/usb/ch9.h> #include <linux/usb/gadget.h> #include <linux/usb.h> #include "trace.h" static DEFINE_IDA(gadget_id_numbers); static const struct bus_type gadget_bus_type; /** * struct usb_udc - describes one usb device controller * @driver: the gadget driver pointer. For use by the class code * @dev: the child device to the actual controller * @gadget: the gadget. For use by the class code * @list: for use by the udc class driver * @vbus: for udcs who care about vbus status, this value is real vbus status; * for udcs who do not care about vbus status, this value is always true * @started: the UDC's started state. True if the UDC had started. * @allow_connect: Indicates whether UDC is allowed to be pulled up. * Set/cleared by gadget_(un)bind_driver() after gadget driver is bound or * unbound. * @vbus_work: work routine to handle VBUS status change notifications. * @connect_lock: protects udc->started, gadget->connect, * gadget->allow_connect and gadget->deactivate. The routines * usb_gadget_connect_locked(), usb_gadget_disconnect_locked(), * usb_udc_connect_control_locked(), usb_gadget_udc_start_locked() and * usb_gadget_udc_stop_locked() are called with this lock held. * * This represents the internal data structure which is used by the UDC-class * to hold information about udc driver and gadget together. */ struct usb_udc { struct usb_gadget_driver *driver; struct usb_gadget *gadget; struct device dev; struct list_head list; bool vbus; bool started; bool allow_connect; struct work_struct vbus_work; struct mutex connect_lock; }; static const struct class udc_class; static LIST_HEAD(udc_list); /* Protects udc_list, udc->driver, driver->is_bound, and related calls */ static DEFINE_MUTEX(udc_lock); /* ------------------------------------------------------------------------- */ /** * usb_ep_set_maxpacket_limit - set maximum packet size limit for endpoint * @ep:the endpoint being configured * @maxpacket_limit:value of maximum packet size limit * * This function should be used only in UDC drivers to initialize endpoint * (usually in probe function). */ void usb_ep_set_maxpacket_limit(struct usb_ep *ep, unsigned maxpacket_limit) { ep->maxpacket_limit = maxpacket_limit; ep->maxpacket = maxpacket_limit; trace_usb_ep_set_maxpacket_limit(ep, 0); } EXPORT_SYMBOL_GPL(usb_ep_set_maxpacket_limit); /** * usb_ep_enable - configure endpoint, making it usable * @ep:the endpoint being configured. may not be the endpoint named "ep0". * drivers discover endpoints through the ep_list of a usb_gadget. * * When configurations are set, or when interface settings change, the driver * will enable or disable the relevant endpoints. while it is enabled, an * endpoint may be used for i/o until the driver receives a disconnect() from * the host or until the endpoint is disabled. * * the ep0 implementation (which calls this routine) must ensure that the * hardware capabilities of each endpoint match the descriptor provided * for it. for example, an endpoint named "ep2in-bulk" would be usable * for interrupt transfers as well as bulk, but it likely couldn't be used * for iso transfers or for endpoint 14. some endpoints are fully * configurable, with more generic names like "ep-a". (remember that for * USB, "in" means "towards the USB host".) * * This routine may be called in an atomic (interrupt) context. * * returns zero, or a negative error code. */ int usb_ep_enable(struct usb_ep *ep) { int ret = 0; if (ep->enabled) goto out; /* UDC drivers can't handle endpoints with maxpacket size 0 */ if (!ep->desc || usb_endpoint_maxp(ep->desc) == 0) { WARN_ONCE(1, "%s: ep%d (%s) has %s\n", __func__, ep->address, ep->name, (!ep->desc) ? "NULL descriptor" : "maxpacket 0"); ret = -EINVAL; goto out; } ret = ep->ops->enable(ep, ep->desc); if (ret) goto out; ep->enabled = true; out: trace_usb_ep_enable(ep, ret); return ret; } EXPORT_SYMBOL_GPL(usb_ep_enable); /** * usb_ep_disable - endpoint is no longer usable * @ep:the endpoint being unconfigured. may not be the endpoint named "ep0". * * no other task may be using this endpoint when this is called. * any pending and uncompleted requests will complete with status * indicating disconnect (-ESHUTDOWN) before this call returns. * gadget drivers must call usb_ep_enable() again before queueing * requests to the endpoint. * * This routine may be called in an atomic (interrupt) context. * * returns zero, or a negative error code. */ int usb_ep_disable(struct usb_ep *ep) { int ret = 0; if (!ep->enabled) goto out; ret = ep->ops->disable(ep); if (ret) goto out; ep->enabled = false; out: trace_usb_ep_disable(ep, ret); return ret; } EXPORT_SYMBOL_GPL(usb_ep_disable); /** * usb_ep_alloc_request - allocate a request object to use with this endpoint * @ep:the endpoint to be used with with the request * @gfp_flags:GFP_* flags to use * * Request objects must be allocated with this call, since they normally * need controller-specific setup and may even need endpoint-specific * resources such as allocation of DMA descriptors. * Requests may be submitted with usb_ep_queue(), and receive a single * completion callback. Free requests with usb_ep_free_request(), when * they are no longer needed. * * Returns the request, or null if one could not be allocated. */ struct usb_request *usb_ep_alloc_request(struct usb_ep *ep, gfp_t gfp_flags) { struct usb_request *req = NULL; req = ep->ops->alloc_request(ep, gfp_flags); trace_usb_ep_alloc_request(ep, req, req ? 0 : -ENOMEM); return req; } EXPORT_SYMBOL_GPL(usb_ep_alloc_request); /** * usb_ep_free_request - frees a request object * @ep:the endpoint associated with the request * @req:the request being freed * * Reverses the effect of usb_ep_alloc_request(). * Caller guarantees the request is not queued, and that it will * no longer be requeued (or otherwise used). */ void usb_ep_free_request(struct usb_ep *ep, struct usb_request *req) { trace_usb_ep_free_request(ep, req, 0); ep->ops->free_request(ep, req); } EXPORT_SYMBOL_GPL(usb_ep_free_request); /** * usb_ep_queue - queues (submits) an I/O request to an endpoint. * @ep:the endpoint associated with the request * @req:the request being submitted * @gfp_flags: GFP_* flags to use in case the lower level driver couldn't * pre-allocate all necessary memory with the request. * * This tells the device controller to perform the specified request through * that endpoint (reading or writing a buffer). When the request completes, * including being canceled by usb_ep_dequeue(), the request's completion * routine is called to return the request to the driver. Any endpoint * (except control endpoints like ep0) may have more than one transfer * request queued; they complete in FIFO order. Once a gadget driver * submits a request, that request may not be examined or modified until it * is given back to that driver through the completion callback. * * Each request is turned into one or more packets. The controller driver * never merges adjacent requests into the same packet. OUT transfers * will sometimes use data that's already buffered in the hardware. * Drivers can rely on the fact that the first byte of the request's buffer * always corresponds to the first byte of some USB packet, for both * IN and OUT transfers. * * Bulk endpoints can queue any amount of data; the transfer is packetized * automatically. The last packet will be short if the request doesn't fill it * out completely. Zero length packets (ZLPs) should be avoided in portable * protocols since not all usb hardware can successfully handle zero length * packets. (ZLPs may be explicitly written, and may be implicitly written if * the request 'zero' flag is set.) Bulk endpoints may also be used * for interrupt transfers; but the reverse is not true, and some endpoints * won't support every interrupt transfer. (Such as 768 byte packets.) * * Interrupt-only endpoints are less functional than bulk endpoints, for * example by not supporting queueing or not handling buffers that are * larger than the endpoint's maxpacket size. They may also treat data * toggle differently. * * Control endpoints ... after getting a setup() callback, the driver queues * one response (even if it would be zero length). That enables the * status ack, after transferring data as specified in the response. Setup * functions may return negative error codes to generate protocol stalls. * (Note that some USB device controllers disallow protocol stall responses * in some cases.) When control responses are deferred (the response is * written after the setup callback returns), then usb_ep_set_halt() may be * used on ep0 to trigger protocol stalls. Depending on the controller, * it may not be possible to trigger a status-stage protocol stall when the * data stage is over, that is, from within the response's completion * routine. * * For periodic endpoints, like interrupt or isochronous ones, the usb host * arranges to poll once per interval, and the gadget driver usually will * have queued some data to transfer at that time. * * Note that @req's ->complete() callback must never be called from * within usb_ep_queue() as that can create deadlock situations. * * This routine may be called in interrupt context. * * Returns zero, or a negative error code. Endpoints that are not enabled * report errors; errors will also be * reported when the usb peripheral is disconnected. * * If and only if @req is successfully queued (the return value is zero), * @req->complete() will be called exactly once, when the Gadget core and * UDC are finished with the request. When the completion function is called, * control of the request is returned to the device driver which submitted it. * The completion handler may then immediately free or reuse @req. */ int usb_ep_queue(struct usb_ep *ep, struct usb_request *req, gfp_t gfp_flags) { int ret = 0; if (!ep->enabled && ep->address) { pr_debug("USB gadget: queue request to disabled ep 0x%x (%s)\n", ep->address, ep->name); ret = -ESHUTDOWN; goto out; } ret = ep->ops->queue(ep, req, gfp_flags); out: trace_usb_ep_queue(ep, req, ret); return ret; } EXPORT_SYMBOL_GPL(usb_ep_queue); /** * usb_ep_dequeue - dequeues (cancels, unlinks) an I/O request from an endpoint * @ep:the endpoint associated with the request * @req:the request being canceled * * If the request is still active on the endpoint, it is dequeued and * eventually its completion routine is called (with status -ECONNRESET); * else a negative error code is returned. This routine is asynchronous, * that is, it may return before the completion routine runs. * * Note that some hardware can't clear out write fifos (to unlink the request * at the head of the queue) except as part of disconnecting from usb. Such * restrictions prevent drivers from supporting configuration changes, * even to configuration zero (a "chapter 9" requirement). * * This routine may be called in interrupt context. */ int usb_ep_dequeue(struct usb_ep *ep, struct usb_request *req) { int ret; ret = ep->ops->dequeue(ep, req); trace_usb_ep_dequeue(ep, req, ret); return ret; } EXPORT_SYMBOL_GPL(usb_ep_dequeue); /** * usb_ep_set_halt - sets the endpoint halt feature. * @ep: the non-isochronous endpoint being stalled * * Use this to stall an endpoint, perhaps as an error report. * Except for control endpoints, * the endpoint stays halted (will not stream any data) until the host * clears this feature; drivers may need to empty the endpoint's request * queue first, to make sure no inappropriate transfers happen. * * Note that while an endpoint CLEAR_FEATURE will be invisible to the * gadget driver, a SET_INTERFACE will not be. To reset endpoints for the * current altsetting, see usb_ep_clear_halt(). When switching altsettings, * it's simplest to use usb_ep_enable() or usb_ep_disable() for the endpoints. * * This routine may be called in interrupt context. * * Returns zero, or a negative error code. On success, this call sets * underlying hardware state that blocks data transfers. * Attempts to halt IN endpoints will fail (returning -EAGAIN) if any * transfer requests are still queued, or if the controller hardware * (usually a FIFO) still holds bytes that the host hasn't collected. */ int usb_ep_set_halt(struct usb_ep *ep) { int ret; ret = ep->ops->set_halt(ep, 1); trace_usb_ep_set_halt(ep, ret); return ret; } EXPORT_SYMBOL_GPL(usb_ep_set_halt); /** * usb_ep_clear_halt - clears endpoint halt, and resets toggle * @ep:the bulk or interrupt endpoint being reset * * Use this when responding to the standard usb "set interface" request, * for endpoints that aren't reconfigured, after clearing any other state * in the endpoint's i/o queue. * * This routine may be called in interrupt context. * * Returns zero, or a negative error code. On success, this call clears * the underlying hardware state reflecting endpoint halt and data toggle. * Note that some hardware can't support this request (like pxa2xx_udc), * and accordingly can't correctly implement interface altsettings. */ int usb_ep_clear_halt(struct usb_ep *ep) { int ret; ret = ep->ops->set_halt(ep, 0); trace_usb_ep_clear_halt(ep, ret); return ret; } EXPORT_SYMBOL_GPL(usb_ep_clear_halt); /** * usb_ep_set_wedge - sets the halt feature and ignores clear requests * @ep: the endpoint being wedged * * Use this to stall an endpoint and ignore CLEAR_FEATURE(HALT_ENDPOINT) * requests. If the gadget driver clears the halt status, it will * automatically unwedge the endpoint. * * This routine may be called in interrupt context. * * Returns zero on success, else negative errno. */ int usb_ep_set_wedge(struct usb_ep *ep) { int ret; if (ep->ops->set_wedge) ret = ep->ops->set_wedge(ep); else ret = ep->ops->set_halt(ep, 1); trace_usb_ep_set_wedge(ep, ret); return ret; } EXPORT_SYMBOL_GPL(usb_ep_set_wedge); /** * usb_ep_fifo_status - returns number of bytes in fifo, or error * @ep: the endpoint whose fifo status is being checked. * * FIFO endpoints may have "unclaimed data" in them in certain cases, * such as after aborted transfers. Hosts may not have collected all * the IN data written by the gadget driver (and reported by a request * completion). The gadget driver may not have collected all the data * written OUT to it by the host. Drivers that need precise handling for * fault reporting or recovery may need to use this call. * * This routine may be called in interrupt context. * * This returns the number of such bytes in the fifo, or a negative * errno if the endpoint doesn't use a FIFO or doesn't support such * precise handling. */ int usb_ep_fifo_status(struct usb_ep *ep) { int ret; if (ep->ops->fifo_status) ret = ep->ops->fifo_status(ep); else ret = -EOPNOTSUPP; trace_usb_ep_fifo_status(ep, ret); return ret; } EXPORT_SYMBOL_GPL(usb_ep_fifo_status); /** * usb_ep_fifo_flush - flushes contents of a fifo * @ep: the endpoint whose fifo is being flushed. * * This call may be used to flush the "unclaimed data" that may exist in * an endpoint fifo after abnormal transaction terminations. The call * must never be used except when endpoint is not being used for any * protocol translation. * * This routine may be called in interrupt context. */ void usb_ep_fifo_flush(struct usb_ep *ep) { if (ep->ops->fifo_flush) ep->ops->fifo_flush(ep); trace_usb_ep_fifo_flush(ep, 0); } EXPORT_SYMBOL_GPL(usb_ep_fifo_flush); /* ------------------------------------------------------------------------- */ /** * usb_gadget_frame_number - returns the current frame number * @gadget: controller that reports the frame number * * Returns the usb frame number, normally eleven bits from a SOF packet, * or negative errno if this device doesn't support this capability. */ int usb_gadget_frame_number(struct usb_gadget *gadget) { int ret; ret = gadget->ops->get_frame(gadget); trace_usb_gadget_frame_number(gadget, ret); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_frame_number); /** * usb_gadget_wakeup - tries to wake up the host connected to this gadget * @gadget: controller used to wake up the host * * Returns zero on success, else negative error code if the hardware * doesn't support such attempts, or its support has not been enabled * by the usb host. Drivers must return device descriptors that report * their ability to support this, or hosts won't enable it. * * This may also try to use SRP to wake the host and start enumeration, * even if OTG isn't otherwise in use. OTG devices may also start * remote wakeup even when hosts don't explicitly enable it. */ int usb_gadget_wakeup(struct usb_gadget *gadget) { int ret = 0; if (!gadget->ops->wakeup) { ret = -EOPNOTSUPP; goto out; } ret = gadget->ops->wakeup(gadget); out: trace_usb_gadget_wakeup(gadget, ret); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_wakeup); /** * usb_gadget_set_remote_wakeup - configures the device remote wakeup feature. * @gadget:the device being configured for remote wakeup * @set:value to be configured. * * set to one to enable remote wakeup feature and zero to disable it. * * returns zero on success, else negative errno. */ int usb_gadget_set_remote_wakeup(struct usb_gadget *gadget, int set) { int ret = 0; if (!gadget->ops->set_remote_wakeup) { ret = -EOPNOTSUPP; goto out; } ret = gadget->ops->set_remote_wakeup(gadget, set); out: trace_usb_gadget_set_remote_wakeup(gadget, ret); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_set_remote_wakeup); /** * usb_gadget_set_selfpowered - sets the device selfpowered feature. * @gadget:the device being declared as self-powered * * this affects the device status reported by the hardware driver * to reflect that it now has a local power supply. * * returns zero on success, else negative errno. */ int usb_gadget_set_selfpowered(struct usb_gadget *gadget) { int ret = 0; if (!gadget->ops->set_selfpowered) { ret = -EOPNOTSUPP; goto out; } ret = gadget->ops->set_selfpowered(gadget, 1); out: trace_usb_gadget_set_selfpowered(gadget, ret); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_set_selfpowered); /** * usb_gadget_clear_selfpowered - clear the device selfpowered feature. * @gadget:the device being declared as bus-powered * * this affects the device status reported by the hardware driver. * some hardware may not support bus-powered operation, in which * case this feature's value can never change. * * returns zero on success, else negative errno. */ int usb_gadget_clear_selfpowered(struct usb_gadget *gadget) { int ret = 0; if (!gadget->ops->set_selfpowered) { ret = -EOPNOTSUPP; goto out; } ret = gadget->ops->set_selfpowered(gadget, 0); out: trace_usb_gadget_clear_selfpowered(gadget, ret); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_clear_selfpowered); /** * usb_gadget_vbus_connect - Notify controller that VBUS is powered * @gadget:The device which now has VBUS power. * Context: can sleep * * This call is used by a driver for an external transceiver (or GPIO) * that detects a VBUS power session starting. Common responses include * resuming the controller, activating the D+ (or D-) pullup to let the * host detect that a USB device is attached, and starting to draw power * (8mA or possibly more, especially after SET_CONFIGURATION). * * Returns zero on success, else negative errno. */ int usb_gadget_vbus_connect(struct usb_gadget *gadget) { int ret = 0; if (!gadget->ops->vbus_session) { ret = -EOPNOTSUPP; goto out; } ret = gadget->ops->vbus_session(gadget, 1); out: trace_usb_gadget_vbus_connect(gadget, ret); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_vbus_connect); /** * usb_gadget_vbus_draw - constrain controller's VBUS power usage * @gadget:The device whose VBUS usage is being described * @mA:How much current to draw, in milliAmperes. This should be twice * the value listed in the configuration descriptor bMaxPower field. * * This call is used by gadget drivers during SET_CONFIGURATION calls, * reporting how much power the device may consume. For example, this * could affect how quickly batteries are recharged. * * Returns zero on success, else negative errno. */ int usb_gadget_vbus_draw(struct usb_gadget *gadget, unsigned mA) { int ret = 0; if (!gadget->ops->vbus_draw) { ret = -EOPNOTSUPP; goto out; } ret = gadget->ops->vbus_draw(gadget, mA); if (!ret) gadget->mA = mA; out: trace_usb_gadget_vbus_draw(gadget, ret); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_vbus_draw); /** * usb_gadget_vbus_disconnect - notify controller about VBUS session end * @gadget:the device whose VBUS supply is being described * Context: can sleep * * This call is used by a driver for an external transceiver (or GPIO) * that detects a VBUS power session ending. Common responses include * reversing everything done in usb_gadget_vbus_connect(). * * Returns zero on success, else negative errno. */ int usb_gadget_vbus_disconnect(struct usb_gadget *gadget) { int ret = 0; if (!gadget->ops->vbus_session) { ret = -EOPNOTSUPP; goto out; } ret = gadget->ops->vbus_session(gadget, 0); out: trace_usb_gadget_vbus_disconnect(gadget, ret); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_vbus_disconnect); static int usb_gadget_connect_locked(struct usb_gadget *gadget) __must_hold(&gadget->udc->connect_lock) { int ret = 0; if (!gadget->ops->pullup) { ret = -EOPNOTSUPP; goto out; } if (gadget->deactivated || !gadget->udc->allow_connect || !gadget->udc->started) { /* * If the gadget isn't usable (because it is deactivated, * unbound, or not yet started), we only save the new state. * The gadget will be connected automatically when it is * activated/bound/started. */ gadget->connected = true; goto out; } ret = gadget->ops->pullup(gadget, 1); if (!ret) gadget->connected = 1; out: trace_usb_gadget_connect(gadget, ret); return ret; } /** * usb_gadget_connect - software-controlled connect to USB host * @gadget:the peripheral being connected * * Enables the D+ (or potentially D-) pullup. The host will start * enumerating this gadget when the pullup is active and a VBUS session * is active (the link is powered). * * Returns zero on success, else negative errno. */ int usb_gadget_connect(struct usb_gadget *gadget) { int ret; mutex_lock(&gadget->udc->connect_lock); ret = usb_gadget_connect_locked(gadget); mutex_unlock(&gadget->udc->connect_lock); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_connect); static int usb_gadget_disconnect_locked(struct usb_gadget *gadget) __must_hold(&gadget->udc->connect_lock) { int ret = 0; if (!gadget->ops->pullup) { ret = -EOPNOTSUPP; goto out; } if (!gadget->connected) goto out; if (gadget->deactivated || !gadget->udc->started) { /* * If gadget is deactivated we only save new state. * Gadget will stay disconnected after activation. */ gadget->connected = false; goto out; } ret = gadget->ops->pullup(gadget, 0); if (!ret) gadget->connected = 0; mutex_lock(&udc_lock); if (gadget->udc->driver) gadget->udc->driver->disconnect(gadget); mutex_unlock(&udc_lock); out: trace_usb_gadget_disconnect(gadget, ret); return ret; } /** * usb_gadget_disconnect - software-controlled disconnect from USB host * @gadget:the peripheral being disconnected * * Disables the D+ (or potentially D-) pullup, which the host may see * as a disconnect (when a VBUS session is active). Not all systems * support software pullup controls. * * Following a successful disconnect, invoke the ->disconnect() callback * for the current gadget driver so that UDC drivers don't need to. * * Returns zero on success, else negative errno. */ int usb_gadget_disconnect(struct usb_gadget *gadget) { int ret; mutex_lock(&gadget->udc->connect_lock); ret = usb_gadget_disconnect_locked(gadget); mutex_unlock(&gadget->udc->connect_lock); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_disconnect); /** * usb_gadget_deactivate - deactivate function which is not ready to work * @gadget: the peripheral being deactivated * * This routine may be used during the gadget driver bind() call to prevent * the peripheral from ever being visible to the USB host, unless later * usb_gadget_activate() is called. For example, user mode components may * need to be activated before the system can talk to hosts. * * This routine may sleep; it must not be called in interrupt context * (such as from within a gadget driver's disconnect() callback). * * Returns zero on success, else negative errno. */ int usb_gadget_deactivate(struct usb_gadget *gadget) { int ret = 0; mutex_lock(&gadget->udc->connect_lock); if (gadget->deactivated) goto unlock; if (gadget->connected) { ret = usb_gadget_disconnect_locked(gadget); if (ret) goto unlock; /* * If gadget was being connected before deactivation, we want * to reconnect it in usb_gadget_activate(). */ gadget->connected = true; } gadget->deactivated = true; unlock: mutex_unlock(&gadget->udc->connect_lock); trace_usb_gadget_deactivate(gadget, ret); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_deactivate); /** * usb_gadget_activate - activate function which is not ready to work * @gadget: the peripheral being activated * * This routine activates gadget which was previously deactivated with * usb_gadget_deactivate() call. It calls usb_gadget_connect() if needed. * * This routine may sleep; it must not be called in interrupt context. * * Returns zero on success, else negative errno. */ int usb_gadget_activate(struct usb_gadget *gadget) { int ret = 0; mutex_lock(&gadget->udc->connect_lock); if (!gadget->deactivated) goto unlock; gadget->deactivated = false; /* * If gadget has been connected before deactivation, or became connected * while it was being deactivated, we call usb_gadget_connect(). */ if (gadget->connected) ret = usb_gadget_connect_locked(gadget); unlock: mutex_unlock(&gadget->udc->connect_lock); trace_usb_gadget_activate(gadget, ret); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_activate); /* ------------------------------------------------------------------------- */ #ifdef CONFIG_HAS_DMA int usb_gadget_map_request_by_dev(struct device *dev, struct usb_request *req, int is_in) { if (req->length == 0) return 0; if (req->sg_was_mapped) { req->num_mapped_sgs = req->num_sgs; return 0; } if (req->num_sgs) { int mapped; mapped = dma_map_sg(dev, req->sg, req->num_sgs, is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE); if (mapped == 0) { dev_err(dev, "failed to map SGs\n"); return -EFAULT; } req->num_mapped_sgs = mapped; } else { if (is_vmalloc_addr(req->buf)) { dev_err(dev, "buffer is not dma capable\n"); return -EFAULT; } else if (object_is_on_stack(req->buf)) { dev_err(dev, "buffer is on stack\n"); return -EFAULT; } req->dma = dma_map_single(dev, req->buf, req->length, is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE); if (dma_mapping_error(dev, req->dma)) { dev_err(dev, "failed to map buffer\n"); return -EFAULT; } req->dma_mapped = 1; } return 0; } EXPORT_SYMBOL_GPL(usb_gadget_map_request_by_dev); int usb_gadget_map_request(struct usb_gadget *gadget, struct usb_request *req, int is_in) { return usb_gadget_map_request_by_dev(gadget->dev.parent, req, is_in); } EXPORT_SYMBOL_GPL(usb_gadget_map_request); void usb_gadget_unmap_request_by_dev(struct device *dev, struct usb_request *req, int is_in) { if (req->length == 0 || req->sg_was_mapped) return; if (req->num_mapped_sgs) { dma_unmap_sg(dev, req->sg, req->num_sgs, is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE); req->num_mapped_sgs = 0; } else if (req->dma_mapped) { dma_unmap_single(dev, req->dma, req->length, is_in ? DMA_TO_DEVICE : DMA_FROM_DEVICE); req->dma_mapped = 0; } } EXPORT_SYMBOL_GPL(usb_gadget_unmap_request_by_dev); void usb_gadget_unmap_request(struct usb_gadget *gadget, struct usb_request *req, int is_in) { usb_gadget_unmap_request_by_dev(gadget->dev.parent, req, is_in); } EXPORT_SYMBOL_GPL(usb_gadget_unmap_request); #endif /* CONFIG_HAS_DMA */ /* ------------------------------------------------------------------------- */ /** * usb_gadget_giveback_request - give the request back to the gadget layer * @ep: the endpoint to be used with with the request * @req: the request being given back * * This is called by device controller drivers in order to return the * completed request back to the gadget layer. */ void usb_gadget_giveback_request(struct usb_ep *ep, struct usb_request *req) { if (likely(req->status == 0)) usb_led_activity(USB_LED_EVENT_GADGET); trace_usb_gadget_giveback_request(ep, req, 0); req->complete(ep, req); } EXPORT_SYMBOL_GPL(usb_gadget_giveback_request); /* ------------------------------------------------------------------------- */ /** * gadget_find_ep_by_name - returns ep whose name is the same as sting passed * in second parameter or NULL if searched endpoint not found * @g: controller to check for quirk * @name: name of searched endpoint */ struct usb_ep *gadget_find_ep_by_name(struct usb_gadget *g, const char *name) { struct usb_ep *ep; gadget_for_each_ep(ep, g) { if (!strcmp(ep->name, name)) return ep; } return NULL; } EXPORT_SYMBOL_GPL(gadget_find_ep_by_name); /* ------------------------------------------------------------------------- */ int usb_gadget_ep_match_desc(struct usb_gadget *gadget, struct usb_ep *ep, struct usb_endpoint_descriptor *desc, struct usb_ss_ep_comp_descriptor *ep_comp) { u8 type; u16 max; int num_req_streams = 0; /* endpoint already claimed? */ if (ep->claimed) return 0; type = usb_endpoint_type(desc); max = usb_endpoint_maxp(desc); if (usb_endpoint_dir_in(desc) && !ep->caps.dir_in) return 0; if (usb_endpoint_dir_out(desc) && !ep->caps.dir_out) return 0; if (max > ep->maxpacket_limit) return 0; /* "high bandwidth" works only at high speed */ if (!gadget_is_dualspeed(gadget) && usb_endpoint_maxp_mult(desc) > 1) return 0; switch (type) { case USB_ENDPOINT_XFER_CONTROL: /* only support ep0 for portable CONTROL traffic */ return 0; case USB_ENDPOINT_XFER_ISOC: if (!ep->caps.type_iso) return 0; /* ISO: limit 1023 bytes full speed, 1024 high/super speed */ if (!gadget_is_dualspeed(gadget) && max > 1023) return 0; break; case USB_ENDPOINT_XFER_BULK: if (!ep->caps.type_bulk) return 0; if (ep_comp && gadget_is_superspeed(gadget)) { /* Get the number of required streams from the * EP companion descriptor and see if the EP * matches it */ num_req_streams = ep_comp->bmAttributes & 0x1f; if (num_req_streams > ep->max_streams) return 0; } break; case USB_ENDPOINT_XFER_INT: /* Bulk endpoints handle interrupt transfers, * except the toggle-quirky iso-synch kind */ if (!ep->caps.type_int && !ep->caps.type_bulk) return 0; /* INT: limit 64 bytes full speed, 1024 high/super speed */ if (!gadget_is_dualspeed(gadget) && max > 64) return 0; break; } return 1; } EXPORT_SYMBOL_GPL(usb_gadget_ep_match_desc); /** * usb_gadget_check_config - checks if the UDC can support the binded * configuration * @gadget: controller to check the USB configuration * * Ensure that a UDC is able to support the requested resources by a * configuration, and that there are no resource limitations, such as * internal memory allocated to all requested endpoints. * * Returns zero on success, else a negative errno. */ int usb_gadget_check_config(struct usb_gadget *gadget) { if (gadget->ops->check_config) return gadget->ops->check_config(gadget); return 0; } EXPORT_SYMBOL_GPL(usb_gadget_check_config); /* ------------------------------------------------------------------------- */ static void usb_gadget_state_work(struct work_struct *work) { struct usb_gadget *gadget = work_to_gadget(work); struct usb_udc *udc = gadget->udc; if (udc) sysfs_notify(&udc->dev.kobj, NULL, "state"); } void usb_gadget_set_state(struct usb_gadget *gadget, enum usb_device_state state) { gadget->state = state; schedule_work(&gadget->work); } EXPORT_SYMBOL_GPL(usb_gadget_set_state); /* ------------------------------------------------------------------------- */ /* Acquire connect_lock before calling this function. */ static int usb_udc_connect_control_locked(struct usb_udc *udc) __must_hold(&udc->connect_lock) { if (udc->vbus) return usb_gadget_connect_locked(udc->gadget); else return usb_gadget_disconnect_locked(udc->gadget); } static void vbus_event_work(struct work_struct *work) { struct usb_udc *udc = container_of(work, struct usb_udc, vbus_work); mutex_lock(&udc->connect_lock); usb_udc_connect_control_locked(udc); mutex_unlock(&udc->connect_lock); } /** * usb_udc_vbus_handler - updates the udc core vbus status, and try to * connect or disconnect gadget * @gadget: The gadget which vbus change occurs * @status: The vbus status * * The udc driver calls it when it wants to connect or disconnect gadget * according to vbus status. * * This function can be invoked from interrupt context by irq handlers of * the gadget drivers, however, usb_udc_connect_control() has to run in * non-atomic context due to the following: * a. Some of the gadget driver implementations expect the ->pullup * callback to be invoked in non-atomic context. * b. usb_gadget_disconnect() acquires udc_lock which is a mutex. * Hence offload invocation of usb_udc_connect_control() to workqueue. */ void usb_udc_vbus_handler(struct usb_gadget *gadget, bool status) { struct usb_udc *udc = gadget->udc; if (udc) { udc->vbus = status; schedule_work(&udc->vbus_work); } } EXPORT_SYMBOL_GPL(usb_udc_vbus_handler); /** * usb_gadget_udc_reset - notifies the udc core that bus reset occurs * @gadget: The gadget which bus reset occurs * @driver: The gadget driver we want to notify * * If the udc driver has bus reset handler, it needs to call this when the bus * reset occurs, it notifies the gadget driver that the bus reset occurs as * well as updates gadget state. */ void usb_gadget_udc_reset(struct usb_gadget *gadget, struct usb_gadget_driver *driver) { driver->reset(gadget); usb_gadget_set_state(gadget, USB_STATE_DEFAULT); } EXPORT_SYMBOL_GPL(usb_gadget_udc_reset); /** * usb_gadget_udc_start_locked - tells usb device controller to start up * @udc: The UDC to be started * * This call is issued by the UDC Class driver when it's about * to register a gadget driver to the device controller, before * calling gadget driver's bind() method. * * It allows the controller to be powered off until strictly * necessary to have it powered on. * * Returns zero on success, else negative errno. * * Caller should acquire connect_lock before invoking this function. */ static inline int usb_gadget_udc_start_locked(struct usb_udc *udc) __must_hold(&udc->connect_lock) { int ret; if (udc->started) { dev_err(&udc->dev, "UDC had already started\n"); return -EBUSY; } ret = udc->gadget->ops->udc_start(udc->gadget, udc->driver); if (!ret) udc->started = true; return ret; } /** * usb_gadget_udc_stop_locked - tells usb device controller we don't need it anymore * @udc: The UDC to be stopped * * This call is issued by the UDC Class driver after calling * gadget driver's unbind() method. * * The details are implementation specific, but it can go as * far as powering off UDC completely and disable its data * line pullups. * * Caller should acquire connect lock before invoking this function. */ static inline void usb_gadget_udc_stop_locked(struct usb_udc *udc) __must_hold(&udc->connect_lock) { if (!udc->started) { dev_err(&udc->dev, "UDC had already stopped\n"); return; } udc->gadget->ops->udc_stop(udc->gadget); udc->started = false; } /** * usb_gadget_udc_set_speed - tells usb device controller speed supported by * current driver * @udc: The device we want to set maximum speed * @speed: The maximum speed to allowed to run * * This call is issued by the UDC Class driver before calling * usb_gadget_udc_start() in order to make sure that we don't try to * connect on speeds the gadget driver doesn't support. */ static inline void usb_gadget_udc_set_speed(struct usb_udc *udc, enum usb_device_speed speed) { struct usb_gadget *gadget = udc->gadget; enum usb_device_speed s; if (speed == USB_SPEED_UNKNOWN) s = gadget->max_speed; else s = min(speed, gadget->max_speed); if (s == USB_SPEED_SUPER_PLUS && gadget->ops->udc_set_ssp_rate) gadget->ops->udc_set_ssp_rate(gadget, gadget->max_ssp_rate); else if (gadget->ops->udc_set_speed) gadget->ops->udc_set_speed(gadget, s); } /** * usb_gadget_enable_async_callbacks - tell usb device controller to enable asynchronous callbacks * @udc: The UDC which should enable async callbacks * * This routine is used when binding gadget drivers. It undoes the effect * of usb_gadget_disable_async_callbacks(); the UDC driver should enable IRQs * (if necessary) and resume issuing callbacks. * * This routine will always be called in process context. */ static inline void usb_gadget_enable_async_callbacks(struct usb_udc *udc) { struct usb_gadget *gadget = udc->gadget; if (gadget->ops->udc_async_callbacks) gadget->ops->udc_async_callbacks(gadget, true); } /** * usb_gadget_disable_async_callbacks - tell usb device controller to disable asynchronous callbacks * @udc: The UDC which should disable async callbacks * * This routine is used when unbinding gadget drivers. It prevents a race: * The UDC driver doesn't know when the gadget driver's ->unbind callback * runs, so unless it is told to disable asynchronous callbacks, it might * issue a callback (such as ->disconnect) after the unbind has completed. * * After this function runs, the UDC driver must suppress all ->suspend, * ->resume, ->disconnect, ->reset, and ->setup callbacks to the gadget driver * until async callbacks are again enabled. A simple-minded but effective * way to accomplish this is to tell the UDC hardware not to generate any * more IRQs. * * Request completion callbacks must still be issued. However, it's okay * to defer them until the request is cancelled, since the pull-up will be * turned off during the time period when async callbacks are disabled. * * This routine will always be called in process context. */ static inline void usb_gadget_disable_async_callbacks(struct usb_udc *udc) { struct usb_gadget *gadget = udc->gadget; if (gadget->ops->udc_async_callbacks) gadget->ops->udc_async_callbacks(gadget, false); } /** * usb_udc_release - release the usb_udc struct * @dev: the dev member within usb_udc * * This is called by driver's core in order to free memory once the last * reference is released. */ static void usb_udc_release(struct device *dev) { struct usb_udc *udc; udc = container_of(dev, struct usb_udc, dev); dev_dbg(dev, "releasing '%s'\n", dev_name(dev)); kfree(udc); } static const struct attribute_group *usb_udc_attr_groups[]; static void usb_udc_nop_release(struct device *dev) { dev_vdbg(dev, "%s\n", __func__); } /** * usb_initialize_gadget - initialize a gadget and its embedded struct device * @parent: the parent device to this udc. Usually the controller driver's * device. * @gadget: the gadget to be initialized. * @release: a gadget release function. */ void usb_initialize_gadget(struct device *parent, struct usb_gadget *gadget, void (*release)(struct device *dev)) { INIT_WORK(&gadget->work, usb_gadget_state_work); gadget->dev.parent = parent; if (release) gadget->dev.release = release; else gadget->dev.release = usb_udc_nop_release; device_initialize(&gadget->dev); gadget->dev.bus = &gadget_bus_type; } EXPORT_SYMBOL_GPL(usb_initialize_gadget); /** * usb_add_gadget - adds a new gadget to the udc class driver list * @gadget: the gadget to be added to the list. * * Returns zero on success, negative errno otherwise. * Does not do a final usb_put_gadget() if an error occurs. */ int usb_add_gadget(struct usb_gadget *gadget) { struct usb_udc *udc; int ret = -ENOMEM; udc = kzalloc(sizeof(*udc), GFP_KERNEL); if (!udc) goto error; device_initialize(&udc->dev); udc->dev.release = usb_udc_release; udc->dev.class = &udc_class; udc->dev.groups = usb_udc_attr_groups; udc->dev.parent = gadget->dev.parent; ret = dev_set_name(&udc->dev, "%s", kobject_name(&gadget->dev.parent->kobj)); if (ret) goto err_put_udc; udc->gadget = gadget; gadget->udc = udc; mutex_init(&udc->connect_lock); udc->started = false; mutex_lock(&udc_lock); list_add_tail(&udc->list, &udc_list); mutex_unlock(&udc_lock); INIT_WORK(&udc->vbus_work, vbus_event_work); ret = device_add(&udc->dev); if (ret) goto err_unlist_udc; usb_gadget_set_state(gadget, USB_STATE_NOTATTACHED); udc->vbus = true; ret = ida_alloc(&gadget_id_numbers, GFP_KERNEL); if (ret < 0) goto err_del_udc; gadget->id_number = ret; dev_set_name(&gadget->dev, "gadget.%d", ret); ret = device_add(&gadget->dev); if (ret) goto err_free_id; ret = sysfs_create_link(&udc->dev.kobj, &gadget->dev.kobj, "gadget"); if (ret) goto err_del_gadget; return 0; err_del_gadget: device_del(&gadget->dev); err_free_id: ida_free(&gadget_id_numbers, gadget->id_number); err_del_udc: flush_work(&gadget->work); device_del(&udc->dev); err_unlist_udc: mutex_lock(&udc_lock); list_del(&udc->list); mutex_unlock(&udc_lock); err_put_udc: put_device(&udc->dev); error: return ret; } EXPORT_SYMBOL_GPL(usb_add_gadget); /** * usb_add_gadget_udc_release - adds a new gadget to the udc class driver list * @parent: the parent device to this udc. Usually the controller driver's * device. * @gadget: the gadget to be added to the list. * @release: a gadget release function. * * Returns zero on success, negative errno otherwise. * Calls the gadget release function in the latter case. */ int usb_add_gadget_udc_release(struct device *parent, struct usb_gadget *gadget, void (*release)(struct device *dev)) { int ret; usb_initialize_gadget(parent, gadget, release); ret = usb_add_gadget(gadget); if (ret) usb_put_gadget(gadget); return ret; } EXPORT_SYMBOL_GPL(usb_add_gadget_udc_release); /** * usb_get_gadget_udc_name - get the name of the first UDC controller * This functions returns the name of the first UDC controller in the system. * Please note that this interface is usefull only for legacy drivers which * assume that there is only one UDC controller in the system and they need to * get its name before initialization. There is no guarantee that the UDC * of the returned name will be still available, when gadget driver registers * itself. * * Returns pointer to string with UDC controller name on success, NULL * otherwise. Caller should kfree() returned string. */ char *usb_get_gadget_udc_name(void) { struct usb_udc *udc; char *name = NULL; /* For now we take the first available UDC */ mutex_lock(&udc_lock); list_for_each_entry(udc, &udc_list, list) { if (!udc->driver) { name = kstrdup(udc->gadget->name, GFP_KERNEL); break; } } mutex_unlock(&udc_lock); return name; } EXPORT_SYMBOL_GPL(usb_get_gadget_udc_name); /** * usb_add_gadget_udc - adds a new gadget to the udc class driver list * @parent: the parent device to this udc. Usually the controller * driver's device. * @gadget: the gadget to be added to the list * * Returns zero on success, negative errno otherwise. */ int usb_add_gadget_udc(struct device *parent, struct usb_gadget *gadget) { return usb_add_gadget_udc_release(parent, gadget, NULL); } EXPORT_SYMBOL_GPL(usb_add_gadget_udc); /** * usb_del_gadget - deletes a gadget and unregisters its udc * @gadget: the gadget to be deleted. * * This will unbind @gadget, if it is bound. * It will not do a final usb_put_gadget(). */ void usb_del_gadget(struct usb_gadget *gadget) { struct usb_udc *udc = gadget->udc; if (!udc) return; dev_vdbg(gadget->dev.parent, "unregistering gadget\n"); mutex_lock(&udc_lock); list_del(&udc->list); mutex_unlock(&udc_lock); kobject_uevent(&udc->dev.kobj, KOBJ_REMOVE); sysfs_remove_link(&udc->dev.kobj, "gadget"); device_del(&gadget->dev); flush_work(&gadget->work); ida_free(&gadget_id_numbers, gadget->id_number); cancel_work_sync(&udc->vbus_work); device_unregister(&udc->dev); } EXPORT_SYMBOL_GPL(usb_del_gadget); /** * usb_del_gadget_udc - unregisters a gadget * @gadget: the gadget to be unregistered. * * Calls usb_del_gadget() and does a final usb_put_gadget(). */ void usb_del_gadget_udc(struct usb_gadget *gadget) { usb_del_gadget(gadget); usb_put_gadget(gadget); } EXPORT_SYMBOL_GPL(usb_del_gadget_udc); /* ------------------------------------------------------------------------- */ static int gadget_match_driver(struct device *dev, const struct device_driver *drv) { struct usb_gadget *gadget = dev_to_usb_gadget(dev); struct usb_udc *udc = gadget->udc; const struct usb_gadget_driver *driver = container_of(drv, struct usb_gadget_driver, driver); /* If the driver specifies a udc_name, it must match the UDC's name */ if (driver->udc_name && strcmp(driver->udc_name, dev_name(&udc->dev)) != 0) return 0; /* If the driver is already bound to a gadget, it doesn't match */ if (driver->is_bound) return 0; /* Otherwise any gadget driver matches any UDC */ return 1; } static int gadget_bind_driver(struct device *dev) { struct usb_gadget *gadget = dev_to_usb_gadget(dev); struct usb_udc *udc = gadget->udc; struct usb_gadget_driver *driver = container_of(dev->driver, struct usb_gadget_driver, driver); int ret = 0; mutex_lock(&udc_lock); if (driver->is_bound) { mutex_unlock(&udc_lock); return -ENXIO; /* Driver binds to only one gadget */ } driver->is_bound = true; udc->driver = driver; mutex_unlock(&udc_lock); dev_dbg(&udc->dev, "binding gadget driver [%s]\n", driver->function); usb_gadget_udc_set_speed(udc, driver->max_speed); ret = driver->bind(udc->gadget, driver); if (ret) goto err_bind; mutex_lock(&udc->connect_lock); ret = usb_gadget_udc_start_locked(udc); if (ret) { mutex_unlock(&udc->connect_lock); goto err_start; } usb_gadget_enable_async_callbacks(udc); udc->allow_connect = true; ret = usb_udc_connect_control_locked(udc); if (ret) goto err_connect_control; mutex_unlock(&udc->connect_lock); kobject_uevent(&udc->dev.kobj, KOBJ_CHANGE); return 0; err_connect_control: udc->allow_connect = false; usb_gadget_disable_async_callbacks(udc); if (gadget->irq) synchronize_irq(gadget->irq); usb_gadget_udc_stop_locked(udc); mutex_unlock(&udc->connect_lock); err_start: driver->unbind(udc->gadget); err_bind: if (ret != -EISNAM) dev_err(&udc->dev, "failed to start %s: %d\n", driver->function, ret); mutex_lock(&udc_lock); udc->driver = NULL; driver->is_bound = false; mutex_unlock(&udc_lock); return ret; } static void gadget_unbind_driver(struct device *dev) { struct usb_gadget *gadget = dev_to_usb_gadget(dev); struct usb_udc *udc = gadget->udc; struct usb_gadget_driver *driver = udc->driver; dev_dbg(&udc->dev, "unbinding gadget driver [%s]\n", driver->function); udc->allow_connect = false; cancel_work_sync(&udc->vbus_work); mutex_lock(&udc->connect_lock); usb_gadget_disconnect_locked(gadget); usb_gadget_disable_async_callbacks(udc); if (gadget->irq) synchronize_irq(gadget->irq); mutex_unlock(&udc->connect_lock); udc->driver->unbind(gadget); mutex_lock(&udc->connect_lock); usb_gadget_udc_stop_locked(udc); mutex_unlock(&udc->connect_lock); mutex_lock(&udc_lock); driver->is_bound = false; udc->driver = NULL; mutex_unlock(&udc_lock); kobject_uevent(&udc->dev.kobj, KOBJ_CHANGE); } /* ------------------------------------------------------------------------- */ int usb_gadget_register_driver_owner(struct usb_gadget_driver *driver, struct module *owner, const char *mod_name) { int ret; if (!driver || !driver->bind || !driver->setup) return -EINVAL; driver->driver.bus = &gadget_bus_type; driver->driver.owner = owner; driver->driver.mod_name = mod_name; driver->driver.probe_type = PROBE_FORCE_SYNCHRONOUS; ret = driver_register(&driver->driver); if (ret) { pr_warn("%s: driver registration failed: %d\n", driver->function, ret); return ret; } mutex_lock(&udc_lock); if (!driver->is_bound) { if (driver->match_existing_only) { pr_warn("%s: couldn't find an available UDC or it's busy\n", driver->function); ret = -EBUSY; } else { pr_info("%s: couldn't find an available UDC\n", driver->function); ret = 0; } } mutex_unlock(&udc_lock); if (ret) driver_unregister(&driver->driver); return ret; } EXPORT_SYMBOL_GPL(usb_gadget_register_driver_owner); int usb_gadget_unregister_driver(struct usb_gadget_driver *driver) { if (!driver || !driver->unbind) return -EINVAL; driver_unregister(&driver->driver); return 0; } EXPORT_SYMBOL_GPL(usb_gadget_unregister_driver); /* ------------------------------------------------------------------------- */ static ssize_t srp_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t n) { struct usb_udc *udc = container_of(dev, struct usb_udc, dev); if (sysfs_streq(buf, "1")) usb_gadget_wakeup(udc->gadget); return n; } static DEVICE_ATTR_WO(srp); static ssize_t soft_connect_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t n) { struct usb_udc *udc = container_of(dev, struct usb_udc, dev); ssize_t ret; device_lock(&udc->gadget->dev); if (!udc->driver) { dev_err(dev, "soft-connect without a gadget driver\n"); ret = -EOPNOTSUPP; goto out; } if (sysfs_streq(buf, "connect")) { mutex_lock(&udc->connect_lock); usb_gadget_udc_start_locked(udc); usb_gadget_connect_locked(udc->gadget); mutex_unlock(&udc->connect_lock); } else if (sysfs_streq(buf, "disconnect")) { mutex_lock(&udc->connect_lock); usb_gadget_disconnect_locked(udc->gadget); usb_gadget_udc_stop_locked(udc); mutex_unlock(&udc->connect_lock); } else { dev_err(dev, "unsupported command '%s'\n", buf); ret = -EINVAL; goto out; } ret = n; out: device_unlock(&udc->gadget->dev); return ret; } static DEVICE_ATTR_WO(soft_connect); static ssize_t state_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_udc *udc = container_of(dev, struct usb_udc, dev); struct usb_gadget *gadget = udc->gadget; return sprintf(buf, "%s\n", usb_state_string(gadget->state)); } static DEVICE_ATTR_RO(state); static ssize_t function_show(struct device *dev, struct device_attribute *attr, char *buf) { struct usb_udc *udc = container_of(dev, struct usb_udc, dev); struct usb_gadget_driver *drv; int rc = 0; mutex_lock(&udc_lock); drv = udc->driver; if (drv && drv->function) rc = scnprintf(buf, PAGE_SIZE, "%s\n", drv->function); mutex_unlock(&udc_lock); return rc; } static DEVICE_ATTR_RO(function); #define USB_UDC_SPEED_ATTR(name, param) \ ssize_t name##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct usb_udc *udc = container_of(dev, struct usb_udc, dev); \ return scnprintf(buf, PAGE_SIZE, "%s\n", \ usb_speed_string(udc->gadget->param)); \ } \ static DEVICE_ATTR_RO(name) static USB_UDC_SPEED_ATTR(current_speed, speed); static USB_UDC_SPEED_ATTR(maximum_speed, max_speed); #define USB_UDC_ATTR(name) \ ssize_t name##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct usb_udc *udc = container_of(dev, struct usb_udc, dev); \ struct usb_gadget *gadget = udc->gadget; \ \ return scnprintf(buf, PAGE_SIZE, "%d\n", gadget->name); \ } \ static DEVICE_ATTR_RO(name) static USB_UDC_ATTR(is_otg); static USB_UDC_ATTR(is_a_peripheral); static USB_UDC_ATTR(b_hnp_enable); static USB_UDC_ATTR(a_hnp_support); static USB_UDC_ATTR(a_alt_hnp_support); static USB_UDC_ATTR(is_selfpowered); static struct attribute *usb_udc_attrs[] = { &dev_attr_srp.attr, &dev_attr_soft_connect.attr, &dev_attr_state.attr, &dev_attr_function.attr, &dev_attr_current_speed.attr, &dev_attr_maximum_speed.attr, &dev_attr_is_otg.attr, &dev_attr_is_a_peripheral.attr, &dev_attr_b_hnp_enable.attr, &dev_attr_a_hnp_support.attr, &dev_attr_a_alt_hnp_support.attr, &dev_attr_is_selfpowered.attr, NULL, }; static const struct attribute_group usb_udc_attr_group = { .attrs = usb_udc_attrs, }; static const struct attribute_group *usb_udc_attr_groups[] = { &usb_udc_attr_group, NULL, }; static int usb_udc_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct usb_udc *udc = container_of(dev, struct usb_udc, dev); int ret; ret = add_uevent_var(env, "USB_UDC_NAME=%s", udc->gadget->name); if (ret) { dev_err(dev, "failed to add uevent USB_UDC_NAME\n"); return ret; } mutex_lock(&udc_lock); if (udc->driver) ret = add_uevent_var(env, "USB_UDC_DRIVER=%s", udc->driver->function); mutex_unlock(&udc_lock); if (ret) { dev_err(dev, "failed to add uevent USB_UDC_DRIVER\n"); return ret; } return 0; } static const struct class udc_class = { .name = "udc", .dev_uevent = usb_udc_uevent, }; static const struct bus_type gadget_bus_type = { .name = "gadget", .probe = gadget_bind_driver, .remove = gadget_unbind_driver, .match = gadget_match_driver, }; static int __init usb_udc_init(void) { int rc; rc = class_register(&udc_class); if (rc) return rc; rc = bus_register(&gadget_bus_type); if (rc) class_unregister(&udc_class); return rc; } subsys_initcall(usb_udc_init); static void __exit usb_udc_exit(void) { bus_unregister(&gadget_bus_type); class_unregister(&udc_class); } module_exit(usb_udc_exit); MODULE_DESCRIPTION("UDC Framework"); MODULE_AUTHOR("Felipe Balbi <balbi@ti.com>"); MODULE_LICENSE("GPL v2");
3 3 7 7 526 526 350 349 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 // SPDX-License-Identifier: GPL-2.0 /* * devtmpfs - kernel-maintained tmpfs-based /dev * * Copyright (C) 2009, Kay Sievers <kay.sievers@vrfy.org> * * During bootup, before any driver core device is registered, * devtmpfs, a tmpfs-based filesystem is created. Every driver-core * device which requests a device node, will add a node in this * filesystem. * By default, all devices are named after the name of the device, * owned by root and have a default mode of 0600. Subsystems can * overwrite the default setting if needed. */ #define pr_fmt(fmt) "devtmpfs: " fmt #include <linux/kernel.h> #include <linux/syscalls.h> #include <linux/mount.h> #include <linux/device.h> #include <linux/blkdev.h> #include <linux/namei.h> #include <linux/fs.h> #include <linux/shmem_fs.h> #include <linux/ramfs.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/kthread.h> #include <linux/init_syscalls.h> #include <uapi/linux/mount.h> #include "base.h" #ifdef CONFIG_DEVTMPFS_SAFE #define DEVTMPFS_MFLAGS (MS_SILENT | MS_NOEXEC | MS_NOSUID) #else #define DEVTMPFS_MFLAGS (MS_SILENT) #endif static struct task_struct *thread; static int __initdata mount_dev = IS_ENABLED(CONFIG_DEVTMPFS_MOUNT); static DEFINE_SPINLOCK(req_lock); static struct req { struct req *next; struct completion done; int err; const char *name; umode_t mode; /* 0 => delete */ kuid_t uid; kgid_t gid; struct device *dev; } *requests; static int __init mount_param(char *str) { mount_dev = simple_strtoul(str, NULL, 0); return 1; } __setup("devtmpfs.mount=", mount_param); static struct vfsmount *mnt; static struct file_system_type internal_fs_type = { .name = "devtmpfs", #ifdef CONFIG_TMPFS .init_fs_context = shmem_init_fs_context, #else .init_fs_context = ramfs_init_fs_context, #endif .kill_sb = kill_litter_super, }; /* Simply take a ref on the existing mount */ static int devtmpfs_get_tree(struct fs_context *fc) { struct super_block *sb = mnt->mnt_sb; atomic_inc(&sb->s_active); down_write(&sb->s_umount); fc->root = dget(sb->s_root); return 0; } /* Ops are filled in during init depending on underlying shmem or ramfs type */ struct fs_context_operations devtmpfs_context_ops = {}; /* Call the underlying initialization and set to our ops */ static int devtmpfs_init_fs_context(struct fs_context *fc) { int ret; #ifdef CONFIG_TMPFS ret = shmem_init_fs_context(fc); #else ret = ramfs_init_fs_context(fc); #endif if (ret < 0) return ret; fc->ops = &devtmpfs_context_ops; return 0; } static struct file_system_type dev_fs_type = { .name = "devtmpfs", .init_fs_context = devtmpfs_init_fs_context, }; static int devtmpfs_submit_req(struct req *req, const char *tmp) { init_completion(&req->done); spin_lock(&req_lock); req->next = requests; requests = req; spin_unlock(&req_lock); wake_up_process(thread); wait_for_completion(&req->done); kfree(tmp); return req->err; } int devtmpfs_create_node(struct device *dev) { const char *tmp = NULL; struct req req; if (!thread) return 0; req.mode = 0; req.uid = GLOBAL_ROOT_UID; req.gid = GLOBAL_ROOT_GID; req.name = device_get_devnode(dev, &req.mode, &req.uid, &req.gid, &tmp); if (!req.name) return -ENOMEM; if (req.mode == 0) req.mode = 0600; if (is_blockdev(dev)) req.mode |= S_IFBLK; else req.mode |= S_IFCHR; req.dev = dev; return devtmpfs_submit_req(&req, tmp); } int devtmpfs_delete_node(struct device *dev) { const char *tmp = NULL; struct req req; if (!thread) return 0; req.name = device_get_devnode(dev, NULL, NULL, NULL, &tmp); if (!req.name) return -ENOMEM; req.mode = 0; req.dev = dev; return devtmpfs_submit_req(&req, tmp); } static int dev_mkdir(const char *name, umode_t mode) { struct dentry *dentry; struct path path; dentry = kern_path_create(AT_FDCWD, name, &path, LOOKUP_DIRECTORY); if (IS_ERR(dentry)) return PTR_ERR(dentry); dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode); if (!IS_ERR(dentry)) /* mark as kernel-created inode */ d_inode(dentry)->i_private = &thread; done_path_create(&path, dentry); return PTR_ERR_OR_ZERO(dentry); } static int create_path(const char *nodepath) { char *path; char *s; int err = 0; /* parent directories do not exist, create them */ path = kstrdup(nodepath, GFP_KERNEL); if (!path) return -ENOMEM; s = path; for (;;) { s = strchr(s, '/'); if (!s) break; s[0] = '\0'; err = dev_mkdir(path, 0755); if (err && err != -EEXIST) break; s[0] = '/'; s++; } kfree(path); return err; } static int handle_create(const char *nodename, umode_t mode, kuid_t uid, kgid_t gid, struct device *dev) { struct dentry *dentry; struct path path; int err; dentry = kern_path_create(AT_FDCWD, nodename, &path, 0); if (dentry == ERR_PTR(-ENOENT)) { create_path(nodename); dentry = kern_path_create(AT_FDCWD, nodename, &path, 0); } if (IS_ERR(dentry)) return PTR_ERR(dentry); err = vfs_mknod(&nop_mnt_idmap, d_inode(path.dentry), dentry, mode, dev->devt); if (!err) { struct iattr newattrs; newattrs.ia_mode = mode; newattrs.ia_uid = uid; newattrs.ia_gid = gid; newattrs.ia_valid = ATTR_MODE|ATTR_UID|ATTR_GID; inode_lock(d_inode(dentry)); notify_change(&nop_mnt_idmap, dentry, &newattrs, NULL); inode_unlock(d_inode(dentry)); /* mark as kernel-created inode */ d_inode(dentry)->i_private = &thread; } done_path_create(&path, dentry); return err; } static int dev_rmdir(const char *name) { struct path parent; struct dentry *dentry; int err; dentry = kern_path_locked(name, &parent); if (IS_ERR(dentry)) return PTR_ERR(dentry); if (d_inode(dentry)->i_private == &thread) err = vfs_rmdir(&nop_mnt_idmap, d_inode(parent.dentry), dentry); else err = -EPERM; dput(dentry); inode_unlock(d_inode(parent.dentry)); path_put(&parent); return err; } static int delete_path(const char *nodepath) { char *path; int err = 0; path = kstrdup(nodepath, GFP_KERNEL); if (!path) return -ENOMEM; for (;;) { char *base; base = strrchr(path, '/'); if (!base) break; base[0] = '\0'; err = dev_rmdir(path); if (err) break; } kfree(path); return err; } static int dev_mynode(struct device *dev, struct inode *inode) { /* did we create it */ if (inode->i_private != &thread) return 0; /* does the dev_t match */ if (is_blockdev(dev)) { if (!S_ISBLK(inode->i_mode)) return 0; } else { if (!S_ISCHR(inode->i_mode)) return 0; } if (inode->i_rdev != dev->devt) return 0; /* ours */ return 1; } static int handle_remove(const char *nodename, struct device *dev) { struct path parent; struct dentry *dentry; struct inode *inode; int deleted = 0; int err = 0; dentry = kern_path_locked(nodename, &parent); if (IS_ERR(dentry)) return PTR_ERR(dentry); inode = d_inode(dentry); if (dev_mynode(dev, inode)) { struct iattr newattrs; /* * before unlinking this node, reset permissions * of possible references like hardlinks */ newattrs.ia_uid = GLOBAL_ROOT_UID; newattrs.ia_gid = GLOBAL_ROOT_GID; newattrs.ia_mode = inode->i_mode & ~0777; newattrs.ia_valid = ATTR_UID|ATTR_GID|ATTR_MODE; inode_lock(d_inode(dentry)); notify_change(&nop_mnt_idmap, dentry, &newattrs, NULL); inode_unlock(d_inode(dentry)); err = vfs_unlink(&nop_mnt_idmap, d_inode(parent.dentry), dentry, NULL); if (!err || err == -ENOENT) deleted = 1; } dput(dentry); inode_unlock(d_inode(parent.dentry)); path_put(&parent); if (deleted && strchr(nodename, '/')) delete_path(nodename); return err; } /* * If configured, or requested by the commandline, devtmpfs will be * auto-mounted after the kernel mounted the root filesystem. */ int __init devtmpfs_mount(void) { int err; if (!mount_dev) return 0; if (!thread) return 0; err = init_mount("devtmpfs", "dev", "devtmpfs", DEVTMPFS_MFLAGS, NULL); if (err) pr_info("error mounting %d\n", err); else pr_info("mounted\n"); return err; } static __initdata DECLARE_COMPLETION(setup_done); static int handle(const char *name, umode_t mode, kuid_t uid, kgid_t gid, struct device *dev) { if (mode) return handle_create(name, mode, uid, gid, dev); else return handle_remove(name, dev); } static void __noreturn devtmpfs_work_loop(void) { while (1) { spin_lock(&req_lock); while (requests) { struct req *req = requests; requests = NULL; spin_unlock(&req_lock); while (req) { struct req *next = req->next; req->err = handle(req->name, req->mode, req->uid, req->gid, req->dev); complete(&req->done); req = next; } spin_lock(&req_lock); } __set_current_state(TASK_INTERRUPTIBLE); spin_unlock(&req_lock); schedule(); } } static noinline int __init devtmpfs_setup(void *p) { int err; err = ksys_unshare(CLONE_NEWNS); if (err) goto out; err = init_mount("devtmpfs", "/", "devtmpfs", DEVTMPFS_MFLAGS, NULL); if (err) goto out; init_chdir("/.."); /* will traverse into overmounted root */ init_chroot("."); out: *(int *)p = err; return err; } /* * The __ref is because devtmpfs_setup needs to be __init for the routines it * calls. That call is done while devtmpfs_init, which is marked __init, * synchronously waits for it to complete. */ static int __ref devtmpfsd(void *p) { int err = devtmpfs_setup(p); complete(&setup_done); if (err) return err; devtmpfs_work_loop(); return 0; } /* * Get the underlying (shmem/ramfs) context ops to build ours */ static int devtmpfs_configure_context(void) { struct fs_context *fc; fc = fs_context_for_reconfigure(mnt->mnt_root, mnt->mnt_sb->s_flags, MS_RMT_MASK); if (IS_ERR(fc)) return PTR_ERR(fc); /* Set up devtmpfs_context_ops based on underlying type */ devtmpfs_context_ops.free = fc->ops->free; devtmpfs_context_ops.dup = fc->ops->dup; devtmpfs_context_ops.parse_param = fc->ops->parse_param; devtmpfs_context_ops.parse_monolithic = fc->ops->parse_monolithic; devtmpfs_context_ops.get_tree = &devtmpfs_get_tree; devtmpfs_context_ops.reconfigure = fc->ops->reconfigure; put_fs_context(fc); return 0; } /* * Create devtmpfs instance, driver-core devices will add their device * nodes here. */ int __init devtmpfs_init(void) { char opts[] = "mode=0755"; int err; mnt = vfs_kern_mount(&internal_fs_type, 0, "devtmpfs", opts); if (IS_ERR(mnt)) { pr_err("unable to create devtmpfs %ld\n", PTR_ERR(mnt)); return PTR_ERR(mnt); } err = devtmpfs_configure_context(); if (err) { pr_err("unable to configure devtmpfs type %d\n", err); return err; } err = register_filesystem(&dev_fs_type); if (err) { pr_err("unable to register devtmpfs type %d\n", err); return err; } thread = kthread_run(devtmpfsd, &err, "kdevtmpfs"); if (!IS_ERR(thread)) { wait_for_completion(&setup_done); } else { err = PTR_ERR(thread); thread = NULL; } if (err) { pr_err("unable to create devtmpfs %d\n", err); unregister_filesystem(&dev_fs_type); thread = NULL; return err; } pr_info("initialized\n"); return 0; }
425 1618 1617 1228 425 425 267 267 98 98 98 98 70 1 69 69 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2004 IBM Corporation * * Author: Serge Hallyn <serue@us.ibm.com> */ #include <linux/export.h> #include <linux/uts.h> #include <linux/utsname.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> #include <linux/sched/task.h> static struct kmem_cache *uts_ns_cache __ro_after_init; static struct ucounts *inc_uts_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_UTS_NAMESPACES); } static void dec_uts_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_UTS_NAMESPACES); } static struct uts_namespace *create_uts_ns(void) { struct uts_namespace *uts_ns; uts_ns = kmem_cache_alloc(uts_ns_cache, GFP_KERNEL); if (uts_ns) refcount_set(&uts_ns->ns.count, 1); return uts_ns; } /* * Clone a new ns copying an original utsname, setting refcount to 1 * @old_ns: namespace to clone * Return ERR_PTR(-ENOMEM) on error (failure to allocate), new ns otherwise */ static struct uts_namespace *clone_uts_ns(struct user_namespace *user_ns, struct uts_namespace *old_ns) { struct uts_namespace *ns; struct ucounts *ucounts; int err; err = -ENOSPC; ucounts = inc_uts_namespaces(user_ns); if (!ucounts) goto fail; err = -ENOMEM; ns = create_uts_ns(); if (!ns) goto fail_dec; err = ns_alloc_inum(&ns->ns); if (err) goto fail_free; ns->ucounts = ucounts; ns->ns.ops = &utsns_operations; down_read(&uts_sem); memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); ns->user_ns = get_user_ns(user_ns); up_read(&uts_sem); return ns; fail_free: kmem_cache_free(uts_ns_cache, ns); fail_dec: dec_uts_namespaces(ucounts); fail: return ERR_PTR(err); } /* * Copy task tsk's utsname namespace, or clone it if flags * specifies CLONE_NEWUTS. In latter case, changes to the * utsname of this process won't be seen by parent, and vice * versa. */ struct uts_namespace *copy_utsname(unsigned long flags, struct user_namespace *user_ns, struct uts_namespace *old_ns) { struct uts_namespace *new_ns; BUG_ON(!old_ns); get_uts_ns(old_ns); if (!(flags & CLONE_NEWUTS)) return old_ns; new_ns = clone_uts_ns(user_ns, old_ns); put_uts_ns(old_ns); return new_ns; } void free_uts_ns(struct uts_namespace *ns) { dec_uts_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); kmem_cache_free(uts_ns_cache, ns); } static inline struct uts_namespace *to_uts_ns(struct ns_common *ns) { return container_of(ns, struct uts_namespace, ns); } static struct ns_common *utsns_get(struct task_struct *task) { struct uts_namespace *ns = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) { ns = nsproxy->uts_ns; get_uts_ns(ns); } task_unlock(task); return ns ? &ns->ns : NULL; } static void utsns_put(struct ns_common *ns) { put_uts_ns(to_uts_ns(ns)); } static int utsns_install(struct nsset *nsset, struct ns_common *new) { struct nsproxy *nsproxy = nsset->nsproxy; struct uts_namespace *ns = to_uts_ns(new); if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) return -EPERM; get_uts_ns(ns); put_uts_ns(nsproxy->uts_ns); nsproxy->uts_ns = ns; return 0; } static struct user_namespace *utsns_owner(struct ns_common *ns) { return to_uts_ns(ns)->user_ns; } const struct proc_ns_operations utsns_operations = { .name = "uts", .type = CLONE_NEWUTS, .get = utsns_get, .put = utsns_put, .install = utsns_install, .owner = utsns_owner, }; void __init uts_ns_init(void) { uts_ns_cache = kmem_cache_create_usercopy( "uts_namespace", sizeof(struct uts_namespace), 0, SLAB_PANIC|SLAB_ACCOUNT, offsetof(struct uts_namespace, name), sizeof_field(struct uts_namespace, name), NULL); }
217 9 10 4561 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PARAVIRT_H #define _ASM_X86_PARAVIRT_H /* Various instructions on x86 need to be replaced for * para-virtualization: those hooks are defined here. */ #include <asm/paravirt_types.h> #ifndef __ASSEMBLER__ struct mm_struct; #endif #ifdef CONFIG_PARAVIRT #include <asm/pgtable_types.h> #include <asm/asm.h> #include <asm/nospec-branch.h> #ifndef __ASSEMBLER__ #include <linux/bug.h> #include <linux/types.h> #include <linux/cpumask.h> #include <linux/static_call_types.h> #include <asm/frame.h> u64 dummy_steal_clock(int cpu); u64 dummy_sched_clock(void); DECLARE_STATIC_CALL(pv_steal_clock, dummy_steal_clock); DECLARE_STATIC_CALL(pv_sched_clock, dummy_sched_clock); void paravirt_set_sched_clock(u64 (*func)(void)); static __always_inline u64 paravirt_sched_clock(void) { return static_call(pv_sched_clock)(); } struct static_key; extern struct static_key paravirt_steal_enabled; extern struct static_key paravirt_steal_rq_enabled; __visible void __native_queued_spin_unlock(struct qspinlock *lock); bool pv_is_native_spin_unlock(void); __visible bool __native_vcpu_is_preempted(long cpu); bool pv_is_native_vcpu_is_preempted(void); static inline u64 paravirt_steal_clock(int cpu) { return static_call(pv_steal_clock)(cpu); } #ifdef CONFIG_PARAVIRT_SPINLOCKS void __init paravirt_set_cap(void); #endif /* The paravirtualized I/O functions */ static inline void slow_down_io(void) { PVOP_VCALL0(cpu.io_delay); #ifdef REALLY_SLOW_IO PVOP_VCALL0(cpu.io_delay); PVOP_VCALL0(cpu.io_delay); PVOP_VCALL0(cpu.io_delay); #endif } void native_flush_tlb_local(void); void native_flush_tlb_global(void); void native_flush_tlb_one_user(unsigned long addr); void native_flush_tlb_multi(const struct cpumask *cpumask, const struct flush_tlb_info *info); static inline void __flush_tlb_local(void) { PVOP_VCALL0(mmu.flush_tlb_user); } static inline void __flush_tlb_global(void) { PVOP_VCALL0(mmu.flush_tlb_kernel); } static inline void __flush_tlb_one_user(unsigned long addr) { PVOP_VCALL1(mmu.flush_tlb_one_user, addr); } static inline void __flush_tlb_multi(const struct cpumask *cpumask, const struct flush_tlb_info *info) { PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info); } static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) { PVOP_VCALL1(mmu.exit_mmap, mm); } static inline void notify_page_enc_status_changed(unsigned long pfn, int npages, bool enc) { PVOP_VCALL3(mmu.notify_page_enc_status_changed, pfn, npages, enc); } static __always_inline void arch_safe_halt(void) { PVOP_VCALL0(irq.safe_halt); } static inline void halt(void) { PVOP_VCALL0(irq.halt); } #ifdef CONFIG_PARAVIRT_XXL static inline void load_sp0(unsigned long sp0) { PVOP_VCALL1(cpu.load_sp0, sp0); } /* The paravirtualized CPUID instruction. */ static inline void __cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { PVOP_VCALL4(cpu.cpuid, eax, ebx, ecx, edx); } /* * These special macros can be used to get or set a debugging register */ static __always_inline unsigned long paravirt_get_debugreg(int reg) { return PVOP_CALL1(unsigned long, cpu.get_debugreg, reg); } #define get_debugreg(var, reg) var = paravirt_get_debugreg(reg) static __always_inline void set_debugreg(unsigned long val, int reg) { PVOP_VCALL2(cpu.set_debugreg, reg, val); } static inline unsigned long read_cr0(void) { return PVOP_CALL0(unsigned long, cpu.read_cr0); } static inline void write_cr0(unsigned long x) { PVOP_VCALL1(cpu.write_cr0, x); } static __always_inline unsigned long read_cr2(void) { return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2, "mov %%cr2, %%rax;", ALT_NOT_XEN); } static __always_inline void write_cr2(unsigned long x) { PVOP_VCALL1(mmu.write_cr2, x); } static inline unsigned long __read_cr3(void) { return PVOP_ALT_CALL0(unsigned long, mmu.read_cr3, "mov %%cr3, %%rax;", ALT_NOT_XEN); } static inline void write_cr3(unsigned long x) { PVOP_ALT_VCALL1(mmu.write_cr3, x, "mov %%rdi, %%cr3", ALT_NOT_XEN); } static inline void __write_cr4(unsigned long x) { PVOP_VCALL1(cpu.write_cr4, x); } static inline u64 paravirt_read_msr(u32 msr) { return PVOP_CALL1(u64, cpu.read_msr, msr); } static inline void paravirt_write_msr(u32 msr, u64 val) { PVOP_VCALL2(cpu.write_msr, msr, val); } static inline int paravirt_read_msr_safe(u32 msr, u64 *val) { return PVOP_CALL2(int, cpu.read_msr_safe, msr, val); } static inline int paravirt_write_msr_safe(u32 msr, u64 val) { return PVOP_CALL2(int, cpu.write_msr_safe, msr, val); } #define rdmsr(msr, val1, val2) \ do { \ u64 _l = paravirt_read_msr(msr); \ val1 = (u32)_l; \ val2 = _l >> 32; \ } while (0) static __always_inline void wrmsr(u32 msr, u32 low, u32 high) { paravirt_write_msr(msr, (u64)high << 32 | low); } #define rdmsrq(msr, val) \ do { \ val = paravirt_read_msr(msr); \ } while (0) static inline void wrmsrq(u32 msr, u64 val) { paravirt_write_msr(msr, val); } static inline int wrmsrq_safe(u32 msr, u64 val) { return paravirt_write_msr_safe(msr, val); } /* rdmsr with exception handling */ #define rdmsr_safe(msr, a, b) \ ({ \ u64 _l; \ int _err = paravirt_read_msr_safe((msr), &_l); \ (*a) = (u32)_l; \ (*b) = (u32)(_l >> 32); \ _err; \ }) static __always_inline int rdmsrq_safe(u32 msr, u64 *p) { return paravirt_read_msr_safe(msr, p); } static __always_inline u64 rdpmc(int counter) { return PVOP_CALL1(u64, cpu.read_pmc, counter); } static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) { PVOP_VCALL2(cpu.alloc_ldt, ldt, entries); } static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries) { PVOP_VCALL2(cpu.free_ldt, ldt, entries); } static inline void load_TR_desc(void) { PVOP_VCALL0(cpu.load_tr_desc); } static inline void load_gdt(const struct desc_ptr *dtr) { PVOP_VCALL1(cpu.load_gdt, dtr); } static inline void load_idt(const struct desc_ptr *dtr) { PVOP_VCALL1(cpu.load_idt, dtr); } static inline void set_ldt(const void *addr, unsigned entries) { PVOP_VCALL2(cpu.set_ldt, addr, entries); } static inline unsigned long paravirt_store_tr(void) { return PVOP_CALL0(unsigned long, cpu.store_tr); } #define store_tr(tr) ((tr) = paravirt_store_tr()) static inline void load_TLS(struct thread_struct *t, unsigned cpu) { PVOP_VCALL2(cpu.load_tls, t, cpu); } static inline void load_gs_index(unsigned int gs) { PVOP_VCALL1(cpu.load_gs_index, gs); } static inline void write_ldt_entry(struct desc_struct *dt, int entry, const void *desc) { PVOP_VCALL3(cpu.write_ldt_entry, dt, entry, desc); } static inline void write_gdt_entry(struct desc_struct *dt, int entry, void *desc, int type) { PVOP_VCALL4(cpu.write_gdt_entry, dt, entry, desc, type); } static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g) { PVOP_VCALL3(cpu.write_idt_entry, dt, entry, g); } #ifdef CONFIG_X86_IOPL_IOPERM static inline void tss_invalidate_io_bitmap(void) { PVOP_VCALL0(cpu.invalidate_io_bitmap); } static inline void tss_update_io_bitmap(void) { PVOP_VCALL0(cpu.update_io_bitmap); } #endif static inline void paravirt_enter_mmap(struct mm_struct *next) { PVOP_VCALL1(mmu.enter_mmap, next); } static inline int paravirt_pgd_alloc(struct mm_struct *mm) { return PVOP_CALL1(int, mmu.pgd_alloc, mm); } static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) { PVOP_VCALL2(mmu.pgd_free, mm, pgd); } static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) { PVOP_VCALL2(mmu.alloc_pte, mm, pfn); } static inline void paravirt_release_pte(unsigned long pfn) { PVOP_VCALL1(mmu.release_pte, pfn); } static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) { PVOP_VCALL2(mmu.alloc_pmd, mm, pfn); } static inline void paravirt_release_pmd(unsigned long pfn) { PVOP_VCALL1(mmu.release_pmd, pfn); } static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) { PVOP_VCALL2(mmu.alloc_pud, mm, pfn); } static inline void paravirt_release_pud(unsigned long pfn) { PVOP_VCALL1(mmu.release_pud, pfn); } static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn) { PVOP_VCALL2(mmu.alloc_p4d, mm, pfn); } static inline void paravirt_release_p4d(unsigned long pfn) { PVOP_VCALL1(mmu.release_p4d, pfn); } static inline pte_t __pte(pteval_t val) { return (pte_t) { PVOP_ALT_CALLEE1(pteval_t, mmu.make_pte, val, "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pteval_t pte_val(pte_t pte) { return PVOP_ALT_CALLEE1(pteval_t, mmu.pte_val, pte.pte, "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline pgd_t __pgd(pgdval_t val) { return (pgd_t) { PVOP_ALT_CALLEE1(pgdval_t, mmu.make_pgd, val, "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pgdval_t pgd_val(pgd_t pgd) { return PVOP_ALT_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd, "mov %%rdi, %%rax", ALT_NOT_XEN); } #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pteval_t ret; ret = PVOP_CALL3(pteval_t, mmu.ptep_modify_prot_start, vma, addr, ptep); return (pte_t) { .pte = ret }; } static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { PVOP_VCALL4(mmu.ptep_modify_prot_commit, vma, addr, ptep, pte.pte); } static inline void set_pte(pte_t *ptep, pte_t pte) { PVOP_VCALL2(mmu.set_pte, ptep, pte.pte); } static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) { PVOP_VCALL2(mmu.set_pmd, pmdp, native_pmd_val(pmd)); } static inline pmd_t __pmd(pmdval_t val) { return (pmd_t) { PVOP_ALT_CALLEE1(pmdval_t, mmu.make_pmd, val, "mov %%rdi, %%rax", ALT_NOT_XEN) }; } static inline pmdval_t pmd_val(pmd_t pmd) { return PVOP_ALT_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd, "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void set_pud(pud_t *pudp, pud_t pud) { PVOP_VCALL2(mmu.set_pud, pudp, native_pud_val(pud)); } static inline pud_t __pud(pudval_t val) { pudval_t ret; ret = PVOP_ALT_CALLEE1(pudval_t, mmu.make_pud, val, "mov %%rdi, %%rax", ALT_NOT_XEN); return (pud_t) { ret }; } static inline pudval_t pud_val(pud_t pud) { return PVOP_ALT_CALLEE1(pudval_t, mmu.pud_val, pud.pud, "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void pud_clear(pud_t *pudp) { set_pud(pudp, native_make_pud(0)); } static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) { p4dval_t val = native_p4d_val(p4d); PVOP_VCALL2(mmu.set_p4d, p4dp, val); } static inline p4d_t __p4d(p4dval_t val) { p4dval_t ret = PVOP_ALT_CALLEE1(p4dval_t, mmu.make_p4d, val, "mov %%rdi, %%rax", ALT_NOT_XEN); return (p4d_t) { ret }; } static inline p4dval_t p4d_val(p4d_t p4d) { return PVOP_ALT_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d, "mov %%rdi, %%rax", ALT_NOT_XEN); } static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd) { PVOP_VCALL2(mmu.set_pgd, pgdp, native_pgd_val(pgd)); } #define set_pgd(pgdp, pgdval) do { \ if (pgtable_l5_enabled()) \ __set_pgd(pgdp, pgdval); \ else \ set_p4d((p4d_t *)(pgdp), (p4d_t) { (pgdval).pgd }); \ } while (0) #define pgd_clear(pgdp) do { \ if (pgtable_l5_enabled()) \ set_pgd(pgdp, native_make_pgd(0)); \ } while (0) static inline void p4d_clear(p4d_t *p4dp) { set_p4d(p4dp, native_make_p4d(0)); } static inline void set_pte_atomic(pte_t *ptep, pte_t pte) { set_pte(ptep, pte); } static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { set_pte(ptep, native_make_pte(0)); } static inline void pmd_clear(pmd_t *pmdp) { set_pmd(pmdp, native_make_pmd(0)); } #define __HAVE_ARCH_START_CONTEXT_SWITCH static inline void arch_start_context_switch(struct task_struct *prev) { PVOP_VCALL1(cpu.start_context_switch, prev); } static inline void arch_end_context_switch(struct task_struct *next) { PVOP_VCALL1(cpu.end_context_switch, next); } #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) { PVOP_VCALL0(mmu.lazy_mode.enter); } static inline void arch_leave_lazy_mmu_mode(void) { PVOP_VCALL0(mmu.lazy_mode.leave); } static inline void arch_flush_lazy_mmu_mode(void) { PVOP_VCALL0(mmu.lazy_mode.flush); } static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, phys_addr_t phys, pgprot_t flags) { pv_ops.mmu.set_fixmap(idx, phys, flags); } #endif #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) { PVOP_VCALL2(lock.queued_spin_lock_slowpath, lock, val); } static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock) { PVOP_ALT_VCALLEE1(lock.queued_spin_unlock, lock, "movb $0, (%%" _ASM_ARG1 ");", ALT_NOT(X86_FEATURE_PVUNLOCK)); } static __always_inline void pv_wait(u8 *ptr, u8 val) { PVOP_VCALL2(lock.wait, ptr, val); } static __always_inline void pv_kick(int cpu) { PVOP_VCALL1(lock.kick, cpu); } static __always_inline bool pv_vcpu_is_preempted(long cpu) { return PVOP_ALT_CALLEE1(bool, lock.vcpu_is_preempted, cpu, "xor %%" _ASM_AX ", %%" _ASM_AX ";", ALT_NOT(X86_FEATURE_VCPUPREEMPT)); } void __raw_callee_save___native_queued_spin_unlock(struct qspinlock *lock); bool __raw_callee_save___native_vcpu_is_preempted(long cpu); #endif /* SMP && PARAVIRT_SPINLOCKS */ #ifdef CONFIG_X86_32 /* save and restore all caller-save registers, except return value */ #define PV_SAVE_ALL_CALLER_REGS "pushl %ecx;" #define PV_RESTORE_ALL_CALLER_REGS "popl %ecx;" #else /* save and restore all caller-save registers, except return value */ #define PV_SAVE_ALL_CALLER_REGS \ "push %rcx;" \ "push %rdx;" \ "push %rsi;" \ "push %rdi;" \ "push %r8;" \ "push %r9;" \ "push %r10;" \ "push %r11;" #define PV_RESTORE_ALL_CALLER_REGS \ "pop %r11;" \ "pop %r10;" \ "pop %r9;" \ "pop %r8;" \ "pop %rdi;" \ "pop %rsi;" \ "pop %rdx;" \ "pop %rcx;" #endif /* * Generate a thunk around a function which saves all caller-save * registers except for the return value. This allows C functions to * be called from assembler code where fewer than normal registers are * available. It may also help code generation around calls from C * code if the common case doesn't use many registers. * * When a callee is wrapped in a thunk, the caller can assume that all * arg regs and all scratch registers are preserved across the * call. The return value in rax/eax will not be saved, even for void * functions. */ #define PV_THUNK_NAME(func) "__raw_callee_save_" #func #define __PV_CALLEE_SAVE_REGS_THUNK(func, section) \ extern typeof(func) __raw_callee_save_##func; \ \ asm(".pushsection " section ", \"ax\";" \ ".globl " PV_THUNK_NAME(func) ";" \ ".type " PV_THUNK_NAME(func) ", @function;" \ ASM_FUNC_ALIGN \ PV_THUNK_NAME(func) ":" \ ASM_ENDBR \ FRAME_BEGIN \ PV_SAVE_ALL_CALLER_REGS \ "call " #func ";" \ PV_RESTORE_ALL_CALLER_REGS \ FRAME_END \ ASM_RET \ ".size " PV_THUNK_NAME(func) ", .-" PV_THUNK_NAME(func) ";" \ ".popsection") #define PV_CALLEE_SAVE_REGS_THUNK(func) \ __PV_CALLEE_SAVE_REGS_THUNK(func, ".text") /* Get a reference to a callee-save function */ #define PV_CALLEE_SAVE(func) \ ((struct paravirt_callee_save) { __raw_callee_save_##func }) /* Promise that "func" already uses the right calling convention */ #define __PV_IS_CALLEE_SAVE(func) \ ((struct paravirt_callee_save) { func }) #ifdef CONFIG_PARAVIRT_XXL static __always_inline unsigned long arch_local_save_flags(void) { return PVOP_ALT_CALLEE0(unsigned long, irq.save_fl, "pushf; pop %%rax;", ALT_NOT_XEN); } static __always_inline void arch_local_irq_disable(void) { PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT_XEN); } static __always_inline void arch_local_irq_enable(void) { PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT_XEN); } static __always_inline unsigned long arch_local_irq_save(void) { unsigned long f; f = arch_local_save_flags(); arch_local_irq_disable(); return f; } #endif /* Make sure as little as possible of this mess escapes. */ #undef PARAVIRT_CALL #undef __PVOP_CALL #undef __PVOP_VCALL #undef PVOP_VCALL0 #undef PVOP_CALL0 #undef PVOP_VCALL1 #undef PVOP_CALL1 #undef PVOP_VCALL2 #undef PVOP_CALL2 #undef PVOP_VCALL3 #undef PVOP_CALL3 #undef PVOP_VCALL4 #undef PVOP_CALL4 extern void default_banner(void); void native_pv_lock_init(void) __init; #else /* __ASSEMBLER__ */ #ifdef CONFIG_X86_64 #ifdef CONFIG_PARAVIRT_XXL #ifdef CONFIG_DEBUG_ENTRY #define PARA_INDIRECT(addr) *addr(%rip) .macro PARA_IRQ_save_fl ANNOTATE_RETPOLINE_SAFE; call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl); .endm #define SAVE_FLAGS ALTERNATIVE_2 "PARA_IRQ_save_fl;", \ "ALT_CALL_INSTR;", ALT_CALL_ALWAYS, \ "pushf; pop %rax;", ALT_NOT_XEN #endif #endif /* CONFIG_PARAVIRT_XXL */ #endif /* CONFIG_X86_64 */ #endif /* __ASSEMBLER__ */ #else /* CONFIG_PARAVIRT */ # define default_banner x86_init_noop #ifndef __ASSEMBLER__ static inline void native_pv_lock_init(void) { } #endif #endif /* !CONFIG_PARAVIRT */ #ifndef __ASSEMBLER__ #ifndef CONFIG_PARAVIRT_XXL static inline void paravirt_enter_mmap(struct mm_struct *mm) { } #endif #ifndef CONFIG_PARAVIRT static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) { } #endif #ifndef CONFIG_PARAVIRT_SPINLOCKS static inline void paravirt_set_cap(void) { } #endif #endif /* __ASSEMBLER__ */ #endif /* _ASM_X86_PARAVIRT_H */
716 195 195 103 103 103 716 716 716 716 716 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 // SPDX-License-Identifier: GPL-2.0 /* * HugeTLB Vmemmap Optimization (HVO) * * Copyright (c) 2020, ByteDance. All rights reserved. * * Author: Muchun Song <songmuchun@bytedance.com> * * See Documentation/mm/vmemmap_dedup.rst */ #define pr_fmt(fmt) "HugeTLB: " fmt #include <linux/pgtable.h> #include <linux/moduleparam.h> #include <linux/bootmem_info.h> #include <linux/mmdebug.h> #include <linux/pagewalk.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include "hugetlb_vmemmap.h" /** * struct vmemmap_remap_walk - walk vmemmap page table * * @remap_pte: called for each lowest-level entry (PTE). * @nr_walked: the number of walked pte. * @reuse_page: the page which is reused for the tail vmemmap pages. * @reuse_addr: the virtual address of the @reuse_page page. * @vmemmap_pages: the list head of the vmemmap pages that can be freed * or is mapped from. * @flags: used to modify behavior in vmemmap page table walking * operations. */ struct vmemmap_remap_walk { void (*remap_pte)(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk); unsigned long nr_walked; struct page *reuse_page; unsigned long reuse_addr; struct list_head *vmemmap_pages; /* Skip the TLB flush when we split the PMD */ #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) /* Skip the TLB flush when we remap the PTE */ #define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1) /* synchronize_rcu() to avoid writes from page_ref_add_unless() */ #define VMEMMAP_SYNCHRONIZE_RCU BIT(2) unsigned long flags; }; static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start, struct vmemmap_remap_walk *walk) { pmd_t __pmd; int i; unsigned long addr = start; pte_t *pgtable; pgtable = pte_alloc_one_kernel(&init_mm); if (!pgtable) return -ENOMEM; pmd_populate_kernel(&init_mm, &__pmd, pgtable); for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { pte_t entry, *pte; pgprot_t pgprot = PAGE_KERNEL; entry = mk_pte(head + i, pgprot); pte = pte_offset_kernel(&__pmd, addr); set_pte_at(&init_mm, addr, pte, entry); } spin_lock(&init_mm.page_table_lock); if (likely(pmd_leaf(*pmd))) { /* * Higher order allocations from buddy allocator must be able to * be treated as indepdenent small pages (as they can be freed * individually). */ if (!PageReserved(head)) split_page(head, get_order(PMD_SIZE)); /* Make pte visible before pmd. See comment in pmd_install(). */ smp_wmb(); pmd_populate_kernel(&init_mm, pmd, pgtable); if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)) flush_tlb_kernel_range(start, start + PMD_SIZE); } else { pte_free_kernel(&init_mm, pgtable); } spin_unlock(&init_mm.page_table_lock); return 0; } static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { int ret = 0; struct page *head; struct vmemmap_remap_walk *vmemmap_walk = walk->private; /* Only splitting, not remapping the vmemmap pages. */ if (!vmemmap_walk->remap_pte) walk->action = ACTION_CONTINUE; spin_lock(&init_mm.page_table_lock); head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL; /* * Due to HugeTLB alignment requirements and the vmemmap * pages being at the start of the hotplugged memory * region in memory_hotplug.memmap_on_memory case. Checking * the vmemmap page associated with the first vmemmap page * if it is self-hosted is sufficient. * * [ hotplugged memory ] * [ section ][...][ section ] * [ vmemmap ][ usable memory ] * ^ | ^ | * +--+ | | * +------------------------+ */ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) { struct page *page = head ? head + pte_index(addr) : pte_page(ptep_get(pte_offset_kernel(pmd, addr))); if (PageVmemmapSelfHosted(page)) ret = -ENOTSUPP; } spin_unlock(&init_mm.page_table_lock); if (!head || ret) return ret; return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk); } static int vmemmap_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { struct vmemmap_remap_walk *vmemmap_walk = walk->private; /* * The reuse_page is found 'first' in page table walking before * starting remapping. */ if (!vmemmap_walk->reuse_page) vmemmap_walk->reuse_page = pte_page(ptep_get(pte)); else vmemmap_walk->remap_pte(pte, addr, vmemmap_walk); vmemmap_walk->nr_walked++; return 0; } static const struct mm_walk_ops vmemmap_remap_ops = { .pmd_entry = vmemmap_pmd_entry, .pte_entry = vmemmap_pte_entry, }; static int vmemmap_remap_range(unsigned long start, unsigned long end, struct vmemmap_remap_walk *walk) { int ret; VM_BUG_ON(!PAGE_ALIGNED(start | end)); mmap_read_lock(&init_mm); ret = walk_kernel_page_table_range(start, end, &vmemmap_remap_ops, NULL, walk); mmap_read_unlock(&init_mm); if (ret) return ret; if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH)) flush_tlb_kernel_range(start, end); return 0; } /* * Free a vmemmap page. A vmemmap page can be allocated from the memblock * allocator or buddy allocator. If the PG_reserved flag is set, it means * that it allocated from the memblock allocator, just free it via the * free_bootmem_page(). Otherwise, use __free_page(). */ static inline void free_vmemmap_page(struct page *page) { if (PageReserved(page)) { memmap_boot_pages_add(-1); free_bootmem_page(page); } else { memmap_pages_add(-1); __free_page(page); } } /* Free a list of the vmemmap pages */ static void free_vmemmap_page_list(struct list_head *list) { struct page *page, *next; list_for_each_entry_safe(page, next, list, lru) free_vmemmap_page(page); } static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk) { /* * Remap the tail pages as read-only to catch illegal write operation * to the tail pages. */ pgprot_t pgprot = PAGE_KERNEL_RO; struct page *page = pte_page(ptep_get(pte)); pte_t entry; /* Remapping the head page requires r/w */ if (unlikely(addr == walk->reuse_addr)) { pgprot = PAGE_KERNEL; list_del(&walk->reuse_page->lru); /* * Makes sure that preceding stores to the page contents from * vmemmap_remap_free() become visible before the set_pte_at() * write. */ smp_wmb(); } entry = mk_pte(walk->reuse_page, pgprot); list_add(&page->lru, walk->vmemmap_pages); set_pte_at(&init_mm, addr, pte, entry); } /* * How many struct page structs need to be reset. When we reuse the head * struct page, the special metadata (e.g. page->flags or page->mapping) * cannot copy to the tail struct page structs. The invalid value will be * checked in the free_tail_page_prepare(). In order to avoid the message * of "corrupted mapping in tail page". We need to reset at least 4 (one * head struct page struct and three tail struct page structs) struct page * structs. */ #define NR_RESET_STRUCT_PAGE 4 static inline void reset_struct_pages(struct page *start) { struct page *from = start + NR_RESET_STRUCT_PAGE; BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page)); memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE); } static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk) { pgprot_t pgprot = PAGE_KERNEL; struct page *page; void *to; BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page); page = list_first_entry(walk->vmemmap_pages, struct page, lru); list_del(&page->lru); to = page_to_virt(page); copy_page(to, (void *)walk->reuse_addr); reset_struct_pages(to); /* * Makes sure that preceding stores to the page contents become visible * before the set_pte_at() write. */ smp_wmb(); set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); } /** * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end) * backing PMDs of the directmap into PTEs * @start: start address of the vmemmap virtual address range that we want * to remap. * @end: end address of the vmemmap virtual address range that we want to * remap. * @reuse: reuse address. * * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_split(unsigned long start, unsigned long end, unsigned long reuse) { struct vmemmap_remap_walk walk = { .remap_pte = NULL, .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH, }; /* See the comment in the vmemmap_remap_free(). */ BUG_ON(start - reuse != PAGE_SIZE); return vmemmap_remap_range(reuse, end, &walk); } /** * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) * to the page which @reuse is mapped to, then free vmemmap * which the range are mapped to. * @start: start address of the vmemmap virtual address range that we want * to remap. * @end: end address of the vmemmap virtual address range that we want to * remap. * @reuse: reuse address. * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers * responsibility to free pages. * @flags: modifications to vmemmap_remap_walk flags * * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_free(unsigned long start, unsigned long end, unsigned long reuse, struct list_head *vmemmap_pages, unsigned long flags) { int ret; struct vmemmap_remap_walk walk = { .remap_pte = vmemmap_remap_pte, .reuse_addr = reuse, .vmemmap_pages = vmemmap_pages, .flags = flags, }; int nid = page_to_nid((struct page *)reuse); gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; /* * Allocate a new head vmemmap page to avoid breaking a contiguous * block of struct page memory when freeing it back to page allocator * in free_vmemmap_page_list(). This will allow the likely contiguous * struct page backing memory to be kept contiguous and allowing for * more allocations of hugepages. Fallback to the currently * mapped head page in case should it fail to allocate. */ walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0); if (walk.reuse_page) { copy_page(page_to_virt(walk.reuse_page), (void *)walk.reuse_addr); list_add(&walk.reuse_page->lru, vmemmap_pages); memmap_pages_add(1); } /* * In order to make remapping routine most efficient for the huge pages, * the routine of vmemmap page table walking has the following rules * (see more details from the vmemmap_pte_range()): * * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE) * should be continuous. * - The @reuse address is part of the range [@reuse, @end) that we are * walking which is passed to vmemmap_remap_range(). * - The @reuse address is the first in the complete range. * * So we need to make sure that @start and @reuse meet the above rules. */ BUG_ON(start - reuse != PAGE_SIZE); ret = vmemmap_remap_range(reuse, end, &walk); if (ret && walk.nr_walked) { end = reuse + walk.nr_walked * PAGE_SIZE; /* * vmemmap_pages contains pages from the previous * vmemmap_remap_range call which failed. These * are pages which were removed from the vmemmap. * They will be restored in the following call. */ walk = (struct vmemmap_remap_walk) { .remap_pte = vmemmap_restore_pte, .reuse_addr = reuse, .vmemmap_pages = vmemmap_pages, .flags = 0, }; vmemmap_remap_range(reuse, end, &walk); } return ret; } static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, struct list_head *list) { gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL; unsigned long nr_pages = (end - start) >> PAGE_SHIFT; int nid = page_to_nid((struct page *)start); struct page *page, *next; int i; for (i = 0; i < nr_pages; i++) { page = alloc_pages_node(nid, gfp_mask, 0); if (!page) goto out; list_add(&page->lru, list); } memmap_pages_add(nr_pages); return 0; out: list_for_each_entry_safe(page, next, list, lru) __free_page(page); return -ENOMEM; } /** * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) * to the page which is from the @vmemmap_pages * respectively. * @start: start address of the vmemmap virtual address range that we want * to remap. * @end: end address of the vmemmap virtual address range that we want to * remap. * @reuse: reuse address. * @flags: modifications to vmemmap_remap_walk flags * * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_alloc(unsigned long start, unsigned long end, unsigned long reuse, unsigned long flags) { LIST_HEAD(vmemmap_pages); struct vmemmap_remap_walk walk = { .remap_pte = vmemmap_restore_pte, .reuse_addr = reuse, .vmemmap_pages = &vmemmap_pages, .flags = flags, }; /* See the comment in the vmemmap_remap_free(). */ BUG_ON(start - reuse != PAGE_SIZE); if (alloc_vmemmap_page_list(start, end, &vmemmap_pages)) return -ENOMEM; return vmemmap_remap_range(reuse, end, &walk); } DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); static int __init hugetlb_vmemmap_optimize_param(char *buf) { return kstrtobool(buf, &vmemmap_optimize_enabled); } early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_optimize_param); static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio, unsigned long flags) { int ret; unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; unsigned long vmemmap_reuse; VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); if (!folio_test_hugetlb_vmemmap_optimized(folio)) return 0; if (flags & VMEMMAP_SYNCHRONIZE_RCU) synchronize_rcu(); vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); vmemmap_reuse = vmemmap_start; vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; /* * The pages which the vmemmap virtual address range [@vmemmap_start, * @vmemmap_end) are mapped to are freed to the buddy allocator, and * the range is mapped to the page which @vmemmap_reuse is mapped to. * When a HugeTLB page is freed to the buddy allocator, previously * discarded vmemmap pages must be allocated and remapping. */ ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags); if (!ret) { folio_clear_hugetlb_vmemmap_optimized(folio); static_branch_dec(&hugetlb_optimize_vmemmap_key); } return ret; } /** * hugetlb_vmemmap_restore_folio - restore previously optimized (by * hugetlb_vmemmap_optimize_folio()) vmemmap pages which * will be reallocated and remapped. * @h: struct hstate. * @folio: the folio whose vmemmap pages will be restored. * * Return: %0 if @folio's vmemmap pages have been reallocated and remapped, * negative error code otherwise. */ int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) { return __hugetlb_vmemmap_restore_folio(h, folio, VMEMMAP_SYNCHRONIZE_RCU); } /** * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list. * @h: hstate. * @folio_list: list of folios. * @non_hvo_folios: Output list of folios for which vmemmap exists. * * Return: number of folios for which vmemmap was restored, or an error code * if an error was encountered restoring vmemmap for a folio. * Folios that have vmemmap are moved to the non_hvo_folios * list. Processing of entries stops when the first error is * encountered. The folio that experienced the error and all * non-processed folios will remain on folio_list. */ long hugetlb_vmemmap_restore_folios(const struct hstate *h, struct list_head *folio_list, struct list_head *non_hvo_folios) { struct folio *folio, *t_folio; long restored = 0; long ret = 0; unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU; list_for_each_entry_safe(folio, t_folio, folio_list, lru) { if (folio_test_hugetlb_vmemmap_optimized(folio)) { ret = __hugetlb_vmemmap_restore_folio(h, folio, flags); /* only need to synchronize_rcu() once for each batch */ flags &= ~VMEMMAP_SYNCHRONIZE_RCU; if (ret) break; restored++; } /* Add non-optimized folios to output list */ list_move(&folio->lru, non_hvo_folios); } if (restored) flush_tlb_all(); if (!ret) ret = restored; return ret; } /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */ static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio) { if (folio_test_hugetlb_vmemmap_optimized(folio)) return false; if (!READ_ONCE(vmemmap_optimize_enabled)) return false; if (!hugetlb_vmemmap_optimizable(h)) return false; return true; } static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio, struct list_head *vmemmap_pages, unsigned long flags) { int ret = 0; unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; unsigned long vmemmap_reuse; VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); if (!vmemmap_should_optimize_folio(h, folio)) return ret; static_branch_inc(&hugetlb_optimize_vmemmap_key); if (flags & VMEMMAP_SYNCHRONIZE_RCU) synchronize_rcu(); /* * Very Subtle * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed * immediately after remapping. As a result, subsequent accesses * and modifications to struct pages associated with the hugetlb * page could be to the OLD struct pages. Set the vmemmap optimized * flag here so that it is copied to the new head page. This keeps * the old and new struct pages in sync. * If there is an error during optimization, we will immediately FLUSH * the TLB and clear the flag below. */ folio_set_hugetlb_vmemmap_optimized(folio); vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); vmemmap_reuse = vmemmap_start; vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; /* * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end) * to the page which @vmemmap_reuse is mapped to. Add pages previously * mapping the range to vmemmap_pages list so that they can be freed by * the caller. */ ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse, vmemmap_pages, flags); if (ret) { static_branch_dec(&hugetlb_optimize_vmemmap_key); folio_clear_hugetlb_vmemmap_optimized(folio); } return ret; } /** * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages. * @h: struct hstate. * @folio: the folio whose vmemmap pages will be optimized. * * This function only tries to optimize @folio's vmemmap pages and does not * guarantee that the optimization will succeed after it returns. The caller * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's * vmemmap pages have been optimized. */ void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) { LIST_HEAD(vmemmap_pages); __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, VMEMMAP_SYNCHRONIZE_RCU); free_vmemmap_page_list(&vmemmap_pages); } static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio) { unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; unsigned long vmemmap_reuse; if (!vmemmap_should_optimize_folio(h, folio)) return 0; vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); vmemmap_reuse = vmemmap_start; vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; /* * Split PMDs on the vmemmap virtual address range [@vmemmap_start, * @vmemmap_end] */ return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse); } static void __hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list, bool boot) { struct folio *folio; int nr_to_optimize; LIST_HEAD(vmemmap_pages); unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU; nr_to_optimize = 0; list_for_each_entry(folio, folio_list, lru) { int ret; unsigned long spfn, epfn; if (boot && folio_test_hugetlb_vmemmap_optimized(folio)) { /* * Already optimized by pre-HVO, just map the * mirrored tail page structs RO. */ spfn = (unsigned long)&folio->page; epfn = spfn + pages_per_huge_page(h); vmemmap_wrprotect_hvo(spfn, epfn, folio_nid(folio), HUGETLB_VMEMMAP_RESERVE_SIZE); register_page_bootmem_memmap(pfn_to_section_nr(spfn), &folio->page, HUGETLB_VMEMMAP_RESERVE_SIZE); static_branch_inc(&hugetlb_optimize_vmemmap_key); continue; } nr_to_optimize++; ret = hugetlb_vmemmap_split_folio(h, folio); /* * Spliting the PMD requires allocating a page, thus lets fail * early once we encounter the first OOM. No point in retrying * as it can be dynamically done on remap with the memory * we get back from the vmemmap deduplication. */ if (ret == -ENOMEM) break; } if (!nr_to_optimize) /* * All pre-HVO folios, nothing left to do. It's ok if * there is a mix of pre-HVO and not yet HVO-ed folios * here, as __hugetlb_vmemmap_optimize_folio() will * skip any folios that already have the optimized flag * set, see vmemmap_should_optimize_folio(). */ goto out; flush_tlb_all(); list_for_each_entry(folio, folio_list, lru) { int ret; ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags); /* only need to synchronize_rcu() once for each batch */ flags &= ~VMEMMAP_SYNCHRONIZE_RCU; /* * Pages to be freed may have been accumulated. If we * encounter an ENOMEM, free what we have and try again. * This can occur in the case that both spliting fails * halfway and head page allocation also failed. In this * case __hugetlb_vmemmap_optimize_folio() would free memory * allowing more vmemmap remaps to occur. */ if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) { flush_tlb_all(); free_vmemmap_page_list(&vmemmap_pages); INIT_LIST_HEAD(&vmemmap_pages); __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags); } } out: flush_tlb_all(); free_vmemmap_page_list(&vmemmap_pages); } void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) { __hugetlb_vmemmap_optimize_folios(h, folio_list, false); } void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, struct list_head *folio_list) { __hugetlb_vmemmap_optimize_folios(h, folio_list, true); } #ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT /* Return true of a bootmem allocated HugeTLB page should be pre-HVO-ed */ static bool vmemmap_should_optimize_bootmem_page(struct huge_bootmem_page *m) { unsigned long section_size, psize, pmd_vmemmap_size; phys_addr_t paddr; if (!READ_ONCE(vmemmap_optimize_enabled)) return false; if (!hugetlb_vmemmap_optimizable(m->hstate)) return false; psize = huge_page_size(m->hstate); paddr = virt_to_phys(m); /* * Pre-HVO only works if the bootmem huge page * is aligned to the section size. */ section_size = (1UL << PA_SECTION_SHIFT); if (!IS_ALIGNED(paddr, section_size) || !IS_ALIGNED(psize, section_size)) return false; /* * The pre-HVO code does not deal with splitting PMDS, * so the bootmem page must be aligned to the number * of base pages that can be mapped with one vmemmap PMD. */ pmd_vmemmap_size = (PMD_SIZE / (sizeof(struct page))) << PAGE_SHIFT; if (!IS_ALIGNED(paddr, pmd_vmemmap_size) || !IS_ALIGNED(psize, pmd_vmemmap_size)) return false; return true; } /* * Initialize memmap section for a gigantic page, HVO-style. */ void __init hugetlb_vmemmap_init_early(int nid) { unsigned long psize, paddr, section_size; unsigned long ns, i, pnum, pfn, nr_pages; unsigned long start, end; struct huge_bootmem_page *m = NULL; void *map; /* * Noting to do if bootmem pages were not allocated * early in boot, or if HVO wasn't enabled in the * first place. */ if (!hugetlb_bootmem_allocated()) return; if (!READ_ONCE(vmemmap_optimize_enabled)) return; section_size = (1UL << PA_SECTION_SHIFT); list_for_each_entry(m, &huge_boot_pages[nid], list) { if (!vmemmap_should_optimize_bootmem_page(m)) continue; nr_pages = pages_per_huge_page(m->hstate); psize = nr_pages << PAGE_SHIFT; paddr = virt_to_phys(m); pfn = PHYS_PFN(paddr); map = pfn_to_page(pfn); start = (unsigned long)map; end = start + nr_pages * sizeof(struct page); if (vmemmap_populate_hvo(start, end, nid, HUGETLB_VMEMMAP_RESERVE_SIZE) < 0) continue; memmap_boot_pages_add(HUGETLB_VMEMMAP_RESERVE_SIZE / PAGE_SIZE); pnum = pfn_to_section_nr(pfn); ns = psize / section_size; for (i = 0; i < ns; i++) { sparse_init_early_section(nid, map, pnum, SECTION_IS_VMEMMAP_PREINIT); map += section_map_size(); pnum++; } m->flags |= HUGE_BOOTMEM_HVO; } } void __init hugetlb_vmemmap_init_late(int nid) { struct huge_bootmem_page *m, *tm; unsigned long phys, nr_pages, start, end; unsigned long pfn, nr_mmap; struct hstate *h; void *map; if (!hugetlb_bootmem_allocated()) return; if (!READ_ONCE(vmemmap_optimize_enabled)) return; list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) { if (!(m->flags & HUGE_BOOTMEM_HVO)) continue; phys = virt_to_phys(m); h = m->hstate; pfn = PHYS_PFN(phys); nr_pages = pages_per_huge_page(h); if (!hugetlb_bootmem_page_zones_valid(nid, m)) { /* * Oops, the hugetlb page spans multiple zones. * Remove it from the list, and undo HVO. */ list_del(&m->list); map = pfn_to_page(pfn); start = (unsigned long)map; end = start + nr_pages * sizeof(struct page); vmemmap_undo_hvo(start, end, nid, HUGETLB_VMEMMAP_RESERVE_SIZE); nr_mmap = end - start - HUGETLB_VMEMMAP_RESERVE_SIZE; memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE)); memblock_phys_free(phys, huge_page_size(h)); continue; } else m->flags |= HUGE_BOOTMEM_ZONES_VALID; } } #endif static const struct ctl_table hugetlb_vmemmap_sysctls[] = { { .procname = "hugetlb_optimize_vmemmap", .data = &vmemmap_optimize_enabled, .maxlen = sizeof(vmemmap_optimize_enabled), .mode = 0644, .proc_handler = proc_dobool, }, }; static int __init hugetlb_vmemmap_init(void) { const struct hstate *h; /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */ BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES); for_each_hstate(h) { if (hugetlb_vmemmap_optimizable(h)) { register_sysctl_init("vm", hugetlb_vmemmap_sysctls); break; } } return 0; } late_initcall(hugetlb_vmemmap_init);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 /* * fs/nfs/idmap.c * * UID and GID to name mapping for clients. * * Copyright (c) 2002 The Regents of the University of Michigan. * All rights reserved. * * Marius Aamodt Eriksen <marius@umich.edu> * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <linux/types.h> #include <linux/parser.h> #include <linux/fs.h> #include <net/net_namespace.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/nfs_fs.h> #include <linux/nfs_fs_sb.h> #include <linux/key.h> #include <linux/keyctl.h> #include <linux/key-type.h> #include <keys/user-type.h> #include <keys/request_key_auth-type.h> #include <linux/module.h> #include <linux/user_namespace.h> #include "internal.h" #include "netns.h" #include "nfs4idmap.h" #include "nfs4trace.h" #define NFS_UINT_MAXLEN 11 static const struct cred *id_resolver_cache; static struct key_type key_type_id_resolver_legacy; struct idmap_legacy_upcalldata { struct rpc_pipe_msg pipe_msg; struct idmap_msg idmap_msg; struct key *authkey; struct idmap *idmap; }; struct idmap { struct rpc_pipe_dir_object idmap_pdo; struct rpc_pipe *idmap_pipe; struct idmap_legacy_upcalldata *idmap_upcall_data; struct mutex idmap_mutex; struct user_namespace *user_ns; }; static struct user_namespace *idmap_userns(const struct idmap *idmap) { if (idmap && idmap->user_ns) return idmap->user_ns; return &init_user_ns; } /** * nfs_fattr_init_names - initialise the nfs_fattr owner_name/group_name fields * @fattr: fully initialised struct nfs_fattr * @owner_name: owner name string cache * @group_name: group name string cache */ void nfs_fattr_init_names(struct nfs_fattr *fattr, struct nfs4_string *owner_name, struct nfs4_string *group_name) { fattr->owner_name = owner_name; fattr->group_name = group_name; } static void nfs_fattr_free_owner_name(struct nfs_fattr *fattr) { fattr->valid &= ~NFS_ATTR_FATTR_OWNER_NAME; kfree(fattr->owner_name->data); } static void nfs_fattr_free_group_name(struct nfs_fattr *fattr) { fattr->valid &= ~NFS_ATTR_FATTR_GROUP_NAME; kfree(fattr->group_name->data); } static bool nfs_fattr_map_owner_name(struct nfs_server *server, struct nfs_fattr *fattr) { struct nfs4_string *owner = fattr->owner_name; kuid_t uid; if (!(fattr->valid & NFS_ATTR_FATTR_OWNER_NAME)) return false; if (nfs_map_name_to_uid(server, owner->data, owner->len, &uid) == 0) { fattr->uid = uid; fattr->valid |= NFS_ATTR_FATTR_OWNER; } return true; } static bool nfs_fattr_map_group_name(struct nfs_server *server, struct nfs_fattr *fattr) { struct nfs4_string *group = fattr->group_name; kgid_t gid; if (!(fattr->valid & NFS_ATTR_FATTR_GROUP_NAME)) return false; if (nfs_map_group_to_gid(server, group->data, group->len, &gid) == 0) { fattr->gid = gid; fattr->valid |= NFS_ATTR_FATTR_GROUP; } return true; } /** * nfs_fattr_free_names - free up the NFSv4 owner and group strings * @fattr: a fully initialised nfs_fattr structure */ void nfs_fattr_free_names(struct nfs_fattr *fattr) { if (fattr->valid & NFS_ATTR_FATTR_OWNER_NAME) nfs_fattr_free_owner_name(fattr); if (fattr->valid & NFS_ATTR_FATTR_GROUP_NAME) nfs_fattr_free_group_name(fattr); } /** * nfs_fattr_map_and_free_names - map owner/group strings into uid/gid and free * @server: pointer to the filesystem nfs_server structure * @fattr: a fully initialised nfs_fattr structure * * This helper maps the cached NFSv4 owner/group strings in fattr into * their numeric uid/gid equivalents, and then frees the cached strings. */ void nfs_fattr_map_and_free_names(struct nfs_server *server, struct nfs_fattr *fattr) { if (nfs_fattr_map_owner_name(server, fattr)) nfs_fattr_free_owner_name(fattr); if (nfs_fattr_map_group_name(server, fattr)) nfs_fattr_free_group_name(fattr); } int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res) { unsigned long val; char buf[16]; if (memchr(name, '@', namelen) != NULL || namelen >= sizeof(buf)) return 0; memcpy(buf, name, namelen); buf[namelen] = '\0'; if (kstrtoul(buf, 0, &val) != 0) return 0; *res = val; return 1; } EXPORT_SYMBOL_GPL(nfs_map_string_to_numeric); static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen) { return snprintf(buf, buflen, "%u", id); } static struct key_type key_type_id_resolver = { .name = "id_resolver", .preparse = user_preparse, .free_preparse = user_free_preparse, .instantiate = generic_key_instantiate, .revoke = user_revoke, .destroy = user_destroy, .describe = user_describe, .read = user_read, }; int nfs_idmap_init(void) { struct cred *cred; struct key *keyring; int ret = 0; printk(KERN_NOTICE "NFS: Registering the %s key type\n", key_type_id_resolver.name); cred = prepare_kernel_cred(&init_task); if (!cred) return -ENOMEM; keyring = keyring_alloc(".id_resolver", GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ, KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto failed_put_cred; } ret = register_key_type(&key_type_id_resolver); if (ret < 0) goto failed_put_key; ret = register_key_type(&key_type_id_resolver_legacy); if (ret < 0) goto failed_reg_legacy; set_bit(KEY_FLAG_ROOT_CAN_CLEAR, &keyring->flags); cred->thread_keyring = keyring; cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; id_resolver_cache = cred; return 0; failed_reg_legacy: unregister_key_type(&key_type_id_resolver); failed_put_key: key_put(keyring); failed_put_cred: put_cred(cred); return ret; } void nfs_idmap_quit(void) { key_revoke(id_resolver_cache->thread_keyring); unregister_key_type(&key_type_id_resolver); unregister_key_type(&key_type_id_resolver_legacy); put_cred(id_resolver_cache); } /* * Assemble the description to pass to request_key() * This function will allocate a new string and update dest to point * at it. The caller is responsible for freeing dest. * * On error 0 is returned. Otherwise, the length of dest is returned. */ static ssize_t nfs_idmap_get_desc(const char *name, size_t namelen, const char *type, size_t typelen, char **desc) { char *cp; size_t desclen = typelen + namelen + 2; *desc = kmalloc(desclen, GFP_KERNEL); if (!*desc) return -ENOMEM; cp = *desc; memcpy(cp, type, typelen); cp += typelen; *cp++ = ':'; memcpy(cp, name, namelen); cp += namelen; *cp = '\0'; return desclen; } static struct key *nfs_idmap_request_key(const char *name, size_t namelen, const char *type, struct idmap *idmap) { char *desc; struct key *rkey = ERR_PTR(-EAGAIN); ssize_t ret; ret = nfs_idmap_get_desc(name, namelen, type, strlen(type), &desc); if (ret < 0) return ERR_PTR(ret); if (!idmap->user_ns || idmap->user_ns == &init_user_ns) rkey = request_key(&key_type_id_resolver, desc, ""); if (IS_ERR(rkey)) { mutex_lock(&idmap->idmap_mutex); rkey = request_key_with_auxdata(&key_type_id_resolver_legacy, desc, NULL, "", 0, idmap); mutex_unlock(&idmap->idmap_mutex); } if (!IS_ERR(rkey)) set_bit(KEY_FLAG_ROOT_CAN_INVAL, &rkey->flags); kfree(desc); return rkey; } static ssize_t nfs_idmap_get_key(const char *name, size_t namelen, const char *type, void *data, size_t data_size, struct idmap *idmap) { const struct cred *saved_cred; struct key *rkey; const struct user_key_payload *payload; ssize_t ret; saved_cred = override_creds(id_resolver_cache); rkey = nfs_idmap_request_key(name, namelen, type, idmap); revert_creds(saved_cred); if (IS_ERR(rkey)) { ret = PTR_ERR(rkey); goto out; } rcu_read_lock(); rkey->perm |= KEY_USR_VIEW; ret = key_validate(rkey); if (ret < 0) goto out_up; payload = user_key_payload_rcu(rkey); if (IS_ERR_OR_NULL(payload)) { ret = PTR_ERR(payload); goto out_up; } ret = payload->datalen; if (ret > 0 && ret <= data_size) memcpy(data, payload->data, ret); else ret = -EINVAL; out_up: rcu_read_unlock(); key_put(rkey); out: return ret; } /* ID -> Name */ static ssize_t nfs_idmap_lookup_name(__u32 id, const char *type, char *buf, size_t buflen, struct idmap *idmap) { char id_str[NFS_UINT_MAXLEN]; int id_len; ssize_t ret; id_len = nfs_map_numeric_to_string(id, id_str, sizeof(id_str)); ret = nfs_idmap_get_key(id_str, id_len, type, buf, buflen, idmap); if (ret < 0) return -EINVAL; return ret; } /* Name -> ID */ static int nfs_idmap_lookup_id(const char *name, size_t namelen, const char *type, __u32 *id, struct idmap *idmap) { char id_str[NFS_UINT_MAXLEN]; long id_long; ssize_t data_size; int ret = 0; data_size = nfs_idmap_get_key(name, namelen, type, id_str, NFS_UINT_MAXLEN, idmap); if (data_size <= 0) { ret = -EINVAL; } else { ret = kstrtol(id_str, 10, &id_long); if (!ret) *id = (__u32)id_long; } return ret; } /* idmap classic begins here */ enum { Opt_find_uid, Opt_find_gid, Opt_find_user, Opt_find_group, Opt_find_err }; static const match_table_t nfs_idmap_tokens = { { Opt_find_uid, "uid:%s" }, { Opt_find_gid, "gid:%s" }, { Opt_find_user, "user:%s" }, { Opt_find_group, "group:%s" }, { Opt_find_err, NULL } }; static int nfs_idmap_legacy_upcall(struct key *, void *); static ssize_t idmap_pipe_downcall(struct file *, const char __user *, size_t); static void idmap_release_pipe(struct inode *); static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *); static const struct rpc_pipe_ops idmap_upcall_ops = { .upcall = rpc_pipe_generic_upcall, .downcall = idmap_pipe_downcall, .release_pipe = idmap_release_pipe, .destroy_msg = idmap_pipe_destroy_msg, }; static struct key_type key_type_id_resolver_legacy = { .name = "id_legacy", .preparse = user_preparse, .free_preparse = user_free_preparse, .instantiate = generic_key_instantiate, .revoke = user_revoke, .destroy = user_destroy, .describe = user_describe, .read = user_read, .request_key = nfs_idmap_legacy_upcall, }; static void nfs_idmap_pipe_destroy(struct dentry *dir, struct rpc_pipe_dir_object *pdo) { struct idmap *idmap = pdo->pdo_data; rpc_unlink(idmap->idmap_pipe); } static int nfs_idmap_pipe_create(struct dentry *dir, struct rpc_pipe_dir_object *pdo) { struct idmap *idmap = pdo->pdo_data; return rpc_mkpipe_dentry(dir, "idmap", idmap, idmap->idmap_pipe); } static const struct rpc_pipe_dir_object_ops nfs_idmap_pipe_dir_object_ops = { .create = nfs_idmap_pipe_create, .destroy = nfs_idmap_pipe_destroy, }; int nfs_idmap_new(struct nfs_client *clp) { struct idmap *idmap; struct rpc_pipe *pipe; int error; idmap = kzalloc(sizeof(*idmap), GFP_KERNEL); if (idmap == NULL) return -ENOMEM; mutex_init(&idmap->idmap_mutex); idmap->user_ns = get_user_ns(clp->cl_rpcclient->cl_cred->user_ns); rpc_init_pipe_dir_object(&idmap->idmap_pdo, &nfs_idmap_pipe_dir_object_ops, idmap); pipe = rpc_mkpipe_data(&idmap_upcall_ops, 0); if (IS_ERR(pipe)) { error = PTR_ERR(pipe); goto err; } idmap->idmap_pipe = pipe; error = rpc_add_pipe_dir_object(clp->cl_net, &clp->cl_rpcclient->cl_pipedir_objects, &idmap->idmap_pdo); if (error) goto err_destroy_pipe; clp->cl_idmap = idmap; return 0; err_destroy_pipe: rpc_destroy_pipe_data(idmap->idmap_pipe); err: put_user_ns(idmap->user_ns); kfree(idmap); return error; } void nfs_idmap_delete(struct nfs_client *clp) { struct idmap *idmap = clp->cl_idmap; if (!idmap) return; clp->cl_idmap = NULL; rpc_remove_pipe_dir_object(clp->cl_net, &clp->cl_rpcclient->cl_pipedir_objects, &idmap->idmap_pdo); rpc_destroy_pipe_data(idmap->idmap_pipe); put_user_ns(idmap->user_ns); kfree(idmap); } static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap, struct idmap_msg *im, struct rpc_pipe_msg *msg) { substring_t substr; int token, ret; im->im_type = IDMAP_TYPE_GROUP; token = match_token(desc, nfs_idmap_tokens, &substr); switch (token) { case Opt_find_uid: im->im_type = IDMAP_TYPE_USER; fallthrough; case Opt_find_gid: im->im_conv = IDMAP_CONV_NAMETOID; ret = match_strlcpy(im->im_name, &substr, IDMAP_NAMESZ); break; case Opt_find_user: im->im_type = IDMAP_TYPE_USER; fallthrough; case Opt_find_group: im->im_conv = IDMAP_CONV_IDTONAME; ret = match_int(&substr, &im->im_id); if (ret) goto out; break; default: ret = -EINVAL; goto out; } msg->data = im; msg->len = sizeof(struct idmap_msg); out: return ret; } static bool nfs_idmap_prepare_pipe_upcall(struct idmap *idmap, struct idmap_legacy_upcalldata *data) { if (idmap->idmap_upcall_data != NULL) { WARN_ON_ONCE(1); return false; } idmap->idmap_upcall_data = data; return true; } static void nfs_idmap_complete_pipe_upcall(struct idmap_legacy_upcalldata *data, int ret) { complete_request_key(data->authkey, ret); key_put(data->authkey); kfree(data); } static void nfs_idmap_abort_pipe_upcall(struct idmap *idmap, struct idmap_legacy_upcalldata *data, int ret) { if (cmpxchg(&idmap->idmap_upcall_data, data, NULL) == data) nfs_idmap_complete_pipe_upcall(data, ret); } static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux) { struct idmap_legacy_upcalldata *data; struct request_key_auth *rka = get_request_key_auth(authkey); struct rpc_pipe_msg *msg; struct idmap_msg *im; struct idmap *idmap = aux; struct key *key = rka->target_key; int ret = -ENOKEY; if (!aux) goto out1; /* msg and im are freed in idmap_pipe_destroy_msg */ ret = -ENOMEM; data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) goto out1; msg = &data->pipe_msg; im = &data->idmap_msg; data->idmap = idmap; data->authkey = key_get(authkey); ret = nfs_idmap_prepare_message(key->description, idmap, im, msg); if (ret < 0) goto out2; ret = -EAGAIN; if (!nfs_idmap_prepare_pipe_upcall(idmap, data)) goto out2; ret = rpc_queue_upcall(idmap->idmap_pipe, msg); if (ret < 0) nfs_idmap_abort_pipe_upcall(idmap, data, ret); return ret; out2: kfree(data); out1: complete_request_key(authkey, ret); return ret; } static int nfs_idmap_instantiate(struct key *key, struct key *authkey, char *data, size_t datalen) { return key_instantiate_and_link(key, data, datalen, id_resolver_cache->thread_keyring, authkey); } static int nfs_idmap_read_and_verify_message(struct idmap_msg *im, struct idmap_msg *upcall, struct key *key, struct key *authkey) { char id_str[NFS_UINT_MAXLEN]; size_t len; int ret = -ENOKEY; /* ret = -ENOKEY */ if (upcall->im_type != im->im_type || upcall->im_conv != im->im_conv) goto out; switch (im->im_conv) { case IDMAP_CONV_NAMETOID: if (strcmp(upcall->im_name, im->im_name) != 0) break; /* Note: here we store the NUL terminator too */ len = 1 + nfs_map_numeric_to_string(im->im_id, id_str, sizeof(id_str)); ret = nfs_idmap_instantiate(key, authkey, id_str, len); break; case IDMAP_CONV_IDTONAME: if (upcall->im_id != im->im_id) break; len = strlen(im->im_name); ret = nfs_idmap_instantiate(key, authkey, im->im_name, len); break; default: ret = -EINVAL; } out: return ret; } static ssize_t idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { struct request_key_auth *rka; struct rpc_inode *rpci = RPC_I(file_inode(filp)); struct idmap *idmap = (struct idmap *)rpci->private; struct idmap_legacy_upcalldata *data; struct key *authkey; struct idmap_msg im; size_t namelen_in; int ret = -ENOKEY; /* If instantiation is successful, anyone waiting for key construction * will have been woken up and someone else may now have used * idmap_key_cons - so after this point we may no longer touch it. */ data = xchg(&idmap->idmap_upcall_data, NULL); if (data == NULL) goto out_noupcall; authkey = data->authkey; rka = get_request_key_auth(authkey); if (mlen != sizeof(im)) { ret = -ENOSPC; goto out; } if (copy_from_user(&im, src, mlen) != 0) { ret = -EFAULT; goto out; } if (!(im.im_status & IDMAP_STATUS_SUCCESS)) { ret = -ENOKEY; goto out; } namelen_in = strnlen(im.im_name, IDMAP_NAMESZ); if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) { ret = -EINVAL; goto out; } ret = nfs_idmap_read_and_verify_message(&im, &data->idmap_msg, rka->target_key, authkey); if (ret >= 0) { key_set_timeout(rka->target_key, nfs_idmap_cache_timeout); ret = mlen; } out: nfs_idmap_complete_pipe_upcall(data, ret); out_noupcall: return ret; } static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg) { struct idmap_legacy_upcalldata *data = container_of(msg, struct idmap_legacy_upcalldata, pipe_msg); struct idmap *idmap = data->idmap; if (msg->errno) nfs_idmap_abort_pipe_upcall(idmap, data, msg->errno); } static void idmap_release_pipe(struct inode *inode) { struct rpc_inode *rpci = RPC_I(inode); struct idmap *idmap = (struct idmap *)rpci->private; struct idmap_legacy_upcalldata *data; data = xchg(&idmap->idmap_upcall_data, NULL); if (data) nfs_idmap_complete_pipe_upcall(data, -EPIPE); } int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, kuid_t *uid) { struct idmap *idmap = server->nfs_client->cl_idmap; __u32 id = -1; int ret = 0; if (!nfs_map_string_to_numeric(name, namelen, &id)) ret = nfs_idmap_lookup_id(name, namelen, "uid", &id, idmap); if (ret == 0) { *uid = make_kuid(idmap_userns(idmap), id); if (!uid_valid(*uid)) ret = -ERANGE; } trace_nfs4_map_name_to_uid(name, namelen, id, ret); return ret; } int nfs_map_group_to_gid(const struct nfs_server *server, const char *name, size_t namelen, kgid_t *gid) { struct idmap *idmap = server->nfs_client->cl_idmap; __u32 id = -1; int ret = 0; if (!nfs_map_string_to_numeric(name, namelen, &id)) ret = nfs_idmap_lookup_id(name, namelen, "gid", &id, idmap); if (ret == 0) { *gid = make_kgid(idmap_userns(idmap), id); if (!gid_valid(*gid)) ret = -ERANGE; } trace_nfs4_map_group_to_gid(name, namelen, id, ret); return ret; } int nfs_map_uid_to_name(const struct nfs_server *server, kuid_t uid, char *buf, size_t buflen) { struct idmap *idmap = server->nfs_client->cl_idmap; int ret = -EINVAL; __u32 id; id = from_kuid_munged(idmap_userns(idmap), uid); if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) ret = nfs_idmap_lookup_name(id, "user", buf, buflen, idmap); if (ret < 0) ret = nfs_map_numeric_to_string(id, buf, buflen); trace_nfs4_map_uid_to_name(buf, ret, id, ret); return ret; } int nfs_map_gid_to_group(const struct nfs_server *server, kgid_t gid, char *buf, size_t buflen) { struct idmap *idmap = server->nfs_client->cl_idmap; int ret = -EINVAL; __u32 id; id = from_kgid_munged(idmap_userns(idmap), gid); if (!(server->caps & NFS_CAP_UIDGID_NOMAP)) ret = nfs_idmap_lookup_name(id, "group", buf, buflen, idmap); if (ret < 0) ret = nfs_map_numeric_to_string(id, buf, buflen); trace_nfs4_map_gid_to_group(buf, ret, id, ret); return ret; }
10 10 14 14 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 // SPDX-License-Identifier: GPL-2.0 #include <linux/export.h> #include <linux/preempt.h> #include <linux/smp.h> #include <linux/completion.h> #include <asm/msr.h> static void __rdmsr_on_cpu(void *info) { struct msr_info *rv = info; struct msr *reg; if (rv->msrs) reg = this_cpu_ptr(rv->msrs); else reg = &rv->reg; rdmsr(rv->msr_no, reg->l, reg->h); } static void __wrmsr_on_cpu(void *info) { struct msr_info *rv = info; struct msr *reg; if (rv->msrs) reg = this_cpu_ptr(rv->msrs); else reg = &rv->reg; wrmsr(rv->msr_no, reg->l, reg->h); } int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) { int err; struct msr_info rv; memset(&rv, 0, sizeof(rv)); rv.msr_no = msr_no; err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); *l = rv.reg.l; *h = rv.reg.h; return err; } EXPORT_SYMBOL(rdmsr_on_cpu); int rdmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) { int err; struct msr_info rv; memset(&rv, 0, sizeof(rv)); rv.msr_no = msr_no; err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); *q = rv.reg.q; return err; } EXPORT_SYMBOL(rdmsrq_on_cpu); int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) { int err; struct msr_info rv; memset(&rv, 0, sizeof(rv)); rv.msr_no = msr_no; rv.reg.l = l; rv.reg.h = h; err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); return err; } EXPORT_SYMBOL(wrmsr_on_cpu); int wrmsrq_on_cpu(unsigned int cpu, u32 msr_no, u64 q) { int err; struct msr_info rv; memset(&rv, 0, sizeof(rv)); rv.msr_no = msr_no; rv.reg.q = q; err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); return err; } EXPORT_SYMBOL(wrmsrq_on_cpu); static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs, void (*msr_func) (void *info)) { struct msr_info rv; int this_cpu; memset(&rv, 0, sizeof(rv)); rv.msrs = msrs; rv.msr_no = msr_no; this_cpu = get_cpu(); if (cpumask_test_cpu(this_cpu, mask)) msr_func(&rv); smp_call_function_many(mask, msr_func, &rv, 1); put_cpu(); } /* rdmsr on a bunch of CPUs * * @mask: which CPUs * @msr_no: which MSR * @msrs: array of MSR values * */ void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs) { __rwmsr_on_cpus(mask, msr_no, msrs, __rdmsr_on_cpu); } EXPORT_SYMBOL(rdmsr_on_cpus); /* * wrmsr on a bunch of CPUs * * @mask: which CPUs * @msr_no: which MSR * @msrs: array of MSR values * */ void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr __percpu *msrs) { __rwmsr_on_cpus(mask, msr_no, msrs, __wrmsr_on_cpu); } EXPORT_SYMBOL(wrmsr_on_cpus); struct msr_info_completion { struct msr_info msr; struct completion done; }; /* These "safe" variants are slower and should be used when the target MSR may not actually exist. */ static void __rdmsr_safe_on_cpu(void *info) { struct msr_info_completion *rv = info; rv->msr.err = rdmsr_safe(rv->msr.msr_no, &rv->msr.reg.l, &rv->msr.reg.h); complete(&rv->done); } static void __wrmsr_safe_on_cpu(void *info) { struct msr_info *rv = info; rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h); } int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) { struct msr_info_completion rv; call_single_data_t csd; int err; INIT_CSD(&csd, __rdmsr_safe_on_cpu, &rv); memset(&rv, 0, sizeof(rv)); init_completion(&rv.done); rv.msr.msr_no = msr_no; err = smp_call_function_single_async(cpu, &csd); if (!err) { wait_for_completion(&rv.done); err = rv.msr.err; } *l = rv.msr.reg.l; *h = rv.msr.reg.h; return err; } EXPORT_SYMBOL(rdmsr_safe_on_cpu); int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) { int err; struct msr_info rv; memset(&rv, 0, sizeof(rv)); rv.msr_no = msr_no; rv.reg.l = l; rv.reg.h = h; err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1); return err ? err : rv.err; } EXPORT_SYMBOL(wrmsr_safe_on_cpu); int wrmsrq_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q) { int err; struct msr_info rv; memset(&rv, 0, sizeof(rv)); rv.msr_no = msr_no; rv.reg.q = q; err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1); return err ? err : rv.err; } EXPORT_SYMBOL(wrmsrq_safe_on_cpu); int rdmsrq_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) { u32 low, high; int err; err = rdmsr_safe_on_cpu(cpu, msr_no, &low, &high); *q = (u64)high << 32 | low; return err; } EXPORT_SYMBOL(rdmsrq_safe_on_cpu); /* * These variants are significantly slower, but allows control over * the entire 32-bit GPR set. */ static void __rdmsr_safe_regs_on_cpu(void *info) { struct msr_regs_info *rv = info; rv->err = rdmsr_safe_regs(rv->regs); } static void __wrmsr_safe_regs_on_cpu(void *info) { struct msr_regs_info *rv = info; rv->err = wrmsr_safe_regs(rv->regs); } int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) { int err; struct msr_regs_info rv; rv.regs = regs; rv.err = -EIO; err = smp_call_function_single(cpu, __rdmsr_safe_regs_on_cpu, &rv, 1); return err ? err : rv.err; } EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu); int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) { int err; struct msr_regs_info rv; rv.regs = regs; rv.err = -EIO; err = smp_call_function_single(cpu, __wrmsr_safe_regs_on_cpu, &rv, 1); return err ? err : rv.err; } EXPORT_SYMBOL(wrmsr_safe_regs_on_cpu);
54 53 1 2 2 84 1 83 82 29 6 47 85 83 2 1 1 4 63 33 2 4 58 30 81 1 4 1 54 29 100 100 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 // SPDX-License-Identifier: GPL-2.0 /* XDP user-space packet buffer * Copyright(c) 2018 Intel Corporation. */ #include <linux/init.h> #include <linux/sched/mm.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/uaccess.h> #include <linux/slab.h> #include <linux/bpf.h> #include <linux/mm.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/idr.h> #include <linux/vmalloc.h> #include "xdp_umem.h" #include "xsk_queue.h" static DEFINE_IDA(umem_ida); static void xdp_umem_unpin_pages(struct xdp_umem *umem) { unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true); kvfree(umem->pgs); umem->pgs = NULL; } static void xdp_umem_unaccount_pages(struct xdp_umem *umem) { if (umem->user) { atomic_long_sub(umem->npgs, &umem->user->locked_vm); free_uid(umem->user); } } static void xdp_umem_addr_unmap(struct xdp_umem *umem) { vunmap(umem->addrs); umem->addrs = NULL; } static int xdp_umem_addr_map(struct xdp_umem *umem, struct page **pages, u32 nr_pages) { umem->addrs = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); if (!umem->addrs) return -ENOMEM; return 0; } static void xdp_umem_release(struct xdp_umem *umem) { umem->zc = false; ida_free(&umem_ida, umem->id); xdp_umem_addr_unmap(umem); xdp_umem_unpin_pages(umem); xdp_umem_unaccount_pages(umem); kfree(umem); } static void xdp_umem_release_deferred(struct work_struct *work) { struct xdp_umem *umem = container_of(work, struct xdp_umem, work); xdp_umem_release(umem); } void xdp_get_umem(struct xdp_umem *umem) { refcount_inc(&umem->users); } void xdp_put_umem(struct xdp_umem *umem, bool defer_cleanup) { if (!umem) return; if (refcount_dec_and_test(&umem->users)) { if (defer_cleanup) { INIT_WORK(&umem->work, xdp_umem_release_deferred); schedule_work(&umem->work); } else { xdp_umem_release(umem); } } } static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address) { unsigned int gup_flags = FOLL_WRITE; long npgs; int err; umem->pgs = kvcalloc(umem->npgs, sizeof(*umem->pgs), GFP_KERNEL | __GFP_NOWARN); if (!umem->pgs) return -ENOMEM; mmap_read_lock(current->mm); npgs = pin_user_pages(address, umem->npgs, gup_flags | FOLL_LONGTERM, &umem->pgs[0]); mmap_read_unlock(current->mm); if (npgs != umem->npgs) { if (npgs >= 0) { umem->npgs = npgs; err = -ENOMEM; goto out_pin; } err = npgs; goto out_pgs; } return 0; out_pin: xdp_umem_unpin_pages(umem); out_pgs: kvfree(umem->pgs); umem->pgs = NULL; return err; } static int xdp_umem_account_pages(struct xdp_umem *umem) { unsigned long lock_limit, new_npgs, old_npgs; if (capable(CAP_IPC_LOCK)) return 0; lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; umem->user = get_uid(current_user()); do { old_npgs = atomic_long_read(&umem->user->locked_vm); new_npgs = old_npgs + umem->npgs; if (new_npgs > lock_limit) { free_uid(umem->user); umem->user = NULL; return -ENOBUFS; } } while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs, new_npgs) != old_npgs); return 0; } #define XDP_UMEM_FLAGS_VALID ( \ XDP_UMEM_UNALIGNED_CHUNK_FLAG | \ XDP_UMEM_TX_SW_CSUM | \ XDP_UMEM_TX_METADATA_LEN | \ 0) static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) { bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG; u32 chunk_size = mr->chunk_size, headroom = mr->headroom; u64 addr = mr->addr, size = mr->len; u32 chunks_rem, npgs_rem; u64 chunks, npgs; int err; if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) { /* Strictly speaking we could support this, if: * - huge pages, or* * - using an IOMMU, or * - making sure the memory area is consecutive * but for now, we simply say "computer says no". */ return -EINVAL; } if (mr->flags & ~XDP_UMEM_FLAGS_VALID) return -EINVAL; if (!unaligned_chunks && !is_power_of_2(chunk_size)) return -EINVAL; if (!PAGE_ALIGNED(addr)) { /* Memory area has to be page size aligned. For * simplicity, this might change. */ return -EINVAL; } if ((addr + size) < addr) return -EINVAL; npgs = div_u64_rem(size, PAGE_SIZE, &npgs_rem); if (npgs_rem) npgs++; if (npgs > U32_MAX) return -EINVAL; chunks = div_u64_rem(size, chunk_size, &chunks_rem); if (!chunks || chunks > U32_MAX) return -EINVAL; if (!unaligned_chunks && chunks_rem) return -EINVAL; if (headroom >= chunk_size - XDP_PACKET_HEADROOM) return -EINVAL; if (mr->flags & XDP_UMEM_TX_METADATA_LEN) { if (mr->tx_metadata_len >= 256 || mr->tx_metadata_len % 8) return -EINVAL; umem->tx_metadata_len = mr->tx_metadata_len; } umem->size = size; umem->headroom = headroom; umem->chunk_size = chunk_size; umem->chunks = chunks; umem->npgs = npgs; umem->pgs = NULL; umem->user = NULL; umem->flags = mr->flags; INIT_LIST_HEAD(&umem->xsk_dma_list); refcount_set(&umem->users, 1); err = xdp_umem_account_pages(umem); if (err) return err; err = xdp_umem_pin_pages(umem, (unsigned long)addr); if (err) goto out_account; err = xdp_umem_addr_map(umem, umem->pgs, umem->npgs); if (err) goto out_unpin; return 0; out_unpin: xdp_umem_unpin_pages(umem); out_account: xdp_umem_unaccount_pages(umem); return err; } struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr) { struct xdp_umem *umem; int err; umem = kzalloc(sizeof(*umem), GFP_KERNEL); if (!umem) return ERR_PTR(-ENOMEM); err = ida_alloc(&umem_ida, GFP_KERNEL); if (err < 0) { kfree(umem); return ERR_PTR(err); } umem->id = err; err = xdp_umem_reg(umem, mr); if (err) { ida_free(&umem_ida, umem->id); kfree(umem); return ERR_PTR(err); } return umem; }
3258 6 3258 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_UACCESS_64_H #define _ASM_X86_UACCESS_64_H /* * User space memory access functions */ #include <linux/compiler.h> #include <linux/lockdep.h> #include <linux/kasan-checks.h> #include <asm/alternative.h> #include <asm/cpufeatures.h> #include <asm/page.h> #include <asm/percpu.h> #include <asm/runtime-const.h> /* * Virtual variable: there's no actual backing store for this, * it can purely be used as 'runtime_const_ptr(USER_PTR_MAX)' */ extern unsigned long USER_PTR_MAX; #ifdef CONFIG_ADDRESS_MASKING /* * Mask out tag bits from the address. */ static inline unsigned long __untagged_addr(unsigned long addr) { asm_inline (ALTERNATIVE("", "and " __percpu_arg([mask]) ", %[addr]", X86_FEATURE_LAM) : [addr] "+r" (addr) : [mask] "m" (__my_cpu_var(tlbstate_untag_mask))); return addr; } #define untagged_addr(addr) ({ \ unsigned long __addr = (__force unsigned long)(addr); \ (__force __typeof__(addr))__untagged_addr(__addr); \ }) static inline unsigned long __untagged_addr_remote(struct mm_struct *mm, unsigned long addr) { mmap_assert_locked(mm); return addr & (mm)->context.untag_mask; } #define untagged_addr_remote(mm, addr) ({ \ unsigned long __addr = (__force unsigned long)(addr); \ (__force __typeof__(addr))__untagged_addr_remote(mm, __addr); \ }) #endif #define valid_user_address(x) \ likely((__force unsigned long)(x) <= runtime_const_ptr(USER_PTR_MAX)) /* * Masking the user address is an alternative to a conditional * user_access_begin that can avoid the fencing. This only works * for dense accesses starting at the address. */ static inline void __user *mask_user_address(const void __user *ptr) { void __user *ret; asm("cmp %1,%0\n\t" "cmova %1,%0" :"=r" (ret) :"r" (runtime_const_ptr(USER_PTR_MAX)), "0" (ptr)); return ret; } #define masked_user_access_begin(x) ({ \ __auto_type __masked_ptr = (x); \ __masked_ptr = mask_user_address(__masked_ptr); \ __uaccess_begin(); __masked_ptr; }) /* * User pointers can have tag bits on x86-64. This scheme tolerates * arbitrary values in those bits rather then masking them off. * * Enforce two rules: * 1. 'ptr' must be in the user part of the address space * 2. 'ptr+size' must not overflow into kernel addresses * * Note that we always have at least one guard page between the * max user address and the non-canonical gap, allowing us to * ignore small sizes entirely. * * In fact, we could probably remove the size check entirely, since * any kernel accesses will be in increasing address order starting * at 'ptr'. * * That's a separate optimization, for now just handle the small * constant case. */ static inline bool __access_ok(const void __user *ptr, unsigned long size) { if (__builtin_constant_p(size <= PAGE_SIZE) && size <= PAGE_SIZE) { return valid_user_address(ptr); } else { unsigned long sum = size + (__force unsigned long)ptr; return valid_user_address(sum) && sum >= (__force unsigned long)ptr; } } #define __access_ok __access_ok /* * Copy To/From Userspace */ /* Handles exceptions in both to and from, but doesn't do access_ok */ __must_check unsigned long rep_movs_alternative(void *to, const void *from, unsigned len); static __always_inline __must_check unsigned long copy_user_generic(void *to, const void *from, unsigned long len) { stac(); /* * If CPU has FSRM feature, use 'rep movs'. * Otherwise, use rep_movs_alternative. */ asm volatile( "1:\n\t" ALTERNATIVE("rep movsb", "call rep_movs_alternative", ALT_NOT(X86_FEATURE_FSRM)) "2:\n" _ASM_EXTABLE_UA(1b, 2b) :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT : : "memory", "rax"); clac(); return len; } static __always_inline __must_check unsigned long raw_copy_from_user(void *dst, const void __user *src, unsigned long size) { return copy_user_generic(dst, (__force void *)src, size); } static __always_inline __must_check unsigned long raw_copy_to_user(void __user *dst, const void *src, unsigned long size) { return copy_user_generic((__force void *)dst, src, size); } extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size); extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size); static inline int __copy_from_user_inatomic_nocache(void *dst, const void __user *src, unsigned size) { long ret; kasan_check_write(dst, size); stac(); ret = __copy_user_nocache(dst, src, size); clac(); return ret; } static inline int __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size) { kasan_check_write(dst, size); return __copy_user_flushcache(dst, src, size); } /* * Zero Userspace. */ __must_check unsigned long rep_stos_alternative(void __user *addr, unsigned long len); static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size) { might_fault(); stac(); /* * No memory constraint because it doesn't change any memory gcc * knows about. */ asm volatile( "1:\n\t" ALTERNATIVE("rep stosb", "call rep_stos_alternative", ALT_NOT(X86_FEATURE_FSRS)) "2:\n" _ASM_EXTABLE_UA(1b, 2b) : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT : "a" (0)); clac(); return size; } static __always_inline unsigned long clear_user(void __user *to, unsigned long n) { if (__access_ok(to, n)) return __clear_user(to, n); return n; } #endif /* _ASM_X86_UACCESS_64_H */
125 43 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef LINUX_KEXEC_H #define LINUX_KEXEC_H #define IND_DESTINATION_BIT 0 #define IND_INDIRECTION_BIT 1 #define IND_DONE_BIT 2 #define IND_SOURCE_BIT 3 #define IND_DESTINATION (1 << IND_DESTINATION_BIT) #define IND_INDIRECTION (1 << IND_INDIRECTION_BIT) #define IND_DONE (1 << IND_DONE_BIT) #define IND_SOURCE (1 << IND_SOURCE_BIT) #define IND_FLAGS (IND_DESTINATION | IND_INDIRECTION | IND_DONE | IND_SOURCE) #if !defined(__ASSEMBLY__) #include <linux/vmcore_info.h> #include <linux/crash_reserve.h> #include <asm/io.h> #include <linux/range.h> #include <uapi/linux/kexec.h> #include <linux/verification.h> extern note_buf_t __percpu *crash_notes; #ifdef CONFIG_CRASH_DUMP #include <linux/prandom.h> #endif #ifdef CONFIG_KEXEC_CORE #include <linux/list.h> #include <linux/compat.h> #include <linux/ioport.h> #include <linux/module.h> #include <linux/highmem.h> #include <asm/kexec.h> #include <linux/crash_core.h> /* Verify architecture specific macros are defined */ #ifndef KEXEC_SOURCE_MEMORY_LIMIT #error KEXEC_SOURCE_MEMORY_LIMIT not defined #endif #ifndef KEXEC_DESTINATION_MEMORY_LIMIT #error KEXEC_DESTINATION_MEMORY_LIMIT not defined #endif #ifndef KEXEC_CONTROL_MEMORY_LIMIT #error KEXEC_CONTROL_MEMORY_LIMIT not defined #endif #ifndef KEXEC_CONTROL_MEMORY_GFP #define KEXEC_CONTROL_MEMORY_GFP (GFP_KERNEL | __GFP_NORETRY) #endif #ifndef KEXEC_CONTROL_PAGE_SIZE #error KEXEC_CONTROL_PAGE_SIZE not defined #endif #ifndef KEXEC_ARCH #error KEXEC_ARCH not defined #endif #ifndef KEXEC_CRASH_CONTROL_MEMORY_LIMIT #define KEXEC_CRASH_CONTROL_MEMORY_LIMIT KEXEC_CONTROL_MEMORY_LIMIT #endif #ifndef KEXEC_CRASH_MEM_ALIGN #define KEXEC_CRASH_MEM_ALIGN PAGE_SIZE #endif /* * This structure is used to hold the arguments that are used when loading * kernel binaries. */ typedef unsigned long kimage_entry_t; /* * This is a copy of the UAPI struct kexec_segment and must be identical * to it because it gets copied straight from user space into kernel * memory. Do not modify this structure unless you change the way segments * get ingested from user space. */ struct kexec_segment { /* * This pointer can point to user memory if kexec_load() system * call is used or will point to kernel memory if * kexec_file_load() system call is used. * * Use ->buf when expecting to deal with user memory and use ->kbuf * when expecting to deal with kernel memory. */ union { void __user *buf; void *kbuf; }; size_t bufsz; unsigned long mem; size_t memsz; }; #ifdef CONFIG_COMPAT struct compat_kexec_segment { compat_uptr_t buf; compat_size_t bufsz; compat_ulong_t mem; /* User space sees this as a (void *) ... */ compat_size_t memsz; }; #endif #ifdef CONFIG_KEXEC_FILE struct purgatory_info { /* * Pointer to elf header at the beginning of kexec_purgatory. * Note: kexec_purgatory is read only */ const Elf_Ehdr *ehdr; /* * Temporary, modifiable buffer for sechdrs used for relocation. * This memory can be freed post image load. */ Elf_Shdr *sechdrs; /* * Temporary, modifiable buffer for stripped purgatory used for * relocation. This memory can be freed post image load. */ void *purgatory_buf; }; struct kimage; typedef int (kexec_probe_t)(const char *kernel_buf, unsigned long kernel_size); typedef void *(kexec_load_t)(struct kimage *image, char *kernel_buf, unsigned long kernel_len, char *initrd, unsigned long initrd_len, char *cmdline, unsigned long cmdline_len); typedef int (kexec_cleanup_t)(void *loader_data); #ifdef CONFIG_KEXEC_SIG typedef int (kexec_verify_sig_t)(const char *kernel_buf, unsigned long kernel_len); #endif struct kexec_file_ops { kexec_probe_t *probe; kexec_load_t *load; kexec_cleanup_t *cleanup; #ifdef CONFIG_KEXEC_SIG kexec_verify_sig_t *verify_sig; #endif }; extern const struct kexec_file_ops * const kexec_file_loaders[]; int kexec_image_probe_default(struct kimage *image, void *buf, unsigned long buf_len); int kexec_image_post_load_cleanup_default(struct kimage *image); /* * If kexec_buf.mem is set to this value, kexec_locate_mem_hole() * will try to allocate free memory. Arch may overwrite it. */ #ifndef KEXEC_BUF_MEM_UNKNOWN #define KEXEC_BUF_MEM_UNKNOWN 0 #endif /** * struct kexec_buf - parameters for finding a place for a buffer in memory * @image: kexec image in which memory to search. * @buffer: Contents which will be copied to the allocated memory. * @bufsz: Size of @buffer. * @mem: On return will have address of the buffer in memory. * @memsz: Size for the buffer in memory. * @buf_align: Minimum alignment needed. * @buf_min: The buffer can't be placed below this address. * @buf_max: The buffer can't be placed above this address. * @cma: CMA page if the buffer is backed by CMA. * @top_down: Allocate from top of memory. * @random: Place the buffer at a random position. */ struct kexec_buf { struct kimage *image; void *buffer; unsigned long bufsz; unsigned long mem; unsigned long memsz; unsigned long buf_align; unsigned long buf_min; unsigned long buf_max; struct page *cma; bool top_down; #ifdef CONFIG_CRASH_DUMP bool random; #endif }; #ifdef CONFIG_CRASH_DUMP static inline void kexec_random_range_start(unsigned long start, unsigned long end, struct kexec_buf *kbuf, unsigned long *temp_start) { unsigned short i; if (kbuf->random) { get_random_bytes(&i, sizeof(unsigned short)); *temp_start = start + (end - start) / USHRT_MAX * i; } } #else static inline void kexec_random_range_start(unsigned long start, unsigned long end, struct kexec_buf *kbuf, unsigned long *temp_start) {} #endif int kexec_load_purgatory(struct kimage *image, struct kexec_buf *kbuf); int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, void *buf, unsigned int size, bool get_value); void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name); #ifndef arch_kexec_kernel_image_probe static inline int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, unsigned long buf_len) { return kexec_image_probe_default(image, buf, buf_len); } #endif #ifndef arch_kimage_file_post_load_cleanup static inline int arch_kimage_file_post_load_cleanup(struct kimage *image) { return kexec_image_post_load_cleanup_default(image); } #endif #ifndef arch_check_excluded_range static inline int arch_check_excluded_range(struct kimage *image, unsigned long start, unsigned long end) { return 0; } #endif #ifdef CONFIG_KEXEC_SIG #ifdef CONFIG_SIGNED_PE_FILE_VERIFICATION int kexec_kernel_verify_pe_sig(const char *kernel, unsigned long kernel_len); #endif #endif extern int kexec_add_buffer(struct kexec_buf *kbuf); int kexec_locate_mem_hole(struct kexec_buf *kbuf); #ifndef arch_kexec_locate_mem_hole /** * arch_kexec_locate_mem_hole - Find free memory to place the segments. * @kbuf: Parameters for the memory search. * * On success, kbuf->mem will have the start address of the memory region found. * * Return: 0 on success, negative errno on error. */ static inline int arch_kexec_locate_mem_hole(struct kexec_buf *kbuf) { return kexec_locate_mem_hole(kbuf); } #endif #ifndef arch_kexec_apply_relocations_add /* * arch_kexec_apply_relocations_add - apply relocations of type RELA * @pi: Purgatory to be relocated. * @section: Section relocations applying to. * @relsec: Section containing RELAs. * @symtab: Corresponding symtab. * * Return: 0 on success, negative errno on error. */ static inline int arch_kexec_apply_relocations_add(struct purgatory_info *pi, Elf_Shdr *section, const Elf_Shdr *relsec, const Elf_Shdr *symtab) { pr_err("RELA relocation unsupported.\n"); return -ENOEXEC; } #endif #ifndef arch_kexec_apply_relocations /* * arch_kexec_apply_relocations - apply relocations of type REL * @pi: Purgatory to be relocated. * @section: Section relocations applying to. * @relsec: Section containing RELs. * @symtab: Corresponding symtab. * * Return: 0 on success, negative errno on error. */ static inline int arch_kexec_apply_relocations(struct purgatory_info *pi, Elf_Shdr *section, const Elf_Shdr *relsec, const Elf_Shdr *symtab) { pr_err("REL relocation unsupported.\n"); return -ENOEXEC; } #endif #endif /* CONFIG_KEXEC_FILE */ #ifdef CONFIG_KEXEC_ELF struct kexec_elf_info { /* * Where the ELF binary contents are kept. * Memory managed by the user of the struct. */ const char *buffer; const struct elfhdr *ehdr; const struct elf_phdr *proghdrs; }; int kexec_build_elf_info(const char *buf, size_t len, struct elfhdr *ehdr, struct kexec_elf_info *elf_info); int kexec_elf_load(struct kimage *image, struct elfhdr *ehdr, struct kexec_elf_info *elf_info, struct kexec_buf *kbuf, unsigned long *lowest_load_addr); void kexec_free_elf_info(struct kexec_elf_info *elf_info); int kexec_elf_probe(const char *buf, unsigned long len); #endif struct kimage { kimage_entry_t head; kimage_entry_t *entry; kimage_entry_t *last_entry; unsigned long start; struct page *control_code_page; struct page *swap_page; void *vmcoreinfo_data_copy; /* locates in the crash memory */ unsigned long nr_segments; struct kexec_segment segment[KEXEC_SEGMENT_MAX]; struct page *segment_cma[KEXEC_SEGMENT_MAX]; struct list_head control_pages; struct list_head dest_pages; struct list_head unusable_pages; /* Address of next control page to allocate for crash kernels. */ unsigned long control_page; /* Flags to indicate special processing */ unsigned int type : 1; #define KEXEC_TYPE_DEFAULT 0 #define KEXEC_TYPE_CRASH 1 unsigned int preserve_context : 1; /* If set, we are using file mode kexec syscall */ unsigned int file_mode:1; #ifdef CONFIG_CRASH_HOTPLUG /* If set, it is safe to update kexec segments that are * excluded from SHA calculation. */ unsigned int hotplug_support:1; #endif unsigned int no_cma:1; #ifdef ARCH_HAS_KIMAGE_ARCH struct kimage_arch arch; #endif #ifdef CONFIG_KEXEC_FILE /* Additional fields for file based kexec syscall */ void *kernel_buf; unsigned long kernel_buf_len; void *initrd_buf; unsigned long initrd_buf_len; char *cmdline_buf; unsigned long cmdline_buf_len; /* File operations provided by image loader */ const struct kexec_file_ops *fops; /* Image loader handling the kernel can store a pointer here */ void *image_loader_data; /* Information for loading purgatory */ struct purgatory_info purgatory_info; #endif #ifdef CONFIG_CRASH_HOTPLUG int hp_action; int elfcorehdr_index; bool elfcorehdr_updated; #endif #ifdef CONFIG_IMA_KEXEC /* Virtual address of IMA measurement buffer for kexec syscall */ void *ima_buffer; phys_addr_t ima_buffer_addr; size_t ima_buffer_size; unsigned long ima_segment_index; bool is_ima_segment_index_set; #endif struct { struct kexec_segment *scratch; phys_addr_t fdt; } kho; /* Core ELF header buffer */ void *elf_headers; unsigned long elf_headers_sz; unsigned long elf_load_addr; /* dm crypt keys buffer */ unsigned long dm_crypt_keys_addr; unsigned long dm_crypt_keys_sz; }; /* kexec interface functions */ extern void machine_kexec(struct kimage *image); extern int machine_kexec_prepare(struct kimage *image); extern void machine_kexec_cleanup(struct kimage *image); extern int kernel_kexec(void); extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order); #ifndef machine_kexec_post_load static inline int machine_kexec_post_load(struct kimage *image) { return 0; } #endif extern struct kimage *kexec_image; extern struct kimage *kexec_crash_image; bool kexec_load_permitted(int kexec_image_type); #ifndef kexec_flush_icache_page #define kexec_flush_icache_page(page) #endif /* List of defined/legal kexec flags */ #ifndef CONFIG_KEXEC_JUMP #define KEXEC_FLAGS (KEXEC_ON_CRASH | KEXEC_UPDATE_ELFCOREHDR | KEXEC_CRASH_HOTPLUG_SUPPORT) #else #define KEXEC_FLAGS (KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT | KEXEC_UPDATE_ELFCOREHDR | \ KEXEC_CRASH_HOTPLUG_SUPPORT) #endif /* List of defined/legal kexec file flags */ #define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \ KEXEC_FILE_NO_INITRAMFS | KEXEC_FILE_DEBUG | \ KEXEC_FILE_NO_CMA) /* flag to track if kexec reboot is in progress */ extern bool kexec_in_progress; #ifndef page_to_boot_pfn static inline unsigned long page_to_boot_pfn(struct page *page) { return page_to_pfn(page); } #endif #ifndef boot_pfn_to_page static inline struct page *boot_pfn_to_page(unsigned long boot_pfn) { return pfn_to_page(boot_pfn); } #endif #ifndef phys_to_boot_phys static inline unsigned long phys_to_boot_phys(phys_addr_t phys) { return phys; } #endif #ifndef boot_phys_to_phys static inline phys_addr_t boot_phys_to_phys(unsigned long boot_phys) { return boot_phys; } #endif #ifndef crash_free_reserved_phys_range static inline void crash_free_reserved_phys_range(unsigned long begin, unsigned long end) { unsigned long addr; for (addr = begin; addr < end; addr += PAGE_SIZE) free_reserved_page(boot_pfn_to_page(addr >> PAGE_SHIFT)); } #endif static inline unsigned long virt_to_boot_phys(void *addr) { return phys_to_boot_phys(__pa((unsigned long)addr)); } static inline void *boot_phys_to_virt(unsigned long entry) { return phys_to_virt(boot_phys_to_phys(entry)); } #ifndef arch_kexec_post_alloc_pages static inline int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) { return 0; } #endif #ifndef arch_kexec_pre_free_pages static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { } #endif extern bool kexec_file_dbg_print; #define kexec_dprintk(fmt, arg...) \ do { if (kexec_file_dbg_print) pr_info(fmt, ##arg); } while (0) extern void *kimage_map_segment(struct kimage *image, unsigned long addr, unsigned long size); extern void kimage_unmap_segment(void *buffer); #else /* !CONFIG_KEXEC_CORE */ struct pt_regs; struct task_struct; struct kimage; static inline void __crash_kexec(struct pt_regs *regs) { } static inline void crash_kexec(struct pt_regs *regs) { } static inline int kexec_should_crash(struct task_struct *p) { return 0; } static inline int kexec_crash_loaded(void) { return 0; } static inline void *kimage_map_segment(struct kimage *image, unsigned long addr, unsigned long size) { return NULL; } static inline void kimage_unmap_segment(void *buffer) { } #define kexec_in_progress false #endif /* CONFIG_KEXEC_CORE */ #ifdef CONFIG_KEXEC_SIG void set_kexec_sig_enforced(void); #else static inline void set_kexec_sig_enforced(void) {} #endif #endif /* !defined(__ASSEBMLY__) */ #endif /* LINUX_KEXEC_H */
18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 /* inflate.c -- zlib decompression * Copyright (C) 1995-2005 Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h * * Based on zlib 1.2.3 but modified for the Linux Kernel by * Richard Purdie <richard@openedhand.com> * * Changes mainly for static instead of dynamic memory allocation * */ #include <linux/zutil.h> #include "inftrees.h" #include "inflate.h" #include "inffast.h" #include "infutil.h" /* architecture-specific bits */ #ifdef CONFIG_ZLIB_DFLTCC # include "../zlib_dfltcc/dfltcc_inflate.h" #else #define INFLATE_RESET_HOOK(strm) do {} while (0) #define INFLATE_TYPEDO_HOOK(strm, flush) do {} while (0) #define INFLATE_NEED_UPDATEWINDOW(strm) 1 #define INFLATE_NEED_CHECKSUM(strm) 1 #endif int zlib_inflate_workspacesize(void) { return sizeof(struct inflate_workspace); } int zlib_inflateReset(z_streamp strm) { struct inflate_state *state; if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; state = (struct inflate_state *)strm->state; strm->total_in = strm->total_out = state->total = 0; strm->msg = NULL; strm->adler = 1; /* to support ill-conceived Java test suite */ state->mode = HEAD; state->last = 0; state->havedict = 0; state->dmax = 32768U; state->hold = 0; state->bits = 0; state->lencode = state->distcode = state->next = state->codes; /* Initialise Window */ state->wsize = 1U << state->wbits; state->write = 0; state->whave = 0; INFLATE_RESET_HOOK(strm); return Z_OK; } int zlib_inflateInit2(z_streamp strm, int windowBits) { struct inflate_state *state; if (strm == NULL) return Z_STREAM_ERROR; strm->msg = NULL; /* in case we return an error */ state = &WS(strm)->inflate_state; strm->state = (struct internal_state *)state; if (windowBits < 0) { state->wrap = 0; windowBits = -windowBits; } else { state->wrap = (windowBits >> 4) + 1; } if (windowBits < 8 || windowBits > 15) { return Z_STREAM_ERROR; } state->wbits = (unsigned)windowBits; #ifdef CONFIG_ZLIB_DFLTCC /* * DFLTCC requires the window to be page aligned. * Thus, we overallocate and take the aligned portion of the buffer. */ state->window = PTR_ALIGN(&WS(strm)->working_window[0], PAGE_SIZE); #else state->window = &WS(strm)->working_window[0]; #endif return zlib_inflateReset(strm); } /* Return state with length and distance decoding tables and index sizes set to fixed code decoding. This returns fixed tables from inffixed.h. */ static void zlib_fixedtables(struct inflate_state *state) { # include "inffixed.h" state->lencode = lenfix; state->lenbits = 9; state->distcode = distfix; state->distbits = 5; } /* Update the window with the last wsize (normally 32K) bytes written before returning. This is only called when a window is already in use, or when output has been written during this inflate call, but the end of the deflate stream has not been reached yet. It is also called to window dictionary data when a dictionary is loaded. Providing output buffers larger than 32K to inflate() should provide a speed advantage, since only the last 32K of output is copied to the sliding window upon return from inflate(), and since all distances after the first 32K of output will fall in the output data, making match copies simpler and faster. The advantage may be dependent on the size of the processor's data caches. */ static void zlib_updatewindow(z_streamp strm, unsigned out) { struct inflate_state *state; unsigned copy, dist; state = (struct inflate_state *)strm->state; /* copy state->wsize or less output bytes into the circular window */ copy = out - strm->avail_out; if (copy >= state->wsize) { memcpy(state->window, strm->next_out - state->wsize, state->wsize); state->write = 0; state->whave = state->wsize; } else { dist = state->wsize - state->write; if (dist > copy) dist = copy; memcpy(state->window + state->write, strm->next_out - copy, dist); copy -= dist; if (copy) { memcpy(state->window, strm->next_out - copy, copy); state->write = copy; state->whave = state->wsize; } else { state->write += dist; if (state->write == state->wsize) state->write = 0; if (state->whave < state->wsize) state->whave += dist; } } } /* * At the end of a Deflate-compressed PPP packet, we expect to have seen * a `stored' block type value but not the (zero) length bytes. */ /* Returns true if inflate is currently at the end of a block generated by Z_SYNC_FLUSH or Z_FULL_FLUSH. This function is used by one PPP implementation to provide an additional safety check. PPP uses Z_SYNC_FLUSH but removes the length bytes of the resulting empty stored block. When decompressing, PPP checks that at the end of input packet, inflate is waiting for these length bytes. */ static int zlib_inflateSyncPacket(z_streamp strm) { struct inflate_state *state; if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; state = (struct inflate_state *)strm->state; if (state->mode == STORED && state->bits == 0) { state->mode = TYPE; return Z_OK; } return Z_DATA_ERROR; } /* Macros for inflate(): */ /* check function to use adler32() for zlib or crc32() for gzip */ #define UPDATE(check, buf, len) zlib_adler32(check, buf, len) /* Load registers with state in inflate() for speed */ #define LOAD() \ do { \ put = strm->next_out; \ left = strm->avail_out; \ next = strm->next_in; \ have = strm->avail_in; \ hold = state->hold; \ bits = state->bits; \ } while (0) /* Restore state from registers in inflate() */ #define RESTORE() \ do { \ strm->next_out = put; \ strm->avail_out = left; \ strm->next_in = next; \ strm->avail_in = have; \ state->hold = hold; \ state->bits = bits; \ } while (0) /* Clear the input bit accumulator */ #define INITBITS() \ do { \ hold = 0; \ bits = 0; \ } while (0) /* Get a byte of input into the bit accumulator, or return from inflate() if there is no input available. */ #define PULLBYTE() \ do { \ if (have == 0) goto inf_leave; \ have--; \ hold += (unsigned long)(*next++) << bits; \ bits += 8; \ } while (0) /* Assure that there are at least n bits in the bit accumulator. If there is not enough available input to do that, then return from inflate(). */ #define NEEDBITS(n) \ do { \ while (bits < (unsigned)(n)) \ PULLBYTE(); \ } while (0) /* Return the low n bits of the bit accumulator (n < 16) */ #define BITS(n) \ ((unsigned)hold & ((1U << (n)) - 1)) /* Remove n bits from the bit accumulator */ #define DROPBITS(n) \ do { \ hold >>= (n); \ bits -= (unsigned)(n); \ } while (0) /* Remove zero to seven bits as needed to go to a byte boundary */ #define BYTEBITS() \ do { \ hold >>= bits & 7; \ bits -= bits & 7; \ } while (0) /* inflate() uses a state machine to process as much input data and generate as much output data as possible before returning. The state machine is structured roughly as follows: for (;;) switch (state) { ... case STATEn: if (not enough input data or output space to make progress) return; ... make progress ... state = STATEm; break; ... } so when inflate() is called again, the same case is attempted again, and if the appropriate resources are provided, the machine proceeds to the next state. The NEEDBITS() macro is usually the way the state evaluates whether it can proceed or should return. NEEDBITS() does the return if the requested bits are not available. The typical use of the BITS macros is: NEEDBITS(n); ... do something with BITS(n) ... DROPBITS(n); where NEEDBITS(n) either returns from inflate() if there isn't enough input left to load n bits into the accumulator, or it continues. BITS(n) gives the low n bits in the accumulator. When done, DROPBITS(n) drops the low n bits off the accumulator. INITBITS() clears the accumulator and sets the number of available bits to zero. BYTEBITS() discards just enough bits to put the accumulator on a byte boundary. After BYTEBITS() and a NEEDBITS(8), then BITS(8) would return the next byte in the stream. NEEDBITS(n) uses PULLBYTE() to get an available byte of input, or to return if there is no input available. The decoding of variable length codes uses PULLBYTE() directly in order to pull just enough bytes to decode the next code, and no more. Some states loop until they get enough input, making sure that enough state information is maintained to continue the loop where it left off if NEEDBITS() returns in the loop. For example, want, need, and keep would all have to actually be part of the saved state in case NEEDBITS() returns: case STATEw: while (want < need) { NEEDBITS(n); keep[want++] = BITS(n); DROPBITS(n); } state = STATEx; case STATEx: As shown above, if the next state is also the next case, then the break is omitted. A state may also return if there is not enough output space available to complete that state. Those states are copying stored data, writing a literal byte, and copying a matching string. When returning, a "goto inf_leave" is used to update the total counters, update the check value, and determine whether any progress has been made during that inflate() call in order to return the proper return code. Progress is defined as a change in either strm->avail_in or strm->avail_out. When there is a window, goto inf_leave will update the window with the last output written. If a goto inf_leave occurs in the middle of decompression and there is no window currently, goto inf_leave will create one and copy output to the window for the next call of inflate(). In this implementation, the flush parameter of inflate() only affects the return code (per zlib.h). inflate() always writes as much as possible to strm->next_out, given the space available and the provided input--the effect documented in zlib.h of Z_SYNC_FLUSH. Furthermore, inflate() always defers the allocation of and copying into a sliding window until necessary, which provides the effect documented in zlib.h for Z_FINISH when the entire input stream available. So the only thing the flush parameter actually does is: when flush is set to Z_FINISH, inflate() cannot return Z_OK. Instead it will return Z_BUF_ERROR if it has not reached the end of the stream. */ int zlib_inflate(z_streamp strm, int flush) { struct inflate_state *state; const unsigned char *next; /* next input */ unsigned char *put; /* next output */ unsigned have, left; /* available input and output */ unsigned long hold; /* bit buffer */ unsigned bits; /* bits in bit buffer */ unsigned in, out; /* save starting available input and output */ unsigned copy; /* number of stored or match bytes to copy */ unsigned char *from; /* where to copy match bytes from */ code this; /* current decoding table entry */ code last; /* parent table entry */ unsigned len; /* length to copy for repeats, bits to drop */ int ret; /* return code */ static const unsigned short order[19] = /* permutation of code lengths */ {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}; /* Do not check for strm->next_out == NULL here as ppc zImage inflates to strm->next_out = 0 */ if (strm == NULL || strm->state == NULL || (strm->next_in == NULL && strm->avail_in != 0)) return Z_STREAM_ERROR; state = (struct inflate_state *)strm->state; if (state->mode == TYPE) state->mode = TYPEDO; /* skip check */ LOAD(); in = have; out = left; ret = Z_OK; for (;;) switch (state->mode) { case HEAD: if (state->wrap == 0) { state->mode = TYPEDO; break; } NEEDBITS(16); if ( ((BITS(8) << 8) + (hold >> 8)) % 31) { strm->msg = (char *)"incorrect header check"; state->mode = BAD; break; } if (BITS(4) != Z_DEFLATED) { strm->msg = (char *)"unknown compression method"; state->mode = BAD; break; } DROPBITS(4); len = BITS(4) + 8; if (len > state->wbits) { strm->msg = (char *)"invalid window size"; state->mode = BAD; break; } state->dmax = 1U << len; strm->adler = state->check = zlib_adler32(0L, NULL, 0); state->mode = hold & 0x200 ? DICTID : TYPE; INITBITS(); break; case DICTID: NEEDBITS(32); strm->adler = state->check = REVERSE(hold); INITBITS(); state->mode = DICT; fallthrough; case DICT: if (state->havedict == 0) { RESTORE(); return Z_NEED_DICT; } strm->adler = state->check = zlib_adler32(0L, NULL, 0); state->mode = TYPE; fallthrough; case TYPE: if (flush == Z_BLOCK) goto inf_leave; fallthrough; case TYPEDO: INFLATE_TYPEDO_HOOK(strm, flush); if (state->last) { BYTEBITS(); state->mode = CHECK; break; } NEEDBITS(3); state->last = BITS(1); DROPBITS(1); switch (BITS(2)) { case 0: /* stored block */ state->mode = STORED; break; case 1: /* fixed block */ zlib_fixedtables(state); state->mode = LEN; /* decode codes */ break; case 2: /* dynamic block */ state->mode = TABLE; break; case 3: strm->msg = (char *)"invalid block type"; state->mode = BAD; } DROPBITS(2); break; case STORED: BYTEBITS(); /* go to byte boundary */ NEEDBITS(32); if ((hold & 0xffff) != ((hold >> 16) ^ 0xffff)) { strm->msg = (char *)"invalid stored block lengths"; state->mode = BAD; break; } state->length = (unsigned)hold & 0xffff; INITBITS(); state->mode = COPY; fallthrough; case COPY: copy = state->length; if (copy) { if (copy > have) copy = have; if (copy > left) copy = left; if (copy == 0) goto inf_leave; memcpy(put, next, copy); have -= copy; next += copy; left -= copy; put += copy; state->length -= copy; break; } state->mode = TYPE; break; case TABLE: NEEDBITS(14); state->nlen = BITS(5) + 257; DROPBITS(5); state->ndist = BITS(5) + 1; DROPBITS(5); state->ncode = BITS(4) + 4; DROPBITS(4); #ifndef PKZIP_BUG_WORKAROUND if (state->nlen > 286 || state->ndist > 30) { strm->msg = (char *)"too many length or distance symbols"; state->mode = BAD; break; } #endif state->have = 0; state->mode = LENLENS; fallthrough; case LENLENS: while (state->have < state->ncode) { NEEDBITS(3); state->lens[order[state->have++]] = (unsigned short)BITS(3); DROPBITS(3); } while (state->have < 19) state->lens[order[state->have++]] = 0; state->next = state->codes; state->lencode = (code const *)(state->next); state->lenbits = 7; ret = zlib_inflate_table(CODES, state->lens, 19, &(state->next), &(state->lenbits), state->work); if (ret) { strm->msg = (char *)"invalid code lengths set"; state->mode = BAD; break; } state->have = 0; state->mode = CODELENS; fallthrough; case CODELENS: while (state->have < state->nlen + state->ndist) { for (;;) { this = state->lencode[BITS(state->lenbits)]; if ((unsigned)(this.bits) <= bits) break; PULLBYTE(); } if (this.val < 16) { NEEDBITS(this.bits); DROPBITS(this.bits); state->lens[state->have++] = this.val; } else { if (this.val == 16) { NEEDBITS(this.bits + 2); DROPBITS(this.bits); if (state->have == 0) { strm->msg = (char *)"invalid bit length repeat"; state->mode = BAD; break; } len = state->lens[state->have - 1]; copy = 3 + BITS(2); DROPBITS(2); } else if (this.val == 17) { NEEDBITS(this.bits + 3); DROPBITS(this.bits); len = 0; copy = 3 + BITS(3); DROPBITS(3); } else { NEEDBITS(this.bits + 7); DROPBITS(this.bits); len = 0; copy = 11 + BITS(7); DROPBITS(7); } if (state->have + copy > state->nlen + state->ndist) { strm->msg = (char *)"invalid bit length repeat"; state->mode = BAD; break; } while (copy--) state->lens[state->have++] = (unsigned short)len; } } /* handle error breaks in while */ if (state->mode == BAD) break; /* build code tables */ state->next = state->codes; state->lencode = (code const *)(state->next); state->lenbits = 9; ret = zlib_inflate_table(LENS, state->lens, state->nlen, &(state->next), &(state->lenbits), state->work); if (ret) { strm->msg = (char *)"invalid literal/lengths set"; state->mode = BAD; break; } state->distcode = (code const *)(state->next); state->distbits = 6; ret = zlib_inflate_table(DISTS, state->lens + state->nlen, state->ndist, &(state->next), &(state->distbits), state->work); if (ret) { strm->msg = (char *)"invalid distances set"; state->mode = BAD; break; } state->mode = LEN; fallthrough; case LEN: if (have >= 6 && left >= 258) { RESTORE(); inflate_fast(strm, out); LOAD(); break; } for (;;) { this = state->lencode[BITS(state->lenbits)]; if ((unsigned)(this.bits) <= bits) break; PULLBYTE(); } if (this.op && (this.op & 0xf0) == 0) { last = this; for (;;) { this = state->lencode[last.val + (BITS(last.bits + last.op) >> last.bits)]; if ((unsigned)(last.bits + this.bits) <= bits) break; PULLBYTE(); } DROPBITS(last.bits); } DROPBITS(this.bits); state->length = (unsigned)this.val; if ((int)(this.op) == 0) { state->mode = LIT; break; } if (this.op & 32) { state->mode = TYPE; break; } if (this.op & 64) { strm->msg = (char *)"invalid literal/length code"; state->mode = BAD; break; } state->extra = (unsigned)(this.op) & 15; state->mode = LENEXT; fallthrough; case LENEXT: if (state->extra) { NEEDBITS(state->extra); state->length += BITS(state->extra); DROPBITS(state->extra); } state->mode = DIST; fallthrough; case DIST: for (;;) { this = state->distcode[BITS(state->distbits)]; if ((unsigned)(this.bits) <= bits) break; PULLBYTE(); } if ((this.op & 0xf0) == 0) { last = this; for (;;) { this = state->distcode[last.val + (BITS(last.bits + last.op) >> last.bits)]; if ((unsigned)(last.bits + this.bits) <= bits) break; PULLBYTE(); } DROPBITS(last.bits); } DROPBITS(this.bits); if (this.op & 64) { strm->msg = (char *)"invalid distance code"; state->mode = BAD; break; } state->offset = (unsigned)this.val; state->extra = (unsigned)(this.op) & 15; state->mode = DISTEXT; fallthrough; case DISTEXT: if (state->extra) { NEEDBITS(state->extra); state->offset += BITS(state->extra); DROPBITS(state->extra); } #ifdef INFLATE_STRICT if (state->offset > state->dmax) { strm->msg = (char *)"invalid distance too far back"; state->mode = BAD; break; } #endif if (state->offset > state->whave + out - left) { strm->msg = (char *)"invalid distance too far back"; state->mode = BAD; break; } state->mode = MATCH; fallthrough; case MATCH: if (left == 0) goto inf_leave; copy = out - left; if (state->offset > copy) { /* copy from window */ copy = state->offset - copy; if (copy > state->write) { copy -= state->write; from = state->window + (state->wsize - copy); } else from = state->window + (state->write - copy); if (copy > state->length) copy = state->length; } else { /* copy from output */ from = put - state->offset; copy = state->length; } if (copy > left) copy = left; left -= copy; state->length -= copy; do { *put++ = *from++; } while (--copy); if (state->length == 0) state->mode = LEN; break; case LIT: if (left == 0) goto inf_leave; *put++ = (unsigned char)(state->length); left--; state->mode = LEN; break; case CHECK: if (state->wrap) { NEEDBITS(32); out -= left; strm->total_out += out; state->total += out; if (INFLATE_NEED_CHECKSUM(strm) && out) strm->adler = state->check = UPDATE(state->check, put - out, out); out = left; if (( REVERSE(hold)) != state->check) { strm->msg = (char *)"incorrect data check"; state->mode = BAD; break; } INITBITS(); } state->mode = DONE; fallthrough; case DONE: ret = Z_STREAM_END; goto inf_leave; case BAD: ret = Z_DATA_ERROR; goto inf_leave; case MEM: return Z_MEM_ERROR; case SYNC: default: return Z_STREAM_ERROR; } /* Return from inflate(), updating the total counts and the check value. If there was no progress during the inflate() call, return a buffer error. Call zlib_updatewindow() to create and/or update the window state. */ inf_leave: RESTORE(); if (INFLATE_NEED_UPDATEWINDOW(strm) && (state->wsize || (state->mode < CHECK && out != strm->avail_out))) zlib_updatewindow(strm, out); in -= strm->avail_in; out -= strm->avail_out; strm->total_in += in; strm->total_out += out; state->total += out; if (INFLATE_NEED_CHECKSUM(strm) && state->wrap && out) strm->adler = state->check = UPDATE(state->check, strm->next_out - out, out); strm->data_type = state->bits + (state->last ? 64 : 0) + (state->mode == TYPE ? 128 : 0); if (flush == Z_PACKET_FLUSH && ret == Z_OK && strm->avail_out != 0 && strm->avail_in == 0) return zlib_inflateSyncPacket(strm); if (((in == 0 && out == 0) || flush == Z_FINISH) && ret == Z_OK) ret = Z_BUF_ERROR; return ret; } int zlib_inflateEnd(z_streamp strm) { if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; return Z_OK; } /* * This subroutine adds the data at next_in/avail_in to the output history * without performing any output. The output buffer must be "caught up"; * i.e. no pending output but this should always be the case. The state must * be waiting on the start of a block (i.e. mode == TYPE or HEAD). On exit, * the output will also be caught up, and the checksum will have been updated * if need be. */ int zlib_inflateIncomp(z_stream *z) { struct inflate_state *state = (struct inflate_state *)z->state; Byte *saved_no = z->next_out; uInt saved_ao = z->avail_out; if (state->mode != TYPE && state->mode != HEAD) return Z_DATA_ERROR; /* Setup some variables to allow misuse of updateWindow */ z->avail_out = 0; z->next_out = (unsigned char*)z->next_in + z->avail_in; zlib_updatewindow(z, z->avail_in); /* Restore saved variables */ z->avail_out = saved_ao; z->next_out = saved_no; z->adler = state->check = UPDATE(state->check, z->next_in, z->avail_in); z->total_out += z->avail_in; z->total_in += z->avail_in; z->next_in += z->avail_in; state->total += z->avail_in; z->avail_in = 0; return Z_OK; }
1 1 8 2 6 1 1 2 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 // SPDX-License-Identifier: GPL-2.0-or-later /* * PTP 1588 clock support * * Copyright (C) 2010 OMICRON electronics GmbH */ #include <linux/device.h> #include <linux/err.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/posix-clock.h> #include <linux/pps_kernel.h> #include <linux/slab.h> #include <linux/syscalls.h> #include <linux/uaccess.h> #include <linux/debugfs.h> #include <linux/xarray.h> #include <uapi/linux/sched/types.h> #include "ptp_private.h" #define PTP_MAX_ALARMS 4 #define PTP_PPS_DEFAULTS (PPS_CAPTUREASSERT | PPS_OFFSETASSERT) #define PTP_PPS_EVENT PPS_CAPTUREASSERT #define PTP_PPS_MODE (PTP_PPS_DEFAULTS | PPS_CANWAIT | PPS_TSFMT_TSPEC) const struct class ptp_class = { .name = "ptp", .dev_groups = ptp_groups }; /* private globals */ static dev_t ptp_devt; static DEFINE_XARRAY_ALLOC(ptp_clocks_map); /* time stamp event queue operations */ static inline int queue_free(struct timestamp_event_queue *q) { return PTP_MAX_TIMESTAMPS - queue_cnt(q) - 1; } static void enqueue_external_timestamp(struct timestamp_event_queue *queue, struct ptp_clock_event *src) { struct ptp_extts_event *dst; struct timespec64 offset_ts; unsigned long flags; s64 seconds; u32 remainder; if (src->type == PTP_CLOCK_EXTTS) { seconds = div_u64_rem(src->timestamp, 1000000000, &remainder); } else if (src->type == PTP_CLOCK_EXTOFF) { offset_ts = ns_to_timespec64(src->offset); seconds = offset_ts.tv_sec; remainder = offset_ts.tv_nsec; } else { WARN(1, "%s: unknown type %d\n", __func__, src->type); return; } spin_lock_irqsave(&queue->lock, flags); dst = &queue->buf[queue->tail]; dst->index = src->index; dst->flags = PTP_EXTTS_EVENT_VALID; dst->t.sec = seconds; dst->t.nsec = remainder; if (src->type == PTP_CLOCK_EXTOFF) dst->flags |= PTP_EXT_OFFSET; /* Both WRITE_ONCE() are paired with READ_ONCE() in queue_cnt() */ if (!queue_free(queue)) WRITE_ONCE(queue->head, (queue->head + 1) % PTP_MAX_TIMESTAMPS); WRITE_ONCE(queue->tail, (queue->tail + 1) % PTP_MAX_TIMESTAMPS); spin_unlock_irqrestore(&queue->lock, flags); } /* posix clock implementation */ static int ptp_clock_getres(struct posix_clock *pc, struct timespec64 *tp) { tp->tv_sec = 0; tp->tv_nsec = 1; return 0; } static int ptp_clock_settime(struct posix_clock *pc, const struct timespec64 *tp) { struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock); if (ptp_clock_freerun(ptp)) { pr_err_ratelimited("ptp: physical clock is free running\n"); return -EBUSY; } return ptp->info->settime64(ptp->info, tp); } static int ptp_clock_gettime(struct posix_clock *pc, struct timespec64 *tp) { struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock); int err; if (ptp->info->gettimex64) err = ptp->info->gettimex64(ptp->info, tp, NULL); else err = ptp->info->gettime64(ptp->info, tp); return err; } static int ptp_clock_adjtime(struct posix_clock *pc, struct __kernel_timex *tx) { struct ptp_clock *ptp = container_of(pc, struct ptp_clock, clock); struct ptp_clock_info *ops; int err = -EOPNOTSUPP; if (tx->modes & (ADJ_SETOFFSET | ADJ_FREQUENCY | ADJ_OFFSET) && ptp_clock_freerun(ptp)) { pr_err("ptp: physical clock is free running\n"); return -EBUSY; } ops = ptp->info; if (tx->modes & ADJ_SETOFFSET) { struct timespec64 ts; ktime_t kt; s64 delta; ts.tv_sec = tx->time.tv_sec; ts.tv_nsec = tx->time.tv_usec; if (!(tx->modes & ADJ_NANO)) ts.tv_nsec *= 1000; if ((unsigned long) ts.tv_nsec >= NSEC_PER_SEC) return -EINVAL; kt = timespec64_to_ktime(ts); delta = ktime_to_ns(kt); err = ops->adjtime(ops, delta); } else if (tx->modes & ADJ_FREQUENCY) { long ppb = scaled_ppm_to_ppb(tx->freq); if (ppb > ops->max_adj || ppb < -ops->max_adj) return -ERANGE; err = ops->adjfine(ops, tx->freq); if (!err) ptp->dialed_frequency = tx->freq; } else if (tx->modes & ADJ_OFFSET) { if (ops->adjphase) { s32 max_phase_adj = ops->getmaxphase(ops); s32 offset = tx->offset; if (!(tx->modes & ADJ_NANO)) offset *= NSEC_PER_USEC; if (offset > max_phase_adj || offset < -max_phase_adj) return -ERANGE; err = ops->adjphase(ops, offset); } } else if (tx->modes == 0) { tx->freq = ptp->dialed_frequency; err = 0; } return err; } static struct posix_clock_operations ptp_clock_ops = { .owner = THIS_MODULE, .clock_adjtime = ptp_clock_adjtime, .clock_gettime = ptp_clock_gettime, .clock_getres = ptp_clock_getres, .clock_settime = ptp_clock_settime, .ioctl = ptp_ioctl, .open = ptp_open, .release = ptp_release, .poll = ptp_poll, .read = ptp_read, }; static void ptp_clock_release(struct device *dev) { struct ptp_clock *ptp = container_of(dev, struct ptp_clock, dev); struct timestamp_event_queue *tsevq; unsigned long flags; ptp_cleanup_pin_groups(ptp); kfree(ptp->vclock_index); mutex_destroy(&ptp->pincfg_mux); mutex_destroy(&ptp->n_vclocks_mux); /* Delete first entry */ spin_lock_irqsave(&ptp->tsevqs_lock, flags); tsevq = list_first_entry(&ptp->tsevqs, struct timestamp_event_queue, qlist); list_del(&tsevq->qlist); spin_unlock_irqrestore(&ptp->tsevqs_lock, flags); bitmap_free(tsevq->mask); kfree(tsevq); debugfs_remove(ptp->debugfs_root); xa_erase(&ptp_clocks_map, ptp->index); kfree(ptp); } static int ptp_getcycles64(struct ptp_clock_info *info, struct timespec64 *ts) { if (info->getcyclesx64) return info->getcyclesx64(info, ts, NULL); else return info->gettime64(info, ts); } static int ptp_enable(struct ptp_clock_info *ptp, struct ptp_clock_request *request, int on) { return -EOPNOTSUPP; } static void ptp_aux_kworker(struct kthread_work *work) { struct ptp_clock *ptp = container_of(work, struct ptp_clock, aux_work.work); struct ptp_clock_info *info = ptp->info; long delay; delay = info->do_aux_work(info); if (delay >= 0) kthread_queue_delayed_work(ptp->kworker, &ptp->aux_work, delay); } /* public interface */ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, struct device *parent) { struct ptp_clock *ptp; struct timestamp_event_queue *queue = NULL; int err, index, major = MAJOR(ptp_devt); char debugfsname[16]; size_t size; if (info->n_alarm > PTP_MAX_ALARMS) return ERR_PTR(-EINVAL); /* Initialize a clock structure. */ ptp = kzalloc(sizeof(struct ptp_clock), GFP_KERNEL); if (!ptp) { err = -ENOMEM; goto no_memory; } err = xa_alloc(&ptp_clocks_map, &index, ptp, xa_limit_31b, GFP_KERNEL); if (err) goto no_slot; ptp->clock.ops = ptp_clock_ops; ptp->info = info; ptp->devid = MKDEV(major, index); ptp->index = index; INIT_LIST_HEAD(&ptp->tsevqs); queue = kzalloc(sizeof(*queue), GFP_KERNEL); if (!queue) { err = -ENOMEM; goto no_memory_queue; } list_add_tail(&queue->qlist, &ptp->tsevqs); spin_lock_init(&ptp->tsevqs_lock); queue->mask = bitmap_alloc(PTP_MAX_CHANNELS, GFP_KERNEL); if (!queue->mask) { err = -ENOMEM; goto no_memory_bitmap; } bitmap_set(queue->mask, 0, PTP_MAX_CHANNELS); spin_lock_init(&queue->lock); mutex_init(&ptp->pincfg_mux); mutex_init(&ptp->n_vclocks_mux); init_waitqueue_head(&ptp->tsev_wq); if (ptp->info->getcycles64 || ptp->info->getcyclesx64) { ptp->has_cycles = true; if (!ptp->info->getcycles64 && ptp->info->getcyclesx64) ptp->info->getcycles64 = ptp_getcycles64; } else { /* Free running cycle counter not supported, use time. */ ptp->info->getcycles64 = ptp_getcycles64; if (ptp->info->gettimex64) ptp->info->getcyclesx64 = ptp->info->gettimex64; if (ptp->info->getcrosststamp) ptp->info->getcrosscycles = ptp->info->getcrosststamp; } if (!ptp->info->enable) ptp->info->enable = ptp_enable; if (ptp->info->do_aux_work) { kthread_init_delayed_work(&ptp->aux_work, ptp_aux_kworker); ptp->kworker = kthread_run_worker(0, "ptp%d", ptp->index); if (IS_ERR(ptp->kworker)) { err = PTR_ERR(ptp->kworker); pr_err("failed to create ptp aux_worker %d\n", err); goto kworker_err; } } /* PTP virtual clock is being registered under physical clock */ if (parent && parent->class && parent->class->name && strcmp(parent->class->name, "ptp") == 0) ptp->is_virtual_clock = true; if (!ptp->is_virtual_clock) { ptp->max_vclocks = PTP_DEFAULT_MAX_VCLOCKS; size = sizeof(int) * ptp->max_vclocks; ptp->vclock_index = kzalloc(size, GFP_KERNEL); if (!ptp->vclock_index) { err = -ENOMEM; goto no_mem_for_vclocks; } } err = ptp_populate_pin_groups(ptp); if (err) goto no_pin_groups; /* Register a new PPS source. */ if (info->pps) { struct pps_source_info pps; memset(&pps, 0, sizeof(pps)); snprintf(pps.name, PPS_MAX_NAME_LEN, "ptp%d", index); pps.mode = PTP_PPS_MODE; pps.owner = info->owner; ptp->pps_source = pps_register_source(&pps, PTP_PPS_DEFAULTS); if (IS_ERR(ptp->pps_source)) { err = PTR_ERR(ptp->pps_source); pr_err("failed to register pps source\n"); goto no_pps; } ptp->pps_source->lookup_cookie = ptp; } /* Initialize a new device of our class in our clock structure. */ device_initialize(&ptp->dev); ptp->dev.devt = ptp->devid; ptp->dev.class = &ptp_class; ptp->dev.parent = parent; ptp->dev.groups = ptp->pin_attr_groups; ptp->dev.release = ptp_clock_release; dev_set_drvdata(&ptp->dev, ptp); dev_set_name(&ptp->dev, "ptp%d", ptp->index); /* Create a posix clock and link it to the device. */ err = posix_clock_register(&ptp->clock, &ptp->dev); if (err) { if (ptp->pps_source) pps_unregister_source(ptp->pps_source); if (ptp->kworker) kthread_destroy_worker(ptp->kworker); put_device(&ptp->dev); pr_err("failed to create posix clock\n"); return ERR_PTR(err); } /* Debugfs initialization */ snprintf(debugfsname, sizeof(debugfsname), "ptp%d", ptp->index); ptp->debugfs_root = debugfs_create_dir(debugfsname, NULL); return ptp; no_pps: ptp_cleanup_pin_groups(ptp); no_pin_groups: kfree(ptp->vclock_index); no_mem_for_vclocks: if (ptp->kworker) kthread_destroy_worker(ptp->kworker); kworker_err: mutex_destroy(&ptp->pincfg_mux); mutex_destroy(&ptp->n_vclocks_mux); bitmap_free(queue->mask); no_memory_bitmap: list_del(&queue->qlist); kfree(queue); no_memory_queue: xa_erase(&ptp_clocks_map, index); no_slot: kfree(ptp); no_memory: return ERR_PTR(err); } EXPORT_SYMBOL(ptp_clock_register); static int unregister_vclock(struct device *dev, void *data) { struct ptp_clock *ptp = dev_get_drvdata(dev); ptp_vclock_unregister(info_to_vclock(ptp->info)); return 0; } int ptp_clock_unregister(struct ptp_clock *ptp) { if (ptp_vclock_in_use(ptp)) { device_for_each_child(&ptp->dev, NULL, unregister_vclock); } ptp->defunct = 1; wake_up_interruptible(&ptp->tsev_wq); if (ptp->kworker) { kthread_cancel_delayed_work_sync(&ptp->aux_work); kthread_destroy_worker(ptp->kworker); } /* Release the clock's resources. */ if (ptp->pps_source) pps_unregister_source(ptp->pps_source); posix_clock_unregister(&ptp->clock); return 0; } EXPORT_SYMBOL(ptp_clock_unregister); void ptp_clock_event(struct ptp_clock *ptp, struct ptp_clock_event *event) { struct timestamp_event_queue *tsevq; struct pps_event_time evt; unsigned long flags; switch (event->type) { case PTP_CLOCK_ALARM: break; case PTP_CLOCK_EXTTS: case PTP_CLOCK_EXTOFF: /* Enqueue timestamp on selected queues */ spin_lock_irqsave(&ptp->tsevqs_lock, flags); list_for_each_entry(tsevq, &ptp->tsevqs, qlist) { if (test_bit((unsigned int)event->index, tsevq->mask)) enqueue_external_timestamp(tsevq, event); } spin_unlock_irqrestore(&ptp->tsevqs_lock, flags); wake_up_interruptible(&ptp->tsev_wq); break; case PTP_CLOCK_PPS: pps_get_ts(&evt); pps_event(ptp->pps_source, &evt, PTP_PPS_EVENT, NULL); break; case PTP_CLOCK_PPSUSR: pps_event(ptp->pps_source, &event->pps_times, PTP_PPS_EVENT, NULL); break; } } EXPORT_SYMBOL(ptp_clock_event); int ptp_clock_index(struct ptp_clock *ptp) { return ptp->index; } EXPORT_SYMBOL(ptp_clock_index); int ptp_find_pin(struct ptp_clock *ptp, enum ptp_pin_function func, unsigned int chan) { struct ptp_pin_desc *pin = NULL; int i; for (i = 0; i < ptp->info->n_pins; i++) { if (ptp->info->pin_config[i].func == func && ptp->info->pin_config[i].chan == chan) { pin = &ptp->info->pin_config[i]; break; } } return pin ? i : -1; } EXPORT_SYMBOL(ptp_find_pin); int ptp_find_pin_unlocked(struct ptp_clock *ptp, enum ptp_pin_function func, unsigned int chan) { int result; mutex_lock(&ptp->pincfg_mux); result = ptp_find_pin(ptp, func, chan); mutex_unlock(&ptp->pincfg_mux); return result; } EXPORT_SYMBOL(ptp_find_pin_unlocked); int ptp_schedule_worker(struct ptp_clock *ptp, unsigned long delay) { return kthread_mod_delayed_work(ptp->kworker, &ptp->aux_work, delay); } EXPORT_SYMBOL(ptp_schedule_worker); void ptp_cancel_worker_sync(struct ptp_clock *ptp) { kthread_cancel_delayed_work_sync(&ptp->aux_work); } EXPORT_SYMBOL(ptp_cancel_worker_sync); /* module operations */ static void __exit ptp_exit(void) { class_unregister(&ptp_class); unregister_chrdev_region(ptp_devt, MINORMASK + 1); xa_destroy(&ptp_clocks_map); } static int __init ptp_init(void) { int err; err = class_register(&ptp_class); if (err) { pr_err("ptp: failed to allocate class\n"); return err; } err = alloc_chrdev_region(&ptp_devt, 0, MINORMASK + 1, "ptp"); if (err < 0) { pr_err("ptp: failed to allocate device region\n"); goto no_region; } pr_info("PTP clock support registered\n"); return 0; no_region: class_unregister(&ptp_class); return err; } subsys_initcall(ptp_init); module_exit(ptp_exit); MODULE_AUTHOR("Richard Cochran <richardcochran@gmail.com>"); MODULE_DESCRIPTION("PTP clocks support"); MODULE_LICENSE("GPL");
7 5 8 2 2 2 1 15 18 1 1 1 15 41 5 37 37 33 33 5 36 37 41 1 41 1 45 14 31 3 1 39 1 40 1 15 1 14 1 2 1 3 8 6 1 3 1 3 6 7 7 1 1 40 1 1 2 3 35 4 31 1 1 3 1 1 1 1 31 8 2 2 2 1 1 1 5 2 2 1 14 14 13 6 7 13 6 6 12 11 1 11 10 10 10 21 11 17 17 10 19 1 19 4 9 4 4 2 3 1 1 1 1 18 1 17 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 14 1 1 1 1 1 1 1 1 3 1 1 1 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 /* * af_llc.c - LLC User Interface SAPs * Description: * Functions in this module are implementation of socket based llc * communications for the Linux operating system. Support of llc class * one and class two is provided via SOCK_DGRAM and SOCK_STREAM * respectively. * * An llc2 connection is (mac + sap), only one llc2 sap connection * is allowed per mac. Though one sap may have multiple mac + sap * connections. * * Copyright (c) 2001 by Jay Schulist <jschlst@samba.org> * 2002-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <net/llc.h> #include <net/llc_sap.h> #include <net/llc_pdu.h> #include <net/llc_conn.h> #include <net/tcp_states.h> /* remember: uninitialized global data is zeroed because its in .bss */ static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START; static u16 llc_ui_sap_link_no_max[256]; static struct sockaddr_llc llc_ui_addrnull; static const struct proto_ops llc_ui_ops; static bool llc_ui_wait_for_conn(struct sock *sk, long timeout); static int llc_ui_wait_for_disc(struct sock *sk, long timeout); static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout); #if 0 #define dprintk(args...) printk(KERN_DEBUG args) #else #define dprintk(args...) do {} while (0) #endif /* Maybe we'll add some more in the future. */ #define LLC_CMSG_PKTINFO 1 /** * llc_ui_next_link_no - return the next unused link number for a sap * @sap: Address of sap to get link number from. * * Return the next unused link number for a given sap. */ static inline u16 llc_ui_next_link_no(int sap) { return llc_ui_sap_link_no_max[sap]++; } /** * llc_proto_type - return eth protocol for ARP header type * @arphrd: ARP header type. * * Given an ARP header type return the corresponding ethernet protocol. */ static inline __be16 llc_proto_type(u16 arphrd) { return htons(ETH_P_802_2); } /** * llc_ui_addr_null - determines if a address structure is null * @addr: Address to test if null. */ static inline u8 llc_ui_addr_null(struct sockaddr_llc *addr) { return !memcmp(addr, &llc_ui_addrnull, sizeof(*addr)); } /** * llc_ui_header_len - return length of llc header based on operation * @sk: Socket which contains a valid llc socket type. * @addr: Complete sockaddr_llc structure received from the user. * * Provide the length of the llc header depending on what kind of * operation the user would like to perform and the type of socket. * Returns the correct llc header length. */ static inline u8 llc_ui_header_len(struct sock *sk, struct sockaddr_llc *addr) { u8 rc = LLC_PDU_LEN_U; if (addr->sllc_test) rc = LLC_PDU_LEN_U; else if (addr->sllc_xid) /* We need to expand header to sizeof(struct llc_xid_info) * since llc_pdu_init_as_xid_cmd() sets 4,5,6 bytes of LLC header * as XID PDU. In llc_ui_sendmsg() we reserved header size and then * filled all other space with user data. If we won't reserve this * bytes, llc_pdu_init_as_xid_cmd() will overwrite user data */ rc = LLC_PDU_LEN_U_XID; else if (sk->sk_type == SOCK_STREAM) rc = LLC_PDU_LEN_I; return rc; } /** * llc_ui_send_data - send data via reliable llc2 connection * @sk: Connection the socket is using. * @skb: Data the user wishes to send. * @noblock: can we block waiting for data? * * Send data via reliable llc2 connection. * Returns 0 upon success, non-zero if action did not succeed. * * This function always consumes a reference to the skb. */ static int llc_ui_send_data(struct sock* sk, struct sk_buff *skb, int noblock) { struct llc_sock* llc = llc_sk(sk); if (unlikely(llc_data_accept_state(llc->state) || llc->remote_busy_flag || llc->p_flag)) { long timeout = sock_sndtimeo(sk, noblock); int rc; rc = llc_ui_wait_for_busy_core(sk, timeout); if (rc) { kfree_skb(skb); return rc; } } return llc_build_and_send_pkt(sk, skb); } static void llc_ui_sk_init(struct socket *sock, struct sock *sk) { sock_graft(sk, sock); sk->sk_type = sock->type; sock->ops = &llc_ui_ops; } static struct proto llc_proto = { .name = "LLC", .owner = THIS_MODULE, .obj_size = sizeof(struct llc_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, }; /** * llc_ui_create - alloc and init a new llc_ui socket * @net: network namespace (must be default network) * @sock: Socket to initialize and attach allocated sk to. * @protocol: Unused. * @kern: on behalf of kernel or userspace * * Allocate and initialize a new llc_ui socket, validate the user wants a * socket type we have available. * Returns 0 upon success, negative upon failure. */ static int llc_ui_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; int rc = -ESOCKTNOSUPPORT; if (!ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; if (!net_eq(net, &init_net)) return -EAFNOSUPPORT; if (likely(sock->type == SOCK_DGRAM || sock->type == SOCK_STREAM)) { rc = -ENOMEM; sk = llc_sk_alloc(net, PF_LLC, GFP_KERNEL, &llc_proto, kern); if (sk) { rc = 0; llc_ui_sk_init(sock, sk); } } return rc; } /** * llc_ui_release - shutdown socket * @sock: Socket to release. * * Shutdown and deallocate an existing socket. */ static int llc_ui_release(struct socket *sock) { struct sock *sk = sock->sk; struct llc_sock *llc; if (unlikely(sk == NULL)) goto out; sock_hold(sk); lock_sock(sk); llc = llc_sk(sk); dprintk("%s: closing local(%02X) remote(%02X)\n", __func__, llc->laddr.lsap, llc->daddr.lsap); if (!llc_send_disc(sk)) llc_ui_wait_for_disc(sk, READ_ONCE(sk->sk_rcvtimeo)); if (!sock_flag(sk, SOCK_ZAPPED)) { struct llc_sap *sap = llc->sap; /* Hold this for release_sock(), so that llc_backlog_rcv() * could still use it. */ llc_sap_hold(sap); llc_sap_remove_socket(llc->sap, sk); release_sock(sk); llc_sap_put(sap); } else { release_sock(sk); } netdev_put(llc->dev, &llc->dev_tracker); sock_put(sk); sock_orphan(sk); sock->sk = NULL; llc_sk_free(sk); out: return 0; } /** * llc_ui_autoport - provide dynamically allocate SAP number * * Provide the caller with a dynamically allocated SAP number according * to the rules that are set in this function. Returns: 0, upon failure, * SAP number otherwise. */ static int llc_ui_autoport(void) { struct llc_sap *sap; int i, tries = 0; while (tries < LLC_SAP_DYN_TRIES) { for (i = llc_ui_sap_last_autoport; i < LLC_SAP_DYN_STOP; i += 2) { sap = llc_sap_find(i); if (!sap) { llc_ui_sap_last_autoport = i + 2; goto out; } llc_sap_put(sap); } llc_ui_sap_last_autoport = LLC_SAP_DYN_START; tries++; } i = 0; out: return i; } /** * llc_ui_autobind - automatically bind a socket to a sap * @sock: socket to bind * @addr: address to connect to * * Used by llc_ui_connect and llc_ui_sendmsg when the user hasn't * specifically used llc_ui_bind to bind to an specific address/sap * * Returns: 0 upon success, negative otherwise. */ static int llc_ui_autobind(struct socket *sock, struct sockaddr_llc *addr) { struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); struct net_device *dev = NULL; struct llc_sap *sap; int rc = -EINVAL; if (!sock_flag(sk, SOCK_ZAPPED)) goto out; if (!addr->sllc_arphrd) addr->sllc_arphrd = ARPHRD_ETHER; if (addr->sllc_arphrd != ARPHRD_ETHER) goto out; rc = -ENODEV; if (sk->sk_bound_dev_if) { dev = dev_get_by_index(&init_net, sk->sk_bound_dev_if); if (dev && addr->sllc_arphrd != dev->type) { dev_put(dev); dev = NULL; } } else dev = dev_getfirstbyhwtype(&init_net, addr->sllc_arphrd); if (!dev) goto out; rc = -EUSERS; llc->laddr.lsap = llc_ui_autoport(); if (!llc->laddr.lsap) goto out; rc = -EBUSY; /* some other network layer is using the sap */ sap = llc_sap_open(llc->laddr.lsap, NULL); if (!sap) goto out; /* Note: We do not expect errors from this point. */ llc->dev = dev; netdev_tracker_alloc(llc->dev, &llc->dev_tracker, GFP_KERNEL); dev = NULL; memcpy(llc->laddr.mac, llc->dev->dev_addr, IFHWADDRLEN); memcpy(&llc->addr, addr, sizeof(llc->addr)); /* assign new connection to its SAP */ llc_sap_add_socket(sap, sk); sock_reset_flag(sk, SOCK_ZAPPED); rc = 0; out: dev_put(dev); return rc; } /** * llc_ui_bind - bind a socket to a specific address. * @sock: Socket to bind an address to. * @uaddr: Address the user wants the socket bound to. * @addrlen: Length of the uaddr structure. * * Bind a socket to a specific address. For llc a user is able to bind to * a specific sap only or mac + sap. * If the user desires to bind to a specific mac + sap, it is possible to * have multiple sap connections via multiple macs. * Bind and autobind for that matter must enforce the correct sap usage * otherwise all hell will break loose. * Returns: 0 upon success, negative otherwise. */ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) { struct sockaddr_llc *addr = (struct sockaddr_llc *)uaddr; struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); struct net_device *dev = NULL; struct llc_sap *sap; int rc = -EINVAL; lock_sock(sk); if (unlikely(!sock_flag(sk, SOCK_ZAPPED) || addrlen != sizeof(*addr))) goto out; rc = -EAFNOSUPPORT; if (!addr->sllc_arphrd) addr->sllc_arphrd = ARPHRD_ETHER; if (unlikely(addr->sllc_family != AF_LLC || addr->sllc_arphrd != ARPHRD_ETHER)) goto out; dprintk("%s: binding %02X\n", __func__, addr->sllc_sap); rc = -ENODEV; rcu_read_lock(); if (sk->sk_bound_dev_if) { dev = dev_get_by_index_rcu(&init_net, sk->sk_bound_dev_if); if (dev) { if (is_zero_ether_addr(addr->sllc_mac)) memcpy(addr->sllc_mac, dev->dev_addr, IFHWADDRLEN); if (addr->sllc_arphrd != dev->type || !ether_addr_equal(addr->sllc_mac, dev->dev_addr)) { rc = -EINVAL; dev = NULL; } } } else { dev = dev_getbyhwaddr_rcu(&init_net, addr->sllc_arphrd, addr->sllc_mac); } dev_hold(dev); rcu_read_unlock(); if (!dev) goto out; if (!addr->sllc_sap) { rc = -EUSERS; addr->sllc_sap = llc_ui_autoport(); if (!addr->sllc_sap) goto out; } sap = llc_sap_find(addr->sllc_sap); if (!sap) { sap = llc_sap_open(addr->sllc_sap, NULL); rc = -EBUSY; /* some other network layer is using the sap */ if (!sap) goto out; } else { struct llc_addr laddr, daddr; struct sock *ask; memset(&laddr, 0, sizeof(laddr)); memset(&daddr, 0, sizeof(daddr)); /* * FIXME: check if the address is multicast, * only SOCK_DGRAM can do this. */ memcpy(laddr.mac, addr->sllc_mac, IFHWADDRLEN); laddr.lsap = addr->sllc_sap; rc = -EADDRINUSE; /* mac + sap clash. */ ask = llc_lookup_established(sap, &daddr, &laddr, &init_net); if (ask) { sock_put(ask); goto out_put; } } /* Note: We do not expect errors from this point. */ llc->dev = dev; netdev_tracker_alloc(llc->dev, &llc->dev_tracker, GFP_KERNEL); dev = NULL; llc->laddr.lsap = addr->sllc_sap; memcpy(llc->laddr.mac, addr->sllc_mac, IFHWADDRLEN); memcpy(&llc->addr, addr, sizeof(llc->addr)); /* assign new connection to its SAP */ llc_sap_add_socket(sap, sk); sock_reset_flag(sk, SOCK_ZAPPED); rc = 0; out_put: llc_sap_put(sap); out: dev_put(dev); release_sock(sk); return rc; } /** * llc_ui_shutdown - shutdown a connect llc2 socket. * @sock: Socket to shutdown. * @how: What part of the socket to shutdown. * * Shutdown a connected llc2 socket. Currently this function only supports * shutting down both sends and receives (2), we could probably make this * function such that a user can shutdown only half the connection but not * right now. * Returns: 0 upon success, negative otherwise. */ static int llc_ui_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; int rc = -ENOTCONN; lock_sock(sk); if (unlikely(sk->sk_state != TCP_ESTABLISHED)) goto out; rc = -EINVAL; if (how != 2) goto out; rc = llc_send_disc(sk); if (!rc) rc = llc_ui_wait_for_disc(sk, READ_ONCE(sk->sk_rcvtimeo)); /* Wake up anyone sleeping in poll */ sk->sk_state_change(sk); out: release_sock(sk); return rc; } /** * llc_ui_connect - Connect to a remote llc2 mac + sap. * @sock: Socket which will be connected to the remote destination. * @uaddr: Remote and possibly the local address of the new connection. * @addrlen: Size of uaddr structure. * @flags: Operational flags specified by the user. * * Connect to a remote llc2 mac + sap. The caller must specify the * destination mac and address to connect to. If the user hasn't previously * called bind(2) with a smac the address of the first interface of the * specified arp type will be used. * This function will autobind if user did not previously call bind. * Returns: 0 upon success, negative otherwise. */ static int llc_ui_connect(struct socket *sock, struct sockaddr *uaddr, int addrlen, int flags) { struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); struct sockaddr_llc *addr = (struct sockaddr_llc *)uaddr; int rc = -EINVAL; lock_sock(sk); if (unlikely(addrlen != sizeof(*addr))) goto out; rc = -EAFNOSUPPORT; if (unlikely(addr->sllc_family != AF_LLC)) goto out; if (unlikely(sk->sk_type != SOCK_STREAM)) goto out; rc = -EALREADY; if (unlikely(sock->state == SS_CONNECTING)) goto out; /* bind connection to sap if user hasn't done it. */ if (sock_flag(sk, SOCK_ZAPPED)) { /* bind to sap with null dev, exclusive */ rc = llc_ui_autobind(sock, addr); if (rc) goto out; } llc->daddr.lsap = addr->sllc_sap; memcpy(llc->daddr.mac, addr->sllc_mac, IFHWADDRLEN); sock->state = SS_CONNECTING; sk->sk_state = TCP_SYN_SENT; llc->link = llc_ui_next_link_no(llc->sap->laddr.lsap); rc = llc_establish_connection(sk, llc->dev->dev_addr, addr->sllc_mac, addr->sllc_sap); if (rc) { dprintk("%s: llc_ui_send_conn failed :-(\n", __func__); sock->state = SS_UNCONNECTED; sk->sk_state = TCP_CLOSE; goto out; } if (sk->sk_state == TCP_SYN_SENT) { const long timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); if (!timeo || !llc_ui_wait_for_conn(sk, timeo)) goto out; rc = sock_intr_errno(timeo); if (signal_pending(current)) goto out; } if (sk->sk_state == TCP_CLOSE) goto sock_error; sock->state = SS_CONNECTED; rc = 0; out: release_sock(sk); return rc; sock_error: rc = sock_error(sk) ? : -ECONNABORTED; sock->state = SS_UNCONNECTED; goto out; } /** * llc_ui_listen - allow a normal socket to accept incoming connections * @sock: Socket to allow incoming connections on. * @backlog: Number of connections to queue. * * Allow a normal socket to accept incoming connections. * Returns 0 upon success, negative otherwise. */ static int llc_ui_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; int rc = -EINVAL; lock_sock(sk); if (unlikely(sock->state != SS_UNCONNECTED)) goto out; rc = -EOPNOTSUPP; if (unlikely(sk->sk_type != SOCK_STREAM)) goto out; rc = -EAGAIN; if (sock_flag(sk, SOCK_ZAPPED)) goto out; rc = 0; if (!(unsigned int)backlog) /* BSDism */ backlog = 1; sk->sk_max_ack_backlog = backlog; if (sk->sk_state != TCP_LISTEN) { sk->sk_ack_backlog = 0; sk->sk_state = TCP_LISTEN; } sk->sk_socket->flags |= __SO_ACCEPTCON; out: release_sock(sk); return rc; } static int llc_ui_wait_for_disc(struct sock *sk, long timeout) { DEFINE_WAIT_FUNC(wait, woken_wake_function); int rc = 0; add_wait_queue(sk_sleep(sk), &wait); while (1) { if (sk_wait_event(sk, &timeout, READ_ONCE(sk->sk_state) == TCP_CLOSE, &wait)) break; rc = -ERESTARTSYS; if (signal_pending(current)) break; rc = -EAGAIN; if (!timeout) break; rc = 0; } remove_wait_queue(sk_sleep(sk), &wait); return rc; } static bool llc_ui_wait_for_conn(struct sock *sk, long timeout) { DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(sk_sleep(sk), &wait); while (1) { if (sk_wait_event(sk, &timeout, READ_ONCE(sk->sk_state) != TCP_SYN_SENT, &wait)) break; if (signal_pending(current) || !timeout) break; } remove_wait_queue(sk_sleep(sk), &wait); return timeout; } static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct llc_sock *llc = llc_sk(sk); int rc; add_wait_queue(sk_sleep(sk), &wait); while (1) { rc = 0; if (sk_wait_event(sk, &timeout, (READ_ONCE(sk->sk_shutdown) & RCV_SHUTDOWN) || (!llc_data_accept_state(llc->state) && !llc->remote_busy_flag && !llc->p_flag), &wait)) break; rc = -ERESTARTSYS; if (signal_pending(current)) break; rc = -EAGAIN; if (!timeout) break; } remove_wait_queue(sk_sleep(sk), &wait); return rc; } static int llc_wait_data(struct sock *sk, long timeo) { int rc; while (1) { /* * POSIX 1003.1g mandates this order. */ rc = sock_error(sk); if (rc) break; rc = 0; if (sk->sk_shutdown & RCV_SHUTDOWN) break; rc = -EAGAIN; if (!timeo) break; rc = sock_intr_errno(timeo); if (signal_pending(current)) break; rc = 0; if (sk_wait_data(sk, &timeo, NULL)) break; } return rc; } static void llc_cmsg_rcv(struct msghdr *msg, struct sk_buff *skb) { struct llc_sock *llc = llc_sk(skb->sk); if (llc->cmsg_flags & LLC_CMSG_PKTINFO) { struct llc_pktinfo info; memset(&info, 0, sizeof(info)); info.lpi_ifindex = llc_sk(skb->sk)->dev->ifindex; llc_pdu_decode_dsap(skb, &info.lpi_sap); llc_pdu_decode_da(skb, info.lpi_mac); put_cmsg(msg, SOL_LLC, LLC_OPT_PKTINFO, sizeof(info), &info); } } /** * llc_ui_accept - accept a new incoming connection. * @sock: Socket which connections arrive on. * @newsock: Socket to move incoming connection to. * @arg: User specified arguments * * Accept a new incoming connection. * Returns 0 upon success, negative otherwise. */ static int llc_ui_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { struct sock *sk = sock->sk, *newsk; struct llc_sock *llc, *newllc; struct sk_buff *skb; int rc = -EOPNOTSUPP; dprintk("%s: accepting on %02X\n", __func__, llc_sk(sk)->laddr.lsap); lock_sock(sk); if (unlikely(sk->sk_type != SOCK_STREAM)) goto out; rc = -EINVAL; if (unlikely(sock->state != SS_UNCONNECTED || sk->sk_state != TCP_LISTEN)) goto out; /* wait for a connection to arrive. */ if (skb_queue_empty(&sk->sk_receive_queue)) { rc = llc_wait_data(sk, READ_ONCE(sk->sk_rcvtimeo)); if (rc) goto out; } dprintk("%s: got a new connection on %02X\n", __func__, llc_sk(sk)->laddr.lsap); skb = skb_dequeue(&sk->sk_receive_queue); rc = -EINVAL; if (!skb->sk) goto frees; rc = 0; newsk = skb->sk; /* attach connection to a new socket. */ llc_ui_sk_init(newsock, newsk); sock_reset_flag(newsk, SOCK_ZAPPED); newsk->sk_state = TCP_ESTABLISHED; newsock->state = SS_CONNECTED; llc = llc_sk(sk); newllc = llc_sk(newsk); memcpy(&newllc->addr, &llc->addr, sizeof(newllc->addr)); newllc->link = llc_ui_next_link_no(newllc->laddr.lsap); /* put original socket back into a clean listen state. */ sk->sk_state = TCP_LISTEN; sk_acceptq_removed(sk); dprintk("%s: ok success on %02X, client on %02X\n", __func__, llc_sk(sk)->addr.sllc_sap, newllc->daddr.lsap); frees: kfree_skb(skb); out: release_sock(sk); return rc; } /** * llc_ui_recvmsg - copy received data to the socket user. * @sock: Socket to copy data from. * @msg: Various user space related information. * @len: Size of user buffer. * @flags: User specified flags. * * Copy received data to the socket user. * Returns non-negative upon success, negative otherwise. */ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { DECLARE_SOCKADDR(struct sockaddr_llc *, uaddr, msg->msg_name); const int nonblock = flags & MSG_DONTWAIT; struct sk_buff *skb = NULL; struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); size_t copied = 0; u32 peek_seq = 0; u32 *seq, skb_len; unsigned long used; int target; /* Read at least this many bytes */ long timeo; lock_sock(sk); copied = -ENOTCONN; if (unlikely(sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_LISTEN)) goto out; timeo = sock_rcvtimeo(sk, nonblock); seq = &llc->copied_seq; if (flags & MSG_PEEK) { peek_seq = llc->copied_seq; seq = &peek_seq; } target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); copied = 0; do { u32 offset; /* * We need to check signals first, to get correct SIGURG * handling. FIXME: Need to check this doesn't impact 1003.1g * and move it down to the bottom of the loop */ if (signal_pending(current)) { if (copied) break; copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; break; } /* Next get a buffer. */ skb = skb_peek(&sk->sk_receive_queue); if (skb) { offset = *seq; goto found_ok_skb; } /* Well, if we have backlog, try to process it now yet. */ if (copied >= target && !READ_ONCE(sk->sk_backlog.tail)) break; if (copied) { if (sk->sk_err || sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) || !timeo || (flags & MSG_PEEK)) break; } else { if (sock_flag(sk, SOCK_DONE)) break; if (sk->sk_err) { copied = sock_error(sk); break; } if (sk->sk_shutdown & RCV_SHUTDOWN) break; if (sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_CLOSE) { if (!sock_flag(sk, SOCK_DONE)) { /* * This occurs when user tries to read * from never connected socket. */ copied = -ENOTCONN; break; } break; } if (!timeo) { copied = -EAGAIN; break; } } if (copied >= target) { /* Do not sleep, just process backlog. */ release_sock(sk); lock_sock(sk); } else sk_wait_data(sk, &timeo, NULL); if ((flags & MSG_PEEK) && peek_seq != llc->copied_seq) { net_dbg_ratelimited("LLC(%s:%d): Application bug, race in MSG_PEEK\n", current->comm, task_pid_nr(current)); peek_seq = llc->copied_seq; } continue; found_ok_skb: skb_len = skb->len; /* Ok so how much can we use? */ used = skb->len - offset; if (len < used) used = len; if (!(flags & MSG_TRUNC)) { int rc = skb_copy_datagram_msg(skb, offset, msg, used); if (rc) { /* Exception. Bailout! */ if (!copied) copied = -EFAULT; break; } } *seq += used; copied += used; len -= used; /* For non stream protcols we get one packet per recvmsg call */ if (sk->sk_type != SOCK_STREAM) goto copy_uaddr; /* Partial read */ if (used + offset < skb_len) continue; if (!(flags & MSG_PEEK)) { skb_unlink(skb, &sk->sk_receive_queue); kfree_skb(skb); *seq = 0; } } while (len > 0); out: release_sock(sk); return copied; copy_uaddr: if (uaddr != NULL && skb != NULL) { memcpy(uaddr, llc_ui_skb_cb(skb), sizeof(*uaddr)); msg->msg_namelen = sizeof(*uaddr); } if (llc_sk(sk)->cmsg_flags) llc_cmsg_rcv(msg, skb); if (!(flags & MSG_PEEK)) { skb_unlink(skb, &sk->sk_receive_queue); kfree_skb(skb); *seq = 0; } goto out; } /** * llc_ui_sendmsg - Transmit data provided by the socket user. * @sock: Socket to transmit data from. * @msg: Various user related information. * @len: Length of data to transmit. * * Transmit data provided by the socket user. * Returns non-negative upon success, negative otherwise. */ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { DECLARE_SOCKADDR(struct sockaddr_llc *, addr, msg->msg_name); struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); int flags = msg->msg_flags; int noblock = flags & MSG_DONTWAIT; int rc = -EINVAL, copied = 0, hdrlen, hh_len; struct sk_buff *skb = NULL; struct net_device *dev; size_t size = 0; dprintk("%s: sending from %02X to %02X\n", __func__, llc->laddr.lsap, llc->daddr.lsap); lock_sock(sk); if (addr) { if (msg->msg_namelen < sizeof(*addr)) goto out; } else { if (llc_ui_addr_null(&llc->addr)) goto out; addr = &llc->addr; } /* must bind connection to sap if user hasn't done it. */ if (sock_flag(sk, SOCK_ZAPPED)) { /* bind to sap with null dev, exclusive. */ rc = llc_ui_autobind(sock, addr); if (rc) goto out; } dev = llc->dev; hh_len = LL_RESERVED_SPACE(dev); hdrlen = llc_ui_header_len(sk, addr); size = hdrlen + len; size = min_t(size_t, size, READ_ONCE(dev->mtu)); copied = size - hdrlen; rc = -EINVAL; if (copied < 0) goto out; release_sock(sk); skb = sock_alloc_send_skb(sk, hh_len + size, noblock, &rc); lock_sock(sk); if (!skb) goto out; if (sock_flag(sk, SOCK_ZAPPED) || llc->dev != dev || hdrlen != llc_ui_header_len(sk, addr) || hh_len != LL_RESERVED_SPACE(dev) || size > READ_ONCE(dev->mtu)) goto out; skb->dev = dev; skb->protocol = llc_proto_type(addr->sllc_arphrd); skb_reserve(skb, hh_len + hdrlen); rc = memcpy_from_msg(skb_put(skb, copied), msg, copied); if (rc) goto out; if (sk->sk_type == SOCK_DGRAM || addr->sllc_ua) { llc_build_and_send_ui_pkt(llc->sap, skb, addr->sllc_mac, addr->sllc_sap); skb = NULL; goto out; } if (addr->sllc_test) { llc_build_and_send_test_pkt(llc->sap, skb, addr->sllc_mac, addr->sllc_sap); skb = NULL; goto out; } if (addr->sllc_xid) { llc_build_and_send_xid_pkt(llc->sap, skb, addr->sllc_mac, addr->sllc_sap); skb = NULL; goto out; } rc = -ENOPROTOOPT; if (!(sk->sk_type == SOCK_STREAM && !addr->sllc_ua)) goto out; rc = llc_ui_send_data(sk, skb, noblock); skb = NULL; out: kfree_skb(skb); if (rc) dprintk("%s: failed sending from %02X to %02X: %d\n", __func__, llc->laddr.lsap, llc->daddr.lsap, rc); release_sock(sk); return rc ? : copied; } /** * llc_ui_getname - return the address info of a socket * @sock: Socket to get address of. * @uaddr: Address structure to return information. * @peer: Does user want local or remote address information. * * Return the address information of a socket. */ static int llc_ui_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_llc sllc; struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); int rc = -EBADF; memset(&sllc, 0, sizeof(sllc)); lock_sock(sk); if (sock_flag(sk, SOCK_ZAPPED)) goto out; if (peer) { rc = -ENOTCONN; if (sk->sk_state != TCP_ESTABLISHED) goto out; if(llc->dev) sllc.sllc_arphrd = llc->dev->type; sllc.sllc_sap = llc->daddr.lsap; memcpy(&sllc.sllc_mac, &llc->daddr.mac, IFHWADDRLEN); } else { rc = -EINVAL; if (!llc->sap) goto out; sllc.sllc_sap = llc->sap->laddr.lsap; if (llc->dev) { sllc.sllc_arphrd = llc->dev->type; memcpy(&sllc.sllc_mac, llc->dev->dev_addr, IFHWADDRLEN); } } sllc.sllc_family = AF_LLC; memcpy(uaddr, &sllc, sizeof(sllc)); rc = sizeof(sllc); out: release_sock(sk); return rc; } /** * llc_ui_ioctl - io controls for PF_LLC * @sock: Socket to get/set info * @cmd: command * @arg: optional argument for cmd * * get/set info on llc sockets */ static int llc_ui_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { return -ENOIOCTLCMD; } /** * llc_ui_setsockopt - set various connection specific parameters. * @sock: Socket to set options on. * @level: Socket level user is requesting operations on. * @optname: Operation name. * @optval: User provided operation data. * @optlen: Length of optval. * * Set various connection specific parameters. */ static int llc_ui_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); unsigned int opt; int rc = -EINVAL; lock_sock(sk); if (unlikely(level != SOL_LLC || optlen != sizeof(int))) goto out; rc = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (rc) goto out; rc = -EINVAL; switch (optname) { case LLC_OPT_RETRY: if (opt > LLC_OPT_MAX_RETRY) goto out; llc->n2 = opt; break; case LLC_OPT_SIZE: if (opt > LLC_OPT_MAX_SIZE) goto out; llc->n1 = opt; break; case LLC_OPT_ACK_TMR_EXP: if (opt > LLC_OPT_MAX_ACK_TMR_EXP) goto out; llc->ack_timer.expire = opt * HZ; break; case LLC_OPT_P_TMR_EXP: if (opt > LLC_OPT_MAX_P_TMR_EXP) goto out; llc->pf_cycle_timer.expire = opt * HZ; break; case LLC_OPT_REJ_TMR_EXP: if (opt > LLC_OPT_MAX_REJ_TMR_EXP) goto out; llc->rej_sent_timer.expire = opt * HZ; break; case LLC_OPT_BUSY_TMR_EXP: if (opt > LLC_OPT_MAX_BUSY_TMR_EXP) goto out; llc->busy_state_timer.expire = opt * HZ; break; case LLC_OPT_TX_WIN: if (opt > LLC_OPT_MAX_WIN) goto out; llc->k = opt; break; case LLC_OPT_RX_WIN: if (opt > LLC_OPT_MAX_WIN) goto out; llc->rw = opt; break; case LLC_OPT_PKTINFO: if (opt) llc->cmsg_flags |= LLC_CMSG_PKTINFO; else llc->cmsg_flags &= ~LLC_CMSG_PKTINFO; break; default: rc = -ENOPROTOOPT; goto out; } rc = 0; out: release_sock(sk); return rc; } /** * llc_ui_getsockopt - get connection specific socket info * @sock: Socket to get information from. * @level: Socket level user is requesting operations on. * @optname: Operation name. * @optval: Variable to return operation data in. * @optlen: Length of optval. * * Get connection specific socket information. */ static int llc_ui_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); int val = 0, len = 0, rc = -EINVAL; lock_sock(sk); if (unlikely(level != SOL_LLC)) goto out; rc = get_user(len, optlen); if (rc) goto out; rc = -EINVAL; if (len != sizeof(int)) goto out; switch (optname) { case LLC_OPT_RETRY: val = llc->n2; break; case LLC_OPT_SIZE: val = llc->n1; break; case LLC_OPT_ACK_TMR_EXP: val = llc->ack_timer.expire / HZ; break; case LLC_OPT_P_TMR_EXP: val = llc->pf_cycle_timer.expire / HZ; break; case LLC_OPT_REJ_TMR_EXP: val = llc->rej_sent_timer.expire / HZ; break; case LLC_OPT_BUSY_TMR_EXP: val = llc->busy_state_timer.expire / HZ; break; case LLC_OPT_TX_WIN: val = llc->k; break; case LLC_OPT_RX_WIN: val = llc->rw; break; case LLC_OPT_PKTINFO: val = (llc->cmsg_flags & LLC_CMSG_PKTINFO) != 0; break; default: rc = -ENOPROTOOPT; goto out; } rc = 0; if (put_user(len, optlen) || copy_to_user(optval, &val, len)) rc = -EFAULT; out: release_sock(sk); return rc; } static const struct net_proto_family llc_ui_family_ops = { .family = PF_LLC, .create = llc_ui_create, .owner = THIS_MODULE, }; static const struct proto_ops llc_ui_ops = { .family = PF_LLC, .owner = THIS_MODULE, .release = llc_ui_release, .bind = llc_ui_bind, .connect = llc_ui_connect, .socketpair = sock_no_socketpair, .accept = llc_ui_accept, .getname = llc_ui_getname, .poll = datagram_poll, .ioctl = llc_ui_ioctl, .listen = llc_ui_listen, .shutdown = llc_ui_shutdown, .setsockopt = llc_ui_setsockopt, .getsockopt = llc_ui_getsockopt, .sendmsg = llc_ui_sendmsg, .recvmsg = llc_ui_recvmsg, .mmap = sock_no_mmap, }; static const char llc_proc_err_msg[] __initconst = KERN_CRIT "LLC: Unable to register the proc_fs entries\n"; static const char llc_sysctl_err_msg[] __initconst = KERN_CRIT "LLC: Unable to register the sysctl entries\n"; static const char llc_sock_err_msg[] __initconst = KERN_CRIT "LLC: Unable to register the network family\n"; static int __init llc2_init(void) { int rc = proto_register(&llc_proto, 0); if (rc != 0) goto out; llc_build_offset_table(); llc_station_init(); llc_ui_sap_last_autoport = LLC_SAP_DYN_START; rc = llc_proc_init(); if (rc != 0) { printk(llc_proc_err_msg); goto out_station; } rc = llc_sysctl_init(); if (rc) { printk(llc_sysctl_err_msg); goto out_proc; } rc = sock_register(&llc_ui_family_ops); if (rc) { printk(llc_sock_err_msg); goto out_sysctl; } llc_add_pack(LLC_DEST_SAP, llc_sap_handler); llc_add_pack(LLC_DEST_CONN, llc_conn_handler); out: return rc; out_sysctl: llc_sysctl_exit(); out_proc: llc_proc_exit(); out_station: llc_station_exit(); proto_unregister(&llc_proto); goto out; } static void __exit llc2_exit(void) { llc_station_exit(); llc_remove_pack(LLC_DEST_SAP); llc_remove_pack(LLC_DEST_CONN); sock_unregister(PF_LLC); llc_proc_exit(); llc_sysctl_exit(); proto_unregister(&llc_proto); } module_init(llc2_init); module_exit(llc2_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Procom 1997, Jay Schullist 2001, Arnaldo C. Melo 2001-2003"); MODULE_DESCRIPTION("IEEE 802.2 PF_LLC support"); MODULE_ALIAS_NETPROTO(PF_LLC);
233 222 2 177 32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 /* * Copyright (c) 1982, 1986 Regents of the University of California. * All rights reserved. * * This code is derived from software contributed to Berkeley by * Robert Elz at The University of Melbourne. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #ifndef _LINUX_QUOTA_ #define _LINUX_QUOTA_ #include <linux/list.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/spinlock.h> #include <linux/wait.h> #include <linux/percpu_counter.h> #include <linux/dqblk_xfs.h> #include <linux/dqblk_v1.h> #include <linux/dqblk_v2.h> #include <linux/atomic.h> #include <linux/uidgid.h> #include <linux/projid.h> #include <uapi/linux/quota.h> #undef USRQUOTA #undef GRPQUOTA #undef PRJQUOTA enum quota_type { USRQUOTA = 0, /* element used for user quotas */ GRPQUOTA = 1, /* element used for group quotas */ PRJQUOTA = 2, /* element used for project quotas */ }; /* Masks for quota types when used as a bitmask */ #define QTYPE_MASK_USR (1 << USRQUOTA) #define QTYPE_MASK_GRP (1 << GRPQUOTA) #define QTYPE_MASK_PRJ (1 << PRJQUOTA) typedef __kernel_uid32_t qid_t; /* Type in which we store ids in memory */ typedef long long qsize_t; /* Type in which we store sizes */ struct kqid { /* Type in which we store the quota identifier */ union { kuid_t uid; kgid_t gid; kprojid_t projid; }; enum quota_type type; /* USRQUOTA (uid) or GRPQUOTA (gid) or PRJQUOTA (projid) */ }; extern bool qid_eq(struct kqid left, struct kqid right); extern bool qid_lt(struct kqid left, struct kqid right); extern qid_t from_kqid(struct user_namespace *to, struct kqid qid); extern qid_t from_kqid_munged(struct user_namespace *to, struct kqid qid); extern bool qid_valid(struct kqid qid); /** * make_kqid - Map a user-namespace, type, qid tuple into a kqid. * @from: User namespace that the qid is in * @type: The type of quota * @qid: Quota identifier * * Maps a user-namespace, type qid tuple into a kernel internal * kqid, and returns that kqid. * * When there is no mapping defined for the user-namespace, type, * qid tuple an invalid kqid is returned. Callers are expected to * test for and handle invalid kqids being returned. * Invalid kqids may be tested for using qid_valid(). */ static inline struct kqid make_kqid(struct user_namespace *from, enum quota_type type, qid_t qid) { struct kqid kqid; kqid.type = type; switch (type) { case USRQUOTA: kqid.uid = make_kuid(from, qid); break; case GRPQUOTA: kqid.gid = make_kgid(from, qid); break; case PRJQUOTA: kqid.projid = make_kprojid(from, qid); break; default: BUG(); } return kqid; } /** * make_kqid_invalid - Explicitly make an invalid kqid * @type: The type of quota identifier * * Returns an invalid kqid with the specified type. */ static inline struct kqid make_kqid_invalid(enum quota_type type) { struct kqid kqid; kqid.type = type; switch (type) { case USRQUOTA: kqid.uid = INVALID_UID; break; case GRPQUOTA: kqid.gid = INVALID_GID; break; case PRJQUOTA: kqid.projid = INVALID_PROJID; break; default: BUG(); } return kqid; } /** * make_kqid_uid - Make a kqid from a kuid * @uid: The kuid to make the quota identifier from */ static inline struct kqid make_kqid_uid(kuid_t uid) { struct kqid kqid; kqid.type = USRQUOTA; kqid.uid = uid; return kqid; } /** * make_kqid_gid - Make a kqid from a kgid * @gid: The kgid to make the quota identifier from */ static inline struct kqid make_kqid_gid(kgid_t gid) { struct kqid kqid; kqid.type = GRPQUOTA; kqid.gid = gid; return kqid; } /** * make_kqid_projid - Make a kqid from a projid * @projid: The kprojid to make the quota identifier from */ static inline struct kqid make_kqid_projid(kprojid_t projid) { struct kqid kqid; kqid.type = PRJQUOTA; kqid.projid = projid; return kqid; } /** * qid_has_mapping - Report if a qid maps into a user namespace. * @ns: The user namespace to see if a value maps into. * @qid: The kernel internal quota identifier to test. */ static inline bool qid_has_mapping(struct user_namespace *ns, struct kqid qid) { return from_kqid(ns, qid) != (qid_t) -1; } extern spinlock_t dq_data_lock; /* Maximal numbers of writes for quota operation (insert/delete/update) * (over VFS all formats) */ #define DQUOT_INIT_ALLOC max(V1_INIT_ALLOC, V2_INIT_ALLOC) #define DQUOT_INIT_REWRITE max(V1_INIT_REWRITE, V2_INIT_REWRITE) #define DQUOT_DEL_ALLOC max(V1_DEL_ALLOC, V2_DEL_ALLOC) #define DQUOT_DEL_REWRITE max(V1_DEL_REWRITE, V2_DEL_REWRITE) /* * Data for one user/group kept in memory */ struct mem_dqblk { qsize_t dqb_bhardlimit; /* absolute limit on disk blks alloc */ qsize_t dqb_bsoftlimit; /* preferred limit on disk blks */ qsize_t dqb_curspace; /* current used space */ qsize_t dqb_rsvspace; /* current reserved space for delalloc*/ qsize_t dqb_ihardlimit; /* absolute limit on allocated inodes */ qsize_t dqb_isoftlimit; /* preferred inode limit */ qsize_t dqb_curinodes; /* current # allocated inodes */ time64_t dqb_btime; /* time limit for excessive disk use */ time64_t dqb_itime; /* time limit for excessive inode use */ }; /* * Data for one quotafile kept in memory */ struct quota_format_type; struct mem_dqinfo { struct quota_format_type *dqi_format; int dqi_fmt_id; /* Id of the dqi_format - used when turning * quotas on after remount RW */ struct list_head dqi_dirty_list; /* List of dirty dquots [dq_list_lock] */ unsigned long dqi_flags; /* DFQ_ flags [dq_data_lock] */ unsigned int dqi_bgrace; /* Space grace time [dq_data_lock] */ unsigned int dqi_igrace; /* Inode grace time [dq_data_lock] */ qsize_t dqi_max_spc_limit; /* Maximum space limit [static] */ qsize_t dqi_max_ino_limit; /* Maximum inode limit [static] */ void *dqi_priv; }; struct super_block; /* Mask for flags passed to userspace */ #define DQF_GETINFO_MASK (DQF_ROOT_SQUASH | DQF_SYS_FILE) /* Mask for flags modifiable from userspace */ #define DQF_SETINFO_MASK DQF_ROOT_SQUASH enum { DQF_INFO_DIRTY_B = DQF_PRIVATE, }; #define DQF_INFO_DIRTY (1 << DQF_INFO_DIRTY_B) /* Is info dirty? */ extern void mark_info_dirty(struct super_block *sb, int type); static inline int info_dirty(struct mem_dqinfo *info) { return test_bit(DQF_INFO_DIRTY_B, &info->dqi_flags); } enum { DQST_LOOKUPS, DQST_DROPS, DQST_READS, DQST_WRITES, DQST_CACHE_HITS, DQST_ALLOC_DQUOTS, DQST_FREE_DQUOTS, DQST_SYNCS, _DQST_DQSTAT_LAST }; struct dqstats { unsigned long stat[_DQST_DQSTAT_LAST]; struct percpu_counter counter[_DQST_DQSTAT_LAST]; }; extern struct dqstats dqstats; static inline void dqstats_inc(unsigned int type) { percpu_counter_inc(&dqstats.counter[type]); } static inline void dqstats_dec(unsigned int type) { percpu_counter_dec(&dqstats.counter[type]); } #define DQ_MOD_B 0 /* dquot modified since read */ #define DQ_BLKS_B 1 /* uid/gid has been warned about blk limit */ #define DQ_INODES_B 2 /* uid/gid has been warned about inode limit */ #define DQ_FAKE_B 3 /* no limits only usage */ #define DQ_READ_B 4 /* dquot was read into memory */ #define DQ_ACTIVE_B 5 /* dquot is active (dquot_release not called) */ #define DQ_RELEASING_B 6 /* dquot is in releasing_dquots list waiting * to be cleaned up */ #define DQ_LASTSET_B 7 /* Following 6 bits (see QIF_) are reserved\ * for the mask of entries set via SETQUOTA\ * quotactl. They are set under dq_data_lock\ * and the quota format handling dquot can\ * clear them when it sees fit. */ struct dquot { struct hlist_node dq_hash; /* Hash list in memory [dq_list_lock] */ struct list_head dq_inuse; /* List of all quotas [dq_list_lock] */ struct list_head dq_free; /* Free list element [dq_list_lock] */ struct list_head dq_dirty; /* List of dirty dquots [dq_list_lock] */ struct mutex dq_lock; /* dquot IO lock */ spinlock_t dq_dqb_lock; /* Lock protecting dq_dqb changes */ atomic_t dq_count; /* Use count */ struct super_block *dq_sb; /* superblock this applies to */ struct kqid dq_id; /* ID this applies to (uid, gid, projid) */ loff_t dq_off; /* Offset of dquot on disk [dq_lock, stable once set] */ unsigned long dq_flags; /* See DQ_* */ struct mem_dqblk dq_dqb; /* Diskquota usage [dq_dqb_lock] */ }; /* Operations which must be implemented by each quota format */ struct quota_format_ops { int (*check_quota_file)(struct super_block *sb, int type); /* Detect whether file is in our format */ int (*read_file_info)(struct super_block *sb, int type); /* Read main info about file - called on quotaon() */ int (*write_file_info)(struct super_block *sb, int type); /* Write main info about file */ int (*free_file_info)(struct super_block *sb, int type); /* Called on quotaoff() */ int (*read_dqblk)(struct dquot *dquot); /* Read structure for one user */ int (*commit_dqblk)(struct dquot *dquot); /* Write structure for one user */ int (*release_dqblk)(struct dquot *dquot); /* Called when last reference to dquot is being dropped */ int (*get_next_id)(struct super_block *sb, struct kqid *qid); /* Get next ID with existing structure in the quota file */ }; /* Operations working with dquots */ struct dquot_operations { int (*write_dquot) (struct dquot *); /* Ordinary dquot write */ struct dquot *(*alloc_dquot)(struct super_block *, int); /* Allocate memory for new dquot */ void (*destroy_dquot)(struct dquot *); /* Free memory for dquot */ int (*acquire_dquot) (struct dquot *); /* Quota is going to be created on disk */ int (*release_dquot) (struct dquot *); /* Quota is going to be deleted from disk */ int (*mark_dirty) (struct dquot *); /* Dquot is marked dirty */ int (*write_info) (struct super_block *, int); /* Write of quota "superblock" */ /* get reserved quota for delayed alloc, value returned is managed by * quota code only */ qsize_t *(*get_reserved_space) (struct inode *); int (*get_projid) (struct inode *, kprojid_t *);/* Get project ID */ /* Get number of inodes that were charged for a given inode */ int (*get_inode_usage) (struct inode *, qsize_t *); /* Get next ID with active quota structure */ int (*get_next_id) (struct super_block *sb, struct kqid *qid); }; struct path; /* Structure for communicating via ->get_dqblk() & ->set_dqblk() */ struct qc_dqblk { int d_fieldmask; /* mask of fields to change in ->set_dqblk() */ u64 d_spc_hardlimit; /* absolute limit on used space */ u64 d_spc_softlimit; /* preferred limit on used space */ u64 d_ino_hardlimit; /* maximum # allocated inodes */ u64 d_ino_softlimit; /* preferred inode limit */ u64 d_space; /* Space owned by the user */ u64 d_ino_count; /* # inodes owned by the user */ s64 d_ino_timer; /* zero if within inode limits */ /* if not, we refuse service */ s64 d_spc_timer; /* similar to above; for space */ int d_ino_warns; /* # warnings issued wrt num inodes */ int d_spc_warns; /* # warnings issued wrt used space */ u64 d_rt_spc_hardlimit; /* absolute limit on realtime space */ u64 d_rt_spc_softlimit; /* preferred limit on RT space */ u64 d_rt_space; /* realtime space owned */ s64 d_rt_spc_timer; /* similar to above; for RT space */ int d_rt_spc_warns; /* # warnings issued wrt RT space */ }; /* * Field specifiers for ->set_dqblk() in struct qc_dqblk and also for * ->set_info() in struct qc_info */ #define QC_INO_SOFT (1<<0) #define QC_INO_HARD (1<<1) #define QC_SPC_SOFT (1<<2) #define QC_SPC_HARD (1<<3) #define QC_RT_SPC_SOFT (1<<4) #define QC_RT_SPC_HARD (1<<5) #define QC_LIMIT_MASK (QC_INO_SOFT | QC_INO_HARD | QC_SPC_SOFT | QC_SPC_HARD | \ QC_RT_SPC_SOFT | QC_RT_SPC_HARD) #define QC_SPC_TIMER (1<<6) #define QC_INO_TIMER (1<<7) #define QC_RT_SPC_TIMER (1<<8) #define QC_TIMER_MASK (QC_SPC_TIMER | QC_INO_TIMER | QC_RT_SPC_TIMER) #define QC_SPC_WARNS (1<<9) #define QC_INO_WARNS (1<<10) #define QC_RT_SPC_WARNS (1<<11) #define QC_WARNS_MASK (QC_SPC_WARNS | QC_INO_WARNS | QC_RT_SPC_WARNS) #define QC_SPACE (1<<12) #define QC_INO_COUNT (1<<13) #define QC_RT_SPACE (1<<14) #define QC_ACCT_MASK (QC_SPACE | QC_INO_COUNT | QC_RT_SPACE) #define QC_FLAGS (1<<15) #define QCI_SYSFILE (1 << 0) /* Quota file is hidden from userspace */ #define QCI_ROOT_SQUASH (1 << 1) /* Root squash turned on */ #define QCI_ACCT_ENABLED (1 << 2) /* Quota accounting enabled */ #define QCI_LIMITS_ENFORCED (1 << 3) /* Quota limits enforced */ /* Structures for communicating via ->get_state */ struct qc_type_state { unsigned int flags; /* Flags QCI_* */ unsigned int spc_timelimit; /* Time after which space softlimit is * enforced */ unsigned int ino_timelimit; /* Ditto for inode softlimit */ unsigned int rt_spc_timelimit; /* Ditto for real-time space */ unsigned int spc_warnlimit; /* Limit for number of space warnings */ unsigned int ino_warnlimit; /* Ditto for inodes */ unsigned int rt_spc_warnlimit; /* Ditto for real-time space */ unsigned long long ino; /* Inode number of quota file */ blkcnt_t blocks; /* Number of 512-byte blocks in the file */ blkcnt_t nextents; /* Number of extents in the file */ }; struct qc_state { unsigned int s_incoredqs; /* Number of dquots in core */ struct qc_type_state s_state[MAXQUOTAS]; /* Per quota type information */ }; /* Structure for communicating via ->set_info */ struct qc_info { int i_fieldmask; /* mask of fields to change in ->set_info() */ unsigned int i_flags; /* Flags QCI_* */ unsigned int i_spc_timelimit; /* Time after which space softlimit is * enforced */ unsigned int i_ino_timelimit; /* Ditto for inode softlimit */ unsigned int i_rt_spc_timelimit;/* Ditto for real-time space */ unsigned int i_spc_warnlimit; /* Limit for number of space warnings */ unsigned int i_ino_warnlimit; /* Limit for number of inode warnings */ unsigned int i_rt_spc_warnlimit; /* Ditto for real-time space */ }; /* Operations handling requests from userspace */ struct quotactl_ops { int (*quota_on)(struct super_block *, int, int, const struct path *); int (*quota_off)(struct super_block *, int); int (*quota_enable)(struct super_block *, unsigned int); int (*quota_disable)(struct super_block *, unsigned int); int (*quota_sync)(struct super_block *, int); int (*set_info)(struct super_block *, int, struct qc_info *); int (*get_dqblk)(struct super_block *, struct kqid, struct qc_dqblk *); int (*get_nextdqblk)(struct super_block *, struct kqid *, struct qc_dqblk *); int (*set_dqblk)(struct super_block *, struct kqid, struct qc_dqblk *); int (*get_state)(struct super_block *, struct qc_state *); int (*rm_xquota)(struct super_block *, unsigned int); }; struct quota_format_type { int qf_fmt_id; /* Quota format id */ const struct quota_format_ops *qf_ops; /* Operations of format */ struct module *qf_owner; /* Module implementing quota format */ struct quota_format_type *qf_next; }; /** * Quota state flags - they come in three flavors - for users, groups and projects. * * Actual typed flags layout: * USRQUOTA GRPQUOTA PRJQUOTA * DQUOT_USAGE_ENABLED 0x0001 0x0002 0x0004 * DQUOT_LIMITS_ENABLED 0x0008 0x0010 0x0020 * DQUOT_SUSPENDED 0x0040 0x0080 0x0100 * * Following bits are used for non-typed flags: * DQUOT_QUOTA_SYS_FILE 0x0200 * DQUOT_NEGATIVE_USAGE 0x0400 * DQUOT_NOLIST_DIRTY 0x0800 */ enum { _DQUOT_USAGE_ENABLED = 0, /* Track disk usage for users */ _DQUOT_LIMITS_ENABLED, /* Enforce quota limits for users */ _DQUOT_SUSPENDED, /* User diskquotas are off, but * we have necessary info in * memory to turn them on */ _DQUOT_STATE_FLAGS }; #define DQUOT_USAGE_ENABLED (1 << _DQUOT_USAGE_ENABLED * MAXQUOTAS) #define DQUOT_LIMITS_ENABLED (1 << _DQUOT_LIMITS_ENABLED * MAXQUOTAS) #define DQUOT_SUSPENDED (1 << _DQUOT_SUSPENDED * MAXQUOTAS) #define DQUOT_STATE_FLAGS (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED | \ DQUOT_SUSPENDED) /* Other quota flags */ #define DQUOT_STATE_LAST (_DQUOT_STATE_FLAGS * MAXQUOTAS) #define DQUOT_QUOTA_SYS_FILE (1 << DQUOT_STATE_LAST) /* Quota file is a special * system file and user cannot * touch it. Filesystem is * responsible for setting * S_NOQUOTA, S_NOATIME flags */ #define DQUOT_NEGATIVE_USAGE (1 << (DQUOT_STATE_LAST + 1)) /* Allow negative quota usage */ /* Do not track dirty dquots in a list */ #define DQUOT_NOLIST_DIRTY (1 << (DQUOT_STATE_LAST + 2)) static inline unsigned int dquot_state_flag(unsigned int flags, int type) { return flags << type; } static inline unsigned int dquot_generic_flag(unsigned int flags, int type) { return (flags >> type) & DQUOT_STATE_FLAGS; } /* Bitmap of quota types where flag is set in flags */ static __always_inline unsigned dquot_state_types(unsigned flags, unsigned flag) { BUILD_BUG_ON_NOT_POWER_OF_2(flag); return (flags / flag) & ((1 << MAXQUOTAS) - 1); } #ifdef CONFIG_QUOTA_NETLINK_INTERFACE extern void quota_send_warning(struct kqid qid, dev_t dev, const char warntype); #else static inline void quota_send_warning(struct kqid qid, dev_t dev, const char warntype) { return; } #endif /* CONFIG_QUOTA_NETLINK_INTERFACE */ struct quota_info { unsigned int flags; /* Flags for diskquotas on this device */ struct rw_semaphore dqio_sem; /* Lock quota file while I/O in progress */ struct inode *files[MAXQUOTAS]; /* inodes of quotafiles */ struct mem_dqinfo info[MAXQUOTAS]; /* Information for each quota type */ const struct quota_format_ops *ops[MAXQUOTAS]; /* Operations for each type */ }; void register_quota_format(struct quota_format_type *fmt); void unregister_quota_format(struct quota_format_type *fmt); struct quota_module_name { int qm_fmt_id; char *qm_mod_name; }; #define INIT_QUOTA_MODULE_NAMES {\ {QFMT_VFS_OLD, "quota_v1"},\ {QFMT_VFS_V0, "quota_v2"},\ {QFMT_VFS_V1, "quota_v2"},\ {0, NULL}} #endif /* _QUOTA_ */
6006 6016 3181 1309 986 1719 2947 1718 1719 2947 1719 2956 1717 1719 1719 2951 2947 2956 1207 63 68 57 67 52 51 6648 41 41 6648 6639 6647 6644 35 35 7950 3535 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 // SPDX-License-Identifier: GPL-2.0 #include <linux/mm.h> #include <linux/gfp.h> #include <linux/hugetlb.h> #include <asm/pgalloc.h> #include <asm/tlb.h> #include <asm/fixmap.h> #include <asm/mtrr.h> #ifdef CONFIG_DYNAMIC_PHYSICAL_MASK phys_addr_t physical_mask __ro_after_init = (1ULL << __PHYSICAL_MASK_SHIFT) - 1; EXPORT_SYMBOL(physical_mask); SYM_PIC_ALIAS(physical_mask); #endif pgtable_t pte_alloc_one(struct mm_struct *mm) { return __pte_alloc_one(mm, GFP_PGTABLE_USER); } void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { paravirt_release_pte(page_to_pfn(pte)); tlb_remove_ptdesc(tlb, page_ptdesc(pte)); } #if CONFIG_PGTABLE_LEVELS > 2 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); /* * NOTE! For PAE, any changes to the top page-directory-pointer-table * entries need a full cr3 reload to flush. */ #ifdef CONFIG_X86_PAE tlb->need_flush_all = 1; #endif tlb_remove_ptdesc(tlb, virt_to_ptdesc(pmd)); } #if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); tlb_remove_ptdesc(tlb, virt_to_ptdesc(pud)); } #if CONFIG_PGTABLE_LEVELS > 4 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) { paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); tlb_remove_ptdesc(tlb, virt_to_ptdesc(p4d)); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #endif /* CONFIG_PGTABLE_LEVELS > 2 */ static inline void pgd_list_add(pgd_t *pgd) { struct ptdesc *ptdesc = virt_to_ptdesc(pgd); list_add(&ptdesc->pt_list, &pgd_list); } static inline void pgd_list_del(pgd_t *pgd) { struct ptdesc *ptdesc = virt_to_ptdesc(pgd); list_del(&ptdesc->pt_list); } static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) { virt_to_ptdesc(pgd)->pt_mm = mm; } struct mm_struct *pgd_page_get_mm(struct page *page) { return page_ptdesc(page)->pt_mm; } static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) { /* PAE preallocates all its PMDs. No cloning needed. */ if (!IS_ENABLED(CONFIG_X86_PAE)) clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, swapper_pg_dir + KERNEL_PGD_BOUNDARY, KERNEL_PGD_PTRS); /* List used to sync kernel mapping updates */ pgd_set_mm(pgd, mm); pgd_list_add(pgd); } static void pgd_dtor(pgd_t *pgd) { spin_lock(&pgd_lock); pgd_list_del(pgd); spin_unlock(&pgd_lock); } /* * List of all pgd's needed for non-PAE so it can invalidate entries * in both cached and uncached pgd's; not needed for PAE since the * kernel pmd is shared. If PAE were not to share the pmd a similar * tactic would be needed. This is essentially codepath-based locking * against pageattr.c; it is the unique case in which a valid change * of kernel pagetables can't be lazily synchronized by vmalloc faults. * vmalloc faults work because attached pagetables are never freed. * -- nyc */ #ifdef CONFIG_X86_PAE /* * In PAE mode, we need to do a cr3 reload (=tlb flush) when * updating the top-level pagetable entries to guarantee the * processor notices the update. Since this is expensive, and * all 4 top-level entries are used almost immediately in a * new process's life, we just pre-populate them here. */ #define PREALLOCATED_PMDS PTRS_PER_PGD /* * "USER_PMDS" are the PMDs for the user copy of the page tables when * PTI is enabled. They do not exist when PTI is disabled. Note that * this is distinct from the user _portion_ of the kernel page tables * which always exists. * * We allocate separate PMDs for the kernel part of the user page-table * when PTI is enabled. We need them to map the per-process LDT into the * user-space page-table. */ #define PREALLOCATED_USER_PMDS (boot_cpu_has(X86_FEATURE_PTI) ? \ KERNEL_PGD_PTRS : 0) #define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) { paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); /* Note: almost everything apart from _PAGE_PRESENT is reserved at the pmd (PDPT) level. */ set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); /* * According to Intel App note "TLBs, Paging-Structure Caches, * and Their Invalidation", April 2007, document 317080-001, * section 8.1: in PAE mode we explicitly have to flush the * TLB via cr3 if the top-level pgd is changed... */ flush_tlb_mm(mm); } #else /* !CONFIG_X86_PAE */ /* No need to prepopulate any pagetable entries in non-PAE modes. */ #define PREALLOCATED_PMDS 0 #define PREALLOCATED_USER_PMDS 0 #define MAX_PREALLOCATED_USER_PMDS 0 #endif /* CONFIG_X86_PAE */ static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) { int i; struct ptdesc *ptdesc; for (i = 0; i < count; i++) if (pmds[i]) { ptdesc = virt_to_ptdesc(pmds[i]); pagetable_dtor(ptdesc); pagetable_free(ptdesc); mm_dec_nr_pmds(mm); } } static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) { int i; bool failed = false; gfp_t gfp = GFP_PGTABLE_USER; if (mm == &init_mm) gfp &= ~__GFP_ACCOUNT; gfp &= ~__GFP_HIGHMEM; for (i = 0; i < count; i++) { pmd_t *pmd = NULL; struct ptdesc *ptdesc = pagetable_alloc(gfp, 0); if (!ptdesc) failed = true; if (ptdesc && !pagetable_pmd_ctor(mm, ptdesc)) { pagetable_free(ptdesc); ptdesc = NULL; failed = true; } if (ptdesc) { mm_inc_nr_pmds(mm); pmd = ptdesc_address(ptdesc); } pmds[i] = pmd; } if (failed) { free_pmds(mm, pmds, count); return -ENOMEM; } return 0; } /* * Mop up any pmd pages which may still be attached to the pgd. * Normally they will be freed by munmap/exit_mmap, but any pmd we * preallocate which never got a corresponding vma will need to be * freed manually. */ static void mop_up_one_pmd(struct mm_struct *mm, pgd_t *pgdp) { pgd_t pgd = *pgdp; if (pgd_val(pgd) != 0) { pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd); pgd_clear(pgdp); paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); pmd_free(mm, pmd); mm_dec_nr_pmds(mm); } } static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) { int i; for (i = 0; i < PREALLOCATED_PMDS; i++) mop_up_one_pmd(mm, &pgdp[i]); #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION if (!boot_cpu_has(X86_FEATURE_PTI)) return; pgdp = kernel_to_user_pgdp(pgdp); for (i = 0; i < PREALLOCATED_USER_PMDS; i++) mop_up_one_pmd(mm, &pgdp[i + KERNEL_PGD_BOUNDARY]); #endif } static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) { p4d_t *p4d; pud_t *pud; int i; p4d = p4d_offset(pgd, 0); pud = pud_offset(p4d, 0); for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { pmd_t *pmd = pmds[i]; if (i >= KERNEL_PGD_BOUNDARY) memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]), sizeof(pmd_t) * PTRS_PER_PMD); pud_populate(mm, pud, pmd); } } #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION static void pgd_prepopulate_user_pmd(struct mm_struct *mm, pgd_t *k_pgd, pmd_t *pmds[]) { pgd_t *s_pgd = kernel_to_user_pgdp(swapper_pg_dir); pgd_t *u_pgd = kernel_to_user_pgdp(k_pgd); p4d_t *u_p4d; pud_t *u_pud; int i; u_p4d = p4d_offset(u_pgd, 0); u_pud = pud_offset(u_p4d, 0); s_pgd += KERNEL_PGD_BOUNDARY; u_pud += KERNEL_PGD_BOUNDARY; for (i = 0; i < PREALLOCATED_USER_PMDS; i++, u_pud++, s_pgd++) { pmd_t *pmd = pmds[i]; memcpy(pmd, (pmd_t *)pgd_page_vaddr(*s_pgd), sizeof(pmd_t) * PTRS_PER_PMD); pud_populate(mm, u_pud, pmd); } } #else static void pgd_prepopulate_user_pmd(struct mm_struct *mm, pgd_t *k_pgd, pmd_t *pmds[]) { } #endif static inline pgd_t *_pgd_alloc(struct mm_struct *mm) { /* * PTI and Xen need a whole page for the PAE PGD * even though the hardware only needs 32 bytes. * * For simplicity, allocate a page for all users. */ return __pgd_alloc(mm, pgd_allocation_order()); } static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd) { __pgd_free(mm, pgd); } pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS]; pmd_t *pmds[PREALLOCATED_PMDS]; pgd = _pgd_alloc(mm); if (pgd == NULL) goto out; mm->pgd = pgd; if (sizeof(pmds) != 0 && preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0) goto out_free_pgd; if (sizeof(u_pmds) != 0 && preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0) goto out_free_pmds; if (paravirt_pgd_alloc(mm) != 0) goto out_free_user_pmds; /* * Make sure that pre-populating the pmds is atomic with * respect to anything walking the pgd_list, so that they * never see a partially populated pgd. */ spin_lock(&pgd_lock); pgd_ctor(mm, pgd); if (sizeof(pmds) != 0) pgd_prepopulate_pmd(mm, pgd, pmds); if (sizeof(u_pmds) != 0) pgd_prepopulate_user_pmd(mm, pgd, u_pmds); spin_unlock(&pgd_lock); return pgd; out_free_user_pmds: if (sizeof(u_pmds) != 0) free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS); out_free_pmds: if (sizeof(pmds) != 0) free_pmds(mm, pmds, PREALLOCATED_PMDS); out_free_pgd: _pgd_free(mm, pgd); out: return NULL; } void pgd_free(struct mm_struct *mm, pgd_t *pgd) { pgd_mop_up_pmds(mm, pgd); pgd_dtor(pgd); paravirt_pgd_free(mm, pgd); _pgd_free(mm, pgd); } /* * Used to set accessed or dirty bits in the page table entries * on other architectures. On x86, the accessed and dirty bits * are tracked by hardware. However, do_wp_page calls this function * to also make the pte writeable at the same time the dirty bit is * set. In that case we do actually need to write the PTE. */ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) { int changed = !pte_same(*ptep, entry); if (changed && dirty) set_pte(ptep, entry); return changed; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty) { int changed = !pmd_same(*pmdp, entry); VM_BUG_ON(address & ~HPAGE_PMD_MASK); if (changed && dirty) { set_pmd(pmdp, entry); /* * We had a write-protection fault here and changed the pmd * to to more permissive. No need to flush the TLB for that, * #PF is architecturally guaranteed to do that and in the * worst-case we'll generate a spurious fault. */ } return changed; } int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t entry, int dirty) { int changed = !pud_same(*pudp, entry); VM_BUG_ON(address & ~HPAGE_PUD_MASK); if (changed && dirty) { set_pud(pudp, entry); /* * We had a write-protection fault here and changed the pud * to to more permissive. No need to flush the TLB for that, * #PF is architecturally guaranteed to do that and in the * worst-case we'll generate a spurious fault. */ } return changed; } #endif int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { int ret = 0; if (pte_young(*ptep)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *) &ptep->pte); return ret; } #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { int ret = 0; if (pmd_young(*pmdp)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *)pmdp); return ret; } #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE int pudp_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pud_t *pudp) { int ret = 0; if (pud_young(*pudp)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *)pudp); return ret; } #endif int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { /* * On x86 CPUs, clearing the accessed bit without a TLB flush * doesn't cause data corruption. [ It could cause incorrect * page aging and the (mistaken) reclaim of hot pages, but the * chance of that should be relatively low. ] * * So as a performance optimization don't flush the TLB when * clearing the accessed bit, it will eventually be flushed by * a context switch or a VM operation anyway. [ In the rare * event of it not getting flushed for a long time the delay * shouldn't really matter because there's no real memory * pressure for swapout to react to. ] */ return ptep_test_and_clear_young(vma, address, ptep); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { int young; VM_BUG_ON(address & ~HPAGE_PMD_MASK); young = pmdp_test_and_clear_young(vma, address, pmdp); if (young) flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return young; } pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { VM_WARN_ON_ONCE(!pmd_present(*pmdp)); /* * No flush is necessary. Once an invalid PTE is established, the PTE's * access and dirty bits cannot be updated. */ return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); } #endif #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) pud_t pudp_invalidate(struct vm_area_struct *vma, unsigned long address, pud_t *pudp) { VM_WARN_ON_ONCE(!pud_present(*pudp)); pud_t old = pudp_establish(vma, address, pudp, pud_mkinvalid(*pudp)); flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); return old; } #endif /** * reserve_top_address - Reserve a hole in the top of the kernel address space * @reserve: Size of hole to reserve * * Can be used to relocate the fixmap area and poke a hole in the top * of the kernel address space to make room for a hypervisor. */ void __init reserve_top_address(unsigned long reserve) { #ifdef CONFIG_X86_32 BUG_ON(fixmaps_set > 0); __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE; printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n", -reserve, __FIXADDR_TOP + PAGE_SIZE); #endif } int fixmaps_set; void __native_set_fixmap(enum fixed_addresses idx, pte_t pte) { unsigned long address = __fix_to_virt(idx); #ifdef CONFIG_X86_64 /* * Ensure that the static initial page tables are covering the * fixmap completely. */ BUILD_BUG_ON(__end_of_permanent_fixed_addresses > (FIXMAP_PMD_NUM * PTRS_PER_PTE)); #endif if (idx >= __end_of_fixed_addresses) { BUG(); return; } set_pte_vaddr(address, pte); fixmaps_set++; } void native_set_fixmap(unsigned /* enum fixed_addresses */ idx, phys_addr_t phys, pgprot_t flags) { /* Sanitize 'prot' against any unsupported bits: */ pgprot_val(flags) &= __default_kernel_pte_mask; __native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags)); } #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP #if CONFIG_PGTABLE_LEVELS > 4 /** * p4d_set_huge - Set up kernel P4D mapping * @p4d: Pointer to the P4D entry * @addr: Virtual address associated with the P4D entry * @prot: Protection bits to use * * No 512GB pages yet -- always return 0 */ int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { return 0; } /** * p4d_clear_huge - Clear kernel P4D mapping when it is set * @p4d: Pointer to the P4D entry to clear * * No 512GB pages yet -- do nothing */ void p4d_clear_huge(p4d_t *p4d) { } #endif /** * pud_set_huge - Set up kernel PUD mapping * @pud: Pointer to the PUD entry * @addr: Virtual address associated with the PUD entry * @prot: Protection bits to use * * MTRRs can override PAT memory types with 4KiB granularity. Therefore, this * function sets up a huge page only if the complete range has the same MTRR * caching mode. * * Callers should try to decrease page size (1GB -> 2MB -> 4K) if the bigger * page mapping attempt fails. * * Returns 1 on success and 0 on failure. */ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) { u8 uniform; mtrr_type_lookup(addr, addr + PUD_SIZE, &uniform); if (!uniform) return 0; /* Bail out if we are we on a populated non-leaf entry: */ if (pud_present(*pud) && !pud_leaf(*pud)) return 0; set_pte((pte_t *)pud, pfn_pte( (u64)addr >> PAGE_SHIFT, __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); return 1; } /** * pmd_set_huge - Set up kernel PMD mapping * @pmd: Pointer to the PMD entry * @addr: Virtual address associated with the PMD entry * @prot: Protection bits to use * * See text over pud_set_huge() above. * * Returns 1 on success and 0 on failure. */ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) { u8 uniform; mtrr_type_lookup(addr, addr + PMD_SIZE, &uniform); if (!uniform) { pr_warn_once("%s: Cannot satisfy [mem %#010llx-%#010llx] with a huge-page mapping due to MTRR override.\n", __func__, addr, addr + PMD_SIZE); return 0; } /* Bail out if we are we on a populated non-leaf entry: */ if (pmd_present(*pmd) && !pmd_leaf(*pmd)) return 0; set_pte((pte_t *)pmd, pfn_pte( (u64)addr >> PAGE_SHIFT, __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); return 1; } /** * pud_clear_huge - Clear kernel PUD mapping when it is set * @pud: Pointer to the PUD entry to clear. * * Returns 1 on success and 0 on failure (no PUD map is found). */ int pud_clear_huge(pud_t *pud) { if (pud_leaf(*pud)) { pud_clear(pud); return 1; } return 0; } /** * pmd_clear_huge - Clear kernel PMD mapping when it is set * @pmd: Pointer to the PMD entry to clear. * * Returns 1 on success and 0 on failure (no PMD map is found). */ int pmd_clear_huge(pmd_t *pmd) { if (pmd_leaf(*pmd)) { pmd_clear(pmd); return 1; } return 0; } #ifdef CONFIG_X86_64 /** * pud_free_pmd_page - Clear PUD entry and free PMD page * @pud: Pointer to a PUD * @addr: Virtual address associated with PUD * * Context: The PUD range has been unmapped and TLB purged. * Return: 1 if clearing the entry succeeded. 0 otherwise. * * NOTE: Callers must allow a single page allocation. */ int pud_free_pmd_page(pud_t *pud, unsigned long addr) { pmd_t *pmd, *pmd_sv; pte_t *pte; int i; pmd = pud_pgtable(*pud); pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL); if (!pmd_sv) return 0; for (i = 0; i < PTRS_PER_PMD; i++) { pmd_sv[i] = pmd[i]; if (!pmd_none(pmd[i])) pmd_clear(&pmd[i]); } pud_clear(pud); /* INVLPG to clear all paging-structure caches */ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); for (i = 0; i < PTRS_PER_PMD; i++) { if (!pmd_none(pmd_sv[i])) { pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]); pte_free_kernel(&init_mm, pte); } } free_page((unsigned long)pmd_sv); pmd_free(&init_mm, pmd); return 1; } /** * pmd_free_pte_page - Clear PMD entry and free PTE page. * @pmd: Pointer to the PMD * @addr: Virtual address associated with PMD * * Context: The PMD range has been unmapped and TLB purged. * Return: 1 if clearing the entry succeeded. 0 otherwise. */ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) { pte_t *pte; pte = (pte_t *)pmd_page_vaddr(*pmd); pmd_clear(pmd); /* INVLPG to clear all paging-structure caches */ flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1); pte_free_kernel(&init_mm, pte); return 1; } #else /* !CONFIG_X86_64 */ /* * Disable free page handling on x86-PAE. This assures that ioremap() * does not update sync'd PMD entries. See vmalloc_sync_one(). */ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) { return pmd_none(*pmd); } #endif /* CONFIG_X86_64 */ #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ pte_t pte_mkwrite(pte_t pte, struct vm_area_struct *vma) { if (vma->vm_flags & VM_SHADOW_STACK) return pte_mkwrite_shstk(pte); pte = pte_mkwrite_novma(pte); return pte_clear_saveddirty(pte); } pmd_t pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) { if (vma->vm_flags & VM_SHADOW_STACK) return pmd_mkwrite_shstk(pmd); pmd = pmd_mkwrite_novma(pmd); return pmd_clear_saveddirty(pmd); } void arch_check_zapped_pte(struct vm_area_struct *vma, pte_t pte) { /* * Hardware before shadow stack can (rarely) set Dirty=1 * on a Write=0 PTE. So the below condition * only indicates a software bug when shadow stack is * supported by the HW. This checking is covered in * pte_shstk(). */ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pte_shstk(pte)); } void arch_check_zapped_pmd(struct vm_area_struct *vma, pmd_t pmd) { /* See note in arch_check_zapped_pte() */ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pmd_shstk(pmd)); } void arch_check_zapped_pud(struct vm_area_struct *vma, pud_t pud) { /* See note in arch_check_zapped_pte() */ VM_WARN_ON_ONCE(!(vma->vm_flags & VM_SHADOW_STACK) && pud_shstk(pud)); }
15 2 2 11 13 66 66 39 178 2 39 188 2 1 8 35 143 102 1 1 50 32 39 188 2 1 12 13 171 2 1 19 38 307 2 183 5 59 71 347 31 270 13 46 12 22 12 251 3 2 308 20 245 87 2 250 12 2 41 7 1 9 8 7 1 8 308 300 7 4 4 330 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 // SPDX-License-Identifier: GPL-2.0-or-later /* Filesystem access-by-fd. * * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/syscalls.h> #include <linux/security.h> #include <linux/anon_inodes.h> #include <linux/namei.h> #include <linux/file.h> #include <uapi/linux/mount.h> #include "internal.h" #include "mount.h" /* * Allow the user to read back any error, warning or informational messages. */ static ssize_t fscontext_read(struct file *file, char __user *_buf, size_t len, loff_t *pos) { struct fs_context *fc = file->private_data; struct fc_log *log = fc->log.log; unsigned int logsize = ARRAY_SIZE(log->buffer); ssize_t ret; char *p; bool need_free; int index, n; ret = mutex_lock_interruptible(&fc->uapi_mutex); if (ret < 0) return ret; if (log->head == log->tail) { mutex_unlock(&fc->uapi_mutex); return -ENODATA; } index = log->tail & (logsize - 1); p = log->buffer[index]; need_free = log->need_free & (1 << index); log->buffer[index] = NULL; log->need_free &= ~(1 << index); log->tail++; mutex_unlock(&fc->uapi_mutex); ret = -EMSGSIZE; n = strlen(p); if (n > len) goto err_free; ret = -EFAULT; if (copy_to_user(_buf, p, n) != 0) goto err_free; ret = n; err_free: if (need_free) kfree(p); return ret; } static int fscontext_release(struct inode *inode, struct file *file) { struct fs_context *fc = file->private_data; if (fc) { file->private_data = NULL; put_fs_context(fc); } return 0; } const struct file_operations fscontext_fops = { .read = fscontext_read, .release = fscontext_release, }; /* * Attach a filesystem context to a file and an fd. */ static int fscontext_create_fd(struct fs_context *fc, unsigned int o_flags) { int fd; fd = anon_inode_getfd("[fscontext]", &fscontext_fops, fc, O_RDWR | o_flags); if (fd < 0) put_fs_context(fc); return fd; } static int fscontext_alloc_log(struct fs_context *fc) { fc->log.log = kzalloc(sizeof(*fc->log.log), GFP_KERNEL); if (!fc->log.log) return -ENOMEM; refcount_set(&fc->log.log->usage, 1); fc->log.log->owner = fc->fs_type->owner; return 0; } /* * Open a filesystem by name so that it can be configured for mounting. * * We are allowed to specify a container in which the filesystem will be * opened, thereby indicating which namespaces will be used (notably, which * network namespace will be used for network filesystems). */ SYSCALL_DEFINE2(fsopen, const char __user *, _fs_name, unsigned int, flags) { struct file_system_type *fs_type; struct fs_context *fc; const char *fs_name; int ret; if (!may_mount()) return -EPERM; if (flags & ~FSOPEN_CLOEXEC) return -EINVAL; fs_name = strndup_user(_fs_name, PAGE_SIZE); if (IS_ERR(fs_name)) return PTR_ERR(fs_name); fs_type = get_fs_type(fs_name); kfree(fs_name); if (!fs_type) return -ENODEV; fc = fs_context_for_mount(fs_type, 0); put_filesystem(fs_type); if (IS_ERR(fc)) return PTR_ERR(fc); fc->phase = FS_CONTEXT_CREATE_PARAMS; ret = fscontext_alloc_log(fc); if (ret < 0) goto err_fc; return fscontext_create_fd(fc, flags & FSOPEN_CLOEXEC ? O_CLOEXEC : 0); err_fc: put_fs_context(fc); return ret; } /* * Pick a superblock into a context for reconfiguration. */ SYSCALL_DEFINE3(fspick, int, dfd, const char __user *, path, unsigned int, flags) { struct fs_context *fc; struct path target; unsigned int lookup_flags; int ret; if (!may_mount()) return -EPERM; if ((flags & ~(FSPICK_CLOEXEC | FSPICK_SYMLINK_NOFOLLOW | FSPICK_NO_AUTOMOUNT | FSPICK_EMPTY_PATH)) != 0) return -EINVAL; lookup_flags = LOOKUP_FOLLOW | LOOKUP_AUTOMOUNT; if (flags & FSPICK_SYMLINK_NOFOLLOW) lookup_flags &= ~LOOKUP_FOLLOW; if (flags & FSPICK_NO_AUTOMOUNT) lookup_flags &= ~LOOKUP_AUTOMOUNT; if (flags & FSPICK_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; ret = user_path_at(dfd, path, lookup_flags, &target); if (ret < 0) goto err; ret = -EINVAL; if (target.mnt->mnt_root != target.dentry) goto err_path; fc = fs_context_for_reconfigure(target.dentry, 0, 0); if (IS_ERR(fc)) { ret = PTR_ERR(fc); goto err_path; } fc->phase = FS_CONTEXT_RECONF_PARAMS; ret = fscontext_alloc_log(fc); if (ret < 0) goto err_fc; path_put(&target); return fscontext_create_fd(fc, flags & FSPICK_CLOEXEC ? O_CLOEXEC : 0); err_fc: put_fs_context(fc); err_path: path_put(&target); err: return ret; } static int vfs_cmd_create(struct fs_context *fc, bool exclusive) { struct super_block *sb; int ret; if (fc->phase != FS_CONTEXT_CREATE_PARAMS) return -EBUSY; if (!mount_capable(fc)) return -EPERM; fc->phase = FS_CONTEXT_CREATING; fc->exclusive = exclusive; ret = vfs_get_tree(fc); if (ret) { fc->phase = FS_CONTEXT_FAILED; return ret; } sb = fc->root->d_sb; ret = security_sb_kern_mount(sb); if (unlikely(ret)) { fc_drop_locked(fc); fc->phase = FS_CONTEXT_FAILED; return ret; } /* vfs_get_tree() callchains will have grabbed @s_umount */ up_write(&sb->s_umount); fc->phase = FS_CONTEXT_AWAITING_MOUNT; return 0; } static int vfs_cmd_reconfigure(struct fs_context *fc) { struct super_block *sb; int ret; if (fc->phase != FS_CONTEXT_RECONF_PARAMS) return -EBUSY; fc->phase = FS_CONTEXT_RECONFIGURING; sb = fc->root->d_sb; if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { fc->phase = FS_CONTEXT_FAILED; return -EPERM; } down_write(&sb->s_umount); ret = reconfigure_super(fc); up_write(&sb->s_umount); if (ret) { fc->phase = FS_CONTEXT_FAILED; return ret; } vfs_clean_context(fc); return 0; } /* * Check the state and apply the configuration. Note that this function is * allowed to 'steal' the value by setting param->xxx to NULL before returning. */ static int vfs_fsconfig_locked(struct fs_context *fc, int cmd, struct fs_parameter *param) { int ret; ret = finish_clean_context(fc); if (ret) return ret; switch (cmd) { case FSCONFIG_CMD_CREATE: return vfs_cmd_create(fc, false); case FSCONFIG_CMD_CREATE_EXCL: return vfs_cmd_create(fc, true); case FSCONFIG_CMD_RECONFIGURE: return vfs_cmd_reconfigure(fc); default: if (fc->phase != FS_CONTEXT_CREATE_PARAMS && fc->phase != FS_CONTEXT_RECONF_PARAMS) return -EBUSY; return vfs_parse_fs_param(fc, param); } } /** * sys_fsconfig - Set parameters and trigger actions on a context * @fd: The filesystem context to act upon * @cmd: The action to take * @_key: Where appropriate, the parameter key to set * @_value: Where appropriate, the parameter value to set * @aux: Additional information for the value * * This system call is used to set parameters on a context, including * superblock settings, data source and security labelling. * * Actions include triggering the creation of a superblock and the * reconfiguration of the superblock attached to the specified context. * * When setting a parameter, @cmd indicates the type of value being proposed * and @_key indicates the parameter to be altered. * * @_value and @aux are used to specify the value, should a value be required: * * (*) fsconfig_set_flag: No value is specified. The parameter must be boolean * in nature. The key may be prefixed with "no" to invert the * setting. @_value must be NULL and @aux must be 0. * * (*) fsconfig_set_string: A string value is specified. The parameter can be * expecting boolean, integer, string or take a path. A conversion to an * appropriate type will be attempted (which may include looking up as a * path). @_value points to a NUL-terminated string and @aux must be 0. * * (*) fsconfig_set_binary: A binary blob is specified. @_value points to the * blob and @aux indicates its size. The parameter must be expecting a * blob. * * (*) fsconfig_set_path: A non-empty path is specified. The parameter must be * expecting a path object. @_value points to a NUL-terminated string that * is the path and @aux is a file descriptor at which to start a relative * lookup or AT_FDCWD. * * (*) fsconfig_set_path_empty: As fsconfig_set_path, but with AT_EMPTY_PATH * implied. * * (*) fsconfig_set_fd: An open file descriptor is specified. @_value must be * NULL and @aux indicates the file descriptor. */ SYSCALL_DEFINE5(fsconfig, int, fd, unsigned int, cmd, const char __user *, _key, const void __user *, _value, int, aux) { struct fs_context *fc; int ret; int lookup_flags = 0; struct fs_parameter param = { .type = fs_value_is_undefined, }; if (fd < 0) return -EINVAL; switch (cmd) { case FSCONFIG_SET_FLAG: if (!_key || _value || aux) return -EINVAL; break; case FSCONFIG_SET_STRING: if (!_key || !_value || aux) return -EINVAL; break; case FSCONFIG_SET_BINARY: if (!_key || !_value || aux <= 0 || aux > 1024 * 1024) return -EINVAL; break; case FSCONFIG_SET_PATH: case FSCONFIG_SET_PATH_EMPTY: if (!_key || !_value || (aux != AT_FDCWD && aux < 0)) return -EINVAL; break; case FSCONFIG_SET_FD: if (!_key || _value || aux < 0) return -EINVAL; break; case FSCONFIG_CMD_CREATE: case FSCONFIG_CMD_CREATE_EXCL: case FSCONFIG_CMD_RECONFIGURE: if (_key || _value || aux) return -EINVAL; break; default: return -EOPNOTSUPP; } CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADF; if (fd_file(f)->f_op != &fscontext_fops) return -EINVAL; fc = fd_file(f)->private_data; if (fc->ops == &legacy_fs_context_ops) { switch (cmd) { case FSCONFIG_SET_BINARY: case FSCONFIG_SET_PATH: case FSCONFIG_SET_PATH_EMPTY: case FSCONFIG_SET_FD: case FSCONFIG_CMD_CREATE_EXCL: return -EOPNOTSUPP; } } if (_key) { param.key = strndup_user(_key, 256); if (IS_ERR(param.key)) return PTR_ERR(param.key); } switch (cmd) { case FSCONFIG_SET_FLAG: param.type = fs_value_is_flag; break; case FSCONFIG_SET_STRING: param.type = fs_value_is_string; param.string = strndup_user(_value, 256); if (IS_ERR(param.string)) { ret = PTR_ERR(param.string); goto out_key; } param.size = strlen(param.string); break; case FSCONFIG_SET_BINARY: param.type = fs_value_is_blob; param.size = aux; param.blob = memdup_user_nul(_value, aux); if (IS_ERR(param.blob)) { ret = PTR_ERR(param.blob); goto out_key; } break; case FSCONFIG_SET_PATH_EMPTY: lookup_flags = LOOKUP_EMPTY; fallthrough; case FSCONFIG_SET_PATH: param.type = fs_value_is_filename; param.name = getname_flags(_value, lookup_flags); if (IS_ERR(param.name)) { ret = PTR_ERR(param.name); goto out_key; } param.dirfd = aux; param.size = strlen(param.name->name); break; case FSCONFIG_SET_FD: param.type = fs_value_is_file; ret = -EBADF; param.file = fget_raw(aux); if (!param.file) goto out_key; param.dirfd = aux; break; default: break; } ret = mutex_lock_interruptible(&fc->uapi_mutex); if (ret == 0) { ret = vfs_fsconfig_locked(fc, cmd, &param); mutex_unlock(&fc->uapi_mutex); } /* Clean up the our record of any value that we obtained from * userspace. Note that the value may have been stolen by the LSM or * filesystem, in which case the value pointer will have been cleared. */ switch (cmd) { case FSCONFIG_SET_STRING: case FSCONFIG_SET_BINARY: kfree(param.string); break; case FSCONFIG_SET_PATH: case FSCONFIG_SET_PATH_EMPTY: if (param.name) putname(param.name); break; case FSCONFIG_SET_FD: if (param.file) fput(param.file); break; default: break; } out_key: kfree(param.key); return ret; }
9 1 1626 1 4 1623 1624 1 1 84 83 1 1 1 1 1 1 1 5 4 1 4 81 80 1 1 1623 1623 1624 4681 4688 2656 1624 1624 496 496 2 1 1 8 1 38 78 1 78 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 // SPDX-License-Identifier: GPL-2.0-only /* * File: pn_dev.c * * Phonet network device * * Copyright (C) 2008 Nokia Corporation. * * Authors: Sakari Ailus <sakari.ailus@nokia.com> * Rémi Denis-Courmont */ #include <linux/kernel.h> #include <linux/net.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/phonet.h> #include <linux/proc_fs.h> #include <linux/if_arp.h> #include <net/sock.h> #include <net/netns/generic.h> #include <net/phonet/pn_dev.h> struct phonet_routes { spinlock_t lock; struct net_device __rcu *table[64]; }; struct phonet_net { struct phonet_device_list pndevs; struct phonet_routes routes; }; static unsigned int phonet_net_id __read_mostly; static struct phonet_net *phonet_pernet(struct net *net) { return net_generic(net, phonet_net_id); } struct phonet_device_list *phonet_device_list(struct net *net) { struct phonet_net *pnn = phonet_pernet(net); return &pnn->pndevs; } /* Allocate new Phonet device. */ static struct phonet_device *__phonet_device_alloc(struct net_device *dev) { struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev)); struct phonet_device *pnd = kmalloc(sizeof(*pnd), GFP_ATOMIC); if (pnd == NULL) return NULL; pnd->netdev = dev; bitmap_zero(pnd->addrs, 64); lockdep_assert_held(&pndevs->lock); list_add_rcu(&pnd->list, &pndevs->list); return pnd; } static struct phonet_device *__phonet_get(struct net_device *dev) { struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev)); struct phonet_device *pnd; lockdep_assert_held(&pndevs->lock); list_for_each_entry(pnd, &pndevs->list, list) { if (pnd->netdev == dev) return pnd; } return NULL; } static struct phonet_device *__phonet_get_rcu(struct net_device *dev) { struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev)); struct phonet_device *pnd; list_for_each_entry_rcu(pnd, &pndevs->list, list) { if (pnd->netdev == dev) return pnd; } return NULL; } static void phonet_device_destroy(struct net_device *dev) { struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev)); struct phonet_device *pnd; ASSERT_RTNL(); spin_lock(&pndevs->lock); pnd = __phonet_get(dev); if (pnd) list_del_rcu(&pnd->list); spin_unlock(&pndevs->lock); if (pnd) { struct net *net = dev_net(dev); u32 ifindex = dev->ifindex; u8 addr; for_each_set_bit(addr, pnd->addrs, 64) phonet_address_notify(net, RTM_DELADDR, ifindex, addr); kfree(pnd); } } struct net_device *phonet_device_get(struct net *net) { struct phonet_device_list *pndevs = phonet_device_list(net); struct phonet_device *pnd; struct net_device *dev = NULL; rcu_read_lock(); list_for_each_entry_rcu(pnd, &pndevs->list, list) { dev = pnd->netdev; BUG_ON(!dev); if ((dev->reg_state == NETREG_REGISTERED) && ((pnd->netdev->flags & IFF_UP)) == IFF_UP) break; dev = NULL; } dev_hold(dev); rcu_read_unlock(); return dev; } int phonet_address_add(struct net_device *dev, u8 addr) { struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev)); struct phonet_device *pnd; int err = 0; spin_lock(&pndevs->lock); /* Find or create Phonet-specific device data */ pnd = __phonet_get(dev); if (pnd == NULL) pnd = __phonet_device_alloc(dev); if (unlikely(pnd == NULL)) err = -ENOMEM; else if (test_and_set_bit(addr >> 2, pnd->addrs)) err = -EEXIST; spin_unlock(&pndevs->lock); return err; } int phonet_address_del(struct net_device *dev, u8 addr) { struct phonet_device_list *pndevs = phonet_device_list(dev_net(dev)); struct phonet_device *pnd; int err = 0; spin_lock(&pndevs->lock); pnd = __phonet_get(dev); if (!pnd || !test_and_clear_bit(addr >> 2, pnd->addrs)) { err = -EADDRNOTAVAIL; pnd = NULL; } else if (bitmap_empty(pnd->addrs, 64)) list_del_rcu(&pnd->list); else pnd = NULL; spin_unlock(&pndevs->lock); if (pnd) kfree_rcu(pnd, rcu); return err; } /* Gets a source address toward a destination, through a interface. */ u8 phonet_address_get(struct net_device *dev, u8 daddr) { struct phonet_device *pnd; u8 saddr; rcu_read_lock(); pnd = __phonet_get_rcu(dev); if (pnd) { BUG_ON(bitmap_empty(pnd->addrs, 64)); /* Use same source address as destination, if possible */ if (test_bit(daddr >> 2, pnd->addrs)) saddr = daddr; else saddr = find_first_bit(pnd->addrs, 64) << 2; } else saddr = PN_NO_ADDR; rcu_read_unlock(); if (saddr == PN_NO_ADDR) { /* Fallback to another device */ struct net_device *def_dev; def_dev = phonet_device_get(dev_net(dev)); if (def_dev) { if (def_dev != dev) saddr = phonet_address_get(def_dev, daddr); dev_put(def_dev); } } return saddr; } int phonet_address_lookup(struct net *net, u8 addr) { struct phonet_device_list *pndevs = phonet_device_list(net); struct phonet_device *pnd; int err = -EADDRNOTAVAIL; rcu_read_lock(); list_for_each_entry_rcu(pnd, &pndevs->list, list) { /* Don't allow unregistering devices! */ if ((pnd->netdev->reg_state != NETREG_REGISTERED) || ((pnd->netdev->flags & IFF_UP)) != IFF_UP) continue; if (test_bit(addr >> 2, pnd->addrs)) { err = 0; goto found; } } found: rcu_read_unlock(); return err; } /* automatically configure a Phonet device, if supported */ static int phonet_device_autoconf(struct net_device *dev) { struct if_phonet_req req; int ret; if (!dev->netdev_ops->ndo_siocdevprivate) return -EOPNOTSUPP; ret = dev->netdev_ops->ndo_siocdevprivate(dev, (struct ifreq *)&req, NULL, SIOCPNGAUTOCONF); if (ret < 0) return ret; ASSERT_RTNL(); ret = phonet_address_add(dev, req.ifr_phonet_autoconf.device); if (ret) return ret; phonet_address_notify(dev_net(dev), RTM_NEWADDR, dev->ifindex, req.ifr_phonet_autoconf.device); return 0; } static void phonet_route_autodel(struct net_device *dev) { struct net *net = dev_net(dev); DECLARE_BITMAP(deleted, 64); u32 ifindex = dev->ifindex; struct phonet_net *pnn; unsigned int i; pnn = phonet_pernet(net); /* Remove left-over Phonet routes */ bitmap_zero(deleted, 64); spin_lock(&pnn->routes.lock); for (i = 0; i < 64; i++) { if (rcu_access_pointer(pnn->routes.table[i]) == dev) { RCU_INIT_POINTER(pnn->routes.table[i], NULL); set_bit(i, deleted); } } spin_unlock(&pnn->routes.lock); if (bitmap_empty(deleted, 64)) return; /* short-circuit RCU */ synchronize_rcu(); for_each_set_bit(i, deleted, 64) { rtm_phonet_notify(net, RTM_DELROUTE, ifindex, i); dev_put(dev); } } /* notify Phonet of device events */ static int phonet_device_notify(struct notifier_block *me, unsigned long what, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (what) { case NETDEV_REGISTER: if (dev->type == ARPHRD_PHONET) phonet_device_autoconf(dev); break; case NETDEV_UNREGISTER: phonet_device_destroy(dev); phonet_route_autodel(dev); break; } return 0; } static struct notifier_block phonet_device_notifier = { .notifier_call = phonet_device_notify, .priority = 0, }; /* Per-namespace Phonet devices handling */ static int __net_init phonet_init_net(struct net *net) { struct phonet_net *pnn = phonet_pernet(net); if (!proc_create_net("phonet", 0, net->proc_net, &pn_sock_seq_ops, sizeof(struct seq_net_private))) return -ENOMEM; INIT_LIST_HEAD(&pnn->pndevs.list); spin_lock_init(&pnn->pndevs.lock); spin_lock_init(&pnn->routes.lock); return 0; } static void __net_exit phonet_exit_net(struct net *net) { struct phonet_net *pnn = phonet_pernet(net); remove_proc_entry("phonet", net->proc_net); WARN_ON_ONCE(!list_empty(&pnn->pndevs.list)); } static struct pernet_operations phonet_net_ops = { .init = phonet_init_net, .exit = phonet_exit_net, .id = &phonet_net_id, .size = sizeof(struct phonet_net), }; /* Initialize Phonet devices list */ int __init phonet_device_init(void) { int err = register_pernet_subsys(&phonet_net_ops); if (err) return err; proc_create_net("pnresource", 0, init_net.proc_net, &pn_res_seq_ops, sizeof(struct seq_net_private)); register_netdevice_notifier(&phonet_device_notifier); err = phonet_netlink_register(); if (err) phonet_device_exit(); return err; } void phonet_device_exit(void) { rtnl_unregister_all(PF_PHONET); unregister_netdevice_notifier(&phonet_device_notifier); unregister_pernet_subsys(&phonet_net_ops); remove_proc_entry("pnresource", init_net.proc_net); } int phonet_route_add(struct net_device *dev, u8 daddr) { struct phonet_net *pnn = phonet_pernet(dev_net(dev)); struct phonet_routes *routes = &pnn->routes; int err = -EEXIST; daddr = daddr >> 2; spin_lock(&routes->lock); if (routes->table[daddr] == NULL) { rcu_assign_pointer(routes->table[daddr], dev); dev_hold(dev); err = 0; } spin_unlock(&routes->lock); return err; } int phonet_route_del(struct net_device *dev, u8 daddr) { struct phonet_net *pnn = phonet_pernet(dev_net(dev)); struct phonet_routes *routes = &pnn->routes; daddr = daddr >> 2; spin_lock(&routes->lock); if (rcu_access_pointer(routes->table[daddr]) == dev) RCU_INIT_POINTER(routes->table[daddr], NULL); else dev = NULL; spin_unlock(&routes->lock); if (!dev) return -ENOENT; /* Note : our caller must call synchronize_rcu() and dev_put(dev) */ return 0; } struct net_device *phonet_route_get_rcu(struct net *net, u8 daddr) { struct phonet_net *pnn = phonet_pernet(net); struct phonet_routes *routes = &pnn->routes; struct net_device *dev; daddr >>= 2; dev = rcu_dereference(routes->table[daddr]); return dev; } struct net_device *phonet_route_output(struct net *net, u8 daddr) { struct phonet_net *pnn = phonet_pernet(net); struct phonet_routes *routes = &pnn->routes; struct net_device *dev; daddr >>= 2; rcu_read_lock(); dev = rcu_dereference(routes->table[daddr]); dev_hold(dev); rcu_read_unlock(); if (!dev) dev = phonet_device_get(net); /* Default route */ return dev; }
3 14 4 58 59 53 6 3 37 35 973 65 64 6 37 9 38 43 980 979 17 17 1 1 1 1 1 4 4 8 4 8 1 1 2 5 6 6 5 1 3 2 103 106 10 6 39 39 33 21 17 18 43 43 4 40 34 6 40 8 34 6 1 54 54 1 11 41 1 3 17 23 39 1 6 33 39 14 25 23 16 145 107 1 35 35 2 24 12 12 12 34 15 20 34 33 59 1 5 2 54 54 54 51 3 54 31 31 31 10 22 23 33 135 6 39 7 5 132 105 42 37 284 285 284 1 112 98 21 2 91 2 15 108 2 2 53 32 30 62 172 30 254 243 36 12 3 248 6 254 62 191 234 9 76 167 242 181 62 36 241 229 59 17 17 144 255 254 11 11 1 2 1 1 1 1 1 3 10 1 3 1 1 1 2 294 3 1 3 289 13 1 1 2 1 1 1 3 28 2 1 5 20 6 22 2 3 2 16 52 13 39 52 511 508 3 6 5 3 496 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 // SPDX-License-Identifier: GPL-2.0-or-later /* * RAW sockets for IPv6 * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Adapted from linux/net/ipv4/raw.c * * Fixes: * Hideaki YOSHIFUJI : sin6_scope_id support * YOSHIFUJI,H.@USAGI : raw checksum (RFC2292(bis) compliance) * Kazunori MIYAZAWA @USAGI: change process style to use ip6_append_data */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/slab.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/icmpv6.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> #include <linux/skbuff.h> #include <linux/compat.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/protocol.h> #include <net/ip6_route.h> #include <net/ip6_checksum.h> #include <net/addrconf.h> #include <net/transp_v6.h> #include <net/udp.h> #include <net/inet_common.h> #include <net/tcp_states.h> #if IS_ENABLED(CONFIG_IPV6_MIP6) #include <net/mip6.h> #endif #include <linux/mroute6.h> #include <net/raw.h> #include <net/rawv6.h> #include <net/xfrm.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/export.h> #define ICMPV6_HDRLEN 4 /* ICMPv6 header, RFC 4443 Section 2.1 */ struct raw_hashinfo raw_v6_hashinfo; EXPORT_SYMBOL_GPL(raw_v6_hashinfo); bool raw_v6_match(struct net *net, const struct sock *sk, unsigned short num, const struct in6_addr *loc_addr, const struct in6_addr *rmt_addr, int dif, int sdif) { if (inet_sk(sk)->inet_num != num || !net_eq(sock_net(sk), net) || (!ipv6_addr_any(&sk->sk_v6_daddr) && !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) || !raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return false; if (ipv6_addr_any(&sk->sk_v6_rcv_saddr) || ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr) || (ipv6_addr_is_multicast(loc_addr) && inet6_mc_check(sk, loc_addr, rmt_addr))) return true; return false; } EXPORT_SYMBOL_GPL(raw_v6_match); /* * 0 - deliver * 1 - block */ static int icmpv6_filter(const struct sock *sk, const struct sk_buff *skb) { struct icmp6hdr _hdr; const struct icmp6hdr *hdr; /* We require only the four bytes of the ICMPv6 header, not any * additional bytes of message body in "struct icmp6hdr". */ hdr = skb_header_pointer(skb, skb_transport_offset(skb), ICMPV6_HDRLEN, &_hdr); if (hdr) { const __u32 *data = &raw6_sk(sk)->filter.data[0]; unsigned int type = hdr->icmp6_type; return (data[type >> 5] & (1U << (type & 31))) != 0; } return 1; } #if IS_ENABLED(CONFIG_IPV6_MIP6) typedef int mh_filter_t(struct sock *sock, struct sk_buff *skb); static mh_filter_t __rcu *mh_filter __read_mostly; int rawv6_mh_filter_register(mh_filter_t filter) { rcu_assign_pointer(mh_filter, filter); return 0; } EXPORT_SYMBOL(rawv6_mh_filter_register); int rawv6_mh_filter_unregister(mh_filter_t filter) { RCU_INIT_POINTER(mh_filter, NULL); synchronize_rcu(); return 0; } EXPORT_SYMBOL(rawv6_mh_filter_unregister); #endif /* * demultiplex raw sockets. * (should consider queueing the skb in the sock receive_queue * without calling rawv6.c) * * Caller owns SKB so we must make clones. */ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) { struct net *net = dev_net(skb->dev); const struct in6_addr *saddr; const struct in6_addr *daddr; struct hlist_head *hlist; struct sock *sk; bool delivered = false; __u8 hash; saddr = &ipv6_hdr(skb)->saddr; daddr = saddr + 1; hash = raw_hashfunc(net, nexthdr); hlist = &raw_v6_hashinfo.ht[hash]; rcu_read_lock(); sk_for_each_rcu(sk, hlist) { int filtered; if (!raw_v6_match(net, sk, nexthdr, daddr, saddr, inet6_iif(skb), inet6_sdif(skb))) continue; if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { atomic_inc(&sk->sk_drops); continue; } delivered = true; switch (nexthdr) { case IPPROTO_ICMPV6: filtered = icmpv6_filter(sk, skb); break; #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPPROTO_MH: { /* XXX: To validate MH only once for each packet, * this is placed here. It should be after checking * xfrm policy, however it doesn't. The checking xfrm * policy is placed in rawv6_rcv() because it is * required for each socket. */ mh_filter_t *filter; filter = rcu_dereference(mh_filter); filtered = filter ? (*filter)(sk, skb) : 0; break; } #endif default: filtered = 0; break; } if (filtered < 0) break; if (filtered == 0) { struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); /* Not releasing hash table! */ if (clone) rawv6_rcv(sk, clone); } } rcu_read_unlock(); return delivered; } bool raw6_local_deliver(struct sk_buff *skb, int nexthdr) { return ipv6_raw_deliver(skb, nexthdr); } /* This cleans up af_inet6 a bit. -DaveM */ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct sockaddr_in6 *addr = (struct sockaddr_in6 *) uaddr; __be32 v4addr = 0; int addr_type; int err; if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; if (addr->sin6_family != AF_INET6) return -EINVAL; addr_type = ipv6_addr_type(&addr->sin6_addr); /* Raw sockets are IPv6 only */ if (addr_type == IPV6_ADDR_MAPPED) return -EADDRNOTAVAIL; lock_sock(sk); err = -EINVAL; if (sk->sk_state != TCP_CLOSE) goto out; rcu_read_lock(); /* Check if the address belongs to the host. */ if (addr_type != IPV6_ADDR_ANY) { struct net_device *dev = NULL; if (__ipv6_addr_needs_scope_id(addr_type)) { if (addr_len >= sizeof(struct sockaddr_in6) && addr->sin6_scope_id) { /* Override any existing binding, if another * one is supplied by user. */ sk->sk_bound_dev_if = addr->sin6_scope_id; } /* Binding to link-local address requires an interface */ if (!sk->sk_bound_dev_if) goto out_unlock; } if (sk->sk_bound_dev_if) { err = -ENODEV; dev = dev_get_by_index_rcu(sock_net(sk), sk->sk_bound_dev_if); if (!dev) goto out_unlock; } /* ipv4 addr of the socket is invalid. Only the * unspecified and mapped address have a v4 equivalent. */ v4addr = LOOPBACK4_IPV6; if (!(addr_type & IPV6_ADDR_MULTICAST) && !ipv6_can_nonlocal_bind(sock_net(sk), inet)) { err = -EADDRNOTAVAIL; if (!ipv6_chk_addr(sock_net(sk), &addr->sin6_addr, dev, 0)) { goto out_unlock; } } } inet->inet_rcv_saddr = inet->inet_saddr = v4addr; sk->sk_v6_rcv_saddr = addr->sin6_addr; if (!(addr_type & IPV6_ADDR_MULTICAST)) np->saddr = addr->sin6_addr; err = 0; out_unlock: rcu_read_unlock(); out: release_sock(sk); return err; } static void rawv6_err(struct sock *sk, struct sk_buff *skb, u8 type, u8 code, int offset, __be32 info) { bool recverr = inet6_test_bit(RECVERR6, sk); struct ipv6_pinfo *np = inet6_sk(sk); int err; int harderr; /* Report error on raw socket, if: 1. User requested recverr. 2. Socket is connected (otherwise the error indication is useless without recverr and error is hard. */ if (!recverr && sk->sk_state != TCP_ESTABLISHED) return; harderr = icmpv6_err_convert(type, code, &err); if (type == ICMPV6_PKT_TOOBIG) { ip6_sk_update_pmtu(skb, sk, info); harderr = (READ_ONCE(np->pmtudisc) == IPV6_PMTUDISC_DO); } if (type == NDISC_REDIRECT) { ip6_sk_redirect(skb, sk); return; } if (recverr) { u8 *payload = skb->data; if (!inet_test_bit(HDRINCL, sk)) payload += offset; ipv6_icmp_error(sk, skb, err, 0, ntohl(info), payload); } if (recverr || harderr) { sk->sk_err = err; sk_error_report(sk); } } void raw6_icmp_error(struct sk_buff *skb, int nexthdr, u8 type, u8 code, int inner_offset, __be32 info) { struct net *net = dev_net(skb->dev); struct hlist_head *hlist; struct sock *sk; int hash; hash = raw_hashfunc(net, nexthdr); hlist = &raw_v6_hashinfo.ht[hash]; rcu_read_lock(); sk_for_each_rcu(sk, hlist) { /* Note: ipv6_hdr(skb) != skb->data */ const struct ipv6hdr *ip6h = (const struct ipv6hdr *)skb->data; if (!raw_v6_match(net, sk, nexthdr, &ip6h->saddr, &ip6h->daddr, inet6_iif(skb), inet6_iif(skb))) continue; rawv6_err(sk, skb, type, code, inner_offset, info); } rcu_read_unlock(); } static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb) { enum skb_drop_reason reason; if ((raw6_sk(sk)->checksum || rcu_access_pointer(sk->sk_filter)) && skb_checksum_complete(skb)) { atomic_inc(&sk->sk_drops); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM); return NET_RX_DROP; } /* Charge it to the socket. */ skb_dst_drop(skb); if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) { sk_skb_reason_drop(sk, skb, reason); return NET_RX_DROP; } return 0; } /* * This is next to useless... * if we demultiplex in network layer we don't need the extra call * just to queue the skb... * maybe we could have the network decide upon a hint if it * should call raw_rcv for demultiplexing */ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) { struct inet_sock *inet = inet_sk(sk); struct raw6_sock *rp = raw6_sk(sk); if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { atomic_inc(&sk->sk_drops); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY); return NET_RX_DROP; } nf_reset_ct(skb); if (!rp->checksum) skb->ip_summed = CHECKSUM_UNNECESSARY; if (skb->ip_summed == CHECKSUM_COMPLETE) { skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); if (!csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->len, inet->inet_num, skb->csum)) skb->ip_summed = CHECKSUM_UNNECESSARY; } if (!skb_csum_unnecessary(skb)) skb->csum = ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->len, inet->inet_num, 0)); if (inet_test_bit(HDRINCL, sk)) { if (skb_checksum_complete(skb)) { atomic_inc(&sk->sk_drops); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM); return NET_RX_DROP; } } rawv6_rcv_skb(sk, skb); return 0; } /* * This should be easy, if there is something there * we return it, otherwise we block. */ static int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct ipv6_pinfo *np = inet6_sk(sk); DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); struct sk_buff *skb; size_t copied; int err; if (flags & MSG_OOB) return -EOPNOTSUPP; if (flags & MSG_ERRQUEUE) return ipv6_recv_error(sk, msg, len, addr_len); if (np->rxpmtu && np->rxopt.bits.rxpmtu) return ipv6_recv_rxpmtu(sk, msg, len, addr_len); skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; copied = skb->len; if (copied > len) { copied = len; msg->msg_flags |= MSG_TRUNC; } if (skb_csum_unnecessary(skb)) { err = skb_copy_datagram_msg(skb, 0, msg, copied); } else if (msg->msg_flags&MSG_TRUNC) { if (__skb_checksum_complete(skb)) goto csum_copy_err; err = skb_copy_datagram_msg(skb, 0, msg, copied); } else { err = skb_copy_and_csum_datagram_msg(skb, 0, msg); if (err == -EINVAL) goto csum_copy_err; } if (err) goto out_free; /* Copy the address. */ if (sin6) { sin6->sin6_family = AF_INET6; sin6->sin6_port = 0; sin6->sin6_addr = ipv6_hdr(skb)->saddr; sin6->sin6_flowinfo = 0; sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, inet6_iif(skb)); *addr_len = sizeof(*sin6); } sock_recv_cmsgs(msg, sk, skb); if (np->rxopt.all) ip6_datagram_recv_ctl(sk, msg, skb); err = copied; if (flags & MSG_TRUNC) err = skb->len; out_free: skb_free_datagram(sk, skb); out: return err; csum_copy_err: skb_kill_datagram(sk, skb, flags); /* Error for blocking case is chosen to masquerade as some normal condition. */ err = (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH; goto out; } static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, struct raw6_sock *rp) { struct ipv6_txoptions *opt; struct sk_buff *skb; int err = 0; int offset; int len; int total_len; __wsum tmp_csum; __sum16 csum; if (!rp->checksum) goto send; skb = skb_peek(&sk->sk_write_queue); if (!skb) goto out; offset = rp->offset; total_len = inet_sk(sk)->cork.base.length; opt = inet6_sk(sk)->cork.opt; total_len -= opt ? opt->opt_flen : 0; if (offset >= total_len - 1) { err = -EINVAL; ip6_flush_pending_frames(sk); goto out; } /* should be check HW csum miyazawa */ if (skb_queue_len(&sk->sk_write_queue) == 1) { /* * Only one fragment on the socket. */ tmp_csum = skb->csum; } else { struct sk_buff *csum_skb = NULL; tmp_csum = 0; skb_queue_walk(&sk->sk_write_queue, skb) { tmp_csum = csum_add(tmp_csum, skb->csum); if (csum_skb) continue; len = skb->len - skb_transport_offset(skb); if (offset >= len) { offset -= len; continue; } csum_skb = skb; } skb = csum_skb; } offset += skb_transport_offset(skb); err = skb_copy_bits(skb, offset, &csum, 2); if (err < 0) { ip6_flush_pending_frames(sk); goto out; } /* in case cksum was not initialized */ if (unlikely(csum)) tmp_csum = csum_sub(tmp_csum, csum_unfold(csum)); csum = csum_ipv6_magic(&fl6->saddr, &fl6->daddr, total_len, fl6->flowi6_proto, tmp_csum); if (csum == 0 && fl6->flowi6_proto == IPPROTO_UDP) csum = CSUM_MANGLED_0; BUG_ON(skb_store_bits(skb, offset, &csum, 2)); send: err = ip6_push_pending_frames(sk); out: return err; } static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, struct flowi6 *fl6, struct dst_entry **dstp, unsigned int flags, const struct sockcm_cookie *sockc) { struct net *net = sock_net(sk); struct ipv6hdr *iph; struct sk_buff *skb; int err; struct rt6_info *rt = dst_rt6_info(*dstp); int hlen = LL_RESERVED_SPACE(rt->dst.dev); int tlen = rt->dst.dev->needed_tailroom; if (length > rt->dst.dev->mtu) { ipv6_local_error(sk, EMSGSIZE, fl6, rt->dst.dev->mtu); return -EMSGSIZE; } if (length < sizeof(struct ipv6hdr)) return -EINVAL; if (flags&MSG_PROBE) goto out; skb = sock_alloc_send_skb(sk, length + hlen + tlen + 15, flags & MSG_DONTWAIT, &err); if (!skb) goto error; skb_reserve(skb, hlen); skb->protocol = htons(ETH_P_IPV6); skb->priority = sockc->priority; skb->mark = sockc->mark; skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid); skb_put(skb, length); skb_reset_network_header(skb); iph = ipv6_hdr(skb); skb->ip_summed = CHECKSUM_NONE; skb_setup_tx_timestamp(skb, sockc); if (flags & MSG_CONFIRM) skb_set_dst_pending_confirm(skb, 1); skb->transport_header = skb->network_header; err = memcpy_from_msg(iph, msg, length); if (err) { err = -EFAULT; kfree_skb(skb); goto error; } skb_dst_set(skb, &rt->dst); *dstp = NULL; /* if egress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip6_out(sk, skb); if (unlikely(!skb)) return 0; /* Acquire rcu_read_lock() in case we need to use rt->rt6i_idev * in the error path. Since skb has been freed, the dst could * have been queued for deletion. */ rcu_read_lock(); IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, rt->dst.dev, dst_output); if (err > 0) err = net_xmit_errno(err); if (err) { IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); rcu_read_unlock(); goto error_check; } rcu_read_unlock(); out: return 0; error: IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); error_check: if (err == -ENOBUFS && !inet6_test_bit(RECVERR6, sk)) err = 0; return err; } struct raw6_frag_vec { struct msghdr *msg; int hlen; char c[4]; }; static int rawv6_probe_proto_opt(struct raw6_frag_vec *rfv, struct flowi6 *fl6) { int err = 0; switch (fl6->flowi6_proto) { case IPPROTO_ICMPV6: rfv->hlen = 2; err = memcpy_from_msg(rfv->c, rfv->msg, rfv->hlen); if (!err) { fl6->fl6_icmp_type = rfv->c[0]; fl6->fl6_icmp_code = rfv->c[1]; } break; case IPPROTO_MH: rfv->hlen = 4; err = memcpy_from_msg(rfv->c, rfv->msg, rfv->hlen); if (!err) fl6->fl6_mh_type = rfv->c[2]; } return err; } static int raw6_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { struct raw6_frag_vec *rfv = from; if (offset < rfv->hlen) { int copy = min(rfv->hlen - offset, len); if (skb->ip_summed == CHECKSUM_PARTIAL) memcpy(to, rfv->c + offset, copy); else skb->csum = csum_block_add( skb->csum, csum_partial_copy_nocheck(rfv->c + offset, to, copy), odd); odd = 0; offset += copy; to += copy; len -= copy; if (!len) return 0; } offset -= rfv->hlen; return ip_generic_getfrag(rfv->msg, to, offset, len, odd, skb); } static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct ipv6_txoptions *opt_to_free = NULL; struct ipv6_txoptions opt_space; DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); struct in6_addr *daddr, *final_p, final; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct raw6_sock *rp = raw6_sk(sk); struct ipv6_txoptions *opt = NULL; struct ip6_flowlabel *flowlabel = NULL; struct dst_entry *dst = NULL; struct raw6_frag_vec rfv; struct flowi6 fl6; struct ipcm6_cookie ipc6; int addr_len = msg->msg_namelen; int hdrincl; u16 proto; int err; /* Rough check on arithmetic overflow, better check is made in ip6_append_data(). */ if (len > INT_MAX) return -EMSGSIZE; /* Mirror BSD error message compatibility */ if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; hdrincl = inet_test_bit(HDRINCL, sk); ipcm6_init_sk(&ipc6, sk); /* * Get and verify the address. */ memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_mark = ipc6.sockc.mark; fl6.flowi6_uid = sk_uid(sk); if (sin6) { if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; if (sin6->sin6_family && sin6->sin6_family != AF_INET6) return -EAFNOSUPPORT; /* port is the proto value [0..255] carried in nexthdr */ proto = ntohs(sin6->sin6_port); if (!proto) proto = inet->inet_num; else if (proto != inet->inet_num && inet->inet_num != IPPROTO_RAW) return -EINVAL; if (proto > 255) return -EINVAL; daddr = &sin6->sin6_addr; if (inet6_test_bit(SNDFLOW, sk)) { fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (IS_ERR(flowlabel)) return -EINVAL; } } /* * Otherwise it will be difficult to maintain * sk->sk_dst_cache. */ if (sk->sk_state == TCP_ESTABLISHED && ipv6_addr_equal(daddr, &sk->sk_v6_daddr)) daddr = &sk->sk_v6_daddr; if (addr_len >= sizeof(struct sockaddr_in6) && sin6->sin6_scope_id && __ipv6_addr_needs_scope_id(__ipv6_addr_type(daddr))) fl6.flowi6_oif = sin6->sin6_scope_id; } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; proto = inet->inet_num; daddr = &sk->sk_v6_daddr; fl6.flowlabel = np->flow_label; } if (fl6.flowi6_oif == 0) fl6.flowi6_oif = sk->sk_bound_dev_if; if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(struct ipv6_txoptions); ipc6.opt = opt; err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6); if (err < 0) { fl6_sock_release(flowlabel); return err; } if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (IS_ERR(flowlabel)) return -EINVAL; } if (!(opt->opt_nflen|opt->opt_flen)) opt = NULL; } if (!opt) { opt = txopt_get(np); opt_to_free = opt; } if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); fl6.flowi6_proto = proto; fl6.flowi6_mark = ipc6.sockc.mark; if (!hdrincl) { rfv.msg = msg; rfv.hlen = 0; err = rawv6_probe_proto_opt(&rfv, &fl6); if (err) goto out; } if (!ipv6_addr_any(daddr)) fl6.daddr = *daddr; else fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */ if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr)) fl6.saddr = np->saddr; final_p = fl6_update_dst(&fl6, opt, &final); if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) fl6.flowi6_oif = READ_ONCE(np->mcast_oif); else if (!fl6.flowi6_oif) fl6.flowi6_oif = READ_ONCE(np->ucast_oif); security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); if (hdrincl) fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH; fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto out; } if (ipc6.hlimit < 0) ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); if (msg->msg_flags&MSG_CONFIRM) goto do_confirm; back_from_confirm: if (hdrincl) err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst, msg->msg_flags, &ipc6.sockc); else { ipc6.opt = opt; lock_sock(sk); err = ip6_append_data(sk, raw6_getfrag, &rfv, len, 0, &ipc6, &fl6, dst_rt6_info(dst), msg->msg_flags); if (err) ip6_flush_pending_frames(sk); else if (!(msg->msg_flags & MSG_MORE)) err = rawv6_push_pending_frames(sk, &fl6, rp); release_sock(sk); } done: dst_release(dst); out: fl6_sock_release(flowlabel); txopt_put(opt_to_free); return err < 0 ? err : len; do_confirm: if (msg->msg_flags & MSG_PROBE) dst_confirm_neigh(dst, &fl6.daddr); if (!(msg->msg_flags & MSG_PROBE) || len) goto back_from_confirm; err = 0; goto done; } static int rawv6_seticmpfilter(struct sock *sk, int optname, sockptr_t optval, int optlen) { switch (optname) { case ICMPV6_FILTER: if (optlen > sizeof(struct icmp6_filter)) optlen = sizeof(struct icmp6_filter); if (copy_from_sockptr(&raw6_sk(sk)->filter, optval, optlen)) return -EFAULT; return 0; default: return -ENOPROTOOPT; } return 0; } static int rawv6_geticmpfilter(struct sock *sk, int optname, char __user *optval, int __user *optlen) { int len; switch (optname) { case ICMPV6_FILTER: if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; if (len > sizeof(struct icmp6_filter)) len = sizeof(struct icmp6_filter); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &raw6_sk(sk)->filter, len)) return -EFAULT; return 0; default: return -ENOPROTOOPT; } return 0; } static int do_rawv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct raw6_sock *rp = raw6_sk(sk); int val; if (optlen < sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; switch (optname) { case IPV6_HDRINCL: if (sk->sk_type != SOCK_RAW) return -EINVAL; inet_assign_bit(HDRINCL, sk, val); return 0; case IPV6_CHECKSUM: if (inet_sk(sk)->inet_num == IPPROTO_ICMPV6 && level == IPPROTO_IPV6) { /* * RFC3542 tells that IPV6_CHECKSUM socket * option in the IPPROTO_IPV6 level is not * allowed on ICMPv6 sockets. * If you want to set it, use IPPROTO_RAW * level IPV6_CHECKSUM socket option * (Linux extension). */ return -EINVAL; } /* You may get strange result with a positive odd offset; RFC2292bis agrees with me. */ if (val > 0 && (val&1)) return -EINVAL; if (val < 0) { rp->checksum = 0; } else { rp->checksum = 1; rp->offset = val; } return 0; default: return -ENOPROTOOPT; } } static int rawv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { switch (level) { case SOL_RAW: break; case SOL_ICMPV6: if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return -EOPNOTSUPP; return rawv6_seticmpfilter(sk, optname, optval, optlen); case SOL_IPV6: if (optname == IPV6_CHECKSUM || optname == IPV6_HDRINCL) break; fallthrough; default: return ipv6_setsockopt(sk, level, optname, optval, optlen); } return do_rawv6_setsockopt(sk, level, optname, optval, optlen); } static int do_rawv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { struct raw6_sock *rp = raw6_sk(sk); int val, len; if (get_user(len, optlen)) return -EFAULT; switch (optname) { case IPV6_HDRINCL: val = inet_test_bit(HDRINCL, sk); break; case IPV6_CHECKSUM: /* * We allow getsockopt() for IPPROTO_IPV6-level * IPV6_CHECKSUM socket option on ICMPv6 sockets * since RFC3542 is silent about it. */ if (rp->checksum == 0) val = -1; else val = rp->offset; break; default: return -ENOPROTOOPT; } len = min_t(unsigned int, sizeof(int), len); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } static int rawv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { switch (level) { case SOL_RAW: break; case SOL_ICMPV6: if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return -EOPNOTSUPP; return rawv6_geticmpfilter(sk, optname, optval, optlen); case SOL_IPV6: if (optname == IPV6_CHECKSUM || optname == IPV6_HDRINCL) break; fallthrough; default: return ipv6_getsockopt(sk, level, optname, optval, optlen); } return do_rawv6_getsockopt(sk, level, optname, optval, optlen); } static int rawv6_ioctl(struct sock *sk, int cmd, int *karg) { switch (cmd) { case SIOCOUTQ: { *karg = sk_wmem_alloc_get(sk); return 0; } case SIOCINQ: { struct sk_buff *skb; spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb) *karg = skb->len; else *karg = 0; spin_unlock_bh(&sk->sk_receive_queue.lock); return 0; } default: #ifdef CONFIG_IPV6_MROUTE return ip6mr_ioctl(sk, cmd, karg); #else return -ENOIOCTLCMD; #endif } } #ifdef CONFIG_COMPAT static int compat_rawv6_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg) { switch (cmd) { case SIOCOUTQ: case SIOCINQ: return -ENOIOCTLCMD; default: #ifdef CONFIG_IPV6_MROUTE return ip6mr_compat_ioctl(sk, cmd, compat_ptr(arg)); #else return -ENOIOCTLCMD; #endif } } #endif static void rawv6_close(struct sock *sk, long timeout) { if (inet_sk(sk)->inet_num == IPPROTO_RAW) ip6_ra_control(sk, -1); ip6mr_sk_done(sk); sk_common_release(sk); } static void raw6_destroy(struct sock *sk) { lock_sock(sk); ip6_flush_pending_frames(sk); release_sock(sk); } static int rawv6_init_sk(struct sock *sk) { struct raw6_sock *rp = raw6_sk(sk); switch (inet_sk(sk)->inet_num) { case IPPROTO_ICMPV6: rp->checksum = 1; rp->offset = 2; break; case IPPROTO_MH: rp->checksum = 1; rp->offset = 4; break; default: break; } return 0; } struct proto rawv6_prot = { .name = "RAWv6", .owner = THIS_MODULE, .close = rawv6_close, .destroy = raw6_destroy, .connect = ip6_datagram_connect_v6_only, .disconnect = __udp_disconnect, .ioctl = rawv6_ioctl, .init = rawv6_init_sk, .setsockopt = rawv6_setsockopt, .getsockopt = rawv6_getsockopt, .sendmsg = rawv6_sendmsg, .recvmsg = rawv6_recvmsg, .bind = rawv6_bind, .backlog_rcv = rawv6_rcv_skb, .hash = raw_hash_sk, .unhash = raw_unhash_sk, .obj_size = sizeof(struct raw6_sock), .ipv6_pinfo_offset = offsetof(struct raw6_sock, inet6), .useroffset = offsetof(struct raw6_sock, filter), .usersize = sizeof_field(struct raw6_sock, filter), .h.raw_hash = &raw_v6_hashinfo, #ifdef CONFIG_COMPAT .compat_ioctl = compat_rawv6_ioctl, #endif .diag_destroy = raw_abort, }; #ifdef CONFIG_PROC_FS static int raw6_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_puts(seq, IPV6_SEQ_DGRAM_HEADER); } else { struct sock *sp = v; __u16 srcp = inet_sk(sp)->inet_num; ip6_dgram_sock_seq_show(seq, v, srcp, 0, raw_seq_private(seq)->bucket); } return 0; } static const struct seq_operations raw6_seq_ops = { .start = raw_seq_start, .next = raw_seq_next, .stop = raw_seq_stop, .show = raw6_seq_show, }; static int __net_init raw6_init_net(struct net *net) { if (!proc_create_net_data("raw6", 0444, net->proc_net, &raw6_seq_ops, sizeof(struct raw_iter_state), &raw_v6_hashinfo)) return -ENOMEM; return 0; } static void __net_exit raw6_exit_net(struct net *net) { remove_proc_entry("raw6", net->proc_net); } static struct pernet_operations raw6_net_ops = { .init = raw6_init_net, .exit = raw6_exit_net, }; int __init raw6_proc_init(void) { return register_pernet_subsys(&raw6_net_ops); } void raw6_proc_exit(void) { unregister_pernet_subsys(&raw6_net_ops); } #endif /* CONFIG_PROC_FS */ /* Same as inet6_dgram_ops, sans udp_poll. */ const struct proto_ops inet6_sockraw_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, .bind = inet6_bind, .connect = inet_dgram_connect, /* ok */ .socketpair = sock_no_socketpair, /* a do nothing */ .accept = sock_no_accept, /* a do nothing */ .getname = inet6_getname, .poll = datagram_poll, /* ok */ .ioctl = inet6_ioctl, /* must change */ .gettstamp = sock_gettstamp, .listen = sock_no_listen, /* ok */ .shutdown = inet_shutdown, /* ok */ .setsockopt = sock_common_setsockopt, /* ok */ .getsockopt = sock_common_getsockopt, /* ok */ .sendmsg = inet_sendmsg, /* ok */ .recvmsg = sock_common_recvmsg, /* ok */ .mmap = sock_no_mmap, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif }; static struct inet_protosw rawv6_protosw = { .type = SOCK_RAW, .protocol = IPPROTO_IP, /* wild card */ .prot = &rawv6_prot, .ops = &inet6_sockraw_ops, .flags = INET_PROTOSW_REUSE, }; int __init rawv6_init(void) { return inet6_register_protosw(&rawv6_protosw); } void rawv6_exit(void) { inet6_unregister_protosw(&rawv6_protosw); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 /* SPDX-License-Identifier: GPL-2.0 */ /* * PHY device list allow maintaining a list of PHY devices that are * part of a netdevice's link topology. PHYs can for example be chained, * as is the case when using a PHY that exposes an SFP module, on which an * SFP transceiver that embeds a PHY is connected. * * This list can then be used by userspace to leverage individual PHY * capabilities. */ #ifndef __PHY_LINK_TOPOLOGY_H #define __PHY_LINK_TOPOLOGY_H #include <linux/ethtool.h> #include <linux/netdevice.h> struct xarray; struct phy_device; struct sfp_bus; struct phy_link_topology { struct xarray phys; u32 next_phy_index; }; struct phy_device_node { enum phy_upstream upstream_type; union { struct net_device *netdev; struct phy_device *phydev; } upstream; struct sfp_bus *parent_sfp_bus; struct phy_device *phy; }; #if IS_ENABLED(CONFIG_PHYLIB) int phy_link_topo_add_phy(struct net_device *dev, struct phy_device *phy, enum phy_upstream upt, void *upstream); void phy_link_topo_del_phy(struct net_device *dev, struct phy_device *phy); static inline struct phy_device * phy_link_topo_get_phy(struct net_device *dev, u32 phyindex) { struct phy_link_topology *topo = dev->link_topo; struct phy_device_node *pdn; if (!topo) return NULL; pdn = xa_load(&topo->phys, phyindex); if (pdn) return pdn->phy; return NULL; } #else static inline int phy_link_topo_add_phy(struct net_device *dev, struct phy_device *phy, enum phy_upstream upt, void *upstream) { return 0; } static inline void phy_link_topo_del_phy(struct net_device *dev, struct phy_device *phy) { } static inline struct phy_device * phy_link_topo_get_phy(struct net_device *dev, u32 phyindex) { return NULL; } #endif #endif /* __PHY_LINK_TOPOLOGY_H */
79409 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 // SPDX-License-Identifier: GPL-2.0-only #include <linux/fault-inject.h> #include <linux/fault-inject-usercopy.h> static struct { struct fault_attr attr; } fail_usercopy = { .attr = FAULT_ATTR_INITIALIZER, }; static int __init setup_fail_usercopy(char *str) { return setup_fault_attr(&fail_usercopy.attr, str); } __setup("fail_usercopy=", setup_fail_usercopy); #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS static int __init fail_usercopy_debugfs(void) { struct dentry *dir; dir = fault_create_debugfs_attr("fail_usercopy", NULL, &fail_usercopy.attr); if (IS_ERR(dir)) return PTR_ERR(dir); return 0; } late_initcall(fail_usercopy_debugfs); #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ bool should_fail_usercopy(void) { return should_fail(&fail_usercopy.attr, 1); } EXPORT_SYMBOL_GPL(should_fail_usercopy);
568 494 568 927 515 465 546 249 336 573 62 62 533 574 577 251 336 577 1042 1047 806 162 161 162 160 160 162 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 // SPDX-License-Identifier: GPL-2.0-only /* * Network node table * * SELinux must keep a mapping of network nodes to labels/SIDs. This * mapping is maintained as part of the normal policy but a fast cache is * needed to reduce the lookup overhead since most of these queries happen on * a per-packet basis. * * Author: Paul Moore <paul@paul-moore.com> * * This code is heavily based on the "netif" concept originally developed by * James Morris <jmorris@redhat.com> * (see security/selinux/netif.c for more information) */ /* * (c) Copyright Hewlett-Packard Development Company, L.P., 2007 */ #include <linux/types.h> #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <net/ip.h> #include <net/ipv6.h> #include "netnode.h" #include "objsec.h" #define SEL_NETNODE_HASH_SIZE 256 #define SEL_NETNODE_HASH_BKT_LIMIT 16 struct sel_netnode_bkt { unsigned int size; struct list_head list; }; struct sel_netnode { struct netnode_security_struct nsec; struct list_head list; struct rcu_head rcu; }; /* NOTE: we are using a combined hash table for both IPv4 and IPv6, the reason * for this is that I suspect most users will not make heavy use of both * address families at the same time so one table will usually end up wasted, * if this becomes a problem we can always add a hash table for each address * family later */ static DEFINE_SPINLOCK(sel_netnode_lock); static struct sel_netnode_bkt sel_netnode_hash[SEL_NETNODE_HASH_SIZE]; /** * sel_netnode_hashfn_ipv4 - IPv4 hashing function for the node table * @addr: IPv4 address * * Description: * This is the IPv4 hashing function for the node interface table, it returns * the bucket number for the given IP address. * */ static unsigned int sel_netnode_hashfn_ipv4(__be32 addr) { /* at some point we should determine if the mismatch in byte order * affects the hash function dramatically */ return (addr & (SEL_NETNODE_HASH_SIZE - 1)); } /** * sel_netnode_hashfn_ipv6 - IPv6 hashing function for the node table * @addr: IPv6 address * * Description: * This is the IPv6 hashing function for the node interface table, it returns * the bucket number for the given IP address. * */ static unsigned int sel_netnode_hashfn_ipv6(const struct in6_addr *addr) { /* just hash the least significant 32 bits to keep things fast (they * are the most likely to be different anyway), we can revisit this * later if needed */ return (addr->s6_addr32[3] & (SEL_NETNODE_HASH_SIZE - 1)); } /** * sel_netnode_find - Search for a node record * @addr: IP address * @family: address family * * Description: * Search the network node table and return the record matching @addr. If an * entry can not be found in the table return NULL. * */ static struct sel_netnode *sel_netnode_find(const void *addr, u16 family) { unsigned int idx; struct sel_netnode *node; switch (family) { case PF_INET: idx = sel_netnode_hashfn_ipv4(*(const __be32 *)addr); break; case PF_INET6: idx = sel_netnode_hashfn_ipv6(addr); break; default: BUG(); return NULL; } list_for_each_entry_rcu(node, &sel_netnode_hash[idx].list, list) if (node->nsec.family == family) switch (family) { case PF_INET: if (node->nsec.addr.ipv4 == *(const __be32 *)addr) return node; break; case PF_INET6: if (ipv6_addr_equal(&node->nsec.addr.ipv6, addr)) return node; break; } return NULL; } /** * sel_netnode_insert - Insert a new node into the table * @node: the new node record * * Description: * Add a new node record to the network address hash table. * */ static void sel_netnode_insert(struct sel_netnode *node) { unsigned int idx; switch (node->nsec.family) { case PF_INET: idx = sel_netnode_hashfn_ipv4(node->nsec.addr.ipv4); break; case PF_INET6: idx = sel_netnode_hashfn_ipv6(&node->nsec.addr.ipv6); break; default: BUG(); return; } /* we need to impose a limit on the growth of the hash table so check * this bucket to make sure it is within the specified bounds */ list_add_rcu(&node->list, &sel_netnode_hash[idx].list); if (sel_netnode_hash[idx].size == SEL_NETNODE_HASH_BKT_LIMIT) { struct sel_netnode *tail; tail = list_entry( rcu_dereference_protected( list_tail_rcu(&sel_netnode_hash[idx].list), lockdep_is_held(&sel_netnode_lock)), struct sel_netnode, list); list_del_rcu(&tail->list); kfree_rcu(tail, rcu); } else sel_netnode_hash[idx].size++; } /** * sel_netnode_sid_slow - Lookup the SID of a network address using the policy * @addr: the IP address * @family: the address family * @sid: node SID * * Description: * This function determines the SID of a network address by querying the * security policy. The result is added to the network address table to * speedup future queries. Returns zero on success, negative values on * failure. * */ static int sel_netnode_sid_slow(const void *addr, u16 family, u32 *sid) { int ret; struct sel_netnode *node; struct sel_netnode *new; spin_lock_bh(&sel_netnode_lock); node = sel_netnode_find(addr, family); if (node != NULL) { *sid = node->nsec.sid; spin_unlock_bh(&sel_netnode_lock); return 0; } /* If this memory allocation fails still return 0. The SID * is valid, it just won't be added to the cache. */ new = kmalloc(sizeof(*new), GFP_ATOMIC); switch (family) { case PF_INET: ret = security_node_sid(PF_INET, addr, sizeof(struct in_addr), sid); if (new) new->nsec.addr.ipv4 = *(const __be32 *)addr; break; case PF_INET6: ret = security_node_sid(PF_INET6, addr, sizeof(struct in6_addr), sid); if (new) new->nsec.addr.ipv6 = *(const struct in6_addr *)addr; break; default: BUG(); ret = -EINVAL; } if (ret == 0 && new) { new->nsec.family = family; new->nsec.sid = *sid; sel_netnode_insert(new); } else kfree(new); spin_unlock_bh(&sel_netnode_lock); if (unlikely(ret)) pr_warn("SELinux: failure in %s(), unable to determine network node label\n", __func__); return ret; } /** * sel_netnode_sid - Lookup the SID of a network address * @addr: the IP address * @family: the address family * @sid: node SID * * Description: * This function determines the SID of a network address using the fastest * method possible. First the address table is queried, but if an entry * can't be found then the policy is queried and the result is added to the * table to speedup future queries. Returns zero on success, negative values * on failure. * */ int sel_netnode_sid(const void *addr, u16 family, u32 *sid) { struct sel_netnode *node; rcu_read_lock(); node = sel_netnode_find(addr, family); if (likely(node != NULL)) { *sid = node->nsec.sid; rcu_read_unlock(); return 0; } rcu_read_unlock(); return sel_netnode_sid_slow(addr, family, sid); } /** * sel_netnode_flush - Flush the entire network address table * * Description: * Remove all entries from the network address table. * */ void sel_netnode_flush(void) { unsigned int idx; struct sel_netnode *node, *node_tmp; spin_lock_bh(&sel_netnode_lock); for (idx = 0; idx < SEL_NETNODE_HASH_SIZE; idx++) { list_for_each_entry_safe(node, node_tmp, &sel_netnode_hash[idx].list, list) { list_del_rcu(&node->list); kfree_rcu(node, rcu); } sel_netnode_hash[idx].size = 0; } spin_unlock_bh(&sel_netnode_lock); } static __init int sel_netnode_init(void) { int iter; if (!selinux_enabled_boot) return 0; for (iter = 0; iter < SEL_NETNODE_HASH_SIZE; iter++) { INIT_LIST_HEAD(&sel_netnode_hash[iter].list); sel_netnode_hash[iter].size = 0; } return 0; } __initcall(sel_netnode_init);
19 14 8 3 19 19 14 19 19 9 19 14 12 3 10 3 2 12 2 2 1 2 2 2 1 14 14 14 2 19 19 14 496 496 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 // SPDX-License-Identifier: GPL-2.0 #include <linux/in.h> #include <linux/inet.h> #include <linux/list.h> #include <linux/module.h> #include <linux/net.h> #include <linux/proc_fs.h> #include <linux/rculist.h> #include <linux/seq_file.h> #include <linux/socket.h> #include <net/inet_sock.h> #include <net/kcm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/tcp.h> #ifdef CONFIG_PROC_FS static struct kcm_mux *kcm_get_first(struct seq_file *seq) { struct net *net = seq_file_net(seq); struct kcm_net *knet = net_generic(net, kcm_net_id); return list_first_or_null_rcu(&knet->mux_list, struct kcm_mux, kcm_mux_list); } static struct kcm_mux *kcm_get_next(struct kcm_mux *mux) { struct kcm_net *knet = mux->knet; return list_next_or_null_rcu(&knet->mux_list, &mux->kcm_mux_list, struct kcm_mux, kcm_mux_list); } static struct kcm_mux *kcm_get_idx(struct seq_file *seq, loff_t pos) { struct net *net = seq_file_net(seq); struct kcm_net *knet = net_generic(net, kcm_net_id); struct kcm_mux *m; list_for_each_entry_rcu(m, &knet->mux_list, kcm_mux_list) { if (!pos) return m; --pos; } return NULL; } static void *kcm_seq_next(struct seq_file *seq, void *v, loff_t *pos) { void *p; if (v == SEQ_START_TOKEN) p = kcm_get_first(seq); else p = kcm_get_next(v); ++*pos; return p; } static void *kcm_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { rcu_read_lock(); if (!*pos) return SEQ_START_TOKEN; else return kcm_get_idx(seq, *pos - 1); } static void kcm_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { rcu_read_unlock(); } struct kcm_proc_mux_state { struct seq_net_private p; int idx; }; static void kcm_format_mux_header(struct seq_file *seq) { struct net *net = seq_file_net(seq); struct kcm_net *knet = net_generic(net, kcm_net_id); seq_printf(seq, "*** KCM statistics (%d MUX) ****\n", knet->count); seq_printf(seq, "%-14s %-10s %-16s %-10s %-16s %-8s %-8s %-8s %-8s %s", "Object", "RX-Msgs", "RX-Bytes", "TX-Msgs", "TX-Bytes", "Recv-Q", "Rmem", "Send-Q", "Smem", "Status"); /* XXX: pdsts header stuff here */ seq_puts(seq, "\n"); } static void kcm_format_sock(struct kcm_sock *kcm, struct seq_file *seq, int i, int *len) { seq_printf(seq, " kcm-%-7u %-10llu %-16llu %-10llu %-16llu %-8d %-8d %-8d %-8s ", kcm->index, kcm->stats.rx_msgs, kcm->stats.rx_bytes, kcm->stats.tx_msgs, kcm->stats.tx_bytes, kcm->sk.sk_receive_queue.qlen, sk_rmem_alloc_get(&kcm->sk), kcm->sk.sk_write_queue.qlen, "-"); if (kcm->tx_psock) seq_printf(seq, "Psck-%u ", kcm->tx_psock->index); if (kcm->tx_wait) seq_puts(seq, "TxWait "); if (kcm->tx_wait_more) seq_puts(seq, "WMore "); if (kcm->rx_wait) seq_puts(seq, "RxWait "); seq_puts(seq, "\n"); } static void kcm_format_psock(struct kcm_psock *psock, struct seq_file *seq, int i, int *len) { seq_printf(seq, " psock-%-5u %-10llu %-16llu %-10llu %-16llu %-8d %-8d %-8d %-8d ", psock->index, psock->strp.stats.msgs, psock->strp.stats.bytes, psock->stats.tx_msgs, psock->stats.tx_bytes, psock->sk->sk_receive_queue.qlen, atomic_read(&psock->sk->sk_rmem_alloc), psock->sk->sk_write_queue.qlen, refcount_read(&psock->sk->sk_wmem_alloc)); if (psock->done) seq_puts(seq, "Done "); if (psock->tx_stopped) seq_puts(seq, "TxStop "); if (psock->strp.stopped) seq_puts(seq, "RxStop "); if (psock->tx_kcm) seq_printf(seq, "Rsvd-%d ", psock->tx_kcm->index); if (!psock->strp.paused && !psock->ready_rx_msg) { if (psock->sk->sk_receive_queue.qlen) { if (psock->strp.need_bytes) seq_printf(seq, "RxWait=%u ", psock->strp.need_bytes); else seq_printf(seq, "RxWait "); } } else { if (psock->strp.paused) seq_puts(seq, "RxPause "); if (psock->ready_rx_msg) seq_puts(seq, "RdyRx "); } seq_puts(seq, "\n"); } static void kcm_format_mux(struct kcm_mux *mux, loff_t idx, struct seq_file *seq) { int i, len; struct kcm_sock *kcm; struct kcm_psock *psock; /* mux information */ seq_printf(seq, "%-6s%-8s %-10llu %-16llu %-10llu %-16llu %-8s %-8s %-8s %-8s ", "mux", "", mux->stats.rx_msgs, mux->stats.rx_bytes, mux->stats.tx_msgs, mux->stats.tx_bytes, "-", "-", "-", "-"); seq_printf(seq, "KCMs: %d, Psocks %d\n", mux->kcm_socks_cnt, mux->psocks_cnt); /* kcm sock information */ i = 0; spin_lock_bh(&mux->lock); list_for_each_entry(kcm, &mux->kcm_socks, kcm_sock_list) { kcm_format_sock(kcm, seq, i, &len); i++; } i = 0; list_for_each_entry(psock, &mux->psocks, psock_list) { kcm_format_psock(psock, seq, i, &len); i++; } spin_unlock_bh(&mux->lock); } static int kcm_seq_show(struct seq_file *seq, void *v) { struct kcm_proc_mux_state *mux_state; mux_state = seq->private; if (v == SEQ_START_TOKEN) { mux_state->idx = 0; kcm_format_mux_header(seq); } else { kcm_format_mux(v, mux_state->idx, seq); mux_state->idx++; } return 0; } static const struct seq_operations kcm_seq_ops = { .show = kcm_seq_show, .start = kcm_seq_start, .next = kcm_seq_next, .stop = kcm_seq_stop, }; static int kcm_stats_seq_show(struct seq_file *seq, void *v) { struct kcm_psock_stats psock_stats; struct kcm_mux_stats mux_stats; struct strp_aggr_stats strp_stats; struct kcm_mux *mux; struct kcm_psock *psock; struct net *net = seq->private; struct kcm_net *knet = net_generic(net, kcm_net_id); memset(&mux_stats, 0, sizeof(mux_stats)); memset(&psock_stats, 0, sizeof(psock_stats)); memset(&strp_stats, 0, sizeof(strp_stats)); mutex_lock(&knet->mutex); aggregate_mux_stats(&knet->aggregate_mux_stats, &mux_stats); aggregate_psock_stats(&knet->aggregate_psock_stats, &psock_stats); aggregate_strp_stats(&knet->aggregate_strp_stats, &strp_stats); list_for_each_entry(mux, &knet->mux_list, kcm_mux_list) { spin_lock_bh(&mux->lock); aggregate_mux_stats(&mux->stats, &mux_stats); aggregate_psock_stats(&mux->aggregate_psock_stats, &psock_stats); aggregate_strp_stats(&mux->aggregate_strp_stats, &strp_stats); list_for_each_entry(psock, &mux->psocks, psock_list) { aggregate_psock_stats(&psock->stats, &psock_stats); save_strp_stats(&psock->strp, &strp_stats); } spin_unlock_bh(&mux->lock); } mutex_unlock(&knet->mutex); seq_printf(seq, "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s\n", "MUX", "RX-Msgs", "RX-Bytes", "TX-Msgs", "TX-Bytes", "TX-Retries", "Attach", "Unattach", "UnattchRsvd", "RX-RdyDrops"); seq_printf(seq, "%-8s %-10llu %-16llu %-10llu %-16llu %-10u %-10u %-10u %-10u %-10u\n", "", mux_stats.rx_msgs, mux_stats.rx_bytes, mux_stats.tx_msgs, mux_stats.tx_bytes, mux_stats.tx_retries, mux_stats.psock_attach, mux_stats.psock_unattach_rsvd, mux_stats.psock_unattach, mux_stats.rx_ready_drops); seq_printf(seq, "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n", "Psock", "RX-Msgs", "RX-Bytes", "TX-Msgs", "TX-Bytes", "Reserved", "Unreserved", "RX-Aborts", "RX-Intr", "RX-Unrecov", "RX-MemFail", "RX-NeedMor", "RX-BadLen", "RX-TooBig", "RX-Timeout", "TX-Aborts"); seq_printf(seq, "%-8s %-10llu %-16llu %-10llu %-16llu %-10llu %-10llu %-10u %-10u %-10u %-10u %-10u %-10u %-10u %-10u %-10u\n", "", strp_stats.msgs, strp_stats.bytes, psock_stats.tx_msgs, psock_stats.tx_bytes, psock_stats.reserved, psock_stats.unreserved, strp_stats.aborts, strp_stats.interrupted, strp_stats.unrecov_intr, strp_stats.mem_fail, strp_stats.need_more_hdr, strp_stats.bad_hdr_len, strp_stats.msg_too_big, strp_stats.msg_timeouts, psock_stats.tx_aborts); return 0; } static int kcm_proc_init_net(struct net *net) { if (!proc_create_net_single("kcm_stats", 0444, net->proc_net, kcm_stats_seq_show, NULL)) goto out_kcm_stats; if (!proc_create_net("kcm", 0444, net->proc_net, &kcm_seq_ops, sizeof(struct kcm_proc_mux_state))) goto out_kcm; return 0; out_kcm: remove_proc_entry("kcm_stats", net->proc_net); out_kcm_stats: return -ENOMEM; } static void kcm_proc_exit_net(struct net *net) { remove_proc_entry("kcm", net->proc_net); remove_proc_entry("kcm_stats", net->proc_net); } static struct pernet_operations kcm_net_ops = { .init = kcm_proc_init_net, .exit = kcm_proc_exit_net, }; int __init kcm_proc_init(void) { return register_pernet_subsys(&kcm_net_ops); } void __exit kcm_proc_exit(void) { unregister_pernet_subsys(&kcm_net_ops); } #endif /* CONFIG_PROC_FS */
145 70 1759 2224 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 // SPDX-License-Identifier: GPL-2.0 /* * fs/ext4/extents_status.h * * Written by Yongqiang Yang <xiaoqiangnk@gmail.com> * Modified by * Allison Henderson <achender@linux.vnet.ibm.com> * Zheng Liu <wenqing.lz@taobao.com> * */ #ifndef _EXT4_EXTENTS_STATUS_H #define _EXT4_EXTENTS_STATUS_H /* * Turn on ES_DEBUG__ to get lots of info about extent status operations. */ #ifdef ES_DEBUG__ #define es_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) #else #define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif /* * With ES_AGGRESSIVE_TEST defined, the result of es caching will be * checked with old map_block's result. */ #define ES_AGGRESSIVE_TEST__ /* * These flags live in the high bits of extent_status.es_pblk */ enum { ES_WRITTEN_B, ES_UNWRITTEN_B, ES_DELAYED_B, ES_HOLE_B, ES_REFERENCED_B, ES_FLAGS }; #define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS) #define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT) /* * Besides EXTENT_STATUS_REFERENCED, all these extent type masks * are exclusive, only one type can be set at a time. */ #define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B) #define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B) #define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B) #define EXTENT_STATUS_HOLE (1 << ES_HOLE_B) #define EXTENT_STATUS_REFERENCED (1 << ES_REFERENCED_B) #define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \ EXTENT_STATUS_UNWRITTEN | \ EXTENT_STATUS_DELAYED | \ EXTENT_STATUS_HOLE)) #define ES_TYPE_VALID(type) ((type) && !((type) & ((type) - 1))) struct ext4_sb_info; struct ext4_extent; struct extent_status { struct rb_node rb_node; ext4_lblk_t es_lblk; /* first logical block extent covers */ ext4_lblk_t es_len; /* length of extent in block */ ext4_fsblk_t es_pblk; /* first physical block */ }; struct ext4_es_tree { struct rb_root root; struct extent_status *cache_es; /* recently accessed extent */ }; struct ext4_es_stats { unsigned long es_stats_shrunk; struct percpu_counter es_stats_cache_hits; struct percpu_counter es_stats_cache_misses; u64 es_stats_scan_time; u64 es_stats_max_scan_time; struct percpu_counter es_stats_all_cnt; struct percpu_counter es_stats_shk_cnt; }; /* * Pending cluster reservations for bigalloc file systems * * A cluster with a pending reservation is a logical cluster shared by at * least one extent in the extents status tree with delayed and unwritten * status and at least one other written or unwritten extent. The * reservation is said to be pending because a cluster reservation would * have to be taken in the event all blocks in the cluster shared with * written or unwritten extents were deleted while the delayed and * unwritten blocks remained. * * The set of pending cluster reservations is an auxiliary data structure * used with the extents status tree to implement reserved cluster/block * accounting for bigalloc file systems. The set is kept in memory and * records all pending cluster reservations. * * Its primary function is to avoid the need to read extents from the * disk when invalidating pages as a result of a truncate, punch hole, or * collapse range operation. Page invalidation requires a decrease in the * reserved cluster count if it results in the removal of all delayed * and unwritten extents (blocks) from a cluster that is not shared with a * written or unwritten extent, and no decrease otherwise. Determining * whether the cluster is shared can be done by searching for a pending * reservation on it. * * Secondarily, it provides a potentially faster method for determining * whether the reserved cluster count should be increased when a physical * cluster is deallocated as a result of a truncate, punch hole, or * collapse range operation. The necessary information is also present * in the extents status tree, but might be more rapidly accessed in * the pending reservation set in many cases due to smaller size. * * The pending cluster reservation set is implemented as a red-black tree * with the goal of minimizing per page search time overhead. */ struct pending_reservation { struct rb_node rb_node; ext4_lblk_t lclu; }; struct ext4_pending_tree { struct rb_root root; }; extern int __init ext4_init_es(void); extern void ext4_exit_es(void); extern void ext4_es_init_tree(struct ext4_es_tree *tree); extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, unsigned int status, bool delalloc_reserve_used); extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk, unsigned int status); extern void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); extern void ext4_es_find_extent_range(struct inode *inode, int (*match_fn)(struct extent_status *es), ext4_lblk_t lblk, ext4_lblk_t end, struct extent_status *es); extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t *next_lblk, struct extent_status *es); extern bool ext4_es_scan_range(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t lblk, ext4_lblk_t end); extern bool ext4_es_scan_clu(struct inode *inode, int (*matching_fn)(struct extent_status *es), ext4_lblk_t lblk); static inline unsigned int ext4_es_status(struct extent_status *es) { return es->es_pblk >> ES_SHIFT; } static inline unsigned int ext4_es_type(struct extent_status *es) { return (es->es_pblk >> ES_SHIFT) & ES_TYPE_MASK; } static inline int ext4_es_is_written(struct extent_status *es) { return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0; } static inline int ext4_es_is_unwritten(struct extent_status *es) { return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0; } static inline int ext4_es_is_delayed(struct extent_status *es) { return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0; } static inline int ext4_es_is_hole(struct extent_status *es) { return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; } static inline int ext4_es_is_mapped(struct extent_status *es) { return (ext4_es_is_written(es) || ext4_es_is_unwritten(es)); } static inline void ext4_es_set_referenced(struct extent_status *es) { es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; } static inline void ext4_es_clear_referenced(struct extent_status *es) { es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT); } static inline int ext4_es_is_referenced(struct extent_status *es) { return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0; } static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) { return es->es_pblk & ~ES_MASK; } static inline ext4_fsblk_t ext4_es_show_pblock(struct extent_status *es) { ext4_fsblk_t pblock = ext4_es_pblock(es); return pblock == ~ES_MASK ? 0 : pblock; } static inline void ext4_es_store_pblock(struct extent_status *es, ext4_fsblk_t pb) { ext4_fsblk_t block; block = (pb & ~ES_MASK) | (es->es_pblk & ES_MASK); es->es_pblk = block; } static inline void ext4_es_store_pblock_status(struct extent_status *es, ext4_fsblk_t pb, unsigned int status) { WARN_ON_ONCE(!ES_TYPE_VALID(status & ES_TYPE_MASK)); es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | (pb & ~ES_MASK); } extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v); extern int __init ext4_init_pending(void); extern void ext4_exit_pending(void); extern void ext4_init_pending_tree(struct ext4_pending_tree *tree); extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk); extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk); extern void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, bool lclu_allocated, bool end_allocated); extern void ext4_clear_inode_es(struct inode *inode); #endif /* _EXT4_EXTENTS_STATUS_H */
325 324 325 226 219 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 // SPDX-License-Identifier: GPL-2.0+ /* * Serial core port device driver * * Copyright (C) 2023 Texas Instruments Incorporated - https://www.ti.com/ * Author: Tony Lindgren <tony@atomide.com> */ #include <linux/device.h> #include <linux/module.h> #include <linux/of.h> #include <linux/platform_device.h> #include <linux/pm_runtime.h> #include <linux/pnp.h> #include <linux/property.h> #include <linux/serial_core.h> #include <linux/spinlock.h> #include "serial_base.h" #define SERIAL_PORT_AUTOSUSPEND_DELAY_MS 500 /* Only considers pending TX for now. Caller must take care of locking */ static int __serial_port_busy(struct uart_port *port) { return !uart_tx_stopped(port) && !kfifo_is_empty(&port->state->port.xmit_fifo); } static int serial_port_runtime_resume(struct device *dev) { struct serial_port_device *port_dev = to_serial_base_port_device(dev); struct uart_port *port; unsigned long flags; port = port_dev->port; if (port->flags & UPF_DEAD) goto out; /* Flush any pending TX for the port */ uart_port_lock_irqsave(port, &flags); if (!port_dev->tx_enabled) goto unlock; if (__serial_port_busy(port)) port->ops->start_tx(port); unlock: uart_port_unlock_irqrestore(port, flags); out: pm_runtime_mark_last_busy(dev); return 0; } static int serial_port_runtime_suspend(struct device *dev) { struct serial_port_device *port_dev = to_serial_base_port_device(dev); struct uart_port *port = port_dev->port; unsigned long flags; bool busy; if (port->flags & UPF_DEAD) return 0; /* * Nothing to do on pm_runtime_force_suspend(), see * DEFINE_RUNTIME_DEV_PM_OPS. */ if (!pm_runtime_enabled(dev)) return 0; uart_port_lock_irqsave(port, &flags); if (!port_dev->tx_enabled) { uart_port_unlock_irqrestore(port, flags); return 0; } busy = __serial_port_busy(port); if (busy) port->ops->start_tx(port); uart_port_unlock_irqrestore(port, flags); if (busy) pm_runtime_mark_last_busy(dev); return busy ? -EBUSY : 0; } static void serial_base_port_set_tx(struct uart_port *port, struct serial_port_device *port_dev, bool enabled) { unsigned long flags; uart_port_lock_irqsave(port, &flags); port_dev->tx_enabled = enabled; uart_port_unlock_irqrestore(port, flags); } void serial_base_port_startup(struct uart_port *port) { struct serial_port_device *port_dev = port->port_dev; serial_base_port_set_tx(port, port_dev, true); } void serial_base_port_shutdown(struct uart_port *port) { struct serial_port_device *port_dev = port->port_dev; serial_base_port_set_tx(port, port_dev, false); } static DEFINE_RUNTIME_DEV_PM_OPS(serial_port_pm, serial_port_runtime_suspend, serial_port_runtime_resume, NULL); static int serial_port_probe(struct device *dev) { pm_runtime_enable(dev); pm_runtime_set_autosuspend_delay(dev, SERIAL_PORT_AUTOSUSPEND_DELAY_MS); pm_runtime_use_autosuspend(dev); return 0; } static int serial_port_remove(struct device *dev) { pm_runtime_dont_use_autosuspend(dev); pm_runtime_disable(dev); return 0; } /* * Serial core port device init functions. Note that the physical serial * port device driver may not have completed probe at this point. */ int uart_add_one_port(struct uart_driver *drv, struct uart_port *port) { return serial_ctrl_register_port(drv, port); } EXPORT_SYMBOL(uart_add_one_port); void uart_remove_one_port(struct uart_driver *drv, struct uart_port *port) { serial_ctrl_unregister_port(drv, port); } EXPORT_SYMBOL(uart_remove_one_port); /** * __uart_read_properties - read firmware properties of the given UART port * @port: corresponding port * @use_defaults: apply defaults (when %true) or validate the values (when %false) * * The following device properties are supported: * - clock-frequency (optional) * - fifo-size (optional) * - no-loopback-test (optional) * - reg-shift (defaults may apply) * - reg-offset (value may be validated) * - reg-io-width (defaults may apply or value may be validated) * - interrupts (OF only) * - serial [alias ID] (OF only) * * If the port->dev is of struct platform_device type the interrupt line * will be retrieved via platform_get_irq() call against that device. * Otherwise it will be assigned by fwnode_irq_get() call. In both cases * the index 0 of the resource is used. * * The caller is responsible to initialize the following fields of the @port * ->dev (must be valid) * ->flags * ->iobase * ->mapbase * ->mapsize * ->regshift (if @use_defaults is false) * before calling this function. Alternatively the above mentioned fields * may be zeroed, in such case the only ones, that have associated properties * found, will be set to the respective values. * * If no error happened, the ->irq, ->mapbase, ->mapsize will be altered. * The ->iotype is always altered. * * When @use_defaults is true and the respective property is not found * the following values will be applied: * ->regshift = 0 * In this case IRQ must be provided, otherwise an error will be returned. * * When @use_defaults is false and the respective property is found * the following values will be validated: * - reg-io-width (->iotype) * - reg-offset (->mapsize against ->mapbase) * * Returns: 0 on success or negative errno on failure */ static int __uart_read_properties(struct uart_port *port, bool use_defaults) { struct device *dev = port->dev; u32 value; int ret; /* Read optional UART functional clock frequency */ device_property_read_u32(dev, "clock-frequency", &port->uartclk); /* Read the registers alignment (default: 8-bit) */ ret = device_property_read_u32(dev, "reg-shift", &value); if (ret) port->regshift = use_defaults ? 0 : port->regshift; else port->regshift = value; /* Read the registers I/O access type (default: MMIO 8-bit) */ ret = device_property_read_u32(dev, "reg-io-width", &value); if (ret) { port->iotype = port->iobase ? UPIO_PORT : UPIO_MEM; } else { switch (value) { case 1: port->iotype = UPIO_MEM; break; case 2: port->iotype = UPIO_MEM16; break; case 4: port->iotype = device_is_big_endian(dev) ? UPIO_MEM32BE : UPIO_MEM32; break; default: port->iotype = UPIO_UNKNOWN; break; } } if (!use_defaults && port->iotype == UPIO_UNKNOWN) { dev_err(dev, "Unsupported reg-io-width (%u)\n", value); return -EINVAL; } /* Read the address mapping base offset (default: no offset) */ ret = device_property_read_u32(dev, "reg-offset", &value); if (ret) value = 0; /* Check for shifted address mapping overflow */ if (!use_defaults && port->mapsize < value) { dev_err(dev, "reg-offset %u exceeds region size %pa\n", value, &port->mapsize); return -EINVAL; } port->mapbase += value; port->mapsize -= value; /* Read optional FIFO size */ device_property_read_u32(dev, "fifo-size", &port->fifosize); if (device_property_read_bool(dev, "no-loopback-test")) port->flags |= UPF_SKIP_TEST; /* Get index of serial line, if found in DT aliases */ ret = of_alias_get_id(dev_of_node(dev), "serial"); if (ret >= 0) port->line = ret; if (dev_is_platform(dev)) ret = platform_get_irq(to_platform_device(dev), 0); else if (dev_is_pnp(dev)) { ret = pnp_irq(to_pnp_dev(dev), 0); if (ret < 0) ret = -ENXIO; } else ret = fwnode_irq_get(dev_fwnode(dev), 0); if (ret == -EPROBE_DEFER) return ret; if (ret > 0) port->irq = ret; else if (use_defaults) /* By default IRQ support is mandatory */ return ret; else port->irq = 0; port->flags |= UPF_SHARE_IRQ; return 0; } int uart_read_port_properties(struct uart_port *port) { return __uart_read_properties(port, true); } EXPORT_SYMBOL_GPL(uart_read_port_properties); int uart_read_and_validate_port_properties(struct uart_port *port) { return __uart_read_properties(port, false); } EXPORT_SYMBOL_GPL(uart_read_and_validate_port_properties); static struct device_driver serial_port_driver = { .name = "port", .suppress_bind_attrs = true, .probe = serial_port_probe, .remove = serial_port_remove, .pm = pm_ptr(&serial_port_pm), }; int serial_base_port_init(void) { return serial_base_driver_register(&serial_port_driver); } void serial_base_port_exit(void) { serial_base_driver_unregister(&serial_port_driver); } MODULE_AUTHOR("Tony Lindgren <tony@atomide.com>"); MODULE_DESCRIPTION("Serial controller port driver"); MODULE_LICENSE("GPL");
169 169 89 89 50 39 89 80 80 80 43 38 80 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2015 Intel Corporation This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ #include <linux/unaligned.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/hci_mon.h> #include <net/bluetooth/mgmt.h> #include "mgmt_util.h" static struct sk_buff *create_monitor_ctrl_event(__le16 index, u32 cookie, u16 opcode, u16 len, void *buf) { struct hci_mon_hdr *hdr; struct sk_buff *skb; skb = bt_skb_alloc(6 + len, GFP_ATOMIC); if (!skb) return NULL; put_unaligned_le32(cookie, skb_put(skb, 4)); put_unaligned_le16(opcode, skb_put(skb, 2)); if (buf) skb_put_data(skb, buf, len); __net_timestamp(skb); hdr = skb_push(skb, HCI_MON_HDR_SIZE); hdr->opcode = cpu_to_le16(HCI_MON_CTRL_EVENT); hdr->index = index; hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE); return skb; } struct sk_buff *mgmt_alloc_skb(struct hci_dev *hdev, u16 opcode, unsigned int size) { struct sk_buff *skb; skb = alloc_skb(sizeof(struct mgmt_hdr) + size, GFP_KERNEL); if (!skb) return skb; skb_reserve(skb, sizeof(struct mgmt_hdr)); bt_cb(skb)->mgmt.hdev = hdev; bt_cb(skb)->mgmt.opcode = opcode; return skb; } int mgmt_send_event_skb(unsigned short channel, struct sk_buff *skb, int flag, struct sock *skip_sk) { struct hci_dev *hdev; struct mgmt_hdr *hdr; int len; if (!skb) return -EINVAL; len = skb->len; hdev = bt_cb(skb)->mgmt.hdev; /* Time stamp */ __net_timestamp(skb); /* Send just the data, without headers, to the monitor */ if (channel == HCI_CHANNEL_CONTROL) hci_send_monitor_ctrl_event(hdev, bt_cb(skb)->mgmt.opcode, skb->data, skb->len, skb_get_ktime(skb), flag, skip_sk); hdr = skb_push(skb, sizeof(*hdr)); hdr->opcode = cpu_to_le16(bt_cb(skb)->mgmt.opcode); if (hdev) hdr->index = cpu_to_le16(hdev->id); else hdr->index = cpu_to_le16(MGMT_INDEX_NONE); hdr->len = cpu_to_le16(len); hci_send_to_channel(channel, skb, flag, skip_sk); kfree_skb(skb); return 0; } int mgmt_send_event(u16 event, struct hci_dev *hdev, unsigned short channel, void *data, u16 data_len, int flag, struct sock *skip_sk) { struct sk_buff *skb; skb = mgmt_alloc_skb(hdev, event, data_len); if (!skb) return -ENOMEM; if (data) skb_put_data(skb, data, data_len); return mgmt_send_event_skb(channel, skb, flag, skip_sk); } int mgmt_cmd_status(struct sock *sk, u16 index, u16 cmd, u8 status) { struct sk_buff *skb, *mskb; struct mgmt_hdr *hdr; struct mgmt_ev_cmd_status *ev; int err; BT_DBG("sock %p, index %u, cmd %u, status %u", sk, index, cmd, status); skb = alloc_skb(sizeof(*hdr) + sizeof(*ev), GFP_KERNEL); if (!skb) return -ENOMEM; hdr = skb_put(skb, sizeof(*hdr)); hdr->opcode = cpu_to_le16(MGMT_EV_CMD_STATUS); hdr->index = cpu_to_le16(index); hdr->len = cpu_to_le16(sizeof(*ev)); ev = skb_put(skb, sizeof(*ev)); ev->status = status; ev->opcode = cpu_to_le16(cmd); mskb = create_monitor_ctrl_event(hdr->index, hci_sock_get_cookie(sk), MGMT_EV_CMD_STATUS, sizeof(*ev), ev); if (mskb) skb->tstamp = mskb->tstamp; else __net_timestamp(skb); err = sock_queue_rcv_skb(sk, skb); if (err < 0) kfree_skb(skb); if (mskb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, mskb, HCI_SOCK_TRUSTED, NULL); kfree_skb(mskb); } return err; } int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status, void *rp, size_t rp_len) { struct sk_buff *skb, *mskb; struct mgmt_hdr *hdr; struct mgmt_ev_cmd_complete *ev; int err; BT_DBG("sock %p", sk); skb = alloc_skb(sizeof(*hdr) + sizeof(*ev) + rp_len, GFP_KERNEL); if (!skb) return -ENOMEM; hdr = skb_put(skb, sizeof(*hdr)); hdr->opcode = cpu_to_le16(MGMT_EV_CMD_COMPLETE); hdr->index = cpu_to_le16(index); hdr->len = cpu_to_le16(sizeof(*ev) + rp_len); ev = skb_put(skb, sizeof(*ev) + rp_len); ev->opcode = cpu_to_le16(cmd); ev->status = status; if (rp) memcpy(ev->data, rp, rp_len); mskb = create_monitor_ctrl_event(hdr->index, hci_sock_get_cookie(sk), MGMT_EV_CMD_COMPLETE, sizeof(*ev) + rp_len, ev); if (mskb) skb->tstamp = mskb->tstamp; else __net_timestamp(skb); err = sock_queue_rcv_skb(sk, skb); if (err < 0) kfree_skb(skb); if (mskb) { hci_send_to_channel(HCI_CHANNEL_MONITOR, mskb, HCI_SOCK_TRUSTED, NULL); kfree_skb(mskb); } return err; } struct mgmt_pending_cmd *mgmt_pending_find(unsigned short channel, u16 opcode, struct hci_dev *hdev) { struct mgmt_pending_cmd *cmd, *tmp; mutex_lock(&hdev->mgmt_pending_lock); list_for_each_entry_safe(cmd, tmp, &hdev->mgmt_pending, list) { if (hci_sock_get_channel(cmd->sk) != channel) continue; if (cmd->opcode == opcode) { mutex_unlock(&hdev->mgmt_pending_lock); return cmd; } } mutex_unlock(&hdev->mgmt_pending_lock); return NULL; } void mgmt_pending_foreach(u16 opcode, struct hci_dev *hdev, bool remove, void (*cb)(struct mgmt_pending_cmd *cmd, void *data), void *data) { struct mgmt_pending_cmd *cmd, *tmp; mutex_lock(&hdev->mgmt_pending_lock); list_for_each_entry_safe(cmd, tmp, &hdev->mgmt_pending, list) { if (opcode > 0 && cmd->opcode != opcode) continue; if (remove) list_del(&cmd->list); cb(cmd, data); if (remove) mgmt_pending_free(cmd); } mutex_unlock(&hdev->mgmt_pending_lock); } struct mgmt_pending_cmd *mgmt_pending_new(struct sock *sk, u16 opcode, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_pending_cmd *cmd; cmd = kzalloc(sizeof(*cmd), GFP_KERNEL); if (!cmd) return NULL; cmd->opcode = opcode; cmd->hdev = hdev; cmd->param = kmemdup(data, len, GFP_KERNEL); if (!cmd->param) { kfree(cmd); return NULL; } cmd->param_len = len; cmd->sk = sk; sock_hold(sk); return cmd; } struct mgmt_pending_cmd *mgmt_pending_add(struct sock *sk, u16 opcode, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_pending_cmd *cmd; cmd = mgmt_pending_new(sk, opcode, hdev, data, len); if (!cmd) return NULL; mutex_lock(&hdev->mgmt_pending_lock); list_add_tail(&cmd->list, &hdev->mgmt_pending); mutex_unlock(&hdev->mgmt_pending_lock); return cmd; } void mgmt_pending_free(struct mgmt_pending_cmd *cmd) { sock_put(cmd->sk); kfree(cmd->param); kfree(cmd); } void mgmt_pending_remove(struct mgmt_pending_cmd *cmd) { mutex_lock(&cmd->hdev->mgmt_pending_lock); list_del(&cmd->list); mutex_unlock(&cmd->hdev->mgmt_pending_lock); mgmt_pending_free(cmd); } void mgmt_mesh_foreach(struct hci_dev *hdev, void (*cb)(struct mgmt_mesh_tx *mesh_tx, void *data), void *data, struct sock *sk) { struct mgmt_mesh_tx *mesh_tx, *tmp; list_for_each_entry_safe(mesh_tx, tmp, &hdev->mesh_pending, list) { if (!sk || mesh_tx->sk == sk) cb(mesh_tx, data); } } struct mgmt_mesh_tx *mgmt_mesh_next(struct hci_dev *hdev, struct sock *sk) { struct mgmt_mesh_tx *mesh_tx; if (list_empty(&hdev->mesh_pending)) return NULL; list_for_each_entry(mesh_tx, &hdev->mesh_pending, list) { if (!sk || mesh_tx->sk == sk) return mesh_tx; } return NULL; } struct mgmt_mesh_tx *mgmt_mesh_find(struct hci_dev *hdev, u8 handle) { struct mgmt_mesh_tx *mesh_tx; if (list_empty(&hdev->mesh_pending)) return NULL; list_for_each_entry(mesh_tx, &hdev->mesh_pending, list) { if (mesh_tx->handle == handle) return mesh_tx; } return NULL; } struct mgmt_mesh_tx *mgmt_mesh_add(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_mesh_tx *mesh_tx; mesh_tx = kzalloc(sizeof(*mesh_tx), GFP_KERNEL); if (!mesh_tx) return NULL; hdev->mesh_send_ref++; if (!hdev->mesh_send_ref) hdev->mesh_send_ref++; mesh_tx->handle = hdev->mesh_send_ref; mesh_tx->index = hdev->id; memcpy(mesh_tx->param, data, len); mesh_tx->param_len = len; mesh_tx->sk = sk; sock_hold(sk); list_add_tail(&mesh_tx->list, &hdev->mesh_pending); return mesh_tx; } void mgmt_mesh_remove(struct mgmt_mesh_tx *mesh_tx) { list_del(&mesh_tx->list); sock_put(mesh_tx->sk); kfree(mesh_tx); }
56 2 39 1 5 6 3 4 2 2 18 6 6 6 18 12 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_offload.h> void nft_immediate_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); nft_data_copy(&regs->data[priv->dreg], &priv->data, priv->dlen); } static const struct nla_policy nft_immediate_policy[NFTA_IMMEDIATE_MAX + 1] = { [NFTA_IMMEDIATE_DREG] = { .type = NLA_U32 }, [NFTA_IMMEDIATE_DATA] = { .type = NLA_NESTED }, }; static enum nft_data_types nft_reg_to_type(const struct nlattr *nla) { enum nft_data_types type; u8 reg; reg = ntohl(nla_get_be32(nla)); if (reg == NFT_REG_VERDICT) type = NFT_DATA_VERDICT; else type = NFT_DATA_VALUE; return type; } static int nft_immediate_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_immediate_expr *priv = nft_expr_priv(expr); struct nft_data_desc desc = { .size = sizeof(priv->data), }; int err; if (tb[NFTA_IMMEDIATE_DREG] == NULL || tb[NFTA_IMMEDIATE_DATA] == NULL) return -EINVAL; desc.type = nft_reg_to_type(tb[NFTA_IMMEDIATE_DREG]); err = nft_data_init(ctx, &priv->data, &desc, tb[NFTA_IMMEDIATE_DATA]); if (err < 0) return err; priv->dlen = desc.len; err = nft_parse_register_store(ctx, tb[NFTA_IMMEDIATE_DREG], &priv->dreg, &priv->data, desc.type, desc.len); if (err < 0) goto err1; if (priv->dreg == NFT_REG_VERDICT) { struct nft_chain *chain = priv->data.verdict.chain; switch (priv->data.verdict.code) { case NFT_JUMP: case NFT_GOTO: err = nf_tables_bind_chain(ctx, chain); if (err < 0) goto err1; break; default: break; } } return 0; err1: nft_data_release(&priv->data, desc.type); return err; } static void nft_immediate_activate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); const struct nft_data *data = &priv->data; struct nft_ctx chain_ctx; struct nft_chain *chain; struct nft_rule *rule; if (priv->dreg == NFT_REG_VERDICT) { switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: chain = data->verdict.chain; if (!nft_chain_binding(chain)) break; chain_ctx = *ctx; chain_ctx.chain = chain; list_for_each_entry(rule, &chain->rules, list) nft_rule_expr_activate(&chain_ctx, rule); nft_clear(ctx->net, chain); break; default: break; } } return nft_data_hold(&priv->data, nft_dreg_to_type(priv->dreg)); } static void nft_immediate_chain_deactivate(const struct nft_ctx *ctx, struct nft_chain *chain, enum nft_trans_phase phase) { struct nft_ctx chain_ctx; struct nft_rule *rule; chain_ctx = *ctx; chain_ctx.chain = chain; list_for_each_entry(rule, &chain->rules, list) nft_rule_expr_deactivate(&chain_ctx, rule, phase); } static void nft_immediate_deactivate(const struct nft_ctx *ctx, const struct nft_expr *expr, enum nft_trans_phase phase) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); const struct nft_data *data = &priv->data; struct nft_chain *chain; if (priv->dreg == NFT_REG_VERDICT) { switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: chain = data->verdict.chain; if (!nft_chain_binding(chain)) break; switch (phase) { case NFT_TRANS_PREPARE_ERROR: nf_tables_unbind_chain(ctx, chain); nft_deactivate_next(ctx->net, chain); break; case NFT_TRANS_PREPARE: nft_immediate_chain_deactivate(ctx, chain, phase); nft_deactivate_next(ctx->net, chain); break; default: nft_immediate_chain_deactivate(ctx, chain, phase); nft_chain_del(chain); chain->bound = false; nft_use_dec(&chain->table->use); break; } break; default: break; } } if (phase == NFT_TRANS_COMMIT) return; return nft_data_release(&priv->data, nft_dreg_to_type(priv->dreg)); } static void nft_immediate_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); const struct nft_data *data = &priv->data; struct nft_rule *rule, *n; struct nft_ctx chain_ctx; struct nft_chain *chain; if (priv->dreg != NFT_REG_VERDICT) return; switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: chain = data->verdict.chain; if (!nft_chain_binding(chain)) break; /* Rule construction failed, but chain is already bound: * let the transaction records release this chain and its rules. */ if (chain->bound) { nft_use_dec(&chain->use); break; } /* Rule has been deleted, release chain and its rules. */ chain_ctx = *ctx; chain_ctx.chain = chain; nft_use_dec(&chain->use); list_for_each_entry_safe(rule, n, &chain->rules, list) { nft_use_dec(&chain->use); list_del(&rule->list); nf_tables_rule_destroy(&chain_ctx, rule); } nf_tables_chain_destroy(chain); break; default: break; } } static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_IMMEDIATE_DREG, priv->dreg)) goto nla_put_failure; return nft_data_dump(skb, NFTA_IMMEDIATE_DATA, &priv->data, nft_dreg_to_type(priv->dreg), priv->dlen); nla_put_failure: return -1; } static int nft_immediate_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); struct nft_ctx *pctx = (struct nft_ctx *)ctx; const struct nft_data *data; int err; if (priv->dreg != NFT_REG_VERDICT) return 0; data = &priv->data; switch (data->verdict.code) { case NFT_JUMP: case NFT_GOTO: pctx->level++; err = nft_chain_validate(ctx, data->verdict.chain); if (err < 0) return err; pctx->level--; break; default: break; } return 0; } static int nft_immediate_offload_verdict(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_immediate_expr *priv) { struct flow_action_entry *entry; const struct nft_data *data; entry = &flow->rule->action.entries[ctx->num_actions++]; data = &priv->data; switch (data->verdict.code) { case NF_ACCEPT: entry->id = FLOW_ACTION_ACCEPT; break; case NF_DROP: entry->id = FLOW_ACTION_DROP; break; default: return -EOPNOTSUPP; } return 0; } static int nft_immediate_offload(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); if (priv->dreg == NFT_REG_VERDICT) return nft_immediate_offload_verdict(ctx, flow, priv); memcpy(&ctx->regs[priv->dreg].data, &priv->data, sizeof(priv->data)); return 0; } static bool nft_immediate_offload_action(const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); if (priv->dreg == NFT_REG_VERDICT) return true; return false; } static bool nft_immediate_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { const struct nft_immediate_expr *priv = nft_expr_priv(expr); if (priv->dreg != NFT_REG_VERDICT) nft_reg_track_cancel(track, priv->dreg, priv->dlen); return false; } static const struct nft_expr_ops nft_imm_ops = { .type = &nft_imm_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_immediate_expr)), .eval = nft_immediate_eval, .init = nft_immediate_init, .activate = nft_immediate_activate, .deactivate = nft_immediate_deactivate, .destroy = nft_immediate_destroy, .dump = nft_immediate_dump, .validate = nft_immediate_validate, .reduce = nft_immediate_reduce, .offload = nft_immediate_offload, .offload_action = nft_immediate_offload_action, }; struct nft_expr_type nft_imm_type __read_mostly = { .name = "immediate", .ops = &nft_imm_ops, .policy = nft_immediate_policy, .maxattr = NFTA_IMMEDIATE_MAX, .owner = THIS_MODULE, };
7114 13009 448 408 9057 6156 149 421 563 2 4447 78 53 44471 626 44471 626 41697 6 21761 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_BITOPS_H #define _ASM_X86_BITOPS_H /* * Copyright 1992, Linus Torvalds. * * Note: inlines with more than a single statement should be marked * __always_inline to avoid problems with older gcc's inlining heuristics. */ #ifndef _LINUX_BITOPS_H #error only <linux/bitops.h> can be included directly #endif #include <linux/compiler.h> #include <asm/alternative.h> #include <asm/rmwcc.h> #include <asm/barrier.h> #if BITS_PER_LONG == 32 # define _BITOPS_LONG_SHIFT 5 #elif BITS_PER_LONG == 64 # define _BITOPS_LONG_SHIFT 6 #else # error "Unexpected BITS_PER_LONG" #endif #define BIT_64(n) (U64_C(1) << (n)) /* * These have to be done with inline assembly: that way the bit-setting * is guaranteed to be atomic. All bit operations return 0 if the bit * was cleared before the operation and != 0 if it was not. * * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1). */ #define RLONG_ADDR(x) "m" (*(volatile long *) (x)) #define WBYTE_ADDR(x) "+m" (*(volatile char *) (x)) #define ADDR RLONG_ADDR(addr) /* * We do the locked ops that don't return the old value as * a mask operation on a byte. */ #define CONST_MASK_ADDR(nr, addr) WBYTE_ADDR((void *)(addr) + ((nr)>>3)) #define CONST_MASK(nr) (1 << ((nr) & 7)) static __always_inline void arch_set_bit(long nr, volatile unsigned long *addr) { if (__builtin_constant_p(nr)) { asm_inline volatile(LOCK_PREFIX "orb %b1,%0" : CONST_MASK_ADDR(nr, addr) : "iq" (CONST_MASK(nr)) : "memory"); } else { asm_inline volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0" : : RLONG_ADDR(addr), "Ir" (nr) : "memory"); } } static __always_inline void arch___set_bit(unsigned long nr, volatile unsigned long *addr) { asm volatile(__ASM_SIZE(bts) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } static __always_inline void arch_clear_bit(long nr, volatile unsigned long *addr) { if (__builtin_constant_p(nr)) { asm_inline volatile(LOCK_PREFIX "andb %b1,%0" : CONST_MASK_ADDR(nr, addr) : "iq" (~CONST_MASK(nr))); } else { asm_inline volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0" : : RLONG_ADDR(addr), "Ir" (nr) : "memory"); } } static __always_inline void arch_clear_bit_unlock(long nr, volatile unsigned long *addr) { barrier(); arch_clear_bit(nr, addr); } static __always_inline void arch___clear_bit(unsigned long nr, volatile unsigned long *addr) { asm volatile(__ASM_SIZE(btr) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } static __always_inline bool arch_xor_unlock_is_negative_byte(unsigned long mask, volatile unsigned long *addr) { bool negative; asm_inline volatile(LOCK_PREFIX "xorb %2,%1" CC_SET(s) : CC_OUT(s) (negative), WBYTE_ADDR(addr) : "iq" ((char)mask) : "memory"); return negative; } #define arch_xor_unlock_is_negative_byte arch_xor_unlock_is_negative_byte static __always_inline void arch___clear_bit_unlock(long nr, volatile unsigned long *addr) { arch___clear_bit(nr, addr); } static __always_inline void arch___change_bit(unsigned long nr, volatile unsigned long *addr) { asm volatile(__ASM_SIZE(btc) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } static __always_inline void arch_change_bit(long nr, volatile unsigned long *addr) { if (__builtin_constant_p(nr)) { asm_inline volatile(LOCK_PREFIX "xorb %b1,%0" : CONST_MASK_ADDR(nr, addr) : "iq" (CONST_MASK(nr))); } else { asm_inline volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0" : : RLONG_ADDR(addr), "Ir" (nr) : "memory"); } } static __always_inline bool arch_test_and_set_bit(long nr, volatile unsigned long *addr) { return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(bts), *addr, c, "Ir", nr); } static __always_inline bool arch_test_and_set_bit_lock(long nr, volatile unsigned long *addr) { return arch_test_and_set_bit(nr, addr); } static __always_inline bool arch___test_and_set_bit(unsigned long nr, volatile unsigned long *addr) { bool oldbit; asm(__ASM_SIZE(bts) " %2,%1" CC_SET(c) : CC_OUT(c) (oldbit) : ADDR, "Ir" (nr) : "memory"); return oldbit; } static __always_inline bool arch_test_and_clear_bit(long nr, volatile unsigned long *addr) { return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btr), *addr, c, "Ir", nr); } /* * Note: the operation is performed atomically with respect to * the local CPU, but not other CPUs. Portable code should not * rely on this behaviour. * KVM relies on this behaviour on x86 for modifying memory that is also * accessed from a hypervisor on the same CPU if running in a VM: don't change * this without also updating arch/x86/kernel/kvm.c */ static __always_inline bool arch___test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) { bool oldbit; asm volatile(__ASM_SIZE(btr) " %2,%1" CC_SET(c) : CC_OUT(c) (oldbit) : ADDR, "Ir" (nr) : "memory"); return oldbit; } static __always_inline bool arch___test_and_change_bit(unsigned long nr, volatile unsigned long *addr) { bool oldbit; asm volatile(__ASM_SIZE(btc) " %2,%1" CC_SET(c) : CC_OUT(c) (oldbit) : ADDR, "Ir" (nr) : "memory"); return oldbit; } static __always_inline bool arch_test_and_change_bit(long nr, volatile unsigned long *addr) { return GEN_BINARY_RMWcc(LOCK_PREFIX __ASM_SIZE(btc), *addr, c, "Ir", nr); } static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr) { return ((1UL << (nr & (BITS_PER_LONG-1))) & (addr[nr >> _BITOPS_LONG_SHIFT])) != 0; } static __always_inline bool constant_test_bit_acquire(long nr, const volatile unsigned long *addr) { bool oldbit; asm volatile("testb %2,%1" CC_SET(nz) : CC_OUT(nz) (oldbit) : "m" (((unsigned char *)addr)[nr >> 3]), "i" (1 << (nr & 7)) :"memory"); return oldbit; } static __always_inline bool variable_test_bit(long nr, volatile const unsigned long *addr) { bool oldbit; asm volatile(__ASM_SIZE(bt) " %2,%1" CC_SET(c) : CC_OUT(c) (oldbit) : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory"); return oldbit; } static __always_inline bool arch_test_bit(unsigned long nr, const volatile unsigned long *addr) { return __builtin_constant_p(nr) ? constant_test_bit(nr, addr) : variable_test_bit(nr, addr); } static __always_inline bool arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr) { return __builtin_constant_p(nr) ? constant_test_bit_acquire(nr, addr) : variable_test_bit(nr, addr); } static __always_inline unsigned long variable__ffs(unsigned long word) { asm("tzcnt %1,%0" : "=r" (word) : ASM_INPUT_RM (word)); return word; } /** * __ffs - find first set bit in word * @word: The word to search * * Undefined if no bit exists, so code should check against 0 first. */ #define __ffs(word) \ (__builtin_constant_p(word) ? \ (unsigned long)__builtin_ctzl(word) : \ variable__ffs(word)) static __always_inline unsigned long variable_ffz(unsigned long word) { return variable__ffs(~word); } /** * ffz - find first zero bit in word * @word: The word to search * * Undefined if no zero exists, so code should check against ~0UL first. */ #define ffz(word) \ (__builtin_constant_p(word) ? \ (unsigned long)__builtin_ctzl(~word) : \ variable_ffz(word)) /* * __fls: find last set bit in word * @word: The word to search * * Undefined if no set bit exists, so code should check against 0 first. */ static __always_inline unsigned long __fls(unsigned long word) { if (__builtin_constant_p(word)) return BITS_PER_LONG - 1 - __builtin_clzl(word); asm("bsr %1,%0" : "=r" (word) : ASM_INPUT_RM (word)); return word; } #undef ADDR #ifdef __KERNEL__ static __always_inline int variable_ffs(int x) { int r; #ifdef CONFIG_X86_64 /* * AMD64 says BSFL won't clobber the dest reg if x==0; Intel64 says the * dest reg is undefined if x==0, but their CPU architect says its * value is written to set it to the same as before, except that the * top 32 bits will be cleared. * * We cannot do this on 32 bits because at the very least some * 486 CPUs did not behave this way. */ asm("bsfl %1,%0" : "=r" (r) : ASM_INPUT_RM (x), "0" (-1)); #elif defined(CONFIG_X86_CMOV) asm("bsfl %1,%0\n\t" "cmovzl %2,%0" : "=&r" (r) : "rm" (x), "r" (-1)); #else asm("bsfl %1,%0\n\t" "jnz 1f\n\t" "movl $-1,%0\n" "1:" : "=r" (r) : "rm" (x)); #endif return r + 1; } /** * ffs - find first set bit in word * @x: the word to search * * This is defined the same way as the libc and compiler builtin ffs * routines, therefore differs in spirit from the other bitops. * * ffs(value) returns 0 if value is 0 or the position of the first * set bit if value is nonzero. The first (least significant) bit * is at position 1. */ #define ffs(x) (__builtin_constant_p(x) ? __builtin_ffs(x) : variable_ffs(x)) /** * fls - find last set bit in word * @x: the word to search * * This is defined in a similar way as the libc and compiler builtin * ffs, but returns the position of the most significant set bit. * * fls(value) returns 0 if value is 0 or the position of the last * set bit if value is nonzero. The last (most significant) bit is * at position 32. */ static __always_inline int fls(unsigned int x) { int r; if (__builtin_constant_p(x)) return x ? 32 - __builtin_clz(x) : 0; #ifdef CONFIG_X86_64 /* * AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the * dest reg is undefined if x==0, but their CPU architect says its * value is written to set it to the same as before, except that the * top 32 bits will be cleared. * * We cannot do this on 32 bits because at the very least some * 486 CPUs did not behave this way. */ asm("bsrl %1,%0" : "=r" (r) : ASM_INPUT_RM (x), "0" (-1)); #elif defined(CONFIG_X86_CMOV) asm("bsrl %1,%0\n\t" "cmovzl %2,%0" : "=&r" (r) : "rm" (x), "rm" (-1)); #else asm("bsrl %1,%0\n\t" "jnz 1f\n\t" "movl $-1,%0\n" "1:" : "=r" (r) : "rm" (x)); #endif return r + 1; } /** * fls64 - find last set bit in a 64-bit word * @x: the word to search * * This is defined in a similar way as the libc and compiler builtin * ffsll, but returns the position of the most significant set bit. * * fls64(value) returns 0 if value is 0 or the position of the last * set bit if value is nonzero. The last (most significant) bit is * at position 64. */ #ifdef CONFIG_X86_64 static __always_inline int fls64(__u64 x) { int bitpos = -1; if (__builtin_constant_p(x)) return x ? 64 - __builtin_clzll(x) : 0; /* * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the * dest reg is undefined if x==0, but their CPU architect says its * value is written to set it to the same as before. */ asm("bsrq %1,%q0" : "+r" (bitpos) : ASM_INPUT_RM (x)); return bitpos + 1; } #else #include <asm-generic/bitops/fls64.h> #endif #include <asm-generic/bitops/sched.h> #include <asm/arch_hweight.h> #include <asm-generic/bitops/const_hweight.h> #include <asm-generic/bitops/instrumented-atomic.h> #include <asm-generic/bitops/instrumented-non-atomic.h> #include <asm-generic/bitops/instrumented-lock.h> #include <asm-generic/bitops/le.h> #include <asm-generic/bitops/ext2-atomic-setbit.h> #endif /* __KERNEL__ */ #endif /* _ASM_X86_BITOPS_H */
37 37 37 1 37 166 164 4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_DESC_H #define _ASM_X86_DESC_H #include <asm/desc_defs.h> #include <asm/ldt.h> #include <asm/mmu.h> #include <asm/fixmap.h> #include <asm/irq_vectors.h> #include <asm/cpu_entry_area.h> #include <linux/debug_locks.h> #include <linux/smp.h> #include <linux/percpu.h> static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *info) { desc->limit0 = info->limit & 0x0ffff; desc->base0 = (info->base_addr & 0x0000ffff); desc->base1 = (info->base_addr & 0x00ff0000) >> 16; desc->type = (info->read_exec_only ^ 1) << 1; desc->type |= info->contents << 2; /* Set the ACCESS bit so it can be mapped RO */ desc->type |= 1; desc->s = 1; desc->dpl = 0x3; desc->p = info->seg_not_present ^ 1; desc->limit1 = (info->limit & 0xf0000) >> 16; desc->avl = info->useable; desc->d = info->seg_32bit; desc->g = info->limit_in_pages; desc->base2 = (info->base_addr & 0xff000000) >> 24; /* * Don't allow setting of the lm bit. It would confuse * user_64bit_mode and would get overridden by sysret anyway. */ desc->l = 0; } struct gdt_page { struct desc_struct gdt[GDT_ENTRIES]; } __attribute__((aligned(PAGE_SIZE))); DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); /* Provide the original GDT */ static inline struct desc_struct *get_cpu_gdt_rw(unsigned int cpu) { return per_cpu(gdt_page, cpu).gdt; } /* Provide the current original GDT */ static inline struct desc_struct *get_current_gdt_rw(void) { return this_cpu_ptr(&gdt_page)->gdt; } /* Provide the fixmap address of the remapped GDT */ static inline struct desc_struct *get_cpu_gdt_ro(int cpu) { return (struct desc_struct *)&get_cpu_entry_area(cpu)->gdt; } /* Provide the current read-only GDT */ static inline struct desc_struct *get_current_gdt_ro(void) { return get_cpu_gdt_ro(smp_processor_id()); } /* Provide the physical address of the GDT page. */ static inline phys_addr_t get_cpu_gdt_paddr(unsigned int cpu) { return per_cpu_ptr_to_phys(get_cpu_gdt_rw(cpu)); } static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func, unsigned dpl, unsigned ist, unsigned seg) { gate->offset_low = (u16) func; gate->bits.p = 1; gate->bits.dpl = dpl; gate->bits.zero = 0; gate->bits.type = type; gate->offset_middle = (u16) (func >> 16); #ifdef CONFIG_X86_64 gate->segment = __KERNEL_CS; gate->bits.ist = ist; gate->reserved = 0; gate->offset_high = (u32) (func >> 32); #else gate->segment = seg; gate->bits.ist = 0; #endif } static inline int desc_empty(const void *ptr) { const u32 *desc = ptr; return !(desc[0] | desc[1]); } #ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else #define load_TR_desc() native_load_tr_desc() #define load_gdt(dtr) native_load_gdt(dtr) #define load_idt(dtr) native_load_idt(dtr) #define load_tr(tr) asm volatile("ltr %0"::"m" (tr)) #define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt)) #define store_gdt(dtr) native_store_gdt(dtr) #define store_tr(tr) (tr = native_store_tr()) #define load_TLS(t, cpu) native_load_tls(t, cpu) #define set_ldt native_set_ldt #define write_ldt_entry(dt, entry, desc) native_write_ldt_entry(dt, entry, desc) #define write_gdt_entry(dt, entry, desc, type) native_write_gdt_entry(dt, entry, desc, type) #define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g) static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) { } static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries) { } #endif /* CONFIG_PARAVIRT_XXL */ #define store_ldt(ldt) asm("sldt %0" : "=m"(ldt)) static inline void native_write_idt_entry(gate_desc *idt, int entry, const gate_desc *gate) { memcpy(&idt[entry], gate, sizeof(*gate)); } static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc) { memcpy(&ldt[entry], desc, 8); } static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc, int type) { unsigned int size; switch (type) { case DESC_TSS: size = sizeof(tss_desc); break; case DESC_LDT: size = sizeof(ldt_desc); break; default: size = sizeof(*gdt); break; } memcpy(&gdt[entry], desc, size); } static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned type, unsigned size) { struct ldttss_desc *desc = d; memset(desc, 0, sizeof(*desc)); desc->limit0 = (u16) size; desc->base0 = (u16) addr; desc->base1 = (addr >> 16) & 0xFF; desc->type = type; desc->p = 1; desc->limit1 = (size >> 16) & 0xF; desc->base2 = (addr >> 24) & 0xFF; #ifdef CONFIG_X86_64 desc->base3 = (u32) (addr >> 32); #endif } static inline void __set_tss_desc(unsigned cpu, unsigned int entry, struct x86_hw_tss *addr) { struct desc_struct *d = get_cpu_gdt_rw(cpu); tss_desc tss; set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS, __KERNEL_TSS_LIMIT); write_gdt_entry(d, entry, &tss, DESC_TSS); } #define set_tss_desc(cpu, addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr) static inline void native_set_ldt(const void *addr, unsigned int entries) { if (likely(entries == 0)) asm volatile("lldt %w0"::"q" (0)); else { unsigned cpu = smp_processor_id(); ldt_desc ldt; set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT, entries * LDT_ENTRY_SIZE - 1); write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_LDT, &ldt, DESC_LDT); asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8)); } } static inline void native_load_gdt(const struct desc_ptr *dtr) { asm volatile("lgdt %0"::"m" (*dtr)); } static __always_inline void native_load_idt(const struct desc_ptr *dtr) { asm volatile("lidt %0"::"m" (*dtr)); } static inline void native_store_gdt(struct desc_ptr *dtr) { asm volatile("sgdt %0":"=m" (*dtr)); } static inline void store_idt(struct desc_ptr *dtr) { asm volatile("sidt %0":"=m" (*dtr)); } static inline void native_gdt_invalidate(void) { const struct desc_ptr invalid_gdt = { .address = 0, .size = 0 }; native_load_gdt(&invalid_gdt); } static inline void native_idt_invalidate(void) { const struct desc_ptr invalid_idt = { .address = 0, .size = 0 }; native_load_idt(&invalid_idt); } /* * The LTR instruction marks the TSS GDT entry as busy. On 64-bit, the GDT is * a read-only remapping. To prevent a page fault, the GDT is switched to the * original writeable version when needed. */ #ifdef CONFIG_X86_64 static inline void native_load_tr_desc(void) { struct desc_ptr gdt; int cpu = raw_smp_processor_id(); bool restore = 0; struct desc_struct *fixmap_gdt; native_store_gdt(&gdt); fixmap_gdt = get_cpu_gdt_ro(cpu); /* * If the current GDT is the read-only fixmap, swap to the original * writeable version. Swap back at the end. */ if (gdt.address == (unsigned long)fixmap_gdt) { load_direct_gdt(cpu); restore = 1; } asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); if (restore) load_fixmap_gdt(cpu); } #else static inline void native_load_tr_desc(void) { asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); } #endif static inline unsigned long native_store_tr(void) { unsigned long tr; asm volatile("str %0":"=r" (tr)); return tr; } static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) { struct desc_struct *gdt = get_cpu_gdt_rw(cpu); unsigned int i; for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]; } DECLARE_PER_CPU(bool, __tss_limit_invalid); static inline void force_reload_TR(void) { struct desc_struct *d = get_current_gdt_rw(); tss_desc tss; memcpy(&tss, &d[GDT_ENTRY_TSS], sizeof(tss_desc)); /* * LTR requires an available TSS, and the TSS is currently * busy. Make it be available so that LTR will work. */ tss.type = DESC_TSS; write_gdt_entry(d, GDT_ENTRY_TSS, &tss, DESC_TSS); load_TR_desc(); this_cpu_write(__tss_limit_invalid, false); } /* * Call this if you need the TSS limit to be correct, which should be the case * if and only if you have TIF_IO_BITMAP set or you're switching to a task * with TIF_IO_BITMAP set. */ static inline void refresh_tss_limit(void) { DEBUG_LOCKS_WARN_ON(preemptible()); if (unlikely(this_cpu_read(__tss_limit_invalid))) force_reload_TR(); } /* * If you do something evil that corrupts the cached TSS limit (I'm looking * at you, VMX exits), call this function. * * The optimization here is that the TSS limit only matters for Linux if the * IO bitmap is in use. If the TSS limit gets forced to its minimum value, * everything works except that IO bitmap will be ignored and all CPL 3 IO * instructions will #GP, which is exactly what we want for normal tasks. */ static inline void invalidate_tss_limit(void) { DEBUG_LOCKS_WARN_ON(preemptible()); if (unlikely(test_thread_flag(TIF_IO_BITMAP))) force_reload_TR(); else this_cpu_write(__tss_limit_invalid, true); } /* This intentionally ignores lm, since 32-bit apps don't have that field. */ #define LDT_empty(info) \ ((info)->base_addr == 0 && \ (info)->limit == 0 && \ (info)->contents == 0 && \ (info)->read_exec_only == 1 && \ (info)->seg_32bit == 0 && \ (info)->limit_in_pages == 0 && \ (info)->seg_not_present == 1 && \ (info)->useable == 0) /* Lots of programs expect an all-zero user_desc to mean "no segment at all". */ static inline bool LDT_zero(const struct user_desc *info) { return (info->base_addr == 0 && info->limit == 0 && info->contents == 0 && info->read_exec_only == 0 && info->seg_32bit == 0 && info->limit_in_pages == 0 && info->seg_not_present == 0 && info->useable == 0); } static inline void clear_LDT(void) { set_ldt(NULL, 0); } static inline unsigned long get_desc_base(const struct desc_struct *desc) { return (unsigned)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24)); } static inline void set_desc_base(struct desc_struct *desc, unsigned long base) { desc->base0 = base & 0xffff; desc->base1 = (base >> 16) & 0xff; desc->base2 = (base >> 24) & 0xff; } static inline unsigned long get_desc_limit(const struct desc_struct *desc) { return desc->limit0 | (desc->limit1 << 16); } static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) { desc->limit0 = limit & 0xffff; desc->limit1 = (limit >> 16) & 0xf; } static inline void init_idt_data(struct idt_data *data, unsigned int n, const void *addr) { BUG_ON(n > 0xFF); memset(data, 0, sizeof(*data)); data->vector = n; data->addr = addr; data->segment = __KERNEL_CS; data->bits.type = GATE_INTERRUPT; data->bits.p = 1; } static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d) { unsigned long addr = (unsigned long) d->addr; gate->offset_low = (u16) addr; gate->segment = (u16) d->segment; gate->bits = d->bits; gate->offset_middle = (u16) (addr >> 16); #ifdef CONFIG_X86_64 gate->offset_high = (u32) (addr >> 32); gate->reserved = 0; #endif } extern unsigned long system_vectors[]; extern void load_current_idt(void); extern void idt_setup_early_handler(void); extern void idt_setup_early_traps(void); extern void idt_setup_traps(void); extern void idt_setup_apic_and_irq_gates(void); extern bool idt_is_f00f_address(unsigned long address); #ifdef CONFIG_X86_64 extern void idt_setup_early_pf(void); #else static inline void idt_setup_early_pf(void) { } #endif extern void idt_invalidate(void); #endif /* _ASM_X86_DESC_H */
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ /* * include/uapi/linux/tipc_config.h: Header for TIPC configuration interface * * Copyright (c) 2003-2006, Ericsson AB * Copyright (c) 2005-2007, 2010-2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _LINUX_TIPC_CONFIG_H_ #define _LINUX_TIPC_CONFIG_H_ #include <linux/types.h> #include <linux/string.h> #include <linux/tipc.h> #include <asm/byteorder.h> /* * Configuration * * All configuration management messaging involves sending a request message * to the TIPC configuration service on a node, which sends a reply message * back. (In the future multi-message replies may be supported.) * * Both request and reply messages consist of a transport header and payload. * The transport header contains info about the desired operation; * the payload consists of zero or more type/length/value (TLV) items * which specify parameters or results for the operation. * * For many operations, the request and reply messages have a fixed number * of TLVs (usually zero or one); however, some reply messages may return * a variable number of TLVs. A failed request is denoted by the presence * of an "error string" TLV in the reply message instead of the TLV(s) the * reply should contain if the request succeeds. */ /* * Public commands: * May be issued by any process. * Accepted by own node, or by remote node only if remote management enabled. */ #define TIPC_CMD_NOOP 0x0000 /* tx none, rx none */ #define TIPC_CMD_GET_NODES 0x0001 /* tx net_addr, rx node_info(s) */ #define TIPC_CMD_GET_MEDIA_NAMES 0x0002 /* tx none, rx media_name(s) */ #define TIPC_CMD_GET_BEARER_NAMES 0x0003 /* tx none, rx bearer_name(s) */ #define TIPC_CMD_GET_LINKS 0x0004 /* tx net_addr, rx link_info(s) */ #define TIPC_CMD_SHOW_NAME_TABLE 0x0005 /* tx name_tbl_query, rx ultra_string */ #define TIPC_CMD_SHOW_PORTS 0x0006 /* tx none, rx ultra_string */ #define TIPC_CMD_SHOW_LINK_STATS 0x000B /* tx link_name, rx ultra_string */ #define TIPC_CMD_SHOW_STATS 0x000F /* tx unsigned, rx ultra_string */ /* * Protected commands: * May only be issued by "network administration capable" process. * Accepted by own node, or by remote node only if remote management enabled * and this node is zone manager. */ #define TIPC_CMD_GET_REMOTE_MNG 0x4003 /* tx none, rx unsigned */ #define TIPC_CMD_GET_MAX_PORTS 0x4004 /* tx none, rx unsigned */ #define TIPC_CMD_GET_MAX_PUBL 0x4005 /* obsoleted */ #define TIPC_CMD_GET_MAX_SUBSCR 0x4006 /* obsoleted */ #define TIPC_CMD_GET_MAX_ZONES 0x4007 /* obsoleted */ #define TIPC_CMD_GET_MAX_CLUSTERS 0x4008 /* obsoleted */ #define TIPC_CMD_GET_MAX_NODES 0x4009 /* obsoleted */ #define TIPC_CMD_GET_MAX_SLAVES 0x400A /* obsoleted */ #define TIPC_CMD_GET_NETID 0x400B /* tx none, rx unsigned */ #define TIPC_CMD_ENABLE_BEARER 0x4101 /* tx bearer_config, rx none */ #define TIPC_CMD_DISABLE_BEARER 0x4102 /* tx bearer_name, rx none */ #define TIPC_CMD_SET_LINK_TOL 0x4107 /* tx link_config, rx none */ #define TIPC_CMD_SET_LINK_PRI 0x4108 /* tx link_config, rx none */ #define TIPC_CMD_SET_LINK_WINDOW 0x4109 /* tx link_config, rx none */ #define TIPC_CMD_SET_LOG_SIZE 0x410A /* obsoleted */ #define TIPC_CMD_DUMP_LOG 0x410B /* obsoleted */ #define TIPC_CMD_RESET_LINK_STATS 0x410C /* tx link_name, rx none */ /* * Private commands: * May only be issued by "network administration capable" process. * Accepted by own node only; cannot be used on a remote node. */ #define TIPC_CMD_SET_NODE_ADDR 0x8001 /* tx net_addr, rx none */ #define TIPC_CMD_SET_REMOTE_MNG 0x8003 /* tx unsigned, rx none */ #define TIPC_CMD_SET_MAX_PORTS 0x8004 /* tx unsigned, rx none */ #define TIPC_CMD_SET_MAX_PUBL 0x8005 /* obsoleted */ #define TIPC_CMD_SET_MAX_SUBSCR 0x8006 /* obsoleted */ #define TIPC_CMD_SET_MAX_ZONES 0x8007 /* obsoleted */ #define TIPC_CMD_SET_MAX_CLUSTERS 0x8008 /* obsoleted */ #define TIPC_CMD_SET_MAX_NODES 0x8009 /* obsoleted */ #define TIPC_CMD_SET_MAX_SLAVES 0x800A /* obsoleted */ #define TIPC_CMD_SET_NETID 0x800B /* tx unsigned, rx none */ /* * Reserved commands: * May not be issued by any process. * Used internally by TIPC. */ #define TIPC_CMD_NOT_NET_ADMIN 0xC001 /* tx none, rx none */ /* * TLV types defined for TIPC */ #define TIPC_TLV_NONE 0 /* no TLV present */ #define TIPC_TLV_VOID 1 /* empty TLV (0 data bytes)*/ #define TIPC_TLV_UNSIGNED 2 /* 32-bit integer */ #define TIPC_TLV_STRING 3 /* char[128] (max) */ #define TIPC_TLV_LARGE_STRING 4 /* char[2048] (max) */ #define TIPC_TLV_ULTRA_STRING 5 /* char[32768] (max) */ #define TIPC_TLV_ERROR_STRING 16 /* char[128] containing "error code" */ #define TIPC_TLV_NET_ADDR 17 /* 32-bit integer denoting <Z.C.N> */ #define TIPC_TLV_MEDIA_NAME 18 /* char[TIPC_MAX_MEDIA_NAME] */ #define TIPC_TLV_BEARER_NAME 19 /* char[TIPC_MAX_BEARER_NAME] */ #define TIPC_TLV_LINK_NAME 20 /* char[TIPC_MAX_LINK_NAME] */ #define TIPC_TLV_NODE_INFO 21 /* struct tipc_node_info */ #define TIPC_TLV_LINK_INFO 22 /* struct tipc_link_info */ #define TIPC_TLV_BEARER_CONFIG 23 /* struct tipc_bearer_config */ #define TIPC_TLV_LINK_CONFIG 24 /* struct tipc_link_config */ #define TIPC_TLV_NAME_TBL_QUERY 25 /* struct tipc_name_table_query */ #define TIPC_TLV_PORT_REF 26 /* 32-bit port reference */ /* * Link priority limits (min, default, max, media default) */ #define TIPC_MIN_LINK_PRI 0 #define TIPC_DEF_LINK_PRI 10 #define TIPC_MAX_LINK_PRI 31 #define TIPC_MEDIA_LINK_PRI (TIPC_MAX_LINK_PRI + 1) /* * Link tolerance limits (min, default, max), in ms */ #define TIPC_MIN_LINK_TOL 50 #define TIPC_DEF_LINK_TOL 1500 #define TIPC_MAX_LINK_TOL 30000 #if (TIPC_MIN_LINK_TOL < 16) #error "TIPC_MIN_LINK_TOL is too small (abort limit may be NaN)" #endif /* * Link window limits (min, default, max), in packets */ #define TIPC_MIN_LINK_WIN 16 #define TIPC_DEF_LINK_WIN 50 #define TIPC_MAX_LINK_WIN 8191 /* * Default MTU for UDP media */ #define TIPC_DEF_LINK_UDP_MTU 14000 struct tipc_node_info { __be32 addr; /* network address of node */ __be32 up; /* 0=down, 1= up */ }; struct tipc_link_info { __be32 dest; /* network address of peer node */ __be32 up; /* 0=down, 1=up */ char str[TIPC_MAX_LINK_NAME]; /* link name */ }; struct tipc_bearer_config { __be32 priority; /* Range [1,31]. Override per link */ __be32 disc_domain; /* <Z.C.N> describing desired nodes */ char name[TIPC_MAX_BEARER_NAME]; }; struct tipc_link_config { __be32 value; char name[TIPC_MAX_LINK_NAME]; }; #define TIPC_NTQ_ALLTYPES 0x80000000 struct tipc_name_table_query { __be32 depth; /* 1:type, 2:+name info, 3:+port info, 4+:+debug info */ __be32 type; /* {t,l,u} info ignored if high bit of "depth" is set */ __be32 lowbound; /* (i.e. displays all entries of name table) */ __be32 upbound; }; /* * The error string TLV is a null-terminated string describing the cause * of the request failure. To simplify error processing (and to save space) * the first character of the string can be a special error code character * (lying by the range 0x80 to 0xFF) which represents a pre-defined reason. */ #define TIPC_CFG_TLV_ERROR "\x80" /* request contains incorrect TLV(s) */ #define TIPC_CFG_NOT_NET_ADMIN "\x81" /* must be network administrator */ #define TIPC_CFG_NOT_ZONE_MSTR "\x82" /* must be zone master */ #define TIPC_CFG_NO_REMOTE "\x83" /* remote management not enabled */ #define TIPC_CFG_NOT_SUPPORTED "\x84" /* request is not supported by TIPC */ #define TIPC_CFG_INVALID_VALUE "\x85" /* request has invalid argument value */ /* * A TLV consists of a descriptor, followed by the TLV value. * TLV descriptor fields are stored in network byte order; * TLV values must also be stored in network byte order (where applicable). * TLV descriptors must be aligned to addresses which are multiple of 4, * so up to 3 bytes of padding may exist at the end of the TLV value area. * There must not be any padding between the TLV descriptor and its value. */ struct tlv_desc { __be16 tlv_len; /* TLV length (descriptor + value) */ __be16 tlv_type; /* TLV identifier */ }; #define TLV_ALIGNTO 4 #define TLV_ALIGN(datalen) (((datalen)+(TLV_ALIGNTO-1)) & ~(TLV_ALIGNTO-1)) #define TLV_LENGTH(datalen) (sizeof(struct tlv_desc) + (datalen)) #define TLV_SPACE(datalen) (TLV_ALIGN(TLV_LENGTH(datalen))) #define TLV_DATA(tlv) ((void *)((char *)(tlv) + TLV_LENGTH(0))) static inline int TLV_OK(const void *tlv, __u16 space) { /* * Would also like to check that "tlv" is a multiple of 4, * but don't know how to do this in a portable way. * - Tried doing (!(tlv & (TLV_ALIGNTO-1))), but GCC compiler * won't allow binary "&" with a pointer. * - Tried casting "tlv" to integer type, but causes warning about size * mismatch when pointer is bigger than chosen type (int, long, ...). */ return (space >= TLV_SPACE(0)) && (__be16_to_cpu(((struct tlv_desc *)tlv)->tlv_len) <= space); } static inline int TLV_CHECK(const void *tlv, __u16 space, __u16 exp_type) { return TLV_OK(tlv, space) && (__be16_to_cpu(((struct tlv_desc *)tlv)->tlv_type) == exp_type); } static inline int TLV_GET_LEN(struct tlv_desc *tlv) { return __be16_to_cpu(tlv->tlv_len); } static inline void TLV_SET_LEN(struct tlv_desc *tlv, __u16 len) { tlv->tlv_len = __cpu_to_be16(len); } static inline int TLV_CHECK_TYPE(struct tlv_desc *tlv, __u16 type) { return (__be16_to_cpu(tlv->tlv_type) == type); } static inline void TLV_SET_TYPE(struct tlv_desc *tlv, __u16 type) { tlv->tlv_type = __cpu_to_be16(type); } static inline int TLV_SET(void *tlv, __u16 type, void *data, __u16 len) { struct tlv_desc *tlv_ptr; int tlv_len; tlv_len = TLV_LENGTH(len); tlv_ptr = (struct tlv_desc *)tlv; tlv_ptr->tlv_type = __cpu_to_be16(type); tlv_ptr->tlv_len = __cpu_to_be16(tlv_len); if (len && data) { memcpy(TLV_DATA(tlv_ptr), data, len); memset((char *)TLV_DATA(tlv_ptr) + len, 0, TLV_SPACE(len) - tlv_len); } return TLV_SPACE(len); } /* * A TLV list descriptor simplifies processing of messages * containing multiple TLVs. */ struct tlv_list_desc { struct tlv_desc *tlv_ptr; /* ptr to current TLV */ __u32 tlv_space; /* # bytes from curr TLV to list end */ }; static inline void TLV_LIST_INIT(struct tlv_list_desc *list, void *data, __u32 space) { list->tlv_ptr = (struct tlv_desc *)data; list->tlv_space = space; } static inline int TLV_LIST_EMPTY(struct tlv_list_desc *list) { return (list->tlv_space == 0); } static inline int TLV_LIST_CHECK(struct tlv_list_desc *list, __u16 exp_type) { return TLV_CHECK(list->tlv_ptr, list->tlv_space, exp_type); } static inline void *TLV_LIST_DATA(struct tlv_list_desc *list) { return TLV_DATA(list->tlv_ptr); } static inline void TLV_LIST_STEP(struct tlv_list_desc *list) { __u16 tlv_space = TLV_ALIGN(__be16_to_cpu(list->tlv_ptr->tlv_len)); list->tlv_ptr = (struct tlv_desc *)((char *)list->tlv_ptr + tlv_space); list->tlv_space -= tlv_space; } /* * Configuration messages exchanged via NETLINK_GENERIC use the following * family id, name, version and command. */ #define TIPC_GENL_NAME "TIPC" #define TIPC_GENL_VERSION 0x1 #define TIPC_GENL_CMD 0x1 /* * TIPC specific header used in NETLINK_GENERIC requests. */ struct tipc_genlmsghdr { __u32 dest; /* Destination address */ __u16 cmd; /* Command */ __u16 reserved; /* Unused */ }; #define TIPC_GENL_HDRLEN NLMSG_ALIGN(sizeof(struct tipc_genlmsghdr)) /* * Configuration messages exchanged via TIPC sockets use the TIPC configuration * message header, which is defined below. This structure is analogous * to the Netlink message header, but fields are stored in network byte order * and no padding is permitted between the header and the message data * that follows. */ struct tipc_cfg_msg_hdr { __be32 tcm_len; /* Message length (including header) */ __be16 tcm_type; /* Command type */ __be16 tcm_flags; /* Additional flags */ char tcm_reserved[8]; /* Unused */ }; #define TCM_F_REQUEST 0x1 /* Flag: Request message */ #define TCM_F_MORE 0x2 /* Flag: Message to be continued */ #define TCM_ALIGN(datalen) (((datalen)+3) & ~3) #define TCM_LENGTH(datalen) (sizeof(struct tipc_cfg_msg_hdr) + datalen) #define TCM_SPACE(datalen) (TCM_ALIGN(TCM_LENGTH(datalen))) #define TCM_DATA(tcm_hdr) ((void *)((char *)(tcm_hdr) + TCM_LENGTH(0))) static inline int TCM_SET(void *msg, __u16 cmd, __u16 flags, void *data, __u16 data_len) { struct tipc_cfg_msg_hdr *tcm_hdr; int msg_len; msg_len = TCM_LENGTH(data_len); tcm_hdr = (struct tipc_cfg_msg_hdr *)msg; tcm_hdr->tcm_len = __cpu_to_be32(msg_len); tcm_hdr->tcm_type = __cpu_to_be16(cmd); tcm_hdr->tcm_flags = __cpu_to_be16(flags); if (data_len && data) { memcpy(TCM_DATA(msg), data, data_len); memset((char *)TCM_DATA(msg) + data_len, 0, TCM_SPACE(data_len) - msg_len); } return TCM_SPACE(data_len); } #endif
42 42 46 3 43 43 6 42 42 459 460 206 17488 17521 17502 17548 5937 14567 14955 13551 14957 17310 17340 13887 14798 1389 1389 1383 15309 15332 13197 13546 629 625 623 2406 2408 2096 2104 3 3 3 3 3 3 3 3 3 9 9 3 3 3 3 3 3 9 9 3 3 2 2 3 3 1 4 1 4 3 1 1 56 56 674 851 179 178 99 99 99 99 17 16 1 9 2 7 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 // SPDX-License-Identifier: GPL-2.0-only #include <linux/export.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> #include <linux/highuid.h> #include <linux/cred.h> #include <linux/securebits.h> #include <linux/security.h> #include <linux/keyctl.h> #include <linux/key-type.h> #include <keys/user-type.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/ctype.h> #include <linux/projid.h> #include <linux/fs_struct.h> #include <linux/bsearch.h> #include <linux/sort.h> static struct kmem_cache *user_ns_cachep __ro_after_init; static DEFINE_MUTEX(userns_state_mutex); static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *map); static void free_user_ns(struct work_struct *work); static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid) { return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES); } static void dec_user_namespaces(struct ucounts *ucounts) { return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES); } static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) { /* Start with the same capabilities as init but useless for doing * anything as the capabilities are bound to the new user namespace. */ cred->securebits = SECUREBITS_DEFAULT; cred->cap_inheritable = CAP_EMPTY_SET; cred->cap_permitted = CAP_FULL_SET; cred->cap_effective = CAP_FULL_SET; cred->cap_ambient = CAP_EMPTY_SET; cred->cap_bset = CAP_FULL_SET; #ifdef CONFIG_KEYS key_put(cred->request_key_auth); cred->request_key_auth = NULL; #endif /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ cred->user_ns = user_ns; } static unsigned long enforced_nproc_rlimit(void) { unsigned long limit = RLIM_INFINITY; /* Is RLIMIT_NPROC currently enforced? */ if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) || (current_user_ns() != &init_user_ns)) limit = rlimit(RLIMIT_NPROC); return limit; } /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the * new namespace. * * This is called by copy_creds(), which will finish setting the target task's * credentials. */ int create_user_ns(struct cred *new) { struct user_namespace *ns, *parent_ns = new->user_ns; kuid_t owner = new->euid; kgid_t group = new->egid; struct ucounts *ucounts; int ret, i; ret = -ENOSPC; if (parent_ns->level > 32) goto fail; ucounts = inc_user_namespaces(parent_ns, owner); if (!ucounts) goto fail; /* * Verify that we can not violate the policy of which files * may be accessed that is specified by the root directory, * by verifying that the root directory is at the root of the * mount namespace which allows all files to be accessed. */ ret = -EPERM; if (current_chrooted()) goto fail_dec; /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who * created a user_namespace. */ ret = -EPERM; if (!kuid_has_mapping(parent_ns, owner) || !kgid_has_mapping(parent_ns, group)) goto fail_dec; ret = security_create_user_ns(new); if (ret < 0) goto fail_dec; ret = -ENOMEM; ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); if (!ns) goto fail_dec; ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); ret = ns_alloc_inum(&ns->ns); if (ret) goto fail_free; ns->ns.ops = &userns_operations; refcount_set(&ns->ns.count, 1); /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->level = parent_ns->level + 1; ns->owner = owner; ns->group = group; INIT_WORK(&ns->work, free_user_ns); for (i = 0; i < UCOUNT_COUNTS; i++) { ns->ucount_max[i] = INT_MAX; } set_userns_rlimit_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit()); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE)); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING)); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK)); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ mutex_lock(&userns_state_mutex); ns->flags = parent_ns->flags; mutex_unlock(&userns_state_mutex); #ifdef CONFIG_KEYS INIT_LIST_HEAD(&ns->keyring_name_list); init_rwsem(&ns->keyring_sem); #endif ret = -ENOMEM; if (!setup_userns_sysctls(ns)) goto fail_keyring; set_cred_user_ns(new, ns); return 0; fail_keyring: #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif ns_free_inum(&ns->ns); fail_free: kmem_cache_free(user_ns_cachep, ns); fail_dec: dec_user_namespaces(ucounts); fail: return ret; } int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) { struct cred *cred; int err = -ENOMEM; if (!(unshare_flags & CLONE_NEWUSER)) return 0; cred = prepare_creds(); if (cred) { err = create_user_ns(cred); if (err) put_cred(cred); else *new_cred = cred; } return err; } static void free_user_ns(struct work_struct *work) { struct user_namespace *parent, *ns = container_of(work, struct user_namespace, work); do { struct ucounts *ucounts = ns->ucounts; parent = ns->parent; if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->gid_map.forward); kfree(ns->gid_map.reverse); } if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->uid_map.forward); kfree(ns->uid_map.reverse); } if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->projid_map.forward); kfree(ns->projid_map.reverse); } #if IS_ENABLED(CONFIG_BINFMT_MISC) kfree(ns->binfmt_misc); #endif retire_userns_sysctls(ns); key_free_user_ns(ns); ns_free_inum(&ns->ns); kmem_cache_free(user_ns_cachep, ns); dec_user_namespaces(ucounts); ns = parent; } while (refcount_dec_and_test(&parent->ns.count)); } void __put_user_ns(struct user_namespace *ns) { schedule_work(&ns->work); } EXPORT_SYMBOL(__put_user_ns); /* * struct idmap_key - holds the information necessary to find an idmapping in a * sorted idmap array. It is passed to cmp_map_id() as first argument. */ struct idmap_key { bool map_up; /* true -> id from kid; false -> kid from id */ u32 id; /* id to find */ u32 count; }; /* * cmp_map_id - Function to be passed to bsearch() to find the requested * idmapping. Expects struct idmap_key to be passed via @k. */ static int cmp_map_id(const void *k, const void *e) { u32 first, last, id2; const struct idmap_key *key = k; const struct uid_gid_extent *el = e; id2 = key->id + key->count - 1; /* handle map_id_{down,up}() */ if (key->map_up) first = el->lower_first; else first = el->first; last = first + el->count - 1; if (key->id >= first && key->id <= last && (id2 >= first && id2 <= last)) return 0; if (key->id < first || id2 < first) return -1; return 1; } /* * map_id_range_down_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { struct idmap_key key; key.map_up = false; key.count = count; key.id = id; return bsearch(&key, map->forward, extents, sizeof(struct uid_gid_extent), cmp_map_id); } /* * map_id_range_down_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { unsigned idx; u32 first, last, id2; id2 = id + count - 1; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last && (id2 >= first && id2 <= last)) return &map->extent[idx]; } return NULL; } static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_range_down_base(extents, map, id, count); else extent = map_id_range_down_max(extents, map, id, count); /* Map the id or note failure */ if (extent) id = (id - extent->first) + extent->lower_first; else id = (u32) -1; return id; } u32 map_id_down(struct uid_gid_map *map, u32 id) { return map_id_range_down(map, id, 1); } /* * map_id_up_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_up_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { unsigned idx; u32 first, last, id2; id2 = id + count - 1; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last && (id2 >= first && id2 <= last)) return &map->extent[idx]; } return NULL; } /* * map_id_up_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_up_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { struct idmap_key key; key.map_up = true; key.count = count; key.id = id; return bsearch(&key, map->reverse, extents, sizeof(struct uid_gid_extent), cmp_map_id); } u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_range_up_base(extents, map, id, count); else extent = map_id_range_up_max(extents, map, id, count); /* Map the id or note failure */ if (extent) id = (id - extent->lower_first) + extent->first; else id = (u32) -1; return id; } u32 map_id_up(struct uid_gid_map *map, u32 id) { return map_id_range_up(map, id, 1); } /** * make_kuid - Map a user-namespace uid pair into a kuid. * @ns: User namespace that the uid is in * @uid: User identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace uid * pair INVALID_UID is returned. Callers are expected to test * for and handle INVALID_UID being returned. INVALID_UID * may be tested for using uid_valid(). */ kuid_t make_kuid(struct user_namespace *ns, uid_t uid) { /* Map the uid to a global kernel uid */ return KUIDT_INIT(map_id_down(&ns->uid_map, uid)); } EXPORT_SYMBOL(make_kuid); /** * from_kuid - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * If @kuid has no mapping in @targ (uid_t)-1 is returned. */ uid_t from_kuid(struct user_namespace *targ, kuid_t kuid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->uid_map, __kuid_val(kuid)); } EXPORT_SYMBOL(from_kuid); /** * from_kuid_munged - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kuid from_kuid_munged never fails and always * returns a valid uid. This makes from_kuid_munged appropriate * for use in syscalls like stat and getuid where failing the * system call and failing to provide a valid uid are not an * options. * * If @kuid has no mapping in @targ overflowuid is returned. */ uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid) { uid_t uid; uid = from_kuid(targ, kuid); if (uid == (uid_t) -1) uid = overflowuid; return uid; } EXPORT_SYMBOL(from_kuid_munged); /** * make_kgid - Map a user-namespace gid pair into a kgid. * @ns: User namespace that the gid is in * @gid: group identifier * * Maps a user-namespace gid pair into a kernel internal kgid, * and returns that kgid. * * When there is no mapping defined for the user-namespace gid * pair INVALID_GID is returned. Callers are expected to test * for and handle INVALID_GID being returned. INVALID_GID may be * tested for using gid_valid(). */ kgid_t make_kgid(struct user_namespace *ns, gid_t gid) { /* Map the gid to a global kernel gid */ return KGIDT_INIT(map_id_down(&ns->gid_map, gid)); } EXPORT_SYMBOL(make_kgid); /** * from_kgid - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * If @kgid has no mapping in @targ (gid_t)-1 is returned. */ gid_t from_kgid(struct user_namespace *targ, kgid_t kgid) { /* Map the gid from a global kernel gid */ return map_id_up(&targ->gid_map, __kgid_val(kgid)); } EXPORT_SYMBOL(from_kgid); /** * from_kgid_munged - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kgid from_kgid_munged never fails and always * returns a valid gid. This makes from_kgid_munged appropriate * for use in syscalls like stat and getgid where failing the * system call and failing to provide a valid gid are not options. * * If @kgid has no mapping in @targ overflowgid is returned. */ gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid) { gid_t gid; gid = from_kgid(targ, kgid); if (gid == (gid_t) -1) gid = overflowgid; return gid; } EXPORT_SYMBOL(from_kgid_munged); /** * make_kprojid - Map a user-namespace projid pair into a kprojid. * @ns: User namespace that the projid is in * @projid: Project identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace projid * pair INVALID_PROJID is returned. Callers are expected to test * for and handle INVALID_PROJID being returned. INVALID_PROJID * may be tested for using projid_valid(). */ kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) { /* Map the uid to a global kernel uid */ return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid)); } EXPORT_SYMBOL(make_kprojid); /** * from_kprojid - Create a projid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal project identifier to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * If @kprojid has no mapping in @targ (projid_t)-1 is returned. */ projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->projid_map, __kprojid_val(kprojid)); } EXPORT_SYMBOL(from_kprojid); /** * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal projid to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kprojid from_kprojid_munged never fails and always * returns a valid projid. This makes from_kprojid_munged * appropriate for use in syscalls like stat and where * failing the system call and failing to provide a valid projid are * not an options. * * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned. */ projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid) { projid_t projid; projid = from_kprojid(targ, kprojid); if (projid == (projid_t) -1) projid = OVERFLOW_PROJID; return projid; } EXPORT_SYMBOL(from_kprojid_munged); static int uid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; uid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int gid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; gid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int projid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; projid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) { loff_t pos = *ppos; unsigned extents = map->nr_extents; smp_rmb(); if (pos >= extents) return NULL; if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return &map->extent[pos]; return &map->forward[pos]; } static void *uid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->uid_map); } static void *gid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->gid_map); } static void *projid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->projid_map); } static void *m_next(struct seq_file *seq, void *v, loff_t *pos) { (*pos)++; return seq->op->start(seq, pos); } static void m_stop(struct seq_file *seq, void *v) { return; } const struct seq_operations proc_uid_seq_operations = { .start = uid_m_start, .stop = m_stop, .next = m_next, .show = uid_m_show, }; const struct seq_operations proc_gid_seq_operations = { .start = gid_m_start, .stop = m_stop, .next = m_next, .show = gid_m_show, }; const struct seq_operations proc_projid_seq_operations = { .start = projid_m_start, .stop = m_stop, .next = m_next, .show = projid_m_show, }; static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) { u32 upper_first, lower_first, upper_last, lower_last; unsigned idx; upper_first = extent->first; lower_first = extent->lower_first; upper_last = upper_first + extent->count - 1; lower_last = lower_first + extent->count - 1; for (idx = 0; idx < new_map->nr_extents; idx++) { u32 prev_upper_first, prev_lower_first; u32 prev_upper_last, prev_lower_last; struct uid_gid_extent *prev; if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) prev = &new_map->extent[idx]; else prev = &new_map->forward[idx]; prev_upper_first = prev->first; prev_lower_first = prev->lower_first; prev_upper_last = prev_upper_first + prev->count - 1; prev_lower_last = prev_lower_first + prev->count - 1; /* Does the upper range intersect a previous extent? */ if ((prev_upper_first <= upper_last) && (prev_upper_last >= upper_first)) return true; /* Does the lower range intersect a previous extent? */ if ((prev_lower_first <= lower_last) && (prev_lower_last >= lower_first)) return true; } return false; } /* * insert_extent - Safely insert a new idmap extent into struct uid_gid_map. * Takes care to allocate a 4K block of memory if the number of mappings exceeds * UID_GID_MAP_MAX_BASE_EXTENTS. */ static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent) { struct uid_gid_extent *dest; if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) { struct uid_gid_extent *forward; /* Allocate memory for 340 mappings. */ forward = kmalloc_array(UID_GID_MAP_MAX_EXTENTS, sizeof(struct uid_gid_extent), GFP_KERNEL); if (!forward) return -ENOMEM; /* Copy over memory. Only set up memory for the forward pointer. * Defer the memory setup for the reverse pointer. */ memcpy(forward, map->extent, map->nr_extents * sizeof(map->extent[0])); map->forward = forward; map->reverse = NULL; } if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) dest = &map->extent[map->nr_extents]; else dest = &map->forward[map->nr_extents]; *dest = *extent; map->nr_extents++; return 0; } /* cmp function to sort() forward mappings */ static int cmp_extents_forward(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->first < e2->first) return -1; if (e1->first > e2->first) return 1; return 0; } /* cmp function to sort() reverse mappings */ static int cmp_extents_reverse(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->lower_first < e2->lower_first) return -1; if (e1->lower_first > e2->lower_first) return 1; return 0; } /* * sort_idmaps - Sorts an array of idmap entries. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static int sort_idmaps(struct uid_gid_map *map) { if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return 0; /* Sort forward array. */ sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_forward, NULL); /* Only copy the memory from forward we actually need. */ map->reverse = kmemdup_array(map->forward, map->nr_extents, sizeof(struct uid_gid_extent), GFP_KERNEL); if (!map->reverse) return -ENOMEM; /* Sort reverse array. */ sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_reverse, NULL); return 0; } /** * verify_root_map() - check the uid 0 mapping * @file: idmapping file * @map_ns: user namespace of the target process * @new_map: requested idmap * * If a process requests mapping parent uid 0 into the new ns, verify that the * process writing the map had the CAP_SETFCAP capability as the target process * will be able to write fscaps that are valid in ancestor user namespaces. * * Return: true if the mapping is allowed, false if not. */ static bool verify_root_map(const struct file *file, struct user_namespace *map_ns, struct uid_gid_map *new_map) { int idx; const struct user_namespace *file_ns = file->f_cred->user_ns; struct uid_gid_extent *extent0 = NULL; for (idx = 0; idx < new_map->nr_extents; idx++) { if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent0 = &new_map->extent[idx]; else extent0 = &new_map->forward[idx]; if (extent0->lower_first == 0) break; extent0 = NULL; } if (!extent0) return true; if (map_ns == file_ns) { /* The process unshared its ns and is writing to its own * /proc/self/uid_map. User already has full capabilites in * the new namespace. Verify that the parent had CAP_SETFCAP * when it unshared. * */ if (!file_ns->parent_could_setfcap) return false; } else { /* Process p1 is writing to uid_map of p2, who is in a child * user namespace to p1's. Verify that the opener of the map * file has CAP_SETFCAP against the parent of the new map * namespace */ if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP)) return false; } return true; } static ssize_t map_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int cap_setid, struct uid_gid_map *map, struct uid_gid_map *parent_map) { struct seq_file *seq = file->private_data; struct user_namespace *map_ns = seq->private; struct uid_gid_map new_map; unsigned idx; struct uid_gid_extent extent; char *kbuf, *pos, *next_line; ssize_t ret; /* Only allow < page size writes at the beginning of the file */ if ((*ppos != 0) || (count >= PAGE_SIZE)) return -EINVAL; /* Slurp in the user data */ kbuf = memdup_user_nul(buf, count); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); /* * The userns_state_mutex serializes all writes to any given map. * * Any map is only ever written once. * * An id map fits within 1 cache line on most architectures. * * On read nothing needs to be done unless you are on an * architecture with a crazy cache coherency model like alpha. * * There is a one time data dependency between reading the * count of the extents and the values of the extents. The * desired behavior is to see the values of the extents that * were written before the count of the extents. * * To achieve this smp_wmb() is used on guarantee the write * order and smp_rmb() is guaranteed that we don't have crazy * architectures returning stale data. */ mutex_lock(&userns_state_mutex); memset(&new_map, 0, sizeof(struct uid_gid_map)); ret = -EPERM; /* Only allow one successful write to the map */ if (map->nr_extents != 0) goto out; /* * Adjusting namespace settings requires capabilities on the target. */ if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN)) goto out; /* Parse the user data */ ret = -EINVAL; pos = kbuf; for (; pos; pos = next_line) { /* Find the end of line and ensure I don't look past it */ next_line = strchr(pos, '\n'); if (next_line) { *next_line = '\0'; next_line++; if (*next_line == '\0') next_line = NULL; } pos = skip_spaces(pos); extent.first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.lower_first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.count = simple_strtoul(pos, &pos, 10); if (*pos && !isspace(*pos)) goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; /* Verify we have been given valid starting values */ if ((extent.first == (u32) -1) || (extent.lower_first == (u32) -1)) goto out; /* Verify count is not zero and does not cause the * extent to wrap */ if ((extent.first + extent.count) <= extent.first) goto out; if ((extent.lower_first + extent.count) <= extent.lower_first) goto out; /* Do the ranges in extent overlap any previous extents? */ if (mappings_overlap(&new_map, &extent)) goto out; if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS && (next_line != NULL)) goto out; ret = insert_extent(&new_map, &extent); if (ret < 0) goto out; ret = -EINVAL; } /* Be very certain the new map actually exists */ if (new_map.nr_extents == 0) goto out; ret = -EPERM; /* Validate the user is allowed to use user id's mapped to. */ if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map)) goto out; ret = -EPERM; /* Map the lower ids from the parent user namespace to the * kernel global id space. */ for (idx = 0; idx < new_map.nr_extents; idx++) { struct uid_gid_extent *e; u32 lower_first; if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) e = &new_map.extent[idx]; else e = &new_map.forward[idx]; lower_first = map_id_range_down(parent_map, e->lower_first, e->count); /* Fail if we can not map the specified extent to * the kernel global id space. */ if (lower_first == (u32) -1) goto out; e->lower_first = lower_first; } /* * If we want to use binary search for lookup, this clones the extent * array and sorts both copies. */ ret = sort_idmaps(&new_map); if (ret < 0) goto out; /* Install the map */ if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { memcpy(map->extent, new_map.extent, new_map.nr_extents * sizeof(new_map.extent[0])); } else { map->forward = new_map.forward; map->reverse = new_map.reverse; } smp_wmb(); map->nr_extents = new_map.nr_extents; *ppos = count; ret = count; out: if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(new_map.forward); kfree(new_map.reverse); map->forward = NULL; map->reverse = NULL; map->nr_extents = 0; } mutex_unlock(&userns_state_mutex); kfree(kbuf); return ret; } ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETUID, &ns->uid_map, &ns->parent->uid_map); } ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETGID, &ns->gid_map, &ns->parent->gid_map); } ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; /* Anyone can set any valid project id no capability needed */ return map_write(file, buf, size, ppos, -1, &ns->projid_map, &ns->parent->projid_map); } static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { const struct cred *cred = file->f_cred; if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map)) return false; /* Don't allow mappings that would allow anything that wouldn't * be allowed without the establishment of unprivileged mappings. */ if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) && uid_eq(ns->owner, cred->euid)) { u32 id = new_map->extent[0].lower_first; if (cap_setid == CAP_SETUID) { kuid_t uid = make_kuid(ns->parent, id); if (uid_eq(uid, cred->euid)) return true; } else if (cap_setid == CAP_SETGID) { kgid_t gid = make_kgid(ns->parent, id); if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) && gid_eq(gid, cred->egid)) return true; } } /* Allow anyone to set a mapping that doesn't require privilege */ if (!cap_valid(cap_setid)) return true; /* Allow the specified ids if we have the appropriate capability * (CAP_SETUID or CAP_SETGID) over the parent user namespace. * And the opener of the id file also has the appropriate capability. */ if (ns_capable(ns->parent, cap_setid) && file_ns_capable(file, ns->parent, cap_setid)) return true; return false; } int proc_setgroups_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; unsigned long userns_flags = READ_ONCE(ns->flags); seq_printf(seq, "%s\n", (userns_flags & USERNS_SETGROUPS_ALLOWED) ? "allow" : "deny"); return 0; } ssize_t proc_setgroups_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; char kbuf[8], *pos; bool setgroups_allowed; ssize_t ret; /* Only allow a very narrow range of strings to be written */ ret = -EINVAL; if ((*ppos != 0) || (count >= sizeof(kbuf))) goto out; /* What was written? */ ret = -EFAULT; if (copy_from_user(kbuf, buf, count)) goto out; kbuf[count] = '\0'; pos = kbuf; /* What is being requested? */ ret = -EINVAL; if (strncmp(pos, "allow", 5) == 0) { pos += 5; setgroups_allowed = true; } else if (strncmp(pos, "deny", 4) == 0) { pos += 4; setgroups_allowed = false; } else goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; ret = -EPERM; mutex_lock(&userns_state_mutex); if (setgroups_allowed) { /* Enabling setgroups after setgroups has been disabled * is not allowed. */ if (!(ns->flags & USERNS_SETGROUPS_ALLOWED)) goto out_unlock; } else { /* Permanently disabling setgroups after setgroups has * been enabled by writing the gid_map is not allowed. */ if (ns->gid_map.nr_extents != 0) goto out_unlock; ns->flags &= ~USERNS_SETGROUPS_ALLOWED; } mutex_unlock(&userns_state_mutex); /* Report a successful write */ *ppos = count; ret = count; out: return ret; out_unlock: mutex_unlock(&userns_state_mutex); goto out; } bool userns_may_setgroups(const struct user_namespace *ns) { bool allowed; mutex_lock(&userns_state_mutex); /* It is not safe to use setgroups until a gid mapping in * the user namespace has been established. */ allowed = ns->gid_map.nr_extents != 0; /* Is setgroups allowed? */ allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED); mutex_unlock(&userns_state_mutex); return allowed; } /* * Returns true if @child is the same namespace or a descendant of * @ancestor. */ bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child) { const struct user_namespace *ns; for (ns = child; ns->level > ancestor->level; ns = ns->parent) ; return (ns == ancestor); } bool current_in_userns(const struct user_namespace *target_ns) { return in_userns(target_ns, current_user_ns()); } EXPORT_SYMBOL(current_in_userns); static inline struct user_namespace *to_user_ns(struct ns_common *ns) { return container_of(ns, struct user_namespace, ns); } static struct ns_common *userns_get(struct task_struct *task) { struct user_namespace *user_ns; rcu_read_lock(); user_ns = get_user_ns(__task_cred(task)->user_ns); rcu_read_unlock(); return user_ns ? &user_ns->ns : NULL; } static void userns_put(struct ns_common *ns) { put_user_ns(to_user_ns(ns)); } static int userns_install(struct nsset *nsset, struct ns_common *ns) { struct user_namespace *user_ns = to_user_ns(ns); struct cred *cred; /* Don't allow gaining capabilities by reentering * the same user namespace. */ if (user_ns == current_user_ns()) return -EINVAL; /* Tasks that share a thread group must share a user namespace */ if (!thread_group_empty(current)) return -EINVAL; if (current->fs->users != 1) return -EINVAL; if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; cred = nsset_cred(nsset); if (!cred) return -EINVAL; put_user_ns(cred->user_ns); set_cred_user_ns(cred, get_user_ns(user_ns)); if (set_cred_ucounts(cred) < 0) return -EINVAL; return 0; } struct ns_common *ns_get_owner(struct ns_common *ns) { struct user_namespace *my_user_ns = current_user_ns(); struct user_namespace *owner, *p; /* See if the owner is in the current user namespace */ owner = p = ns->ops->owner(ns); for (;;) { if (!p) return ERR_PTR(-EPERM); if (p == my_user_ns) break; p = p->parent; } return &get_user_ns(owner)->ns; } static struct user_namespace *userns_owner(struct ns_common *ns) { return to_user_ns(ns)->parent; } const struct proc_ns_operations userns_operations = { .name = "user", .type = CLONE_NEWUSER, .get = userns_get, .put = userns_put, .install = userns_install, .owner = userns_owner, .get_parent = ns_get_owner, }; static __init int user_namespaces_init(void) { user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT); return 0; } subsys_initcall(user_namespaces_init);
17 14 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 // SPDX-License-Identifier: GPL-2.0-or-later /* * ALSA sequencer MIDI-through client * Copyright (c) 1999-2000 by Takashi Iwai <tiwai@suse.de> */ #include <linux/init.h> #include <linux/slab.h> #include <linux/module.h> #include <sound/core.h> #include "seq_clientmgr.h" #include <sound/initval.h> #include <sound/asoundef.h> /* Sequencer MIDI-through client This gives a simple midi-through client. All the normal input events are redirected to output port immediately. The routing can be done via aconnect program in alsa-utils. Each client has a static client number 14 (= SNDRV_SEQ_CLIENT_DUMMY). If you want to auto-load this module, you may add the following alias in your /etc/conf.modules file. alias snd-seq-client-14 snd-seq-dummy The module is loaded on demand for client 14, or /proc/asound/seq/ is accessed. If you don't need this module to be loaded, alias snd-seq-client-14 as "off". This will help modprobe. The number of ports to be created can be specified via the module parameter "ports". For example, to create four ports, add the following option in a configuration file under /etc/modprobe.d/: option snd-seq-dummy ports=4 The model option "duplex=1" enables duplex operation to the port. In duplex mode, a pair of ports are created instead of single port, and events are tunneled between pair-ports. For example, input to port A is sent to output port of another port B and vice versa. In duplex mode, each port has DUPLEX capability. */ MODULE_AUTHOR("Takashi Iwai <tiwai@suse.de>"); MODULE_DESCRIPTION("ALSA sequencer MIDI-through client"); MODULE_LICENSE("GPL"); MODULE_ALIAS("snd-seq-client-" __stringify(SNDRV_SEQ_CLIENT_DUMMY)); static int ports = 1; static bool duplex; module_param(ports, int, 0444); MODULE_PARM_DESC(ports, "number of ports to be created"); module_param(duplex, bool, 0444); MODULE_PARM_DESC(duplex, "create DUPLEX ports"); #if IS_ENABLED(CONFIG_SND_SEQ_UMP) static int ump; module_param(ump, int, 0444); MODULE_PARM_DESC(ump, "UMP conversion (0: no convert, 1: MIDI 1.0, 2: MIDI 2.0)"); #endif struct snd_seq_dummy_port { int client; int port; int duplex; int connect; }; static int my_client = -1; /* * event input callback - just redirect events to subscribers */ static int dummy_input(struct snd_seq_event *ev, int direct, void *private_data, int atomic, int hop) { struct snd_seq_dummy_port *p; struct snd_seq_event tmpev; p = private_data; if (ev->source.client == SNDRV_SEQ_CLIENT_SYSTEM || ev->type == SNDRV_SEQ_EVENT_KERNEL_ERROR) return 0; /* ignore system messages */ tmpev = *ev; if (p->duplex) tmpev.source.port = p->connect; else tmpev.source.port = p->port; tmpev.dest.client = SNDRV_SEQ_ADDRESS_SUBSCRIBERS; return snd_seq_kernel_client_dispatch(p->client, &tmpev, atomic, hop); } /* * free_private callback */ static void dummy_free(void *private_data) { kfree(private_data); } /* * create a port */ static struct snd_seq_dummy_port __init * create_port(int idx, int type) { struct snd_seq_port_info pinfo; struct snd_seq_port_callback pcb; struct snd_seq_dummy_port *rec; rec = kzalloc(sizeof(*rec), GFP_KERNEL); if (!rec) return NULL; rec->client = my_client; rec->duplex = duplex; rec->connect = 0; memset(&pinfo, 0, sizeof(pinfo)); pinfo.addr.client = my_client; if (duplex) sprintf(pinfo.name, "Midi Through Port-%d:%c", idx, (type ? 'B' : 'A')); else sprintf(pinfo.name, "Midi Through Port-%d", idx); pinfo.capability = SNDRV_SEQ_PORT_CAP_READ | SNDRV_SEQ_PORT_CAP_SUBS_READ; pinfo.capability |= SNDRV_SEQ_PORT_CAP_WRITE | SNDRV_SEQ_PORT_CAP_SUBS_WRITE; if (duplex) pinfo.capability |= SNDRV_SEQ_PORT_CAP_DUPLEX; pinfo.direction = SNDRV_SEQ_PORT_DIR_BIDIRECTION; pinfo.type = SNDRV_SEQ_PORT_TYPE_MIDI_GENERIC | SNDRV_SEQ_PORT_TYPE_SOFTWARE | SNDRV_SEQ_PORT_TYPE_PORT; memset(&pcb, 0, sizeof(pcb)); pcb.owner = THIS_MODULE; pcb.event_input = dummy_input; pcb.private_free = dummy_free; pcb.private_data = rec; pinfo.kernel = &pcb; if (snd_seq_kernel_client_ctl(my_client, SNDRV_SEQ_IOCTL_CREATE_PORT, &pinfo) < 0) { kfree(rec); return NULL; } rec->port = pinfo.addr.port; return rec; } /* * register client and create ports */ static int __init register_client(void) { struct snd_seq_dummy_port *rec1, *rec2; #if IS_ENABLED(CONFIG_SND_SEQ_UMP) struct snd_seq_client *client; #endif int i; if (ports < 1) { pr_err("ALSA: seq_dummy: invalid number of ports %d\n", ports); return -EINVAL; } /* create client */ my_client = snd_seq_create_kernel_client(NULL, SNDRV_SEQ_CLIENT_DUMMY, "Midi Through"); if (my_client < 0) return my_client; #if IS_ENABLED(CONFIG_SND_SEQ_UMP) client = snd_seq_kernel_client_get(my_client); if (!client) return -EINVAL; switch (ump) { case 1: client->midi_version = SNDRV_SEQ_CLIENT_UMP_MIDI_1_0; break; case 2: client->midi_version = SNDRV_SEQ_CLIENT_UMP_MIDI_2_0; break; default: /* don't convert events but just pass-through */ client->filter = SNDRV_SEQ_FILTER_NO_CONVERT; break; } snd_seq_kernel_client_put(client); #endif /* create ports */ for (i = 0; i < ports; i++) { rec1 = create_port(i, 0); if (rec1 == NULL) { snd_seq_delete_kernel_client(my_client); return -ENOMEM; } if (duplex) { rec2 = create_port(i, 1); if (rec2 == NULL) { snd_seq_delete_kernel_client(my_client); return -ENOMEM; } rec1->connect = rec2->port; rec2->connect = rec1->port; } } return 0; } /* * delete client if exists */ static void __exit delete_client(void) { if (my_client >= 0) snd_seq_delete_kernel_client(my_client); } /* * Init part */ static int __init alsa_seq_dummy_init(void) { return register_client(); } static void __exit alsa_seq_dummy_exit(void) { delete_client(); } module_init(alsa_seq_dummy_init) module_exit(alsa_seq_dummy_exit)
4686 4683 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2007-2012 Nicira, Inc. */ #include <linux/netdevice.h> #include <net/genetlink.h> #include <net/netns/generic.h> #include "datapath.h" #include "vport-internal_dev.h" #include "vport-netdev.h" static void dp_detach_port_notify(struct vport *vport) { struct sk_buff *notify; struct datapath *dp; dp = vport->dp; notify = ovs_vport_cmd_build_info(vport, ovs_dp_get_net(dp), 0, 0, OVS_VPORT_CMD_DEL); ovs_dp_detach_port(vport); if (IS_ERR(notify)) { genl_set_err(&dp_vport_genl_family, ovs_dp_get_net(dp), 0, 0, PTR_ERR(notify)); return; } genlmsg_multicast_netns(&dp_vport_genl_family, ovs_dp_get_net(dp), notify, 0, 0, GFP_KERNEL); } void ovs_dp_notify_wq(struct work_struct *work) { struct ovs_net *ovs_net = container_of(work, struct ovs_net, dp_notify_work); struct datapath *dp; ovs_lock(); list_for_each_entry(dp, &ovs_net->dps, list_node) { int i; for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { struct vport *vport; struct hlist_node *n; hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) { if (vport->ops->type == OVS_VPORT_TYPE_INTERNAL) continue; if (!(netif_is_ovs_port(vport->dev))) dp_detach_port_notify(vport); } } } ovs_unlock(); } static int dp_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct ovs_net *ovs_net; struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct vport *vport = NULL; if (!ovs_is_internal_dev(dev)) vport = ovs_netdev_get_vport(dev); if (!vport) return NOTIFY_DONE; if (event == NETDEV_UNREGISTER) { /* upper_dev_unlink and decrement promisc immediately */ ovs_netdev_detach_dev(vport); /* schedule vport destroy, dev_put and genl notification */ ovs_net = net_generic(dev_net(dev), ovs_net_id); queue_work(system_wq, &ovs_net->dp_notify_work); } return NOTIFY_DONE; } struct notifier_block ovs_dp_device_notifier = { .notifier_call = dp_device_event };
7761 7762 5863 1562 3571 5334 1932 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 // SPDX-License-Identifier: GPL-2.0 /* * linux/lib/kasprintf.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/stdarg.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/string.h> /* Simplified asprintf. */ char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap) { unsigned int first, second; char *p; va_list aq; va_copy(aq, ap); first = vsnprintf(NULL, 0, fmt, aq); va_end(aq); p = kmalloc_track_caller(first+1, gfp); if (!p) return NULL; second = vsnprintf(p, first+1, fmt, ap); WARN(first != second, "different return values (%u and %u) from vsnprintf(\"%s\", ...)", first, second, fmt); return p; } EXPORT_SYMBOL(kvasprintf); /* * If fmt contains no % (or is exactly %s), use kstrdup_const. If fmt * (or the sole vararg) points to rodata, we will then save a memory * allocation and string copy. In any case, the return value should be * freed using kfree_const(). */ const char *kvasprintf_const(gfp_t gfp, const char *fmt, va_list ap) { if (!strchr(fmt, '%')) return kstrdup_const(fmt, gfp); if (!strcmp(fmt, "%s")) return kstrdup_const(va_arg(ap, const char*), gfp); return kvasprintf(gfp, fmt, ap); } EXPORT_SYMBOL(kvasprintf_const); char *kasprintf(gfp_t gfp, const char *fmt, ...) { va_list ap; char *p; va_start(ap, fmt); p = kvasprintf(gfp, fmt, ap); va_end(ap); return p; } EXPORT_SYMBOL(kasprintf);
1 1 185 185 185 40 40 40 40 40 40 40 40 40 40 186 1 185 185 145 40 40 40 185 8 8 8 8 8 7 1 14 1 1 3 9 1 1 1 1 8 6 2 5 5 1 33 21 9 3 13 34 6 29 1 40 37 1 32 5 1 23 11 32 33 16 33 1 1 10 14 9 14 10 21 1 4 2 2 4 4 13 1 12 10 1 1 1 14 15 15 1 1 2 1 41 16 17 8 9 1 8 8 48 39 9 44 1 43 1 41 23 26 1 24 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 4 1 1 1 1 27 46 1 1 1 1 2 3 1 1 4 1 2 2 1 2 71 74 1 9 1 22 1 1 1 14 2 1 7 7 1 16 1 6 9 1 1 7 1 25 15 3 1 5 5 5 307 308 183 1471 293 1179 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/ioctl.c * * Copyright (C) 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) */ #include <linux/fs.h> #include <linux/capability.h> #include <linux/time.h> #include <linux/compat.h> #include <linux/mount.h> #include <linux/file.h> #include <linux/quotaops.h> #include <linux/random.h> #include <linux/uaccess.h> #include <linux/delay.h> #include <linux/iversion.h> #include <linux/fileattr.h> #include <linux/uuid.h> #include "ext4_jbd2.h" #include "ext4.h" #include <linux/fsmap.h> #include "fsmap.h" #include <trace/events/ext4.h> typedef void ext4_update_sb_callback(struct ext4_super_block *es, const void *arg); /* * Superblock modification callback function for changing file system * label */ static void ext4_sb_setlabel(struct ext4_super_block *es, const void *arg) { /* Sanity check, this should never happen */ BUILD_BUG_ON(sizeof(es->s_volume_name) < EXT4_LABEL_MAX); memcpy(es->s_volume_name, (char *)arg, EXT4_LABEL_MAX); } /* * Superblock modification callback function for changing file system * UUID. */ static void ext4_sb_setuuid(struct ext4_super_block *es, const void *arg) { memcpy(es->s_uuid, (__u8 *)arg, UUID_SIZE); } static int ext4_update_primary_sb(struct super_block *sb, handle_t *handle, ext4_update_sb_callback func, const void *arg) { int err = 0; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh = sbi->s_sbh; struct ext4_super_block *es = sbi->s_es; trace_ext4_update_sb(sb, bh->b_blocknr, 1); BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE); if (err) goto out_err; lock_buffer(bh); func(es, arg); ext4_superblock_csum_set(sb); unlock_buffer(bh); if (buffer_write_io_error(bh) || !buffer_uptodate(bh)) { ext4_msg(sbi->s_sb, KERN_ERR, "previous I/O error to " "superblock detected"); clear_buffer_write_io_error(bh); set_buffer_uptodate(bh); } err = ext4_handle_dirty_metadata(handle, NULL, bh); if (err) goto out_err; err = sync_dirty_buffer(bh); out_err: ext4_std_error(sb, err); return err; } /* * Update one backup superblock in the group 'grp' using the callback * function 'func' and argument 'arg'. If the handle is NULL the * modification is not journalled. * * Returns: 0 when no modification was done (no superblock in the group) * 1 when the modification was successful * <0 on error */ static int ext4_update_backup_sb(struct super_block *sb, handle_t *handle, ext4_group_t grp, ext4_update_sb_callback func, const void *arg) { int err = 0; ext4_fsblk_t sb_block; struct buffer_head *bh; unsigned long offset = 0; struct ext4_super_block *es; if (!ext4_bg_has_super(sb, grp)) return 0; /* * For the group 0 there is always 1k padding, so we have * either adjust offset, or sb_block depending on blocksize */ if (grp == 0) { sb_block = 1 * EXT4_MIN_BLOCK_SIZE; offset = do_div(sb_block, sb->s_blocksize); } else { sb_block = ext4_group_first_block_no(sb, grp); offset = 0; } trace_ext4_update_sb(sb, sb_block, handle ? 1 : 0); bh = ext4_sb_bread(sb, sb_block, 0); if (IS_ERR(bh)) return PTR_ERR(bh); if (handle) { BUFFER_TRACE(bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE); if (err) goto out_bh; } es = (struct ext4_super_block *) (bh->b_data + offset); lock_buffer(bh); if (ext4_has_feature_metadata_csum(sb) && es->s_checksum != ext4_superblock_csum(es)) { ext4_msg(sb, KERN_ERR, "Invalid checksum for backup " "superblock %llu", sb_block); unlock_buffer(bh); goto out_bh; } func(es, arg); if (ext4_has_feature_metadata_csum(sb)) es->s_checksum = ext4_superblock_csum(es); set_buffer_uptodate(bh); unlock_buffer(bh); if (handle) { err = ext4_handle_dirty_metadata(handle, NULL, bh); if (err) goto out_bh; } else { BUFFER_TRACE(bh, "marking dirty"); mark_buffer_dirty(bh); } err = sync_dirty_buffer(bh); out_bh: brelse(bh); ext4_std_error(sb, err); return (err) ? err : 1; } /* * Update primary and backup superblocks using the provided function * func and argument arg. * * Only the primary superblock and at most two backup superblock * modifications are journalled; the rest is modified without journal. * This is safe because e2fsck will re-write them if there is a problem, * and we're very unlikely to ever need more than two backups. */ static int ext4_update_superblocks_fn(struct super_block *sb, ext4_update_sb_callback func, const void *arg) { handle_t *handle; ext4_group_t ngroups; unsigned int three = 1; unsigned int five = 5; unsigned int seven = 7; int err = 0, ret, i; ext4_group_t grp, primary_grp; struct ext4_sb_info *sbi = EXT4_SB(sb); /* * We can't update superblocks while the online resize is running */ if (test_and_set_bit_lock(EXT4_FLAGS_RESIZING, &sbi->s_ext4_flags)) { ext4_msg(sb, KERN_ERR, "Can't modify superblock while" "performing online resize"); return -EBUSY; } /* * We're only going to update primary superblock and two * backup superblocks in this transaction. */ handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 3); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto out; } /* Update primary superblock */ err = ext4_update_primary_sb(sb, handle, func, arg); if (err) { ext4_msg(sb, KERN_ERR, "Failed to update primary " "superblock"); goto out_journal; } primary_grp = ext4_get_group_number(sb, sbi->s_sbh->b_blocknr); ngroups = ext4_get_groups_count(sb); /* * Update backup superblocks. We have to start from group 0 * because it might not be where the primary superblock is * if the fs is mounted with -o sb=<backup_sb_block> */ i = 0; grp = 0; while (grp < ngroups) { /* Skip primary superblock */ if (grp == primary_grp) goto next_grp; ret = ext4_update_backup_sb(sb, handle, grp, func, arg); if (ret < 0) { /* Ignore bad checksum; try to update next sb */ if (ret == -EFSBADCRC) goto next_grp; err = ret; goto out_journal; } i += ret; if (handle && i > 1) { /* * We're only journalling primary superblock and * two backup superblocks; the rest is not * journalled. */ err = ext4_journal_stop(handle); if (err) goto out; handle = NULL; } next_grp: grp = ext4_list_backups(sb, &three, &five, &seven); } out_journal: if (handle) { ret = ext4_journal_stop(handle); if (ret && !err) err = ret; } out: clear_bit_unlock(EXT4_FLAGS_RESIZING, &sbi->s_ext4_flags); smp_mb__after_atomic(); return err ? err : 0; } /* * Swap memory between @a and @b for @len bytes. * * @a: pointer to first memory area * @b: pointer to second memory area * @len: number of bytes to swap * */ static void memswap(void *a, void *b, size_t len) { unsigned char *ap, *bp; ap = (unsigned char *)a; bp = (unsigned char *)b; while (len-- > 0) { swap(*ap, *bp); ap++; bp++; } } /* * Swap i_data and associated attributes between @inode1 and @inode2. * This function is used for the primary swap between inode1 and inode2 * and also to revert this primary swap in case of errors. * * Therefore you have to make sure, that calling this method twice * will revert all changes. * * @inode1: pointer to first inode * @inode2: pointer to second inode */ static void swap_inode_data(struct inode *inode1, struct inode *inode2) { loff_t isize; struct ext4_inode_info *ei1; struct ext4_inode_info *ei2; unsigned long tmp; struct timespec64 ts1, ts2; ei1 = EXT4_I(inode1); ei2 = EXT4_I(inode2); swap(inode1->i_version, inode2->i_version); ts1 = inode_get_atime(inode1); ts2 = inode_get_atime(inode2); inode_set_atime_to_ts(inode1, ts2); inode_set_atime_to_ts(inode2, ts1); ts1 = inode_get_mtime(inode1); ts2 = inode_get_mtime(inode2); inode_set_mtime_to_ts(inode1, ts2); inode_set_mtime_to_ts(inode2, ts1); memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data)); tmp = ei1->i_flags & EXT4_FL_SHOULD_SWAP; ei1->i_flags = (ei2->i_flags & EXT4_FL_SHOULD_SWAP) | (ei1->i_flags & ~EXT4_FL_SHOULD_SWAP); ei2->i_flags = tmp | (ei2->i_flags & ~EXT4_FL_SHOULD_SWAP); swap(ei1->i_disksize, ei2->i_disksize); ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS); ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS); isize = i_size_read(inode1); i_size_write(inode1, i_size_read(inode2)); i_size_write(inode2, isize); } void ext4_reset_inode_seed(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = cpu_to_le32(inode->i_generation); __u32 csum; if (!ext4_has_feature_metadata_csum(inode->i_sb)) return; csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); } /* * Swap the information from the given @inode and the inode * EXT4_BOOT_LOADER_INO. It will basically swap i_data and all other * important fields of the inodes. * * @sb: the super block of the filesystem * @idmap: idmap of the mount the inode was found from * @inode: the inode to swap with EXT4_BOOT_LOADER_INO * */ static long swap_inode_boot_loader(struct super_block *sb, struct mnt_idmap *idmap, struct inode *inode) { handle_t *handle; int err; struct inode *inode_bl; struct ext4_inode_info *ei_bl; qsize_t size, size_bl, diff; blkcnt_t blocks; unsigned short bytes; inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL | EXT4_IGET_BAD); if (IS_ERR(inode_bl)) return PTR_ERR(inode_bl); ei_bl = EXT4_I(inode_bl); /* Protect orig inodes against a truncate and make sure, * that only 1 swap_inode_boot_loader is running. */ lock_two_nondirectories(inode, inode_bl); if (inode->i_nlink != 1 || !S_ISREG(inode->i_mode) || IS_SWAPFILE(inode) || IS_ENCRYPTED(inode) || (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) || ext4_has_inline_data(inode)) { err = -EINVAL; goto journal_err_out; } if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || !inode_owner_or_capable(idmap, inode) || !capable(CAP_SYS_ADMIN)) { err = -EPERM; goto journal_err_out; } filemap_invalidate_lock(inode->i_mapping); err = filemap_write_and_wait(inode->i_mapping); if (err) goto err_out; err = filemap_write_and_wait(inode_bl->i_mapping); if (err) goto err_out; /* Wait for all existing dio workers */ inode_dio_wait(inode); inode_dio_wait(inode_bl); truncate_inode_pages(&inode->i_data, 0); truncate_inode_pages(&inode_bl->i_data, 0); handle = ext4_journal_start(inode_bl, EXT4_HT_MOVE_EXTENTS, 2); if (IS_ERR(handle)) { err = -EINVAL; goto err_out; } ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_SWAP_BOOT, handle); /* Protect extent tree against block allocations via delalloc */ ext4_double_down_write_data_sem(inode, inode_bl); if (is_bad_inode(inode_bl) || !S_ISREG(inode_bl->i_mode)) { /* this inode has never been used as a BOOT_LOADER */ set_nlink(inode_bl, 1); i_uid_write(inode_bl, 0); i_gid_write(inode_bl, 0); inode_bl->i_flags = 0; ei_bl->i_flags = 0; inode_set_iversion(inode_bl, 1); i_size_write(inode_bl, 0); EXT4_I(inode_bl)->i_disksize = inode_bl->i_size; inode_bl->i_mode = S_IFREG; if (ext4_has_feature_extents(sb)) { ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS); ext4_ext_tree_init(handle, inode_bl); } else memset(ei_bl->i_data, 0, sizeof(ei_bl->i_data)); } err = dquot_initialize(inode); if (err) goto err_out1; size = (qsize_t)(inode->i_blocks) * (1 << 9) + inode->i_bytes; size_bl = (qsize_t)(inode_bl->i_blocks) * (1 << 9) + inode_bl->i_bytes; diff = size - size_bl; swap_inode_data(inode, inode_bl); inode_set_ctime_current(inode); inode_set_ctime_current(inode_bl); inode_inc_iversion(inode); inode->i_generation = get_random_u32(); inode_bl->i_generation = get_random_u32(); ext4_reset_inode_seed(inode); ext4_reset_inode_seed(inode_bl); ext4_discard_preallocations(inode); err = ext4_mark_inode_dirty(handle, inode); if (err < 0) { /* No need to update quota information. */ ext4_warning(inode->i_sb, "couldn't mark inode #%lu dirty (err %d)", inode->i_ino, err); /* Revert all changes: */ swap_inode_data(inode, inode_bl); ext4_mark_inode_dirty(handle, inode); goto err_out1; } blocks = inode_bl->i_blocks; bytes = inode_bl->i_bytes; inode_bl->i_blocks = inode->i_blocks; inode_bl->i_bytes = inode->i_bytes; err = ext4_mark_inode_dirty(handle, inode_bl); if (err < 0) { /* No need to update quota information. */ ext4_warning(inode_bl->i_sb, "couldn't mark inode #%lu dirty (err %d)", inode_bl->i_ino, err); goto revert; } /* Bootloader inode should not be counted into quota information. */ if (diff > 0) dquot_free_space(inode, diff); else err = dquot_alloc_space(inode, -1 * diff); if (err < 0) { revert: /* Revert all changes: */ inode_bl->i_blocks = blocks; inode_bl->i_bytes = bytes; swap_inode_data(inode, inode_bl); ext4_mark_inode_dirty(handle, inode); ext4_mark_inode_dirty(handle, inode_bl); } err_out1: ext4_journal_stop(handle); ext4_double_up_write_data_sem(inode, inode_bl); err_out: filemap_invalidate_unlock(inode->i_mapping); journal_err_out: unlock_two_nondirectories(inode, inode_bl); iput(inode_bl); return err; } /* * If immutable is set and we are not clearing it, we're not allowed to change * anything else in the inode. Don't error out if we're only trying to set * immutable on an immutable file. */ static int ext4_ioctl_check_immutable(struct inode *inode, __u32 new_projid, unsigned int flags) { struct ext4_inode_info *ei = EXT4_I(inode); unsigned int oldflags = ei->i_flags; if (!(oldflags & EXT4_IMMUTABLE_FL) || !(flags & EXT4_IMMUTABLE_FL)) return 0; if ((oldflags & ~EXT4_IMMUTABLE_FL) != (flags & ~EXT4_IMMUTABLE_FL)) return -EPERM; if (ext4_has_feature_project(inode->i_sb) && __kprojid_val(ei->i_projid) != new_projid) return -EPERM; return 0; } static void ext4_dax_dontcache(struct inode *inode, unsigned int flags) { struct ext4_inode_info *ei = EXT4_I(inode); if (S_ISDIR(inode->i_mode)) return; if (test_opt2(inode->i_sb, DAX_NEVER) || test_opt(inode->i_sb, DAX_ALWAYS)) return; if ((ei->i_flags ^ flags) & EXT4_DAX_FL) d_mark_dontcache(inode); } static bool dax_compatible(struct inode *inode, unsigned int oldflags, unsigned int flags) { /* Allow the DAX flag to be changed on inline directories */ if (S_ISDIR(inode->i_mode)) { flags &= ~EXT4_INLINE_DATA_FL; oldflags &= ~EXT4_INLINE_DATA_FL; } if (flags & EXT4_DAX_FL) { if ((oldflags & EXT4_DAX_MUT_EXCL) || ext4_test_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS)) { return false; } } if ((flags & EXT4_DAX_MUT_EXCL) && (oldflags & EXT4_DAX_FL)) return false; return true; } static int ext4_ioctl_setflags(struct inode *inode, unsigned int flags) { struct ext4_inode_info *ei = EXT4_I(inode); handle_t *handle = NULL; int err = -EPERM, migrate = 0; struct ext4_iloc iloc; unsigned int oldflags, mask, i; struct super_block *sb = inode->i_sb; /* Is it quota file? Do not allow user to mess with it */ if (ext4_is_quota_file(inode)) goto flags_out; oldflags = ei->i_flags; /* * The JOURNAL_DATA flag can only be changed by * the relevant capability. */ if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { if (!capable(CAP_SYS_RESOURCE)) goto flags_out; } if (!dax_compatible(inode, oldflags, flags)) { err = -EOPNOTSUPP; goto flags_out; } if ((flags ^ oldflags) & EXT4_EXTENTS_FL) migrate = 1; if ((flags ^ oldflags) & EXT4_CASEFOLD_FL) { if (!ext4_has_feature_casefold(sb)) { err = -EOPNOTSUPP; goto flags_out; } if (!S_ISDIR(inode->i_mode)) { err = -ENOTDIR; goto flags_out; } if (!ext4_empty_dir(inode)) { err = -ENOTEMPTY; goto flags_out; } } /* * Wait for all pending directio and then flush all the dirty pages * for this file. The flush marks all the pages readonly, so any * subsequent attempt to write to the file (particularly mmap pages) * will come through the filesystem and fail. */ if (S_ISREG(inode->i_mode) && !IS_IMMUTABLE(inode) && (flags & EXT4_IMMUTABLE_FL)) { inode_dio_wait(inode); err = filemap_write_and_wait(inode->i_mapping); if (err) goto flags_out; } handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto flags_out; } if (IS_SYNC(inode)) ext4_handle_sync(handle); err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) goto flags_err; ext4_dax_dontcache(inode, flags); for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { if (!(mask & EXT4_FL_USER_MODIFIABLE)) continue; /* These flags get special treatment later */ if (mask == EXT4_JOURNAL_DATA_FL || mask == EXT4_EXTENTS_FL) continue; if (mask & flags) ext4_set_inode_flag(inode, i); else ext4_clear_inode_flag(inode, i); } ext4_set_inode_flags(inode, false); inode_set_ctime_current(inode); inode_inc_iversion(inode); err = ext4_mark_iloc_dirty(handle, inode, &iloc); flags_err: ext4_journal_stop(handle); if (err) goto flags_out; if ((flags ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { /* * Changes to the journaling mode can cause unsafe changes to * S_DAX if the inode is DAX */ if (IS_DAX(inode)) { err = -EBUSY; goto flags_out; } err = ext4_change_inode_journal_flag(inode, flags & EXT4_JOURNAL_DATA_FL); if (err) goto flags_out; } if (migrate) { if (flags & EXT4_EXTENTS_FL) err = ext4_ext_migrate(inode); else err = ext4_ind_migrate(inode); } flags_out: return err; } #ifdef CONFIG_QUOTA static int ext4_ioctl_setproject(struct inode *inode, __u32 projid) { struct super_block *sb = inode->i_sb; struct ext4_inode_info *ei = EXT4_I(inode); int err, rc; handle_t *handle; kprojid_t kprojid; struct ext4_iloc iloc; struct ext4_inode *raw_inode; struct dquot *transfer_to[MAXQUOTAS] = { }; if (!ext4_has_feature_project(sb)) { if (projid != EXT4_DEF_PROJID) return -EOPNOTSUPP; else return 0; } if (EXT4_INODE_SIZE(sb) <= EXT4_GOOD_OLD_INODE_SIZE) return -EOPNOTSUPP; kprojid = make_kprojid(&init_user_ns, (projid_t)projid); if (projid_eq(kprojid, EXT4_I(inode)->i_projid)) return 0; err = -EPERM; /* Is it quota file? Do not allow user to mess with it */ if (ext4_is_quota_file(inode)) return err; err = dquot_initialize(inode); if (err) return err; err = ext4_get_inode_loc(inode, &iloc); if (err) return err; raw_inode = ext4_raw_inode(&iloc); if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) { err = ext4_expand_extra_isize(inode, EXT4_SB(sb)->s_want_extra_isize, &iloc); if (err) return err; } else { brelse(iloc.bh); } handle = ext4_journal_start(inode, EXT4_HT_QUOTA, EXT4_QUOTA_INIT_BLOCKS(sb) + EXT4_QUOTA_DEL_BLOCKS(sb) + 3); if (IS_ERR(handle)) return PTR_ERR(handle); err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) goto out_stop; transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); if (!IS_ERR(transfer_to[PRJQUOTA])) { /* __dquot_transfer() calls back ext4_get_inode_usage() which * counts xattr inode references. */ down_read(&EXT4_I(inode)->xattr_sem); err = __dquot_transfer(inode, transfer_to); up_read(&EXT4_I(inode)->xattr_sem); dqput(transfer_to[PRJQUOTA]); if (err) goto out_dirty; } EXT4_I(inode)->i_projid = kprojid; inode_set_ctime_current(inode); inode_inc_iversion(inode); out_dirty: rc = ext4_mark_iloc_dirty(handle, inode, &iloc); if (!err) err = rc; out_stop: ext4_journal_stop(handle); return err; } #else static int ext4_ioctl_setproject(struct inode *inode, __u32 projid) { if (projid != EXT4_DEF_PROJID) return -EOPNOTSUPP; return 0; } #endif int ext4_force_shutdown(struct super_block *sb, u32 flags) { struct ext4_sb_info *sbi = EXT4_SB(sb); int ret; if (flags > EXT4_GOING_FLAGS_NOLOGFLUSH) return -EINVAL; if (ext4_forced_shutdown(sb)) return 0; ext4_msg(sb, KERN_ALERT, "shut down requested (%d)", flags); trace_ext4_shutdown(sb, flags); switch (flags) { case EXT4_GOING_FLAGS_DEFAULT: ret = bdev_freeze(sb->s_bdev); if (ret) return ret; set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); bdev_thaw(sb->s_bdev); break; case EXT4_GOING_FLAGS_LOGFLUSH: set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) { (void) ext4_force_commit(sb); jbd2_journal_abort(sbi->s_journal, -ESHUTDOWN); } break; case EXT4_GOING_FLAGS_NOLOGFLUSH: set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); if (sbi->s_journal && !is_journal_aborted(sbi->s_journal)) jbd2_journal_abort(sbi->s_journal, -ESHUTDOWN); break; default: return -EINVAL; } clear_opt(sb, DISCARD); return 0; } static int ext4_ioctl_shutdown(struct super_block *sb, unsigned long arg) { u32 flags; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (get_user(flags, (__u32 __user *)arg)) return -EFAULT; return ext4_force_shutdown(sb, flags); } struct getfsmap_info { struct super_block *gi_sb; struct fsmap_head __user *gi_data; unsigned int gi_idx; __u32 gi_last_flags; }; static int ext4_getfsmap_format(struct ext4_fsmap *xfm, void *priv) { struct getfsmap_info *info = priv; struct fsmap fm; trace_ext4_getfsmap_mapping(info->gi_sb, xfm); info->gi_last_flags = xfm->fmr_flags; ext4_fsmap_from_internal(info->gi_sb, &fm, xfm); if (copy_to_user(&info->gi_data->fmh_recs[info->gi_idx++], &fm, sizeof(struct fsmap))) return -EFAULT; return 0; } static int ext4_ioc_getfsmap(struct super_block *sb, struct fsmap_head __user *arg) { struct getfsmap_info info = { NULL }; struct ext4_fsmap_head xhead = {0}; struct fsmap_head head; bool aborted = false; int error; if (copy_from_user(&head, arg, sizeof(struct fsmap_head))) return -EFAULT; if (memchr_inv(head.fmh_reserved, 0, sizeof(head.fmh_reserved)) || memchr_inv(head.fmh_keys[0].fmr_reserved, 0, sizeof(head.fmh_keys[0].fmr_reserved)) || memchr_inv(head.fmh_keys[1].fmr_reserved, 0, sizeof(head.fmh_keys[1].fmr_reserved))) return -EINVAL; /* * ext4 doesn't report file extents at all, so the only valid * file offsets are the magic ones (all zeroes or all ones). */ if (head.fmh_keys[0].fmr_offset || (head.fmh_keys[1].fmr_offset != 0 && head.fmh_keys[1].fmr_offset != -1ULL)) return -EINVAL; xhead.fmh_iflags = head.fmh_iflags; xhead.fmh_count = head.fmh_count; ext4_fsmap_to_internal(sb, &xhead.fmh_keys[0], &head.fmh_keys[0]); ext4_fsmap_to_internal(sb, &xhead.fmh_keys[1], &head.fmh_keys[1]); trace_ext4_getfsmap_low_key(sb, &xhead.fmh_keys[0]); trace_ext4_getfsmap_high_key(sb, &xhead.fmh_keys[1]); info.gi_sb = sb; info.gi_data = arg; error = ext4_getfsmap(sb, &xhead, ext4_getfsmap_format, &info); if (error == EXT4_QUERY_RANGE_ABORT) aborted = true; else if (error) return error; /* If we didn't abort, set the "last" flag in the last fmx */ if (!aborted && info.gi_idx) { info.gi_last_flags |= FMR_OF_LAST; if (copy_to_user(&info.gi_data->fmh_recs[info.gi_idx - 1].fmr_flags, &info.gi_last_flags, sizeof(info.gi_last_flags))) return -EFAULT; } /* copy back header */ head.fmh_entries = xhead.fmh_entries; head.fmh_oflags = xhead.fmh_oflags; if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) return -EFAULT; return 0; } static long ext4_ioctl_group_add(struct file *file, struct ext4_new_group_data *input) { struct super_block *sb = file_inode(file)->i_sb; int err, err2=0; err = ext4_resize_begin(sb); if (err) return err; if (ext4_has_feature_bigalloc(sb)) { ext4_msg(sb, KERN_ERR, "Online resizing not supported with bigalloc"); err = -EOPNOTSUPP; goto group_add_out; } err = mnt_want_write_file(file); if (err) goto group_add_out; err = ext4_group_add(sb, input); if (EXT4_SB(sb)->s_journal) { jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); } if (err == 0) err = err2; mnt_drop_write_file(file); if (!err && ext4_has_group_desc_csum(sb) && test_opt(sb, INIT_INODE_TABLE)) err = ext4_register_li_request(sb, input->group); group_add_out: err2 = ext4_resize_end(sb, false); if (err == 0) err = err2; return err; } int ext4_fileattr_get(struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); struct ext4_inode_info *ei = EXT4_I(inode); u32 flags = ei->i_flags & EXT4_FL_USER_VISIBLE; if (S_ISREG(inode->i_mode)) flags &= ~FS_PROJINHERIT_FL; fileattr_fill_flags(fa, flags); if (ext4_has_feature_project(inode->i_sb)) fa->fsx_projid = from_kprojid(&init_user_ns, ei->i_projid); return 0; } int ext4_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct file_kattr *fa) { struct inode *inode = d_inode(dentry); u32 flags = fa->flags; int err = -EOPNOTSUPP; if (flags & ~EXT4_FL_USER_VISIBLE) goto out; /* * chattr(1) grabs flags via GETFLAGS, modifies the result and * passes that to SETFLAGS. So we cannot easily make SETFLAGS * more restrictive than just silently masking off visible but * not settable flags as we always did. */ flags &= EXT4_FL_USER_MODIFIABLE; if (ext4_mask_flags(inode->i_mode, flags) != flags) goto out; err = ext4_ioctl_check_immutable(inode, fa->fsx_projid, flags); if (err) goto out; err = ext4_ioctl_setflags(inode, flags); if (err) goto out; err = ext4_ioctl_setproject(inode, fa->fsx_projid); out: return err; } /* So that the fiemap access checks can't overflow on 32 bit machines. */ #define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent)) static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg) { struct fiemap fiemap; struct fiemap __user *ufiemap = (struct fiemap __user *) arg; struct fiemap_extent_info fieinfo = { 0, }; struct inode *inode = file_inode(filp); int error; if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap))) return -EFAULT; if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) return -EINVAL; fieinfo.fi_flags = fiemap.fm_flags; fieinfo.fi_extents_max = fiemap.fm_extent_count; fieinfo.fi_extents_start = ufiemap->fm_extents; error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start, fiemap.fm_length); fiemap.fm_flags = fieinfo.fi_flags; fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap))) error = -EFAULT; return error; } static int ext4_ioctl_checkpoint(struct file *filp, unsigned long arg) { int err = 0; __u32 flags = 0; unsigned int flush_flags = 0; struct super_block *sb = file_inode(filp)->i_sb; if (copy_from_user(&flags, (__u32 __user *)arg, sizeof(__u32))) return -EFAULT; if (!capable(CAP_SYS_ADMIN)) return -EPERM; /* check for invalid bits set */ if ((flags & ~EXT4_IOC_CHECKPOINT_FLAG_VALID) || ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && (flags & JBD2_JOURNAL_FLUSH_ZEROOUT))) return -EINVAL; if (!EXT4_SB(sb)->s_journal) return -ENODEV; if ((flags & JBD2_JOURNAL_FLUSH_DISCARD) && !bdev_max_discard_sectors(EXT4_SB(sb)->s_journal->j_dev)) return -EOPNOTSUPP; if (flags & EXT4_IOC_CHECKPOINT_FLAG_DRY_RUN) return 0; if (flags & EXT4_IOC_CHECKPOINT_FLAG_DISCARD) flush_flags |= JBD2_JOURNAL_FLUSH_DISCARD; if (flags & EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT) { flush_flags |= JBD2_JOURNAL_FLUSH_ZEROOUT; pr_info_ratelimited("warning: checkpointing journal with EXT4_IOC_CHECKPOINT_FLAG_ZEROOUT can be slow"); } jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, flush_flags); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); return err; } static int ext4_ioctl_setlabel(struct file *filp, const char __user *user_label) { size_t len; int ret = 0; char new_label[EXT4_LABEL_MAX + 1]; struct super_block *sb = file_inode(filp)->i_sb; if (!capable(CAP_SYS_ADMIN)) return -EPERM; /* * Copy the maximum length allowed for ext4 label with one more to * find the required terminating null byte in order to test the * label length. The on disk label doesn't need to be null terminated. */ if (copy_from_user(new_label, user_label, EXT4_LABEL_MAX + 1)) return -EFAULT; len = strnlen(new_label, EXT4_LABEL_MAX + 1); if (len > EXT4_LABEL_MAX) return -EINVAL; /* * Clear the buffer after the new label */ memset(new_label + len, 0, EXT4_LABEL_MAX - len); ret = mnt_want_write_file(filp); if (ret) return ret; ret = ext4_update_superblocks_fn(sb, ext4_sb_setlabel, new_label); mnt_drop_write_file(filp); return ret; } static int ext4_ioctl_getlabel(struct ext4_sb_info *sbi, char __user *user_label) { char label[EXT4_LABEL_MAX + 1]; /* * EXT4_LABEL_MAX must always be smaller than FSLABEL_MAX because * FSLABEL_MAX must include terminating null byte, while s_volume_name * does not have to. */ BUILD_BUG_ON(EXT4_LABEL_MAX >= FSLABEL_MAX); lock_buffer(sbi->s_sbh); memtostr_pad(label, sbi->s_es->s_volume_name); unlock_buffer(sbi->s_sbh); if (copy_to_user(user_label, label, sizeof(label))) return -EFAULT; return 0; } static int ext4_ioctl_getuuid(struct ext4_sb_info *sbi, struct fsuuid __user *ufsuuid) { struct fsuuid fsuuid; __u8 uuid[UUID_SIZE]; if (copy_from_user(&fsuuid, ufsuuid, sizeof(fsuuid))) return -EFAULT; if (fsuuid.fsu_len == 0) { fsuuid.fsu_len = UUID_SIZE; if (copy_to_user(&ufsuuid->fsu_len, &fsuuid.fsu_len, sizeof(fsuuid.fsu_len))) return -EFAULT; return 0; } if (fsuuid.fsu_len < UUID_SIZE || fsuuid.fsu_flags != 0) return -EINVAL; lock_buffer(sbi->s_sbh); memcpy(uuid, sbi->s_es->s_uuid, UUID_SIZE); unlock_buffer(sbi->s_sbh); fsuuid.fsu_len = UUID_SIZE; if (copy_to_user(ufsuuid, &fsuuid, sizeof(fsuuid)) || copy_to_user(&ufsuuid->fsu_uuid[0], uuid, UUID_SIZE)) return -EFAULT; return 0; } static int ext4_ioctl_setuuid(struct file *filp, const struct fsuuid __user *ufsuuid) { int ret = 0; struct super_block *sb = file_inode(filp)->i_sb; struct fsuuid fsuuid; __u8 uuid[UUID_SIZE]; if (!capable(CAP_SYS_ADMIN)) return -EPERM; /* * If any checksums (group descriptors or metadata) are being used * then the checksum seed feature is required to change the UUID. */ if (((ext4_has_feature_gdt_csum(sb) || ext4_has_feature_metadata_csum(sb)) && !ext4_has_feature_csum_seed(sb)) || ext4_has_feature_stable_inodes(sb)) return -EOPNOTSUPP; if (copy_from_user(&fsuuid, ufsuuid, sizeof(fsuuid))) return -EFAULT; if (fsuuid.fsu_len != UUID_SIZE || fsuuid.fsu_flags != 0) return -EINVAL; if (copy_from_user(uuid, &ufsuuid->fsu_uuid[0], UUID_SIZE)) return -EFAULT; ret = mnt_want_write_file(filp); if (ret) return ret; ret = ext4_update_superblocks_fn(sb, ext4_sb_setuuid, &uuid); mnt_drop_write_file(filp); return ret; } static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; struct mnt_idmap *idmap = file_mnt_idmap(filp); ext4_debug("cmd = %u, arg = %lu\n", cmd, arg); switch (cmd) { case FS_IOC_GETFSMAP: return ext4_ioc_getfsmap(sb, (void __user *)arg); case EXT4_IOC_GETVERSION: case EXT4_IOC_GETVERSION_OLD: return put_user(inode->i_generation, (int __user *) arg); case EXT4_IOC_SETVERSION: case EXT4_IOC_SETVERSION_OLD: { handle_t *handle; struct ext4_iloc iloc; __u32 generation; int err; if (!inode_owner_or_capable(idmap, inode)) return -EPERM; if (ext4_has_feature_metadata_csum(inode->i_sb)) { ext4_warning(sb, "Setting inode version is not " "supported with metadata_csum enabled."); return -ENOTTY; } err = mnt_want_write_file(filp); if (err) return err; if (get_user(generation, (int __user *) arg)) { err = -EFAULT; goto setversion_out; } inode_lock(inode); handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto unlock_out; } err = ext4_reserve_inode_write(handle, inode, &iloc); if (err == 0) { inode_set_ctime_current(inode); inode_inc_iversion(inode); inode->i_generation = generation; err = ext4_mark_iloc_dirty(handle, inode, &iloc); } ext4_journal_stop(handle); unlock_out: inode_unlock(inode); setversion_out: mnt_drop_write_file(filp); return err; } case EXT4_IOC_GROUP_EXTEND: { ext4_fsblk_t n_blocks_count; int err, err2=0; err = ext4_resize_begin(sb); if (err) return err; if (get_user(n_blocks_count, (__u32 __user *)arg)) { err = -EFAULT; goto group_extend_out; } if (ext4_has_feature_bigalloc(sb)) { ext4_msg(sb, KERN_ERR, "Online resizing not supported with bigalloc"); err = -EOPNOTSUPP; goto group_extend_out; } err = mnt_want_write_file(filp); if (err) goto group_extend_out; err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); if (EXT4_SB(sb)->s_journal) { jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); } if (err == 0) err = err2; mnt_drop_write_file(filp); group_extend_out: err2 = ext4_resize_end(sb, false); if (err == 0) err = err2; return err; } case EXT4_IOC_MOVE_EXT: { struct move_extent me; int err; if (!(filp->f_mode & FMODE_READ) || !(filp->f_mode & FMODE_WRITE)) return -EBADF; if (copy_from_user(&me, (struct move_extent __user *)arg, sizeof(me))) return -EFAULT; me.moved_len = 0; CLASS(fd, donor)(me.donor_fd); if (fd_empty(donor)) return -EBADF; if (!(fd_file(donor)->f_mode & FMODE_WRITE)) return -EBADF; if (ext4_has_feature_bigalloc(sb)) { ext4_msg(sb, KERN_ERR, "Online defrag not supported with bigalloc"); return -EOPNOTSUPP; } else if (IS_DAX(inode)) { ext4_msg(sb, KERN_ERR, "Online defrag not supported with DAX"); return -EOPNOTSUPP; } err = mnt_want_write_file(filp); if (err) return err; err = ext4_move_extents(filp, fd_file(donor), me.orig_start, me.donor_start, me.len, &me.moved_len); mnt_drop_write_file(filp); if (copy_to_user((struct move_extent __user *)arg, &me, sizeof(me))) err = -EFAULT; return err; } case EXT4_IOC_GROUP_ADD: { struct ext4_new_group_data input; if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, sizeof(input))) return -EFAULT; return ext4_ioctl_group_add(filp, &input); } case EXT4_IOC_MIGRATE: { int err; if (!inode_owner_or_capable(idmap, inode)) return -EACCES; err = mnt_want_write_file(filp); if (err) return err; /* * inode_mutex prevent write and truncate on the file. * Read still goes through. We take i_data_sem in * ext4_ext_swap_inode_data before we switch the * inode format to prevent read. */ inode_lock((inode)); err = ext4_ext_migrate(inode); inode_unlock((inode)); mnt_drop_write_file(filp); return err; } case EXT4_IOC_ALLOC_DA_BLKS: { int err; if (!inode_owner_or_capable(idmap, inode)) return -EACCES; err = mnt_want_write_file(filp); if (err) return err; err = ext4_alloc_da_blocks(inode); mnt_drop_write_file(filp); return err; } case EXT4_IOC_SWAP_BOOT: { int err; if (!(filp->f_mode & FMODE_WRITE)) return -EBADF; err = mnt_want_write_file(filp); if (err) return err; err = swap_inode_boot_loader(sb, idmap, inode); mnt_drop_write_file(filp); return err; } case EXT4_IOC_RESIZE_FS: { ext4_fsblk_t n_blocks_count; int err = 0, err2 = 0; ext4_group_t o_group = EXT4_SB(sb)->s_groups_count; if (copy_from_user(&n_blocks_count, (__u64 __user *)arg, sizeof(__u64))) { return -EFAULT; } err = ext4_resize_begin(sb); if (err) return err; err = mnt_want_write_file(filp); if (err) goto resizefs_out; err = ext4_resize_fs(sb, n_blocks_count); if (EXT4_SB(sb)->s_journal) { ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE, NULL); jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); } if (err == 0) err = err2; mnt_drop_write_file(filp); if (!err && (o_group < EXT4_SB(sb)->s_groups_count) && ext4_has_group_desc_csum(sb) && test_opt(sb, INIT_INODE_TABLE)) err = ext4_register_li_request(sb, o_group); resizefs_out: err2 = ext4_resize_end(sb, true); if (err == 0) err = err2; return err; } case FITRIM: { struct fstrim_range range; int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!bdev_max_discard_sectors(sb->s_bdev)) return -EOPNOTSUPP; /* * We haven't replayed the journal, so we cannot use our * block-bitmap-guided storage zapping commands. */ if (test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) return -EROFS; if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) return -EFAULT; ret = ext4_trim_fs(sb, &range); if (ret < 0) return ret; if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range))) return -EFAULT; return 0; } case EXT4_IOC_PRECACHE_EXTENTS: { int ret; inode_lock_shared(inode); ret = ext4_ext_precache(inode); inode_unlock_shared(inode); return ret; } case FS_IOC_SET_ENCRYPTION_POLICY: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); case FS_IOC_GET_ENCRYPTION_PWSALT: return ext4_ioctl_get_encryption_pwsalt(filp, (void __user *)arg); case FS_IOC_GET_ENCRYPTION_POLICY: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; return fscrypt_ioctl_get_policy(filp, (void __user *)arg); case FS_IOC_GET_ENCRYPTION_POLICY_EX: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; return fscrypt_ioctl_get_policy_ex(filp, (void __user *)arg); case FS_IOC_ADD_ENCRYPTION_KEY: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; return fscrypt_ioctl_add_key(filp, (void __user *)arg); case FS_IOC_REMOVE_ENCRYPTION_KEY: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; return fscrypt_ioctl_remove_key(filp, (void __user *)arg); case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; return fscrypt_ioctl_remove_key_all_users(filp, (void __user *)arg); case FS_IOC_GET_ENCRYPTION_KEY_STATUS: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; return fscrypt_ioctl_get_key_status(filp, (void __user *)arg); case FS_IOC_GET_ENCRYPTION_NONCE: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; return fscrypt_ioctl_get_nonce(filp, (void __user *)arg); case EXT4_IOC_CLEAR_ES_CACHE: { if (!inode_owner_or_capable(idmap, inode)) return -EACCES; ext4_clear_inode_es(inode); return 0; } case EXT4_IOC_GETSTATE: { __u32 state = 0; if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED)) state |= EXT4_STATE_FLAG_EXT_PRECACHED; if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) state |= EXT4_STATE_FLAG_NEW; if (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) state |= EXT4_STATE_FLAG_NEWENTRY; if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) state |= EXT4_STATE_FLAG_DA_ALLOC_CLOSE; return put_user(state, (__u32 __user *) arg); } case EXT4_IOC_GET_ES_CACHE: return ext4_ioctl_get_es_cache(filp, arg); case EXT4_IOC_SHUTDOWN: return ext4_ioctl_shutdown(sb, arg); case FS_IOC_ENABLE_VERITY: if (!ext4_has_feature_verity(sb)) return -EOPNOTSUPP; return fsverity_ioctl_enable(filp, (const void __user *)arg); case FS_IOC_MEASURE_VERITY: if (!ext4_has_feature_verity(sb)) return -EOPNOTSUPP; return fsverity_ioctl_measure(filp, (void __user *)arg); case FS_IOC_READ_VERITY_METADATA: if (!ext4_has_feature_verity(sb)) return -EOPNOTSUPP; return fsverity_ioctl_read_metadata(filp, (const void __user *)arg); case EXT4_IOC_CHECKPOINT: return ext4_ioctl_checkpoint(filp, arg); case FS_IOC_GETFSLABEL: return ext4_ioctl_getlabel(EXT4_SB(sb), (void __user *)arg); case FS_IOC_SETFSLABEL: return ext4_ioctl_setlabel(filp, (const void __user *)arg); case EXT4_IOC_GETFSUUID: return ext4_ioctl_getuuid(EXT4_SB(sb), (void __user *)arg); case EXT4_IOC_SETFSUUID: return ext4_ioctl_setuuid(filp, (const void __user *)arg); default: return -ENOTTY; } } long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { return __ext4_ioctl(filp, cmd, arg); } #ifdef CONFIG_COMPAT long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { /* These are just misnamed, they actually get/put from/to user an int */ switch (cmd) { case EXT4_IOC32_GETVERSION: cmd = EXT4_IOC_GETVERSION; break; case EXT4_IOC32_SETVERSION: cmd = EXT4_IOC_SETVERSION; break; case EXT4_IOC32_GROUP_EXTEND: cmd = EXT4_IOC_GROUP_EXTEND; break; case EXT4_IOC32_GETVERSION_OLD: cmd = EXT4_IOC_GETVERSION_OLD; break; case EXT4_IOC32_SETVERSION_OLD: cmd = EXT4_IOC_SETVERSION_OLD; break; case EXT4_IOC32_GETRSVSZ: cmd = EXT4_IOC_GETRSVSZ; break; case EXT4_IOC32_SETRSVSZ: cmd = EXT4_IOC_SETRSVSZ; break; case EXT4_IOC32_GROUP_ADD: { struct compat_ext4_new_group_input __user *uinput; struct ext4_new_group_data input; int err; uinput = compat_ptr(arg); err = get_user(input.group, &uinput->group); err |= get_user(input.block_bitmap, &uinput->block_bitmap); err |= get_user(input.inode_bitmap, &uinput->inode_bitmap); err |= get_user(input.inode_table, &uinput->inode_table); err |= get_user(input.blocks_count, &uinput->blocks_count); err |= get_user(input.reserved_blocks, &uinput->reserved_blocks); if (err) return -EFAULT; return ext4_ioctl_group_add(file, &input); } case EXT4_IOC_MOVE_EXT: case EXT4_IOC_RESIZE_FS: case FITRIM: case EXT4_IOC_PRECACHE_EXTENTS: case FS_IOC_SET_ENCRYPTION_POLICY: case FS_IOC_GET_ENCRYPTION_PWSALT: case FS_IOC_GET_ENCRYPTION_POLICY: case FS_IOC_GET_ENCRYPTION_POLICY_EX: case FS_IOC_ADD_ENCRYPTION_KEY: case FS_IOC_REMOVE_ENCRYPTION_KEY: case FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS: case FS_IOC_GET_ENCRYPTION_KEY_STATUS: case FS_IOC_GET_ENCRYPTION_NONCE: case EXT4_IOC_SHUTDOWN: case FS_IOC_GETFSMAP: case FS_IOC_ENABLE_VERITY: case FS_IOC_MEASURE_VERITY: case FS_IOC_READ_VERITY_METADATA: case EXT4_IOC_CLEAR_ES_CACHE: case EXT4_IOC_GETSTATE: case EXT4_IOC_GET_ES_CACHE: case EXT4_IOC_CHECKPOINT: case FS_IOC_GETFSLABEL: case FS_IOC_SETFSLABEL: case EXT4_IOC_GETFSUUID: case EXT4_IOC_SETFSUUID: break; default: return -ENOIOCTLCMD; } return ext4_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); } #endif static void set_overhead(struct ext4_super_block *es, const void *arg) { es->s_overhead_clusters = cpu_to_le32(*((unsigned long *) arg)); } int ext4_update_overhead(struct super_block *sb, bool force) { struct ext4_sb_info *sbi = EXT4_SB(sb); if (ext4_emergency_state(sb) || sb_rdonly(sb)) return 0; if (!force && (sbi->s_overhead == 0 || sbi->s_overhead == le32_to_cpu(sbi->s_es->s_overhead_clusters))) return 0; return ext4_update_superblocks_fn(sb, set_overhead, &sbi->s_overhead); }
4 4 4 3 1 4 4 4 4 4 1 2 3 3 3 3 1 11 11 11 11 11 11 8 3 1 3 7 30 7 7 7 7 3 33 33 3 33 33 29 29 4 4 4 3 38 33 5 30 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 // SPDX-License-Identifier: GPL-2.0-only #include <linux/net_tstamp.h> #include <linux/phy.h> #include <linux/phy_link_topology.h> #include <linux/ptp_clock_kernel.h> #include <net/netdev_lock.h> #include "netlink.h" #include "common.h" #include "bitset.h" #include "ts.h" struct tsinfo_req_info { struct ethnl_req_info base; struct hwtstamp_provider_desc hwprov_desc; }; struct tsinfo_reply_data { struct ethnl_reply_data base; struct kernel_ethtool_ts_info ts_info; struct ethtool_ts_stats stats; }; #define TSINFO_REQINFO(__req_base) \ container_of(__req_base, struct tsinfo_req_info, base) #define TSINFO_REPDATA(__reply_base) \ container_of(__reply_base, struct tsinfo_reply_data, base) #define ETHTOOL_TS_STAT_CNT \ (__ETHTOOL_A_TS_STAT_CNT - (ETHTOOL_A_TS_STAT_UNSPEC + 1)) const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_MAX + 1] = { [ETHTOOL_A_TSINFO_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_stats), [ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER] = NLA_POLICY_NESTED(ethnl_ts_hwtst_prov_policy), }; int ts_parse_hwtst_provider(const struct nlattr *nest, struct hwtstamp_provider_desc *hwprov_desc, struct netlink_ext_ack *extack, bool *mod) { struct nlattr *tb[ARRAY_SIZE(ethnl_ts_hwtst_prov_policy)]; int ret; ret = nla_parse_nested(tb, ARRAY_SIZE(ethnl_ts_hwtst_prov_policy) - 1, nest, ethnl_ts_hwtst_prov_policy, extack); if (ret < 0) return ret; if (NL_REQ_ATTR_CHECK(extack, nest, tb, ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX) || NL_REQ_ATTR_CHECK(extack, nest, tb, ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER)) return -EINVAL; ethnl_update_u32(&hwprov_desc->index, tb[ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX], mod); ethnl_update_u32(&hwprov_desc->qualifier, tb[ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER], mod); return 0; } static int tsinfo_parse_request(struct ethnl_req_info *req_base, struct nlattr **tb, struct netlink_ext_ack *extack) { struct tsinfo_req_info *req = TSINFO_REQINFO(req_base); bool mod = false; req->hwprov_desc.index = -1; if (!tb[ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER]) return 0; return ts_parse_hwtst_provider(tb[ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER], &req->hwprov_desc, extack, &mod); } static int tsinfo_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base); struct tsinfo_req_info *req = TSINFO_REQINFO(req_base); struct net_device *dev = reply_base->dev; int ret; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; if (req->hwprov_desc.index != -1) { ret = ethtool_get_ts_info_by_phc(dev, &data->ts_info, &req->hwprov_desc); ethnl_ops_complete(dev); return ret; } if (req_base->flags & ETHTOOL_FLAG_STATS) { ethtool_stats_init((u64 *)&data->stats, sizeof(data->stats) / sizeof(u64)); if (dev->ethtool_ops->get_ts_stats) dev->ethtool_ops->get_ts_stats(dev, &data->stats); } ret = __ethtool_get_ts_info(dev, &data->ts_info); ethnl_ops_complete(dev); return ret; } static int tsinfo_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const struct kernel_ethtool_ts_info *ts_info = &data->ts_info; int len = 0; int ret; BUILD_BUG_ON(__SOF_TIMESTAMPING_CNT > 32); BUILD_BUG_ON(__HWTSTAMP_TX_CNT > 32); BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT > 32); if (ts_info->so_timestamping) { ret = ethnl_bitset32_size(&ts_info->so_timestamping, NULL, __SOF_TIMESTAMPING_CNT, sof_timestamping_names, compact); if (ret < 0) return ret; len += ret; /* _TSINFO_TIMESTAMPING */ } if (ts_info->tx_types) { ret = ethnl_bitset32_size(&ts_info->tx_types, NULL, __HWTSTAMP_TX_CNT, ts_tx_type_names, compact); if (ret < 0) return ret; len += ret; /* _TSINFO_TX_TYPES */ } if (ts_info->rx_filters) { ret = ethnl_bitset32_size(&ts_info->rx_filters, NULL, __HWTSTAMP_FILTER_CNT, ts_rx_filter_names, compact); if (ret < 0) return ret; len += ret; /* _TSINFO_RX_FILTERS */ } if (ts_info->phc_index >= 0) { len += nla_total_size(sizeof(u32)); /* _TSINFO_PHC_INDEX */ /* _TSINFO_HWTSTAMP_PROVIDER */ len += nla_total_size(0) + 2 * nla_total_size(sizeof(u32)); } if (ts_info->phc_source) { len += nla_total_size(sizeof(u32)); /* _TSINFO_HWTSTAMP_SOURCE */ if (ts_info->phc_phyindex) /* _TSINFO_HWTSTAMP_PHYINDEX */ len += nla_total_size(sizeof(u32)); } if (req_base->flags & ETHTOOL_FLAG_STATS) len += nla_total_size(0) + /* _TSINFO_STATS */ nla_total_size_64bit(sizeof(u64)) * ETHTOOL_TS_STAT_CNT; return len; } static int tsinfo_put_stat(struct sk_buff *skb, u64 val, u16 attrtype) { if (val == ETHTOOL_STAT_NOT_SET) return 0; if (nla_put_uint(skb, attrtype, val)) return -EMSGSIZE; return 0; } static int tsinfo_put_stats(struct sk_buff *skb, const struct ethtool_ts_stats *stats) { struct nlattr *nest; nest = nla_nest_start(skb, ETHTOOL_A_TSINFO_STATS); if (!nest) return -EMSGSIZE; if (tsinfo_put_stat(skb, stats->tx_stats.pkts, ETHTOOL_A_TS_STAT_TX_PKTS) || tsinfo_put_stat(skb, stats->tx_stats.onestep_pkts_unconfirmed, ETHTOOL_A_TS_STAT_TX_ONESTEP_PKTS_UNCONFIRMED) || tsinfo_put_stat(skb, stats->tx_stats.lost, ETHTOOL_A_TS_STAT_TX_LOST) || tsinfo_put_stat(skb, stats->tx_stats.err, ETHTOOL_A_TS_STAT_TX_ERR)) goto err_cancel; nla_nest_end(skb, nest); return 0; err_cancel: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int tsinfo_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base); bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const struct kernel_ethtool_ts_info *ts_info = &data->ts_info; int ret; if (ts_info->so_timestamping) { ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSINFO_TIMESTAMPING, &ts_info->so_timestamping, NULL, __SOF_TIMESTAMPING_CNT, sof_timestamping_names, compact); if (ret < 0) return ret; } if (ts_info->tx_types) { ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSINFO_TX_TYPES, &ts_info->tx_types, NULL, __HWTSTAMP_TX_CNT, ts_tx_type_names, compact); if (ret < 0) return ret; } if (ts_info->rx_filters) { ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSINFO_RX_FILTERS, &ts_info->rx_filters, NULL, __HWTSTAMP_FILTER_CNT, ts_rx_filter_names, compact); if (ret < 0) return ret; } if (ts_info->phc_index >= 0) { struct nlattr *nest; ret = nla_put_u32(skb, ETHTOOL_A_TSINFO_PHC_INDEX, ts_info->phc_index); if (ret) return -EMSGSIZE; nest = nla_nest_start(skb, ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER); if (!nest) return -EMSGSIZE; if (nla_put_u32(skb, ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX, ts_info->phc_index) || nla_put_u32(skb, ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER, ts_info->phc_qualifier)) { nla_nest_cancel(skb, nest); return -EMSGSIZE; } nla_nest_end(skb, nest); } if (ts_info->phc_source) { if (nla_put_u32(skb, ETHTOOL_A_TSINFO_HWTSTAMP_SOURCE, ts_info->phc_source)) return -EMSGSIZE; if (ts_info->phc_phyindex && nla_put_u32(skb, ETHTOOL_A_TSINFO_HWTSTAMP_PHYINDEX, ts_info->phc_phyindex)) return -EMSGSIZE; } if (req_base->flags & ETHTOOL_FLAG_STATS && tsinfo_put_stats(skb, &data->stats)) return -EMSGSIZE; return 0; } struct ethnl_tsinfo_dump_ctx { struct tsinfo_req_info *req_info; struct tsinfo_reply_data *reply_data; unsigned long pos_ifindex; bool netdev_dump_done; unsigned long pos_phyindex; enum hwtstamp_provider_qualifier pos_phcqualifier; }; static void *ethnl_tsinfo_prepare_dump(struct sk_buff *skb, struct net_device *dev, struct tsinfo_reply_data *reply_data, struct netlink_callback *cb) { struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; void *ehdr = NULL; ehdr = ethnl_dump_put(skb, cb, ETHTOOL_MSG_TSINFO_GET_REPLY); if (!ehdr) return ERR_PTR(-EMSGSIZE); reply_data = ctx->reply_data; memset(reply_data, 0, sizeof(*reply_data)); reply_data->base.dev = dev; reply_data->ts_info.cmd = ETHTOOL_GET_TS_INFO; reply_data->ts_info.phc_index = -1; return ehdr; } static int ethnl_tsinfo_end_dump(struct sk_buff *skb, struct net_device *dev, struct tsinfo_req_info *req_info, struct tsinfo_reply_data *reply_data, void *ehdr) { int ret; reply_data->ts_info.so_timestamping |= SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_SOFTWARE; ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_TSINFO_HEADER); if (ret < 0) return ret; ret = tsinfo_fill_reply(skb, &req_info->base, &reply_data->base); if (ret < 0) return ret; reply_data->base.dev = NULL; genlmsg_end(skb, ehdr); return ret; } static int ethnl_tsinfo_dump_one_phydev(struct sk_buff *skb, struct net_device *dev, struct phy_device *phydev, struct netlink_callback *cb) { struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; struct tsinfo_reply_data *reply_data; struct tsinfo_req_info *req_info; void *ehdr = NULL; int ret = 0; if (!phy_has_tsinfo(phydev)) return -EOPNOTSUPP; reply_data = ctx->reply_data; req_info = ctx->req_info; ehdr = ethnl_tsinfo_prepare_dump(skb, dev, reply_data, cb); if (IS_ERR(ehdr)) return PTR_ERR(ehdr); ret = phy_ts_info(phydev, &reply_data->ts_info); if (ret < 0) goto err; if (reply_data->ts_info.phc_index >= 0) { reply_data->ts_info.phc_source = HWTSTAMP_SOURCE_PHYLIB; reply_data->ts_info.phc_phyindex = phydev->phyindex; } ret = ethnl_tsinfo_end_dump(skb, dev, req_info, reply_data, ehdr); if (ret < 0) goto err; return ret; err: genlmsg_cancel(skb, ehdr); return ret; } static int ethnl_tsinfo_dump_one_netdev(struct sk_buff *skb, struct net_device *dev, struct netlink_callback *cb) { struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; const struct ethtool_ops *ops = dev->ethtool_ops; struct tsinfo_reply_data *reply_data; struct tsinfo_req_info *req_info; void *ehdr = NULL; int ret = 0; if (!ops->get_ts_info) return -EOPNOTSUPP; reply_data = ctx->reply_data; req_info = ctx->req_info; for (; ctx->pos_phcqualifier < HWTSTAMP_PROVIDER_QUALIFIER_CNT; ctx->pos_phcqualifier++) { if (!net_support_hwtstamp_qualifier(dev, ctx->pos_phcqualifier)) continue; ehdr = ethnl_tsinfo_prepare_dump(skb, dev, reply_data, cb); if (IS_ERR(ehdr)) { ret = PTR_ERR(ehdr); goto err; } reply_data->ts_info.phc_qualifier = ctx->pos_phcqualifier; ret = ops->get_ts_info(dev, &reply_data->ts_info); if (ret < 0) goto err; if (reply_data->ts_info.phc_index >= 0) reply_data->ts_info.phc_source = HWTSTAMP_SOURCE_NETDEV; ret = ethnl_tsinfo_end_dump(skb, dev, req_info, reply_data, ehdr); if (ret < 0) goto err; } return ret; err: genlmsg_cancel(skb, ehdr); return ret; } static int ethnl_tsinfo_dump_one_net_topo(struct sk_buff *skb, struct net_device *dev, struct netlink_callback *cb) { struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; struct phy_device_node *pdn; int ret = 0; if (!ctx->netdev_dump_done) { ret = ethnl_tsinfo_dump_one_netdev(skb, dev, cb); if (ret < 0 && ret != -EOPNOTSUPP) return ret; ctx->netdev_dump_done = true; } if (!dev->link_topo) { if (phy_has_tsinfo(dev->phydev)) { ret = ethnl_tsinfo_dump_one_phydev(skb, dev, dev->phydev, cb); if (ret < 0 && ret != -EOPNOTSUPP) return ret; } return 0; } xa_for_each_start(&dev->link_topo->phys, ctx->pos_phyindex, pdn, ctx->pos_phyindex) { if (phy_has_tsinfo(pdn->phy)) { ret = ethnl_tsinfo_dump_one_phydev(skb, dev, pdn->phy, cb); if (ret < 0 && ret != -EOPNOTSUPP) return ret; } } return ret; } int ethnl_tsinfo_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); struct net_device *dev; int ret = 0; rtnl_lock(); if (ctx->req_info->base.dev) { dev = ctx->req_info->base.dev; netdev_lock_ops(dev); ret = ethnl_tsinfo_dump_one_net_topo(skb, dev, cb); netdev_unlock_ops(dev); } else { for_each_netdev_dump(net, dev, ctx->pos_ifindex) { netdev_lock_ops(dev); ret = ethnl_tsinfo_dump_one_net_topo(skb, dev, cb); netdev_unlock_ops(dev); if (ret < 0 && ret != -EOPNOTSUPP) break; ctx->pos_phyindex = 0; ctx->netdev_dump_done = false; ctx->pos_phcqualifier = HWTSTAMP_PROVIDER_QUALIFIER_PRECISE; } } rtnl_unlock(); return ret; } int ethnl_tsinfo_start(struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; struct nlattr **tb = info->info.attrs; struct tsinfo_reply_data *reply_data; struct tsinfo_req_info *req_info; int ret; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); req_info = kzalloc(sizeof(*req_info), GFP_KERNEL); if (!req_info) return -ENOMEM; reply_data = kzalloc(sizeof(*reply_data), GFP_KERNEL); if (!reply_data) { ret = -ENOMEM; goto free_req_info; } ret = ethnl_parse_header_dev_get(&req_info->base, tb[ETHTOOL_A_TSINFO_HEADER], sock_net(cb->skb->sk), cb->extack, false); if (ret < 0) goto free_reply_data; ctx->req_info = req_info; ctx->reply_data = reply_data; ctx->pos_ifindex = 0; ctx->pos_phyindex = 0; ctx->netdev_dump_done = false; ctx->pos_phcqualifier = HWTSTAMP_PROVIDER_QUALIFIER_PRECISE; return 0; free_reply_data: kfree(reply_data); free_req_info: kfree(req_info); return ret; } int ethnl_tsinfo_done(struct netlink_callback *cb) { struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx; struct tsinfo_req_info *req_info = ctx->req_info; ethnl_parse_header_dev_put(&req_info->base); kfree(ctx->reply_data); kfree(ctx->req_info); return 0; } const struct ethnl_request_ops ethnl_tsinfo_request_ops = { .request_cmd = ETHTOOL_MSG_TSINFO_GET, .reply_cmd = ETHTOOL_MSG_TSINFO_GET_REPLY, .hdr_attr = ETHTOOL_A_TSINFO_HEADER, .req_info_size = sizeof(struct tsinfo_req_info), .reply_data_size = sizeof(struct tsinfo_reply_data), .parse_request = tsinfo_parse_request, .prepare_data = tsinfo_prepare_data, .reply_size = tsinfo_reply_size, .fill_reply = tsinfo_fill_reply, };
4 2 886 613 320 589 611 599 390 140 399 362 888 974 19 18 18 978 1 962 19 980 19 18 979 1 17 5 5 64 2 47 6 1 1 4 488 451 145 93 488 488 725 1061 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright Red Hat Inc. 2017 * * This file is part of the SCTP kernel implementation * * These functions manipulate sctp stream queue/scheduling. * * Please send any bug reports or fixes you make to the * email addresched(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> */ #include <linux/list.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> /* First Come First Serve (a.k.a. FIFO) * RFC DRAFT ndata Section 3.1 */ static int sctp_sched_fcfs_set(struct sctp_stream *stream, __u16 sid, __u16 value, gfp_t gfp) { return 0; } static int sctp_sched_fcfs_get(struct sctp_stream *stream, __u16 sid, __u16 *value) { *value = 0; return 0; } static int sctp_sched_fcfs_init(struct sctp_stream *stream) { return 0; } static int sctp_sched_fcfs_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) { return 0; } static void sctp_sched_fcfs_free_sid(struct sctp_stream *stream, __u16 sid) { } static void sctp_sched_fcfs_enqueue(struct sctp_outq *q, struct sctp_datamsg *msg) { } static struct sctp_chunk *sctp_sched_fcfs_dequeue(struct sctp_outq *q) { struct sctp_stream *stream = &q->asoc->stream; struct sctp_chunk *ch = NULL; struct list_head *entry; if (list_empty(&q->out_chunk_list)) goto out; if (stream->out_curr) { ch = list_entry(stream->out_curr->ext->outq.next, struct sctp_chunk, stream_list); } else { entry = q->out_chunk_list.next; ch = list_entry(entry, struct sctp_chunk, list); } sctp_sched_dequeue_common(q, ch); out: return ch; } static void sctp_sched_fcfs_dequeue_done(struct sctp_outq *q, struct sctp_chunk *chunk) { } static void sctp_sched_fcfs_sched_all(struct sctp_stream *stream) { } static void sctp_sched_fcfs_unsched_all(struct sctp_stream *stream) { } static struct sctp_sched_ops sctp_sched_fcfs = { .set = sctp_sched_fcfs_set, .get = sctp_sched_fcfs_get, .init = sctp_sched_fcfs_init, .init_sid = sctp_sched_fcfs_init_sid, .free_sid = sctp_sched_fcfs_free_sid, .enqueue = sctp_sched_fcfs_enqueue, .dequeue = sctp_sched_fcfs_dequeue, .dequeue_done = sctp_sched_fcfs_dequeue_done, .sched_all = sctp_sched_fcfs_sched_all, .unsched_all = sctp_sched_fcfs_unsched_all, }; static void sctp_sched_ops_fcfs_init(void) { sctp_sched_ops_register(SCTP_SS_FCFS, &sctp_sched_fcfs); } /* API to other parts of the stack */ static struct sctp_sched_ops *sctp_sched_ops[SCTP_SS_MAX + 1]; void sctp_sched_ops_register(enum sctp_sched_type sched, struct sctp_sched_ops *sched_ops) { sctp_sched_ops[sched] = sched_ops; } void sctp_sched_ops_init(void) { sctp_sched_ops_fcfs_init(); sctp_sched_ops_prio_init(); sctp_sched_ops_rr_init(); sctp_sched_ops_fc_init(); sctp_sched_ops_wfq_init(); } static void sctp_sched_free_sched(struct sctp_stream *stream) { struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); struct sctp_stream_out_ext *soute; int i; sched->unsched_all(stream); for (i = 0; i < stream->outcnt; i++) { soute = SCTP_SO(stream, i)->ext; if (!soute) continue; sched->free_sid(stream, i); /* Give the next scheduler a clean slate. */ memset_after(soute, 0, outq); } } int sctp_sched_set_sched(struct sctp_association *asoc, enum sctp_sched_type sched) { struct sctp_sched_ops *old = asoc->outqueue.sched; struct sctp_datamsg *msg = NULL; struct sctp_sched_ops *n; struct sctp_chunk *ch; int i, ret = 0; if (sched > SCTP_SS_MAX) return -EINVAL; n = sctp_sched_ops[sched]; if (old == n) return ret; if (old) sctp_sched_free_sched(&asoc->stream); asoc->outqueue.sched = n; n->init(&asoc->stream); for (i = 0; i < asoc->stream.outcnt; i++) { if (!SCTP_SO(&asoc->stream, i)->ext) continue; ret = n->init_sid(&asoc->stream, i, GFP_ATOMIC); if (ret) goto err; } /* We have to requeue all chunks already queued. */ list_for_each_entry(ch, &asoc->outqueue.out_chunk_list, list) { if (ch->msg == msg) continue; msg = ch->msg; n->enqueue(&asoc->outqueue, msg); } return ret; err: sctp_sched_free_sched(&asoc->stream); asoc->outqueue.sched = &sctp_sched_fcfs; /* Always safe */ return ret; } int sctp_sched_get_sched(struct sctp_association *asoc) { int i; for (i = 0; i <= SCTP_SS_MAX; i++) if (asoc->outqueue.sched == sctp_sched_ops[i]) return i; return 0; } int sctp_sched_set_value(struct sctp_association *asoc, __u16 sid, __u16 value, gfp_t gfp) { if (sid >= asoc->stream.outcnt) return -EINVAL; if (!SCTP_SO(&asoc->stream, sid)->ext) { int ret; ret = sctp_stream_init_ext(&asoc->stream, sid); if (ret) return ret; } return asoc->outqueue.sched->set(&asoc->stream, sid, value, gfp); } int sctp_sched_get_value(struct sctp_association *asoc, __u16 sid, __u16 *value) { if (sid >= asoc->stream.outcnt) return -EINVAL; if (!SCTP_SO(&asoc->stream, sid)->ext) return 0; return asoc->outqueue.sched->get(&asoc->stream, sid, value); } void sctp_sched_dequeue_done(struct sctp_outq *q, struct sctp_chunk *ch) { if (!list_is_last(&ch->frag_list, &ch->msg->chunks) && !q->asoc->peer.intl_capable) { struct sctp_stream_out *sout; __u16 sid; /* datamsg is not finish, so save it as current one, * in case application switch scheduler or a higher * priority stream comes in. */ sid = sctp_chunk_stream_no(ch); sout = SCTP_SO(&q->asoc->stream, sid); q->asoc->stream.out_curr = sout; return; } q->asoc->stream.out_curr = NULL; q->sched->dequeue_done(q, ch); } /* Auxiliary functions for the schedulers */ void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch) { list_del_init(&ch->list); list_del_init(&ch->stream_list); q->out_qlen -= ch->skb->len; } int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) { struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); struct sctp_stream_out_ext *ext = SCTP_SO(stream, sid)->ext; INIT_LIST_HEAD(&ext->outq); return sched->init_sid(stream, sid, gfp); } struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream) { struct sctp_association *asoc; asoc = container_of(stream, struct sctp_association, stream); return asoc->outqueue.sched; }
35 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/eventfd.h * * Copyright (C) 2007 Davide Libenzi <davidel@xmailserver.org> * */ #ifndef _LINUX_EVENTFD_H #define _LINUX_EVENTFD_H #include <linux/wait.h> #include <linux/err.h> #include <linux/percpu-defs.h> #include <linux/percpu.h> #include <linux/sched.h> #include <uapi/linux/eventfd.h> /* * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining * new flags, since they might collide with O_* ones. We want * to re-use O_* flags that couldn't possibly have a meaning * from eventfd, in order to leave a free define-space for * shared O_* flags. */ #define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) #define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE) struct eventfd_ctx; struct file; #ifdef CONFIG_EVENTFD void eventfd_ctx_put(struct eventfd_ctx *ctx); struct file *eventfd_fget(int fd); struct eventfd_ctx *eventfd_ctx_fdget(int fd); struct eventfd_ctx *eventfd_ctx_fileget(struct file *file); void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask); int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, __u64 *cnt); void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt); static inline bool eventfd_signal_allowed(void) { return !current->in_eventfd; } #else /* CONFIG_EVENTFD */ /* * Ugly ugly ugly error layer to support modules that uses eventfd but * pretend to work in !CONFIG_EVENTFD configurations. Namely, AIO. */ static inline struct eventfd_ctx *eventfd_ctx_fdget(int fd) { return ERR_PTR(-ENOSYS); } static inline void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask) { } static inline void eventfd_ctx_put(struct eventfd_ctx *ctx) { } static inline int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, __u64 *cnt) { return -ENOSYS; } static inline bool eventfd_signal_allowed(void) { return true; } static inline void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) { } #endif static inline void eventfd_signal(struct eventfd_ctx *ctx) { eventfd_signal_mask(ctx, 0); } #endif /* _LINUX_EVENTFD_H */
239 238 239 239 379 1 5 4 1 4 4 5 1 1 5 3 5 4 5 1 1 1 1 12 1 2 1 6 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 // SPDX-License-Identifier: GPL-2.0-only // Copyright (c) 2020, Nikolay Aleksandrov <nikolay@cumulusnetworks.com> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <net/ip_tunnels.h> #include "br_private.h" #include "br_private_tunnel.h" static bool __vlan_tun_put(struct sk_buff *skb, const struct net_bridge_vlan *v) { __be32 tid = tunnel_id_to_key32(v->tinfo.tunnel_id); struct nlattr *nest; if (!v->tinfo.tunnel_dst) return true; nest = nla_nest_start(skb, BRIDGE_VLANDB_ENTRY_TUNNEL_INFO); if (!nest) return false; if (nla_put_u32(skb, BRIDGE_VLANDB_TINFO_ID, be32_to_cpu(tid))) { nla_nest_cancel(skb, nest); return false; } nla_nest_end(skb, nest); return true; } static bool __vlan_tun_can_enter_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *range_end) { return (!v_curr->tinfo.tunnel_dst && !range_end->tinfo.tunnel_dst) || vlan_tunid_inrange(v_curr, range_end); } /* check if the options' state of v_curr allow it to enter the range */ bool br_vlan_opts_eq_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *range_end) { u8 range_mc_rtr = br_vlan_multicast_router(range_end); u8 curr_mc_rtr = br_vlan_multicast_router(v_curr); return v_curr->state == range_end->state && __vlan_tun_can_enter_range(v_curr, range_end) && curr_mc_rtr == range_mc_rtr; } bool br_vlan_opts_fill(struct sk_buff *skb, const struct net_bridge_vlan *v, const struct net_bridge_port *p) { if (nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_STATE, br_vlan_get_state(v)) || !__vlan_tun_put(skb, v) || nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS, !!(v->priv_flags & BR_VLFLAG_NEIGH_SUPPRESS_ENABLED))) return false; #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (nla_put_u8(skb, BRIDGE_VLANDB_ENTRY_MCAST_ROUTER, br_vlan_multicast_router(v))) return false; if (p && !br_multicast_port_ctx_vlan_disabled(&v->port_mcast_ctx) && (nla_put_u32(skb, BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS, br_multicast_ngroups_get(&v->port_mcast_ctx)) || nla_put_u32(skb, BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS, br_multicast_ngroups_get_max(&v->port_mcast_ctx)))) return false; #endif return true; } size_t br_vlan_opts_nl_size(void) { return nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_ENTRY_STATE */ + nla_total_size(0) /* BRIDGE_VLANDB_ENTRY_TUNNEL_INFO */ + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_TINFO_ID */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_ENTRY_MCAST_ROUTER */ + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_ENTRY_MCAST_N_GROUPS */ + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS */ #endif + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS */ + 0; } static int br_vlan_modify_state(struct net_bridge_vlan_group *vg, struct net_bridge_vlan *v, u8 state, bool *changed, struct netlink_ext_ack *extack) { struct net_bridge *br; ASSERT_RTNL(); if (state > BR_STATE_BLOCKING) { NL_SET_ERR_MSG_MOD(extack, "Invalid vlan state"); return -EINVAL; } if (br_vlan_is_brentry(v)) br = v->br; else br = v->port->br; if (br->stp_enabled == BR_KERNEL_STP) { NL_SET_ERR_MSG_MOD(extack, "Can't modify vlan state when using kernel STP"); return -EBUSY; } if (br_opt_get(br, BROPT_MST_ENABLED)) { NL_SET_ERR_MSG_MOD(extack, "Can't modify vlan state directly when MST is enabled"); return -EBUSY; } if (v->state == state) return 0; if (v->vid == br_get_pvid(vg)) br_vlan_set_pvid_state(vg, state); br_vlan_set_state(v, state); *changed = true; return 0; } static const struct nla_policy br_vlandb_tinfo_pol[BRIDGE_VLANDB_TINFO_MAX + 1] = { [BRIDGE_VLANDB_TINFO_ID] = { .type = NLA_U32 }, [BRIDGE_VLANDB_TINFO_CMD] = { .type = NLA_U32 }, }; static int br_vlan_modify_tunnel(const struct net_bridge_port *p, struct net_bridge_vlan *v, struct nlattr **tb, bool *changed, struct netlink_ext_ack *extack) { struct nlattr *tun_tb[BRIDGE_VLANDB_TINFO_MAX + 1], *attr; struct bridge_vlan_info *vinfo; u32 tun_id = 0; int cmd, err; if (!p) { NL_SET_ERR_MSG_MOD(extack, "Can't modify tunnel mapping of non-port vlans"); return -EINVAL; } if (!(p->flags & BR_VLAN_TUNNEL)) { NL_SET_ERR_MSG_MOD(extack, "Port doesn't have tunnel flag set"); return -EINVAL; } attr = tb[BRIDGE_VLANDB_ENTRY_TUNNEL_INFO]; err = nla_parse_nested(tun_tb, BRIDGE_VLANDB_TINFO_MAX, attr, br_vlandb_tinfo_pol, extack); if (err) return err; if (!tun_tb[BRIDGE_VLANDB_TINFO_CMD]) { NL_SET_ERR_MSG_MOD(extack, "Missing tunnel command attribute"); return -ENOENT; } cmd = nla_get_u32(tun_tb[BRIDGE_VLANDB_TINFO_CMD]); switch (cmd) { case RTM_SETLINK: if (!tun_tb[BRIDGE_VLANDB_TINFO_ID]) { NL_SET_ERR_MSG_MOD(extack, "Missing tunnel id attribute"); return -ENOENT; } /* when working on vlan ranges this is the starting tunnel id */ tun_id = nla_get_u32(tun_tb[BRIDGE_VLANDB_TINFO_ID]); /* vlan info attr is guaranteed by br_vlan_rtm_process_one */ vinfo = nla_data(tb[BRIDGE_VLANDB_ENTRY_INFO]); /* tunnel ids are mapped to each vlan in increasing order, * the starting vlan is in BRIDGE_VLANDB_ENTRY_INFO and v is the * current vlan, so we compute: tun_id + v - vinfo->vid */ tun_id += v->vid - vinfo->vid; break; case RTM_DELLINK: break; default: NL_SET_ERR_MSG_MOD(extack, "Unsupported tunnel command"); return -EINVAL; } return br_vlan_tunnel_info(p, cmd, v->vid, tun_id, changed); } static int br_vlan_process_one_opts(const struct net_bridge *br, const struct net_bridge_port *p, struct net_bridge_vlan_group *vg, struct net_bridge_vlan *v, struct nlattr **tb, bool *changed, struct netlink_ext_ack *extack) { int err; *changed = false; if (tb[BRIDGE_VLANDB_ENTRY_STATE]) { u8 state = nla_get_u8(tb[BRIDGE_VLANDB_ENTRY_STATE]); err = br_vlan_modify_state(vg, v, state, changed, extack); if (err) return err; } if (tb[BRIDGE_VLANDB_ENTRY_TUNNEL_INFO]) { err = br_vlan_modify_tunnel(p, v, tb, changed, extack); if (err) return err; } #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (tb[BRIDGE_VLANDB_ENTRY_MCAST_ROUTER]) { u8 val; val = nla_get_u8(tb[BRIDGE_VLANDB_ENTRY_MCAST_ROUTER]); err = br_multicast_set_vlan_router(v, val); if (err) return err; *changed = true; } if (tb[BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS]) { u32 val; if (!p) { NL_SET_ERR_MSG_MOD(extack, "Can't set mcast_max_groups for non-port vlans"); return -EINVAL; } if (br_multicast_port_ctx_vlan_disabled(&v->port_mcast_ctx)) { NL_SET_ERR_MSG_MOD(extack, "Multicast snooping disabled on this VLAN"); return -EINVAL; } val = nla_get_u32(tb[BRIDGE_VLANDB_ENTRY_MCAST_MAX_GROUPS]); br_multicast_ngroups_set_max(&v->port_mcast_ctx, val); *changed = true; } #endif if (tb[BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS]) { bool enabled = v->priv_flags & BR_VLFLAG_NEIGH_SUPPRESS_ENABLED; bool val = nla_get_u8(tb[BRIDGE_VLANDB_ENTRY_NEIGH_SUPPRESS]); if (!p) { NL_SET_ERR_MSG_MOD(extack, "Can't set neigh_suppress for non-port vlans"); return -EINVAL; } if (val != enabled) { v->priv_flags ^= BR_VLFLAG_NEIGH_SUPPRESS_ENABLED; *changed = true; } } return 0; } int br_vlan_process_options(const struct net_bridge *br, const struct net_bridge_port *p, struct net_bridge_vlan *range_start, struct net_bridge_vlan *range_end, struct nlattr **tb, struct netlink_ext_ack *extack) { struct net_bridge_vlan *v, *curr_start = NULL, *curr_end = NULL; struct net_bridge_vlan_group *vg; int vid, err = 0; u16 pvid; if (p) vg = nbp_vlan_group(p); else vg = br_vlan_group(br); if (!range_start || !br_vlan_should_use(range_start)) { NL_SET_ERR_MSG_MOD(extack, "Vlan range start doesn't exist, can't process options"); return -ENOENT; } if (!range_end || !br_vlan_should_use(range_end)) { NL_SET_ERR_MSG_MOD(extack, "Vlan range end doesn't exist, can't process options"); return -ENOENT; } pvid = br_get_pvid(vg); for (vid = range_start->vid; vid <= range_end->vid; vid++) { bool changed = false; v = br_vlan_find(vg, vid); if (!v || !br_vlan_should_use(v)) { NL_SET_ERR_MSG_MOD(extack, "Vlan in range doesn't exist, can't process options"); err = -ENOENT; break; } err = br_vlan_process_one_opts(br, p, vg, v, tb, &changed, extack); if (err) break; if (changed) { /* vlan options changed, check for range */ if (!curr_start) { curr_start = v; curr_end = v; continue; } if (v->vid == pvid || !br_vlan_can_enter_range(v, curr_end)) { br_vlan_notify(br, p, curr_start->vid, curr_end->vid, RTM_NEWVLAN); curr_start = v; } curr_end = v; } else { /* nothing changed and nothing to notify yet */ if (!curr_start) continue; br_vlan_notify(br, p, curr_start->vid, curr_end->vid, RTM_NEWVLAN); curr_start = NULL; curr_end = NULL; } } if (curr_start) br_vlan_notify(br, p, curr_start->vid, curr_end->vid, RTM_NEWVLAN); return err; } bool br_vlan_global_opts_can_enter_range(const struct net_bridge_vlan *v_curr, const struct net_bridge_vlan *r_end) { return v_curr->vid - r_end->vid == 1 && v_curr->msti == r_end->msti && ((v_curr->priv_flags ^ r_end->priv_flags) & BR_VLFLAG_GLOBAL_MCAST_ENABLED) == 0 && br_multicast_ctx_options_equal(&v_curr->br_mcast_ctx, &r_end->br_mcast_ctx); } bool br_vlan_global_opts_fill(struct sk_buff *skb, u16 vid, u16 vid_range, const struct net_bridge_vlan *v_opts) { struct nlattr *nest2 __maybe_unused; u64 clockval __maybe_unused; struct nlattr *nest; nest = nla_nest_start(skb, BRIDGE_VLANDB_GLOBAL_OPTIONS); if (!nest) return false; if (nla_put_u16(skb, BRIDGE_VLANDB_GOPTS_ID, vid)) goto out_err; if (vid_range && vid < vid_range && nla_put_u16(skb, BRIDGE_VLANDB_GOPTS_RANGE, vid_range)) goto out_err; #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING, !!(v_opts->priv_flags & BR_VLFLAG_GLOBAL_MCAST_ENABLED)) || nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION, v_opts->br_mcast_ctx.multicast_igmp_version) || nla_put_u32(skb, BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT, v_opts->br_mcast_ctx.multicast_last_member_count) || nla_put_u32(skb, BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT, v_opts->br_mcast_ctx.multicast_startup_query_count) || nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER, v_opts->br_mcast_ctx.multicast_querier) || br_multicast_dump_querier_state(skb, &v_opts->br_mcast_ctx, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE)) goto out_err; clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_last_member_interval); if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL, clockval, BRIDGE_VLANDB_GOPTS_PAD)) goto out_err; clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_membership_interval); if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL, clockval, BRIDGE_VLANDB_GOPTS_PAD)) goto out_err; clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_querier_interval); if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL, clockval, BRIDGE_VLANDB_GOPTS_PAD)) goto out_err; clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_query_interval); if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL, clockval, BRIDGE_VLANDB_GOPTS_PAD)) goto out_err; clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_query_response_interval); if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL, clockval, BRIDGE_VLANDB_GOPTS_PAD)) goto out_err; clockval = jiffies_to_clock_t(v_opts->br_mcast_ctx.multicast_startup_query_interval); if (nla_put_u64_64bit(skb, BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL, clockval, BRIDGE_VLANDB_GOPTS_PAD)) goto out_err; if (br_rports_have_mc_router(&v_opts->br_mcast_ctx)) { nest2 = nla_nest_start(skb, BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS); if (!nest2) goto out_err; rcu_read_lock(); if (br_rports_fill_info(skb, &v_opts->br_mcast_ctx)) { rcu_read_unlock(); nla_nest_cancel(skb, nest2); goto out_err; } rcu_read_unlock(); nla_nest_end(skb, nest2); } #if IS_ENABLED(CONFIG_IPV6) if (nla_put_u8(skb, BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION, v_opts->br_mcast_ctx.multicast_mld_version)) goto out_err; #endif #endif if (nla_put_u16(skb, BRIDGE_VLANDB_GOPTS_MSTI, v_opts->msti)) goto out_err; nla_nest_end(skb, nest); return true; out_err: nla_nest_cancel(skb, nest); return false; } static size_t rtnl_vlan_global_opts_nlmsg_size(const struct net_bridge_vlan *v) { return NLMSG_ALIGN(sizeof(struct br_vlan_msg)) + nla_total_size(0) /* BRIDGE_VLANDB_GLOBAL_OPTIONS */ + nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_GOPTS_ID */ #ifdef CONFIG_BRIDGE_IGMP_SNOOPING + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING */ + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION */ + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION */ + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT */ + nla_total_size(sizeof(u32)) /* BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT */ + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL */ + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL */ + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL */ + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL */ + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL */ + nla_total_size(sizeof(u64)) /* BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL */ + nla_total_size(sizeof(u8)) /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER */ + br_multicast_querier_state_size() /* BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_STATE */ + nla_total_size(0) /* BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS */ + br_rports_size(&v->br_mcast_ctx) /* BRIDGE_VLANDB_GOPTS_MCAST_ROUTER_PORTS */ #endif + nla_total_size(sizeof(u16)) /* BRIDGE_VLANDB_GOPTS_MSTI */ + nla_total_size(sizeof(u16)); /* BRIDGE_VLANDB_GOPTS_RANGE */ } static void br_vlan_global_opts_notify(const struct net_bridge *br, u16 vid, u16 vid_range) { struct net_bridge_vlan *v; struct br_vlan_msg *bvm; struct nlmsghdr *nlh; struct sk_buff *skb; int err = -ENOBUFS; /* right now notifications are done only with rtnl held */ ASSERT_RTNL(); /* need to find the vlan due to flags/options */ v = br_vlan_find(br_vlan_group(br), vid); if (!v) return; skb = nlmsg_new(rtnl_vlan_global_opts_nlmsg_size(v), GFP_KERNEL); if (!skb) goto out_err; err = -EMSGSIZE; nlh = nlmsg_put(skb, 0, 0, RTM_NEWVLAN, sizeof(*bvm), 0); if (!nlh) goto out_err; bvm = nlmsg_data(nlh); memset(bvm, 0, sizeof(*bvm)); bvm->family = AF_BRIDGE; bvm->ifindex = br->dev->ifindex; if (!br_vlan_global_opts_fill(skb, vid, vid_range, v)) goto out_err; nlmsg_end(skb, nlh); rtnl_notify(skb, dev_net(br->dev), 0, RTNLGRP_BRVLAN, NULL, GFP_KERNEL); return; out_err: rtnl_set_sk_err(dev_net(br->dev), RTNLGRP_BRVLAN, err); kfree_skb(skb); } static int br_vlan_process_global_one_opts(const struct net_bridge *br, struct net_bridge_vlan_group *vg, struct net_bridge_vlan *v, struct nlattr **tb, bool *changed, struct netlink_ext_ack *extack) { int err __maybe_unused; *changed = false; #ifdef CONFIG_BRIDGE_IGMP_SNOOPING if (tb[BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING]) { u8 mc_snooping; mc_snooping = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING]); if (br_multicast_toggle_global_vlan(v, !!mc_snooping)) *changed = true; } if (tb[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]) { u8 ver; ver = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION]); err = br_multicast_set_igmp_version(&v->br_mcast_ctx, ver); if (err) return err; *changed = true; } if (tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]) { u32 cnt; cnt = nla_get_u32(tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT]); v->br_mcast_ctx.multicast_last_member_count = cnt; *changed = true; } if (tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]) { u32 cnt; cnt = nla_get_u32(tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT]); v->br_mcast_ctx.multicast_startup_query_count = cnt; *changed = true; } if (tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL]) { u64 val; val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL]); v->br_mcast_ctx.multicast_last_member_interval = clock_t_to_jiffies(val); *changed = true; } if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL]) { u64 val; val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL]); v->br_mcast_ctx.multicast_membership_interval = clock_t_to_jiffies(val); *changed = true; } if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL]) { u64 val; val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL]); v->br_mcast_ctx.multicast_querier_interval = clock_t_to_jiffies(val); *changed = true; } if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL]) { u64 val; val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL]); br_multicast_set_query_intvl(&v->br_mcast_ctx, val); *changed = true; } if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL]) { u64 val; val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL]); v->br_mcast_ctx.multicast_query_response_interval = clock_t_to_jiffies(val); *changed = true; } if (tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL]) { u64 val; val = nla_get_u64(tb[BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL]); br_multicast_set_startup_query_intvl(&v->br_mcast_ctx, val); *changed = true; } if (tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER]) { u8 val; val = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_QUERIER]); err = br_multicast_set_querier(&v->br_mcast_ctx, val); if (err) return err; *changed = true; } #if IS_ENABLED(CONFIG_IPV6) if (tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]) { u8 ver; ver = nla_get_u8(tb[BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION]); err = br_multicast_set_mld_version(&v->br_mcast_ctx, ver); if (err) return err; *changed = true; } #endif #endif if (tb[BRIDGE_VLANDB_GOPTS_MSTI]) { u16 msti; msti = nla_get_u16(tb[BRIDGE_VLANDB_GOPTS_MSTI]); err = br_mst_vlan_set_msti(v, msti); if (err) return err; *changed = true; } return 0; } static const struct nla_policy br_vlan_db_gpol[BRIDGE_VLANDB_GOPTS_MAX + 1] = { [BRIDGE_VLANDB_GOPTS_ID] = { .type = NLA_U16 }, [BRIDGE_VLANDB_GOPTS_RANGE] = { .type = NLA_U16 }, [BRIDGE_VLANDB_GOPTS_MCAST_SNOOPING] = { .type = NLA_U8 }, [BRIDGE_VLANDB_GOPTS_MCAST_MLD_VERSION] = { .type = NLA_U8 }, [BRIDGE_VLANDB_GOPTS_MCAST_QUERY_INTVL] = { .type = NLA_U64 }, [BRIDGE_VLANDB_GOPTS_MCAST_QUERIER] = { .type = NLA_U8 }, [BRIDGE_VLANDB_GOPTS_MCAST_IGMP_VERSION] = { .type = NLA_U8 }, [BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_CNT] = { .type = NLA_U32 }, [BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_CNT] = { .type = NLA_U32 }, [BRIDGE_VLANDB_GOPTS_MCAST_LAST_MEMBER_INTVL] = { .type = NLA_U64 }, [BRIDGE_VLANDB_GOPTS_MCAST_MEMBERSHIP_INTVL] = { .type = NLA_U64 }, [BRIDGE_VLANDB_GOPTS_MCAST_QUERIER_INTVL] = { .type = NLA_U64 }, [BRIDGE_VLANDB_GOPTS_MCAST_STARTUP_QUERY_INTVL] = { .type = NLA_U64 }, [BRIDGE_VLANDB_GOPTS_MCAST_QUERY_RESPONSE_INTVL] = { .type = NLA_U64 }, [BRIDGE_VLANDB_GOPTS_MSTI] = NLA_POLICY_MAX(NLA_U16, VLAN_N_VID - 1), }; int br_vlan_rtm_process_global_options(struct net_device *dev, const struct nlattr *attr, int cmd, struct netlink_ext_ack *extack) { struct net_bridge_vlan *v, *curr_start = NULL, *curr_end = NULL; struct nlattr *tb[BRIDGE_VLANDB_GOPTS_MAX + 1]; struct net_bridge_vlan_group *vg; u16 vid, vid_range = 0; struct net_bridge *br; int err = 0; if (cmd != RTM_NEWVLAN) { NL_SET_ERR_MSG_MOD(extack, "Global vlan options support only set operation"); return -EINVAL; } if (!netif_is_bridge_master(dev)) { NL_SET_ERR_MSG_MOD(extack, "Global vlan options can only be set on bridge device"); return -EINVAL; } br = netdev_priv(dev); vg = br_vlan_group(br); if (WARN_ON(!vg)) return -ENODEV; err = nla_parse_nested(tb, BRIDGE_VLANDB_GOPTS_MAX, attr, br_vlan_db_gpol, extack); if (err) return err; if (!tb[BRIDGE_VLANDB_GOPTS_ID]) { NL_SET_ERR_MSG_MOD(extack, "Missing vlan entry id"); return -EINVAL; } vid = nla_get_u16(tb[BRIDGE_VLANDB_GOPTS_ID]); if (!br_vlan_valid_id(vid, extack)) return -EINVAL; if (tb[BRIDGE_VLANDB_GOPTS_RANGE]) { vid_range = nla_get_u16(tb[BRIDGE_VLANDB_GOPTS_RANGE]); if (!br_vlan_valid_id(vid_range, extack)) return -EINVAL; if (vid >= vid_range) { NL_SET_ERR_MSG_MOD(extack, "End vlan id is less than or equal to start vlan id"); return -EINVAL; } } else { vid_range = vid; } for (; vid <= vid_range; vid++) { bool changed = false; v = br_vlan_find(vg, vid); if (!v) { NL_SET_ERR_MSG_MOD(extack, "Vlan in range doesn't exist, can't process global options"); err = -ENOENT; break; } err = br_vlan_process_global_one_opts(br, vg, v, tb, &changed, extack); if (err) break; if (changed) { /* vlan options changed, check for range */ if (!curr_start) { curr_start = v; curr_end = v; continue; } if (!br_vlan_global_opts_can_enter_range(v, curr_end)) { br_vlan_global_opts_notify(br, curr_start->vid, curr_end->vid); curr_start = v; } curr_end = v; } else { /* nothing changed and nothing to notify yet */ if (!curr_start) continue; br_vlan_global_opts_notify(br, curr_start->vid, curr_end->vid); curr_start = NULL; curr_end = NULL; } } if (curr_start) br_vlan_global_opts_notify(br, curr_start->vid, curr_end->vid); return err; }
100 361 79 1342 130 2714 678 676 21 12 1716 1319 328 5 813 64 43 1086 1086 4 363 138 945 1083 63 393 4 31 70 1 1 1 2 3224 1311 2387 1337 2178 209 1537 872 517 516 11 339 408 355 422 807 2523 1398 2523 2 807 861 414 2956 45 63 1456 613 53 181 5 1691 3235 114 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 // SPDX-License-Identifier: GPL-2.0 /* * ext4.h * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/include/linux/minix_fs.h * * Copyright (C) 1991, 1992 Linus Torvalds */ #ifndef _EXT4_H #define _EXT4_H #include <linux/refcount.h> #include <linux/types.h> #include <linux/blkdev.h> #include <linux/magic.h> #include <linux/jbd2.h> #include <linux/quota.h> #include <linux/rwsem.h> #include <linux/rbtree.h> #include <linux/seqlock.h> #include <linux/mutex.h> #include <linux/timer.h> #include <linux/wait.h> #include <linux/sched/signal.h> #include <linux/blockgroup_lock.h> #include <linux/percpu_counter.h> #include <linux/ratelimit.h> #include <linux/crc32c.h> #include <linux/falloc.h> #include <linux/percpu-rwsem.h> #include <linux/fiemap.h> #ifdef __KERNEL__ #include <linux/compat.h> #endif #include <uapi/linux/ext4.h> #include <linux/fscrypt.h> #include <linux/fsverity.h> #include <linux/compiler.h> /* * The fourth extended filesystem constants/structures */ /* * with AGGRESSIVE_CHECK allocator runs consistency checks over * structures. these checks slow things down a lot */ #define AGGRESSIVE_CHECK__ /* * with DOUBLE_CHECK defined mballoc creates persistent in-core * bitmaps, maintains and uses them to check for double allocations */ #define DOUBLE_CHECK__ /* * Define EXT4FS_DEBUG to produce debug messages */ #undef EXT4FS_DEBUG /* * Debug code */ #ifdef EXT4FS_DEBUG #define ext4_debug(f, a...) \ do { \ printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ __FILE__, __LINE__, __func__); \ printk(KERN_DEBUG f, ## a); \ } while (0) #else #define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif /* * Turn on EXT_DEBUG to enable ext4_ext_show_path/leaf/move in extents.c */ #define EXT_DEBUG__ /* * Dynamic printk for controlled extents debugging. */ #ifdef CONFIG_EXT4_DEBUG #define ext_debug(ino, fmt, ...) \ pr_debug("[%s/%d] EXT4-fs (%s): ino %lu: (%s, %d): %s:" fmt, \ current->comm, task_pid_nr(current), \ ino->i_sb->s_id, ino->i_ino, __FILE__, __LINE__, \ __func__, ##__VA_ARGS__) #else #define ext_debug(ino, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif #define ASSERT(assert) \ do { \ if (unlikely(!(assert))) { \ printk(KERN_EMERG \ "Assertion failure in %s() at %s:%d: '%s'\n", \ __func__, __FILE__, __LINE__, #assert); \ BUG(); \ } \ } while (0) /* data type for block offset of block group */ typedef int ext4_grpblk_t; /* data type for filesystem-wide blocks number */ typedef unsigned long long ext4_fsblk_t; /* data type for file logical block number */ typedef __u32 ext4_lblk_t; /* data type for block group number */ typedef unsigned int ext4_group_t; enum SHIFT_DIRECTION { SHIFT_LEFT = 0, SHIFT_RIGHT, }; /* * For each criteria, mballoc has slightly different way of finding * the required blocks nad usually, higher the criteria the slower the * allocation. We start at lower criterias and keep falling back to * higher ones if we are not able to find any blocks. Lower (earlier) * criteria are faster. */ enum criteria { /* * Used when number of blocks needed is a power of 2. This * doesn't trigger any disk IO except prefetch and is the * fastest criteria. */ CR_POWER2_ALIGNED, /* * Tries to lookup in-memory data structures to find the most * suitable group that satisfies goal request. No disk IO * except block prefetch. */ CR_GOAL_LEN_FAST, /* * Same as CR_GOAL_LEN_FAST but is allowed to reduce the goal * length to the best available length for faster allocation. */ CR_BEST_AVAIL_LEN, /* * Reads each block group sequentially, performing disk IO if * necessary, to find suitable block group. Tries to * allocate goal length but might trim the request if nothing * is found after enough tries. */ CR_GOAL_LEN_SLOW, /* * Finds the first free set of blocks and allocates * those. This is only used in rare cases when * CR_GOAL_LEN_SLOW also fails to allocate anything. */ CR_ANY_FREE, /* * Number of criterias defined. */ EXT4_MB_NUM_CRS }; /* * Flags used in mballoc's allocation_context flags field. * * Also used to show what's going on for debugging purposes when the * flag field is exported via the traceport interface */ /* prefer goal again. length */ #define EXT4_MB_HINT_MERGE 0x0001 /* first blocks in the file */ #define EXT4_MB_HINT_FIRST 0x0008 /* data is being allocated */ #define EXT4_MB_HINT_DATA 0x0020 /* don't preallocate (for tails) */ #define EXT4_MB_HINT_NOPREALLOC 0x0040 /* allocate for locality group */ #define EXT4_MB_HINT_GROUP_ALLOC 0x0080 /* allocate goal blocks or none */ #define EXT4_MB_HINT_GOAL_ONLY 0x0100 /* goal is meaningful */ #define EXT4_MB_HINT_TRY_GOAL 0x0200 /* blocks already pre-reserved by delayed allocation */ #define EXT4_MB_DELALLOC_RESERVED 0x0400 /* We are doing stream allocation */ #define EXT4_MB_STREAM_ALLOC 0x0800 /* Use reserved root blocks if needed */ #define EXT4_MB_USE_ROOT_BLOCKS 0x1000 /* Use blocks from reserved pool */ #define EXT4_MB_USE_RESERVED 0x2000 /* Do strict check for free blocks while retrying block allocation */ #define EXT4_MB_STRICT_CHECK 0x4000 struct ext4_allocation_request { /* target inode for block we're allocating */ struct inode *inode; /* how many blocks we want to allocate */ unsigned int len; /* logical block in target inode */ ext4_lblk_t logical; /* the closest logical allocated block to the left */ ext4_lblk_t lleft; /* the closest logical allocated block to the right */ ext4_lblk_t lright; /* phys. target (a hint) */ ext4_fsblk_t goal; /* phys. block for the closest logical allocated block to the left */ ext4_fsblk_t pleft; /* phys. block for the closest logical allocated block to the right */ ext4_fsblk_t pright; /* flags. see above EXT4_MB_HINT_* */ unsigned int flags; }; /* * Logical to physical block mapping, used by ext4_map_blocks() * * This structure is used to pass requests into ext4_map_blocks() as * well as to store the information returned by ext4_map_blocks(). It * takes less room on the stack than a struct buffer_head. */ #define EXT4_MAP_NEW BIT(BH_New) #define EXT4_MAP_MAPPED BIT(BH_Mapped) #define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten) #define EXT4_MAP_BOUNDARY BIT(BH_Boundary) #define EXT4_MAP_DELAYED BIT(BH_Delay) /* * This is for use in ext4_map_query_blocks() for a special case where we can * have a physically and logically contiguous blocks split across two leaf * nodes instead of a single extent. This is required in case of atomic writes * to know whether the returned extent is last in leaf. If yes, then lookup for * next in leaf block in ext4_map_query_blocks_next_in_leaf(). * - This is never going to be added to any buffer head state. * - We use the next available bit after BH_BITMAP_UPTODATE. */ #define EXT4_MAP_QUERY_LAST_IN_LEAF BIT(BH_BITMAP_UPTODATE + 1) #define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ EXT4_MAP_DELAYED | EXT4_MAP_QUERY_LAST_IN_LEAF) struct ext4_map_blocks { ext4_fsblk_t m_pblk; ext4_lblk_t m_lblk; unsigned int m_len; unsigned int m_flags; }; /* * Block validity checking, system zone rbtree. */ struct ext4_system_blocks { struct rb_root root; struct rcu_head rcu; }; /* * Flags for ext4_io_end->flags */ #define EXT4_IO_END_UNWRITTEN 0x0001 #define EXT4_IO_END_FAILED 0x0002 #define EXT4_IO_END_DEFER_COMPLETION (EXT4_IO_END_UNWRITTEN | EXT4_IO_END_FAILED) struct ext4_io_end_vec { struct list_head list; /* list of io_end_vec */ loff_t offset; /* offset in the file */ ssize_t size; /* size of the extent */ }; /* * For converting unwritten extents on a work queue. 'handle' is used for * buffered writeback. */ typedef struct ext4_io_end { struct list_head list; /* per-file finished IO list */ handle_t *handle; /* handle reserved for extent * conversion */ struct inode *inode; /* file being written to */ struct bio *bio; /* Linked list of completed * bios covering the extent */ unsigned int flag; /* unwritten or not */ refcount_t count; /* reference counter */ struct list_head list_vec; /* list of ext4_io_end_vec */ } ext4_io_end_t; struct ext4_io_submit { struct writeback_control *io_wbc; struct bio *io_bio; ext4_io_end_t *io_end; sector_t io_next_block; }; /* * Special inodes numbers */ #define EXT4_BAD_INO 1 /* Bad blocks inode */ #define EXT4_ROOT_INO 2 /* Root inode */ #define EXT4_USR_QUOTA_INO 3 /* User quota inode */ #define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */ #define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */ #define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */ #define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */ #define EXT4_JOURNAL_INO 8 /* Journal inode */ /* First non-reserved inode for old ext4 filesystems */ #define EXT4_GOOD_OLD_FIRST_INO 11 /* * Maximal count of links to a file */ #define EXT4_LINK_MAX 65000 /* * Macro-instructions used to manage several block sizes */ #define EXT4_MIN_BLOCK_SIZE 1024 #define EXT4_MAX_BLOCK_SIZE 65536 #define EXT4_MIN_BLOCK_LOG_SIZE 10 #define EXT4_MAX_BLOCK_LOG_SIZE 16 #define EXT4_MAX_CLUSTER_LOG_SIZE 30 #ifdef __KERNEL__ # define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize) #else # define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size) #endif #define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32)) #define EXT4_CLUSTER_SIZE(s) (EXT4_BLOCK_SIZE(s) << \ EXT4_SB(s)->s_cluster_bits) #ifdef __KERNEL__ # define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) # define EXT4_CLUSTER_BITS(s) (EXT4_SB(s)->s_cluster_bits) #else # define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) #endif #ifdef __KERNEL__ #define EXT4_ADDR_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_addr_per_block_bits) #define EXT4_INODE_SIZE(s) (EXT4_SB(s)->s_inode_size) #define EXT4_FIRST_INO(s) (EXT4_SB(s)->s_first_ino) #else #define EXT4_INODE_SIZE(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ EXT4_GOOD_OLD_INODE_SIZE : \ (s)->s_inode_size) #define EXT4_FIRST_INO(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ EXT4_GOOD_OLD_FIRST_INO : \ (s)->s_first_ino) #endif #define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits))) #define EXT4_MAX_BLOCKS(size, offset, blkbits) \ ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \ blkbits)) #define EXT4_B_TO_LBLK(inode, offset) \ (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits) /* Translate a block number to a cluster number */ #define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) /* Translate a cluster number to a block number */ #define EXT4_C2B(sbi, cluster) ((cluster) << (sbi)->s_cluster_bits) /* Translate # of blks to # of clusters */ #define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \ (sbi)->s_cluster_bits) /* Mask out the low bits to get the starting block of the cluster */ #define EXT4_PBLK_CMASK(s, pblk) ((pblk) & \ ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) #define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \ ~((ext4_lblk_t) (s)->s_cluster_ratio - 1)) /* Fill in the low bits to get the last block of the cluster */ #define EXT4_LBLK_CFILL(sbi, lblk) ((lblk) | \ ((ext4_lblk_t) (sbi)->s_cluster_ratio - 1)) /* Get the cluster offset */ #define EXT4_PBLK_COFF(s, pblk) ((pblk) & \ ((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) #define EXT4_LBLK_COFF(s, lblk) ((lblk) & \ ((ext4_lblk_t) (s)->s_cluster_ratio - 1)) /* * Structure of a blocks group descriptor */ struct ext4_group_desc { __le32 bg_block_bitmap_lo; /* Blocks bitmap block */ __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */ __le32 bg_inode_table_lo; /* Inodes table block */ __le16 bg_free_blocks_count_lo;/* Free blocks count */ __le16 bg_free_inodes_count_lo;/* Free inodes count */ __le16 bg_used_dirs_count_lo; /* Directories count */ __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ __le32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ __le16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */ __le16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */ __le16 bg_itable_unused_lo; /* Unused inodes count */ __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ __le32 bg_inode_table_hi; /* Inodes table block MSB */ __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */ __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ __le16 bg_used_dirs_count_hi; /* Directories count MSB */ __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ __le32 bg_exclude_bitmap_hi; /* Exclude bitmap block MSB */ __le16 bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */ __le16 bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */ __u32 bg_reserved; }; #define EXT4_BG_INODE_BITMAP_CSUM_HI_END \ (offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \ sizeof(__le16)) #define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END \ (offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \ sizeof(__le16)) /* * Structure of a flex block group info */ struct flex_groups { atomic64_t free_clusters; atomic_t free_inodes; atomic_t used_dirs; }; #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ #define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ #define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ /* * Macro-instructions used to manage group descriptors */ #define EXT4_MIN_DESC_SIZE 32 #define EXT4_MIN_DESC_SIZE_64BIT 64 #define EXT4_MAX_DESC_SIZE EXT4_MIN_BLOCK_SIZE #define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size) #ifdef __KERNEL__ # define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group) # define EXT4_CLUSTERS_PER_GROUP(s) (EXT4_SB(s)->s_clusters_per_group) # define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block) # define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group) # define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits) #else # define EXT4_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group) # define EXT4_DESC_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s)) # define EXT4_INODES_PER_GROUP(s) ((s)->s_inodes_per_group) #endif /* * Constants relative to the data blocks */ #define EXT4_NDIR_BLOCKS 12 #define EXT4_IND_BLOCK EXT4_NDIR_BLOCKS #define EXT4_DIND_BLOCK (EXT4_IND_BLOCK + 1) #define EXT4_TIND_BLOCK (EXT4_DIND_BLOCK + 1) #define EXT4_N_BLOCKS (EXT4_TIND_BLOCK + 1) /* * Inode flags */ #define EXT4_SECRM_FL 0x00000001 /* Secure deletion */ #define EXT4_UNRM_FL 0x00000002 /* Undelete */ #define EXT4_COMPR_FL 0x00000004 /* Compress file */ #define EXT4_SYNC_FL 0x00000008 /* Synchronous updates */ #define EXT4_IMMUTABLE_FL 0x00000010 /* Immutable file */ #define EXT4_APPEND_FL 0x00000020 /* writes to file may only append */ #define EXT4_NODUMP_FL 0x00000040 /* do not dump file */ #define EXT4_NOATIME_FL 0x00000080 /* do not update atime */ /* Reserved for compression usage... */ #define EXT4_DIRTY_FL 0x00000100 #define EXT4_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ #define EXT4_NOCOMPR_FL 0x00000400 /* Don't compress */ /* nb: was previously EXT2_ECOMPR_FL */ #define EXT4_ENCRYPT_FL 0x00000800 /* encrypted file */ /* End compression flags --- maybe not all used */ #define EXT4_INDEX_FL 0x00001000 /* hash-indexed directory */ #define EXT4_IMAGIC_FL 0x00002000 /* AFS directory */ #define EXT4_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ #define EXT4_NOTAIL_FL 0x00008000 /* file tail should not be merged */ #define EXT4_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ #define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ #define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ #define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */ #define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ /* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */ #define EXT4_DAX_FL 0x02000000 /* Inode is DAX */ #define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ #define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded directory */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ /* User modifiable flags */ #define EXT4_FL_USER_MODIFIABLE (EXT4_SECRM_FL | \ EXT4_UNRM_FL | \ EXT4_COMPR_FL | \ EXT4_SYNC_FL | \ EXT4_IMMUTABLE_FL | \ EXT4_APPEND_FL | \ EXT4_NODUMP_FL | \ EXT4_NOATIME_FL | \ EXT4_JOURNAL_DATA_FL | \ EXT4_NOTAIL_FL | \ EXT4_DIRSYNC_FL | \ EXT4_TOPDIR_FL | \ EXT4_EXTENTS_FL | \ 0x00400000 /* EXT4_EOFBLOCKS_FL */ | \ EXT4_DAX_FL | \ EXT4_PROJINHERIT_FL | \ EXT4_CASEFOLD_FL) /* User visible flags */ #define EXT4_FL_USER_VISIBLE (EXT4_FL_USER_MODIFIABLE | \ EXT4_DIRTY_FL | \ EXT4_COMPRBLK_FL | \ EXT4_NOCOMPR_FL | \ EXT4_ENCRYPT_FL | \ EXT4_INDEX_FL | \ EXT4_VERITY_FL | \ EXT4_INLINE_DATA_FL) /* Flags that should be inherited by new inodes from their parent. */ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\ EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL |\ EXT4_DAX_FL) /* Flags that are appropriate for regular files (all but dir-specific ones). */ #define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\ EXT4_PROJINHERIT_FL)) /* Flags that are appropriate for non-directories/regular files. */ #define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL) /* The only flags that should be swapped */ #define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL) /* Flags which are mutually exclusive to DAX */ #define EXT4_DAX_MUT_EXCL (EXT4_VERITY_FL | EXT4_ENCRYPT_FL |\ EXT4_JOURNAL_DATA_FL | EXT4_INLINE_DATA_FL) /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) { if (S_ISDIR(mode)) return flags; else if (S_ISREG(mode)) return flags & EXT4_REG_FLMASK; else return flags & EXT4_OTHER_FLMASK; } /* * Inode flags used for atomic set/get */ enum { EXT4_INODE_SECRM = 0, /* Secure deletion */ EXT4_INODE_UNRM = 1, /* Undelete */ EXT4_INODE_COMPR = 2, /* Compress file */ EXT4_INODE_SYNC = 3, /* Synchronous updates */ EXT4_INODE_IMMUTABLE = 4, /* Immutable file */ EXT4_INODE_APPEND = 5, /* writes to file may only append */ EXT4_INODE_NODUMP = 6, /* do not dump file */ EXT4_INODE_NOATIME = 7, /* do not update atime */ /* Reserved for compression usage... */ EXT4_INODE_DIRTY = 8, EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */ EXT4_INODE_NOCOMPR = 10, /* Don't compress */ EXT4_INODE_ENCRYPT = 11, /* Encrypted file */ /* End compression flags --- maybe not all used */ EXT4_INODE_INDEX = 12, /* hash-indexed directory */ EXT4_INODE_IMAGIC = 13, /* AFS directory */ EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */ EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */ EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */ EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/ EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */ EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ EXT4_INODE_VERITY = 20, /* Verity protected inode */ EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ /* 22 was formerly EXT4_INODE_EOFBLOCKS */ EXT4_INODE_DAX = 25, /* Inode is DAX */ EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */ EXT4_INODE_CASEFOLD = 30, /* Casefolded directory */ EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ }; /* * Since it's pretty easy to mix up bit numbers and hex values, we use a * build-time check to make sure that EXT4_XXX_FL is consistent with respect to * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost * any extra space in the compiled kernel image, otherwise, the build will fail. * It's important that these values are the same, since we are using * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk * values found in ext2, ext3 and ext4 filesystems, and of course the values * defined in e2fsprogs. * * It's not paranoia if the Murphy's Law really *is* out to get you. :-) */ #define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1U << EXT4_INODE_##FLAG)) #define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG)) static inline void ext4_check_flag_values(void) { CHECK_FLAG_VALUE(SECRM); CHECK_FLAG_VALUE(UNRM); CHECK_FLAG_VALUE(COMPR); CHECK_FLAG_VALUE(SYNC); CHECK_FLAG_VALUE(IMMUTABLE); CHECK_FLAG_VALUE(APPEND); CHECK_FLAG_VALUE(NODUMP); CHECK_FLAG_VALUE(NOATIME); CHECK_FLAG_VALUE(DIRTY); CHECK_FLAG_VALUE(COMPRBLK); CHECK_FLAG_VALUE(NOCOMPR); CHECK_FLAG_VALUE(ENCRYPT); CHECK_FLAG_VALUE(INDEX); CHECK_FLAG_VALUE(IMAGIC); CHECK_FLAG_VALUE(JOURNAL_DATA); CHECK_FLAG_VALUE(NOTAIL); CHECK_FLAG_VALUE(DIRSYNC); CHECK_FLAG_VALUE(TOPDIR); CHECK_FLAG_VALUE(HUGE_FILE); CHECK_FLAG_VALUE(EXTENTS); CHECK_FLAG_VALUE(VERITY); CHECK_FLAG_VALUE(EA_INODE); CHECK_FLAG_VALUE(INLINE_DATA); CHECK_FLAG_VALUE(PROJINHERIT); CHECK_FLAG_VALUE(CASEFOLD); CHECK_FLAG_VALUE(RESERVED); } #if defined(__KERNEL__) && defined(CONFIG_COMPAT) struct compat_ext4_new_group_input { u32 group; compat_u64 block_bitmap; compat_u64 inode_bitmap; compat_u64 inode_table; u32 blocks_count; u16 reserved_blocks; u16 unused; }; #endif /* The struct ext4_new_group_input in kernel space, with free_blocks_count */ struct ext4_new_group_data { __u32 group; __u64 block_bitmap; __u64 inode_bitmap; __u64 inode_table; __u32 blocks_count; __u16 reserved_blocks; __u16 mdata_blocks; __u32 free_clusters_count; }; /* Indexes used to index group tables in ext4_new_group_data */ enum { BLOCK_BITMAP = 0, /* block bitmap */ INODE_BITMAP, /* inode bitmap */ INODE_TABLE, /* inode tables */ GROUP_TABLE_COUNT, }; /* * Flags used by ext4_map_blocks() */ /* Allocate any needed blocks and/or convert an unwritten extent to be an initialized ext4 */ #define EXT4_GET_BLOCKS_CREATE 0x0001 /* Request the creation of an unwritten extent */ #define EXT4_GET_BLOCKS_UNWRIT_EXT 0x0002 #define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT (EXT4_GET_BLOCKS_UNWRIT_EXT|\ EXT4_GET_BLOCKS_CREATE) /* Caller is from the delayed allocation writeout path * finally doing the actual allocation of delayed blocks */ #define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 /* caller is from the direct IO path, request to creation of an unwritten extents if not allocated, split the unwritten extent if blocks has been preallocated already*/ #define EXT4_GET_BLOCKS_PRE_IO 0x0008 #define EXT4_GET_BLOCKS_CONVERT 0x0010 #define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) /* Eventual metadata allocation (due to growing extent tree) * should not fail, so try to use reserved blocks for that.*/ #define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 /* Don't normalize allocation size (used for fallocate) */ #define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 /* Convert written extents to unwritten */ #define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0100 /* Write zeros to newly created written extents */ #define EXT4_GET_BLOCKS_ZERO 0x0200 #define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\ EXT4_GET_BLOCKS_ZERO) /* Caller is in the context of data submission, such as writeback, * fsync, etc. Especially, in the generic writeback path, caller will * submit data before dropping transaction handle. This allows jbd2 * to avoid submitting data before commit. */ #define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400 /* Convert extent to initialized after IO complete */ #define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT |\ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |\ EXT4_GET_BLOCKS_IO_SUBMIT) /* Caller is in the atomic contex, find extent if it has been cached */ #define EXT4_GET_BLOCKS_CACHED_NOWAIT 0x0800 /* * Atomic write caller needs this to query in the slow path of mixed mapping * case, when a contiguous extent can be split across two adjacent leaf nodes. * Look EXT4_MAP_QUERY_LAST_IN_LEAF. */ #define EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF 0x1000 /* * The bit position of these flags must not overlap with any of the * EXT4_GET_BLOCKS_*. They are used by ext4_find_extent(), * read_extent_tree_block(), ext4_split_extent_at(), * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf(). * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be * caching the extents when reading from the extent tree while a * truncate or punch hole operation is in progress. */ #define EXT4_EX_NOCACHE 0x40000000 #define EXT4_EX_FORCE_CACHE 0x20000000 #define EXT4_EX_NOFAIL 0x10000000 /* * ext4_map_query_blocks() uses this filter mask to filter the flags needed to * pass while lookup/querying of on disk extent tree. */ #define EXT4_EX_QUERY_FILTER (EXT4_EX_NOCACHE | EXT4_EX_FORCE_CACHE |\ EXT4_EX_NOFAIL |\ EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) /* * Flags used by ext4_free_blocks */ #define EXT4_FREE_BLOCKS_METADATA 0x0001 #define EXT4_FREE_BLOCKS_FORGET 0x0002 #define EXT4_FREE_BLOCKS_VALIDATED 0x0004 #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 #define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER 0x0040 #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* * ioctl commands in 32 bit emulation */ #define EXT4_IOC32_GETVERSION _IOR('f', 3, int) #define EXT4_IOC32_SETVERSION _IOW('f', 4, int) #define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) #define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) #define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) #define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input) #define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION #define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION #endif /* Max physical block we can address w/o extents */ #define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF /* Max logical block we can support */ #define EXT4_MAX_LOGICAL_BLOCK 0xFFFFFFFE /* * Structure of an inode on the disk */ struct ext4_inode { __le16 i_mode; /* File mode */ __le16 i_uid; /* Low 16 bits of Owner Uid */ __le32 i_size_lo; /* Size in bytes */ __le32 i_atime; /* Access time */ __le32 i_ctime; /* Inode Change time */ __le32 i_mtime; /* Modification time */ __le32 i_dtime; /* Deletion Time */ __le16 i_gid; /* Low 16 bits of Group Id */ __le16 i_links_count; /* Links count */ __le32 i_blocks_lo; /* Blocks count */ __le32 i_flags; /* File flags */ union { struct { __le32 l_i_version; } linux1; struct { __u32 h_i_translator; } hurd1; struct { __u32 m_i_reserved1; } masix1; } osd1; /* OS dependent 1 */ __le32 i_block[EXT4_N_BLOCKS];/* Pointers to blocks */ __le32 i_generation; /* File version (for NFS) */ __le32 i_file_acl_lo; /* File ACL */ __le32 i_size_high; __le32 i_obso_faddr; /* Obsoleted fragment address */ union { struct { __le16 l_i_blocks_high; /* were l_i_reserved1 */ __le16 l_i_file_acl_high; __le16 l_i_uid_high; /* these 2 fields */ __le16 l_i_gid_high; /* were reserved2[0] */ __le16 l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */ __le16 l_i_reserved; } linux2; struct { __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ __u16 h_i_mode_high; __u16 h_i_uid_high; __u16 h_i_gid_high; __u32 h_i_author; } hurd2; struct { __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ __le16 m_i_file_acl_high; __u32 m_i_reserved2[2]; } masix2; } osd2; /* OS dependent 2 */ __le16 i_extra_isize; __le16 i_checksum_hi; /* crc32c(uuid+inum+inode) BE */ __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */ __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ __le32 i_crtime; /* File Creation time */ __le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */ __le32 i_version_hi; /* high 32 bits for 64-bit version */ __le32 i_projid; /* Project ID */ }; #define EXT4_EPOCH_BITS 2 #define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) #define EXT4_NSEC_MASK (~0UL << EXT4_EPOCH_BITS) /* * Extended fields will fit into an inode if the filesystem was formatted * with large inodes (-I 256 or larger) and there are not currently any EAs * consuming all of the available space. For new inodes we always reserve * enough space for the kernel's known extended fields, but for inodes * created with an old kernel this might not have been the case. None of * the extended inode fields is critical for correct filesystem operation. * This macro checks if a certain field fits in the inode. Note that * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize */ #define EXT4_FITS_IN_INODE(ext4_inode, einode, field) \ ((offsetof(typeof(*ext4_inode), field) + \ sizeof((ext4_inode)->field)) \ <= (EXT4_GOOD_OLD_INODE_SIZE + \ (einode)->i_extra_isize)) \ /* * We use an encoding that preserves the times for extra epoch "00": * * extra msb of adjust for signed * epoch 32-bit 32-bit tv_sec to * bits time decoded 64-bit tv_sec 64-bit tv_sec valid time range * 0 0 1 -0x80000000..-0x00000001 0x000000000 1901-12-13..1969-12-31 * 0 0 0 0x000000000..0x07fffffff 0x000000000 1970-01-01..2038-01-19 * 0 1 1 0x080000000..0x0ffffffff 0x100000000 2038-01-19..2106-02-07 * 0 1 0 0x100000000..0x17fffffff 0x100000000 2106-02-07..2174-02-25 * 1 0 1 0x180000000..0x1ffffffff 0x200000000 2174-02-25..2242-03-16 * 1 0 0 0x200000000..0x27fffffff 0x200000000 2242-03-16..2310-04-04 * 1 1 1 0x280000000..0x2ffffffff 0x300000000 2310-04-04..2378-04-22 * 1 1 0 0x300000000..0x37fffffff 0x300000000 2378-04-22..2446-05-10 * * Note that previous versions of the kernel on 64-bit systems would * incorrectly use extra epoch bits 1,1 for dates between 1901 and * 1970. e2fsck will correct this, assuming that it is run on the * affected filesystem before 2242. */ static inline __le32 ext4_encode_extra_time(struct timespec64 ts) { u32 extra = ((ts.tv_sec - (s32)ts.tv_sec) >> 32) & EXT4_EPOCH_MASK; return cpu_to_le32(extra | (ts.tv_nsec << EXT4_EPOCH_BITS)); } static inline struct timespec64 ext4_decode_extra_time(__le32 base, __le32 extra) { struct timespec64 ts = { .tv_sec = (signed)le32_to_cpu(base) }; if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) ts.tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; ts.tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; return ts; } #define EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, ts) \ do { \ if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \ (raw_inode)->xtime = cpu_to_le32((ts).tv_sec); \ (raw_inode)->xtime ## _extra = ext4_encode_extra_time(ts); \ } else \ (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (ts).tv_sec, S32_MIN, S32_MAX)); \ } while (0) #define EXT4_INODE_SET_ATIME(inode, raw_inode) \ EXT4_INODE_SET_XTIME_VAL(i_atime, inode, raw_inode, inode_get_atime(inode)) #define EXT4_INODE_SET_MTIME(inode, raw_inode) \ EXT4_INODE_SET_XTIME_VAL(i_mtime, inode, raw_inode, inode_get_mtime(inode)) #define EXT4_INODE_SET_CTIME(inode, raw_inode) \ EXT4_INODE_SET_XTIME_VAL(i_ctime, inode, raw_inode, inode_get_ctime(inode)) #define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ EXT4_INODE_SET_XTIME_VAL(xtime, &((einode)->vfs_inode), \ raw_inode, (einode)->xtime) #define EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode) \ (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra) ? \ ext4_decode_extra_time((raw_inode)->xtime, \ (raw_inode)->xtime ## _extra) : \ (struct timespec64) { \ .tv_sec = (signed)le32_to_cpu((raw_inode)->xtime) \ }) #define EXT4_INODE_GET_ATIME(inode, raw_inode) \ do { \ inode_set_atime_to_ts(inode, \ EXT4_INODE_GET_XTIME_VAL(i_atime, inode, raw_inode)); \ } while (0) #define EXT4_INODE_GET_MTIME(inode, raw_inode) \ do { \ inode_set_mtime_to_ts(inode, \ EXT4_INODE_GET_XTIME_VAL(i_mtime, inode, raw_inode)); \ } while (0) #define EXT4_INODE_GET_CTIME(inode, raw_inode) \ do { \ inode_set_ctime_to_ts(inode, \ EXT4_INODE_GET_XTIME_VAL(i_ctime, inode, raw_inode)); \ } while (0) #define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ do { \ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ (einode)->xtime = \ EXT4_INODE_GET_XTIME_VAL(xtime, &(einode->vfs_inode), \ raw_inode); \ else \ (einode)->xtime = (struct timespec64){0, 0}; \ } while (0) #define i_disk_version osd1.linux1.l_i_version #if defined(__KERNEL__) || defined(__linux__) #define i_reserved1 osd1.linux1.l_i_reserved1 #define i_file_acl_high osd2.linux2.l_i_file_acl_high #define i_blocks_high osd2.linux2.l_i_blocks_high #define i_uid_low i_uid #define i_gid_low i_gid #define i_uid_high osd2.linux2.l_i_uid_high #define i_gid_high osd2.linux2.l_i_gid_high #define i_checksum_lo osd2.linux2.l_i_checksum_lo #elif defined(__GNU__) #define i_translator osd1.hurd1.h_i_translator #define i_uid_high osd2.hurd2.h_i_uid_high #define i_gid_high osd2.hurd2.h_i_gid_high #define i_author osd2.hurd2.h_i_author #elif defined(__masix__) #define i_reserved1 osd1.masix1.m_i_reserved1 #define i_file_acl_high osd2.masix2.m_i_file_acl_high #define i_reserved2 osd2.masix2.m_i_reserved2 #endif /* defined(__KERNEL__) || defined(__linux__) */ #include "extents_status.h" #include "fast_commit.h" /* * Lock subclasses for i_data_sem in the ext4_inode_info structure. * * These are needed to avoid lockdep false positives when we need to * allocate blocks to the quota inode during ext4_map_blocks(), while * holding i_data_sem for a normal (non-quota) inode. Since we don't * do quota tracking for the quota inode, this avoids deadlock (as * well as infinite recursion, since it isn't turtles all the way * down...) * * I_DATA_SEM_NORMAL - Used for most inodes * I_DATA_SEM_OTHER - Used by move_inode.c for the second normal inode * where the second inode has larger inode number * than the first * I_DATA_SEM_QUOTA - Used for quota inodes only * I_DATA_SEM_EA - Used for ea_inodes only */ enum { I_DATA_SEM_NORMAL = 0, I_DATA_SEM_OTHER, I_DATA_SEM_QUOTA, I_DATA_SEM_EA }; /* * fourth extended file system inode data in memory */ struct ext4_inode_info { __le32 i_data[15]; /* unconverted */ __u32 i_dtime; ext4_fsblk_t i_file_acl; /* * i_block_group is the number of the block group which contains * this file's inode. Constant across the lifetime of the inode, * it is used for making block allocation decisions - we try to * place a file's data blocks near its inode block, and new inodes * near to their parent directory's inode. */ ext4_group_t i_block_group; ext4_lblk_t i_dir_start_lookup; #if (BITS_PER_LONG < 64) unsigned long i_state_flags; /* Dynamic state flags */ #endif unsigned long i_flags; /* * Extended attributes can be read independently of the main file * data. Taking i_rwsem even when reading would cause contention * between readers of EAs and writers of regular file data, so * instead we synchronize on xattr_sem when reading or changing * EAs. */ struct rw_semaphore xattr_sem; /* * Inodes with EXT4_STATE_ORPHAN_FILE use i_orphan_idx. Otherwise * i_orphan is used. */ union { struct list_head i_orphan; /* unlinked but open inodes */ unsigned int i_orphan_idx; /* Index in orphan file */ }; /* Fast commit related info */ /* For tracking dentry create updates */ struct list_head i_fc_dilist; struct list_head i_fc_list; /* * inodes that need fast commit * protected by sbi->s_fc_lock. */ /* Start of lblk range that needs to be committed in this fast commit */ ext4_lblk_t i_fc_lblk_start; /* End of lblk range that needs to be committed in this fast commit */ ext4_lblk_t i_fc_lblk_len; spinlock_t i_raw_lock; /* protects updates to the raw inode */ /* Fast commit wait queue for this inode */ wait_queue_head_t i_fc_wait; /* * Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len * and inode's EXT4_FC_STATE_COMMITTING state bit. */ spinlock_t i_fc_lock; /* * i_disksize keeps track of what the inode size is ON DISK, not * in memory. During truncate, i_size is set to the new size by * the VFS prior to calling ext4_truncate(), but the filesystem won't * set i_disksize to 0 until the truncate is actually under way. * * The intent is that i_disksize always represents the blocks which * are used by this file. This allows recovery to restart truncate * on orphans if we crash during truncate. We actually write i_disksize * into the on-disk inode when writing inodes out, instead of i_size. * * The only time when i_disksize and i_size may be different is when * a truncate is in progress. The only things which change i_disksize * are ext4_get_block (growth) and ext4_truncate (shrinkth). */ loff_t i_disksize; /* * i_data_sem is for serialising ext4_truncate() against * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's * data tree are chopped off during truncate. We can't do that in * ext4 because whenever we perform intermediate commits during * truncate, the inode and all the metadata blocks *must* be in a * consistent state which allows truncation of the orphans to restart * during recovery. Hence we must fix the get_block-vs-truncate race * by other means, so we have i_data_sem. */ struct rw_semaphore i_data_sem; struct inode vfs_inode; struct jbd2_inode *jinode; /* * File creation time. Its function is same as that of * struct timespec64 i_{a,c,m}time in the generic inode. */ struct timespec64 i_crtime; /* mballoc */ atomic_t i_prealloc_active; /* allocation reservation info for delalloc */ /* In case of bigalloc, this refer to clusters rather than blocks */ unsigned int i_reserved_data_blocks; struct rb_root i_prealloc_node; rwlock_t i_prealloc_lock; /* extents status tree */ struct ext4_es_tree i_es_tree; rwlock_t i_es_lock; struct list_head i_es_list; unsigned int i_es_all_nr; /* protected by i_es_lock */ unsigned int i_es_shk_nr; /* protected by i_es_lock */ ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for extents to shrink. Protected by i_es_lock */ /* ialloc */ ext4_group_t i_last_alloc_group; /* pending cluster reservations for bigalloc file systems */ struct ext4_pending_tree i_pending_tree; /* on-disk additional length */ __u16 i_extra_isize; /* Indicate the inline data space. */ u16 i_inline_off; u16 i_inline_size; #ifdef CONFIG_QUOTA /* quota space reservation, managed internally by quota code */ qsize_t i_reserved_quota; #endif spinlock_t i_block_reservation_lock; /* Lock protecting lists below */ spinlock_t i_completed_io_lock; /* * Completed IOs that need unwritten extents handling and have * transaction reserved */ struct list_head i_rsv_conversion_list; struct work_struct i_rsv_conversion_work; /* * Transactions that contain inode's metadata needed to complete * fsync and fdatasync, respectively. */ tid_t i_sync_tid; tid_t i_datasync_tid; #ifdef CONFIG_QUOTA struct dquot __rcu *i_dquot[MAXQUOTAS]; #endif /* Precomputed uuid+inum+igen checksum for seeding inode checksums */ __u32 i_csum_seed; kprojid_t i_projid; }; /* * File system states */ #define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */ #define EXT4_ERROR_FS 0x0002 /* Errors detected */ #define EXT4_ORPHAN_FS 0x0004 /* Orphans being recovered */ #define EXT4_FC_REPLAY 0x0020 /* Fast commit replay ongoing */ /* * Misc. filesystem flags */ #define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */ #define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */ #define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ /* * Mount flags set via mount options or defaults */ #define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */ #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ #define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ #define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ #define EXT4_MOUNT_ERRORS_MASK 0x00070 #define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ #define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ #ifdef CONFIG_FS_DAX #define EXT4_MOUNT_DAX_ALWAYS 0x00200 /* Direct Access */ #else #define EXT4_MOUNT_DAX_ALWAYS 0 #endif #define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ #define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ #define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ #define EXT4_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */ #define EXT4_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */ #define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ #define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ #define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ #define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ #define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ #define EXT4_MOUNT_QUOTA 0x40000 /* Some quota option set */ #define EXT4_MOUNT_USRQUOTA 0x80000 /* "old" user quota, * enable enforcement for hidden * quota files */ #define EXT4_MOUNT_GRPQUOTA 0x100000 /* "old" group quota, enable * enforcement for hidden quota * files */ #define EXT4_MOUNT_PRJQUOTA 0x200000 /* Enable project quota * enforcement */ #define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */ #define EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS 0x4000000 #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ #define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ #define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ /* * Mount flags set either automatically (could not be set by mount option) * based on per file system feature or property or in special cases such as * distinguishing between explicit mount option definition and default. */ #define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly specified delalloc */ #define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group size of blocksize * 8 blocks */ #define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated file systems */ #define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly specified journal checksum */ #define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */ #define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */ #define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */ #define EXT4_MOUNT2_MB_OPTIMIZE_SCAN 0x00000080 /* Optimize group * scanning in mballoc */ #define EXT4_MOUNT2_ABORT 0x00000100 /* Abort filesystem */ #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt #define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ EXT4_MOUNT_##opt #define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ EXT4_MOUNT_##opt) #define clear_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 &= \ ~EXT4_MOUNT2_##opt #define set_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 |= \ EXT4_MOUNT2_##opt #define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \ EXT4_MOUNT2_##opt) #define ext4_test_and_set_bit __test_and_set_bit_le #define ext4_set_bit __set_bit_le #define ext4_test_and_clear_bit __test_and_clear_bit_le #define ext4_clear_bit __clear_bit_le #define ext4_test_bit test_bit_le #define ext4_find_next_zero_bit find_next_zero_bit_le #define ext4_find_next_bit find_next_bit_le extern void mb_set_bits(void *bm, int cur, int len); /* * Maximal mount counts between two filesystem checks */ #define EXT4_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ #define EXT4_DFL_CHECKINTERVAL 0 /* Don't use interval check */ /* * Behaviour when detecting errors */ #define EXT4_ERRORS_CONTINUE 1 /* Continue execution */ #define EXT4_ERRORS_RO 2 /* Remount fs read-only */ #define EXT4_ERRORS_PANIC 3 /* Panic */ #define EXT4_ERRORS_DEFAULT EXT4_ERRORS_CONTINUE /* Metadata checksum algorithm codes */ #define EXT4_CRC32C_CHKSUM 1 #define EXT4_LABEL_MAX 16 /* * Structure of the super block */ struct ext4_super_block { /*00*/ __le32 s_inodes_count; /* Inodes count */ __le32 s_blocks_count_lo; /* Blocks count */ __le32 s_r_blocks_count_lo; /* Reserved blocks count */ __le32 s_free_blocks_count_lo; /* Free blocks count */ /*10*/ __le32 s_free_inodes_count; /* Free inodes count */ __le32 s_first_data_block; /* First Data Block */ __le32 s_log_block_size; /* Block size */ __le32 s_log_cluster_size; /* Allocation cluster size */ /*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ __le32 s_clusters_per_group; /* # Clusters per group */ __le32 s_inodes_per_group; /* # Inodes per group */ __le32 s_mtime; /* Mount time */ /*30*/ __le32 s_wtime; /* Write time */ __le16 s_mnt_count; /* Mount count */ __le16 s_max_mnt_count; /* Maximal mount count */ __le16 s_magic; /* Magic signature */ __le16 s_state; /* File system state */ __le16 s_errors; /* Behaviour when detecting errors */ __le16 s_minor_rev_level; /* minor revision level */ /*40*/ __le32 s_lastcheck; /* time of last check */ __le32 s_checkinterval; /* max. time between checks */ __le32 s_creator_os; /* OS */ __le32 s_rev_level; /* Revision level */ /*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */ __le16 s_def_resgid; /* Default gid for reserved blocks */ /* * These fields are for EXT4_DYNAMIC_REV superblocks only. * * Note: the difference between the compatible feature set and * the incompatible feature set is that if there is a bit set * in the incompatible feature set that the kernel doesn't * know about, it should refuse to mount the filesystem. * * e2fsck's requirements are more strict; if it doesn't know * about a feature in either the compatible or incompatible * feature set, it must abort and not try to meddle with * things it doesn't understand... */ __le32 s_first_ino; /* First non-reserved inode */ __le16 s_inode_size; /* size of inode structure */ __le16 s_block_group_nr; /* block group # of this superblock */ __le32 s_feature_compat; /* compatible feature set */ /*60*/ __le32 s_feature_incompat; /* incompatible feature set */ __le32 s_feature_ro_compat; /* readonly-compatible feature set */ /*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ /*78*/ char s_volume_name[EXT4_LABEL_MAX] __nonstring; /* volume name */ /*88*/ char s_last_mounted[64] __nonstring; /* directory where last mounted */ /*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ /* * Performance hints. Directory preallocation should only * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on. */ __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */ /* * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set. */ /*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ /*E0*/ __le32 s_journal_inum; /* inode number of journal file */ __le32 s_journal_dev; /* device number of journal file */ __le32 s_last_orphan; /* start of list of inodes to delete */ __le32 s_hash_seed[4]; /* HTREE hash seed */ __u8 s_def_hash_version; /* Default hash version to use */ __u8 s_jnl_backup_type; __le16 s_desc_size; /* size of group descriptor */ /*100*/ __le32 s_default_mount_opts; __le32 s_first_meta_bg; /* First metablock block group */ __le32 s_mkfs_time; /* When the filesystem was created */ __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ /* 64bit support valid if EXT4_FEATURE_INCOMPAT_64BIT */ /*150*/ __le32 s_blocks_count_hi; /* Blocks count */ __le32 s_r_blocks_count_hi; /* Reserved blocks count */ __le32 s_free_blocks_count_hi; /* Free blocks count */ __le16 s_min_extra_isize; /* All inodes have at least # bytes */ __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ __le32 s_flags; /* Miscellaneous flags */ __le16 s_raid_stride; /* RAID stride */ __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ __le64 s_mmp_block; /* Block for multi-mount protection */ __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ __u8 s_log_groups_per_flex; /* FLEX_BG group size */ __u8 s_checksum_type; /* metadata checksum algorithm used */ __u8 s_encryption_level; /* versioning level for encryption */ __u8 s_reserved_pad; /* Padding to next 32bits */ __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ __le32 s_snapshot_inum; /* Inode number of active snapshot */ __le32 s_snapshot_id; /* sequential ID of active snapshot */ __le64 s_snapshot_r_blocks_count; /* reserved blocks for active snapshot's future use */ __le32 s_snapshot_list; /* inode number of the head of the on-disk snapshot list */ #define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count) __le32 s_error_count; /* number of fs errors */ __le32 s_first_error_time; /* first time an error happened */ __le32 s_first_error_ino; /* inode involved in first error */ __le64 s_first_error_block; /* block involved of first error */ __u8 s_first_error_func[32] __nonstring; /* function where the error happened */ __le32 s_first_error_line; /* line number where error happened */ __le32 s_last_error_time; /* most recent time of an error */ __le32 s_last_error_ino; /* inode involved in last error */ __le32 s_last_error_line; /* line number where error happened */ __le64 s_last_error_block; /* block involved of last error */ __u8 s_last_error_func[32] __nonstring; /* function where the error happened */ #define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts) __u8 s_mount_opts[64]; __le32 s_usr_quota_inum; /* inode for tracking user quota */ __le32 s_grp_quota_inum; /* inode for tracking group quota */ __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ __le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */ __u8 s_encrypt_algos[4]; /* Encryption algorithms in use */ __u8 s_encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ __le32 s_lpf_ino; /* Location of the lost+found inode */ __le32 s_prj_quota_inum; /* inode for tracking project quota */ __le32 s_checksum_seed; /* crc32c(uuid) if csum_seed set */ __u8 s_wtime_hi; __u8 s_mtime_hi; __u8 s_mkfs_time_hi; __u8 s_lastcheck_hi; __u8 s_first_error_time_hi; __u8 s_last_error_time_hi; __u8 s_first_error_errcode; __u8 s_last_error_errcode; __le16 s_encoding; /* Filename charset encoding */ __le16 s_encoding_flags; /* Filename charset encoding flags */ __le32 s_orphan_file_inum; /* Inode for tracking orphan inodes */ __le32 s_reserved[94]; /* Padding to the end of the block */ __le32 s_checksum; /* crc32c(superblock) */ }; #define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) #ifdef __KERNEL__ /* Number of quota types we support */ #define EXT4_MAXQUOTAS 3 #define EXT4_ENC_UTF8_12_1 1 /* Types of ext4 journal triggers */ enum ext4_journal_trigger_type { EXT4_JTR_ORPHAN_FILE, EXT4_JTR_NONE /* This must be the last entry for indexing to work! */ }; #define EXT4_JOURNAL_TRIGGER_COUNT EXT4_JTR_NONE struct ext4_journal_trigger { struct jbd2_buffer_trigger_type tr_triggers; struct super_block *sb; }; static inline struct ext4_journal_trigger *EXT4_TRIGGER( struct jbd2_buffer_trigger_type *trigger) { return container_of(trigger, struct ext4_journal_trigger, tr_triggers); } #define EXT4_ORPHAN_BLOCK_MAGIC 0x0b10ca04 /* Structure at the tail of orphan block */ struct ext4_orphan_block_tail { __le32 ob_magic; __le32 ob_checksum; }; static inline int ext4_inodes_per_orphan_block(struct super_block *sb) { return (sb->s_blocksize - sizeof(struct ext4_orphan_block_tail)) / sizeof(u32); } struct ext4_orphan_block { atomic_t ob_free_entries; /* Number of free orphan entries in block */ struct buffer_head *ob_bh; /* Buffer for orphan block */ }; /* * Info about orphan file. */ struct ext4_orphan_info { int of_blocks; /* Number of orphan blocks in a file */ __u32 of_csum_seed; /* Checksum seed for orphan file */ struct ext4_orphan_block *of_binfo; /* Array with info about orphan * file blocks */ }; /* * fourth extended-fs super-block data in memory */ struct ext4_sb_info { unsigned long s_desc_size; /* Size of a group descriptor in bytes */ unsigned long s_inodes_per_block;/* Number of inodes per block */ unsigned long s_blocks_per_group;/* Number of blocks in a group */ unsigned long s_clusters_per_group; /* Number of clusters in a group */ unsigned long s_inodes_per_group;/* Number of inodes in a group */ unsigned long s_itb_per_group; /* Number of inode table blocks per group */ unsigned long s_gdb_count; /* Number of group descriptor blocks */ unsigned long s_desc_per_block; /* Number of group descriptors per block */ ext4_group_t s_groups_count; /* Number of groups in the fs */ ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ unsigned long s_overhead; /* # of fs overhead clusters */ unsigned int s_cluster_ratio; /* Number of blocks per cluster */ unsigned int s_cluster_bits; /* log2 of s_cluster_ratio */ loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ struct buffer_head * s_sbh; /* Buffer containing the super block */ struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ /* Array of bh's for the block group descriptors */ struct buffer_head * __rcu *s_group_desc; unsigned int s_mount_opt; unsigned int s_mount_opt2; unsigned long s_mount_flags; unsigned int s_def_mount_opt; unsigned int s_def_mount_opt2; ext4_fsblk_t s_sb_block; atomic64_t s_resv_clusters; kuid_t s_resuid; kgid_t s_resgid; unsigned short s_mount_state; unsigned short s_pad; int s_addr_per_block_bits; int s_desc_per_block_bits; int s_inode_size; int s_first_ino; unsigned int s_inode_readahead_blks; unsigned int s_inode_goal; u32 s_hash_seed[4]; int s_def_hash_version; int s_hash_unsigned; /* 3 if hash should be unsigned, 0 if not */ struct percpu_counter s_freeclusters_counter; struct percpu_counter s_freeinodes_counter; struct percpu_counter s_dirs_counter; struct percpu_counter s_dirtyclusters_counter; struct percpu_counter s_sra_exceeded_retry_limit; struct blockgroup_lock *s_blockgroup_lock; struct proc_dir_entry *s_proc; struct kobject s_kobj; struct completion s_kobj_unregister; struct super_block *s_sb; struct buffer_head *s_mmp_bh; /* Journaling */ struct journal_s *s_journal; unsigned long s_ext4_flags; /* Ext4 superblock flags */ struct mutex s_orphan_lock; /* Protects on disk list changes */ struct list_head s_orphan; /* List of orphaned inodes in on disk list */ struct ext4_orphan_info s_orphan_info; unsigned long s_commit_interval; u32 s_max_batch_time; u32 s_min_batch_time; struct file *s_journal_bdev_file; #ifdef CONFIG_QUOTA /* Names of quota files with journalled quota */ char __rcu *s_qf_names[EXT4_MAXQUOTAS]; int s_jquota_fmt; /* Format of quota to use */ #endif unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ struct ext4_system_blocks __rcu *s_system_blks; #ifdef EXTENTS_STATS /* ext4 extents stats */ unsigned long s_ext_min; unsigned long s_ext_max; unsigned long s_depth_max; spinlock_t s_ext_stats_lock; unsigned long s_ext_blocks; unsigned long s_ext_extents; #endif /* for buddy allocator */ struct ext4_group_info ** __rcu *s_group_info; struct inode *s_buddy_cache; spinlock_t s_md_lock; unsigned short *s_mb_offsets; unsigned int *s_mb_maxs; unsigned int s_group_info_size; atomic_t s_mb_free_pending; struct list_head s_freed_data_list[2]; /* List of blocks to be freed after commit completed */ struct list_head s_discard_list; struct work_struct s_discard_work; atomic_t s_retry_alloc_pending; struct xarray *s_mb_avg_fragment_size; struct xarray *s_mb_largest_free_orders; /* tunables */ unsigned long s_stripe; unsigned int s_mb_max_linear_groups; unsigned int s_mb_stream_request; unsigned int s_mb_max_to_scan; unsigned int s_mb_min_to_scan; unsigned int s_mb_stats; unsigned int s_mb_order2_reqs; unsigned int s_mb_group_prealloc; unsigned int s_max_dir_size_kb; unsigned int s_mb_prefetch; unsigned int s_mb_prefetch_limit; unsigned int s_mb_best_avail_max_trim_order; unsigned int s_sb_update_sec; unsigned int s_sb_update_kb; /* where last allocation was done - for stream allocation */ ext4_group_t *s_mb_last_groups; unsigned int s_mb_nr_global_goals; /* stats for buddy allocator */ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ atomic_t s_bal_success; /* we found long enough chunks */ atomic_t s_bal_allocated; /* in blocks */ atomic_t s_bal_ex_scanned; /* total extents scanned */ atomic_t s_bal_cX_ex_scanned[EXT4_MB_NUM_CRS]; /* total extents scanned */ atomic_t s_bal_groups_scanned; /* number of groups scanned */ atomic_t s_bal_goals; /* goal hits */ atomic_t s_bal_stream_goals; /* stream allocation global goal hits */ atomic_t s_bal_len_goals; /* len goal hits */ atomic_t s_bal_breaks; /* too long searches */ atomic_t s_bal_2orders; /* 2^order hits */ atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS]; atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS]; atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS]; /* cX loop didn't find blocks */ atomic_t s_mb_buddies_generated; /* number of buddies generated */ atomic64_t s_mb_generation_time; atomic_t s_mb_lost_chunks; atomic_t s_mb_preallocated; atomic_t s_mb_discarded; atomic_t s_lock_busy; /* locality groups */ struct ext4_locality_group __percpu *s_locality_groups; /* for write statistics */ unsigned long s_sectors_written_start; u64 s_kbytes_written; /* the size of zero-out chunk */ unsigned int s_extent_max_zeroout_kb; unsigned int s_log_groups_per_flex; struct flex_groups * __rcu *s_flex_groups; ext4_group_t s_flex_groups_allocated; /* workqueue for reserved extent conversions (buffered io) */ struct workqueue_struct *rsv_conversion_wq; /* timer for periodic error stats printing */ struct timer_list s_err_report; /* Lazy inode table initialization info */ struct ext4_li_request *s_li_request; /* Wait multiplier for lazy initialization thread */ unsigned int s_li_wait_mult; /* Kernel thread for multiple mount protection */ struct task_struct *s_mmp_tsk; /* record the last minlen when FITRIM is called. */ unsigned long s_last_trim_minblks; /* Precomputed FS UUID checksum for seeding other checksums */ __u32 s_csum_seed; /* Reclaim extents from extent status tree */ struct shrinker *s_es_shrinker; struct list_head s_es_list; /* List of inodes with reclaimable extents */ long s_es_nr_inode; struct ext4_es_stats s_es_stats; struct mb_cache *s_ea_block_cache; struct mb_cache *s_ea_inode_cache; spinlock_t s_es_lock ____cacheline_aligned_in_smp; /* Journal triggers for checksum computation */ struct ext4_journal_trigger s_journal_triggers[EXT4_JOURNAL_TRIGGER_COUNT]; /* Ratelimit ext4 messages. */ struct ratelimit_state s_err_ratelimit_state; struct ratelimit_state s_warning_ratelimit_state; struct ratelimit_state s_msg_ratelimit_state; atomic_t s_warning_count; atomic_t s_msg_count; /* Encryption policy for '-o test_dummy_encryption' */ struct fscrypt_dummy_policy s_dummy_enc_policy; /* * Barrier between writepages ops and changing any inode's JOURNAL_DATA * or EXTENTS flag or between writepages ops and changing DELALLOC or * DIOREAD_NOLOCK mount options on remount. */ struct percpu_rw_semaphore s_writepages_rwsem; struct dax_device *s_daxdev; u64 s_dax_part_off; #ifdef CONFIG_EXT4_DEBUG unsigned long s_simulate_fail; #endif /* Record the errseq of the backing block device */ errseq_t s_bdev_wb_err; spinlock_t s_bdev_wb_lock; /* Information about errors that happened during this mount */ spinlock_t s_error_lock; int s_add_error_count; int s_first_error_code; __u32 s_first_error_line; __u32 s_first_error_ino; __u64 s_first_error_block; const char *s_first_error_func; time64_t s_first_error_time; int s_last_error_code; __u32 s_last_error_line; __u32 s_last_error_ino; __u64 s_last_error_block; const char *s_last_error_func; time64_t s_last_error_time; /* * If we are in a context where we cannot update the on-disk * superblock, we queue the work here. This is used to update * the error information in the superblock, and for periodic * updates of the superblock called from the commit callback * function. */ struct work_struct s_sb_upd_work; /* Atomic write unit values in bytes */ unsigned int s_awu_min; unsigned int s_awu_max; /* Ext4 fast commit sub transaction ID */ atomic_t s_fc_subtid; /* * After commit starts, the main queue gets locked, and the further * updates get added in the staging queue. */ #define FC_Q_MAIN 0 #define FC_Q_STAGING 1 struct list_head s_fc_q[2]; /* Inodes staged for fast commit * that have data changes in them. */ struct list_head s_fc_dentry_q[2]; /* directory entry updates */ unsigned int s_fc_bytes; /* * Main fast commit lock. This lock protects accesses to the * following fields: * ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh. */ struct mutex s_fc_lock; struct buffer_head *s_fc_bh; struct ext4_fc_stats s_fc_stats; tid_t s_fc_ineligible_tid; #ifdef CONFIG_EXT4_DEBUG int s_fc_debug_max_replay; #endif struct ext4_fc_replay_state s_fc_replay_state; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) { return sb->s_fs_info; } static inline struct ext4_inode_info *EXT4_I(struct inode *inode) { return container_of(inode, struct ext4_inode_info, vfs_inode); } static inline int ext4_writepages_down_read(struct super_block *sb) { percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem); return memalloc_nofs_save(); } static inline void ext4_writepages_up_read(struct super_block *sb, int ctx) { memalloc_nofs_restore(ctx); percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem); } static inline int ext4_writepages_down_write(struct super_block *sb) { percpu_down_write(&EXT4_SB(sb)->s_writepages_rwsem); return memalloc_nofs_save(); } static inline void ext4_writepages_up_write(struct super_block *sb, int ctx) { memalloc_nofs_restore(ctx); percpu_up_write(&EXT4_SB(sb)->s_writepages_rwsem); } static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) { return ino == EXT4_ROOT_INO || (ino >= EXT4_FIRST_INO(sb) && ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); } /* * Returns: sbi->field[index] * Used to access an array element from the following sbi fields which require * rcu protection to avoid dereferencing an invalid pointer due to reassignment * - s_group_desc * - s_group_info * - s_flex_group */ #define sbi_array_rcu_deref(sbi, field, index) \ ({ \ typeof(*((sbi)->field)) _v; \ rcu_read_lock(); \ _v = ((typeof(_v)*)rcu_dereference((sbi)->field))[index]; \ rcu_read_unlock(); \ _v; \ }) /* * run-time mount flags */ enum { EXT4_MF_MNTDIR_SAMPLED, EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */ EXT4_MF_JOURNAL_DESTROY /* Journal is in process of destroying */ }; static inline void ext4_set_mount_flag(struct super_block *sb, int bit) { set_bit(bit, &EXT4_SB(sb)->s_mount_flags); } static inline void ext4_clear_mount_flag(struct super_block *sb, int bit) { clear_bit(bit, &EXT4_SB(sb)->s_mount_flags); } static inline int ext4_test_mount_flag(struct super_block *sb, int bit) { return test_bit(bit, &EXT4_SB(sb)->s_mount_flags); } /* * Simulate_fail codes */ #define EXT4_SIM_BBITMAP_EIO 1 #define EXT4_SIM_BBITMAP_CRC 2 #define EXT4_SIM_IBITMAP_EIO 3 #define EXT4_SIM_IBITMAP_CRC 4 #define EXT4_SIM_INODE_EIO 5 #define EXT4_SIM_INODE_CRC 6 #define EXT4_SIM_DIRBLOCK_EIO 7 #define EXT4_SIM_DIRBLOCK_CRC 8 static inline bool ext4_simulate_fail(struct super_block *sb, unsigned long code) { #ifdef CONFIG_EXT4_DEBUG struct ext4_sb_info *sbi = EXT4_SB(sb); if (unlikely(sbi->s_simulate_fail == code)) { sbi->s_simulate_fail = 0; return true; } #endif return false; } /* * Error number codes for s_{first,last}_error_errno * * Linux errno numbers are architecture specific, so we need to translate * them into something which is architecture independent. We don't define * codes for all errno's; just the ones which are most likely to be the cause * of an ext4_error() call. */ #define EXT4_ERR_UNKNOWN 1 #define EXT4_ERR_EIO 2 #define EXT4_ERR_ENOMEM 3 #define EXT4_ERR_EFSBADCRC 4 #define EXT4_ERR_EFSCORRUPTED 5 #define EXT4_ERR_ENOSPC 6 #define EXT4_ERR_ENOKEY 7 #define EXT4_ERR_EROFS 8 #define EXT4_ERR_EFBIG 9 #define EXT4_ERR_EEXIST 10 #define EXT4_ERR_ERANGE 11 #define EXT4_ERR_EOVERFLOW 12 #define EXT4_ERR_EBUSY 13 #define EXT4_ERR_ENOTDIR 14 #define EXT4_ERR_ENOTEMPTY 15 #define EXT4_ERR_ESHUTDOWN 16 #define EXT4_ERR_EFAULT 17 /* * Inode dynamic state flags */ enum { EXT4_STATE_NEW, /* inode is newly created */ EXT4_STATE_XATTR, /* has in-inode xattrs */ EXT4_STATE_NO_EXPAND, /* No space for expansion */ EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ EXT4_STATE_NEWENTRY, /* File just added to dir */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */ EXT4_STATE_FC_FLUSHING_DATA, /* Fast commit flushing data */ EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ static inline int ext4_test_inode_##name(struct inode *inode, int bit) \ { \ return test_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ } \ static inline void ext4_set_inode_##name(struct inode *inode, int bit) \ { \ set_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ } \ static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \ { \ clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ } /* Add these declarations here only so that these functions can be * found by name. Otherwise, they are very hard to locate. */ static inline int ext4_test_inode_flag(struct inode *inode, int bit); static inline void ext4_set_inode_flag(struct inode *inode, int bit); static inline void ext4_clear_inode_flag(struct inode *inode, int bit); EXT4_INODE_BIT_FNS(flag, flags, 0) /* Add these declarations here only so that these functions can be * found by name. Otherwise, they are very hard to locate. */ static inline int ext4_test_inode_state(struct inode *inode, int bit); static inline void ext4_set_inode_state(struct inode *inode, int bit); static inline void ext4_clear_inode_state(struct inode *inode, int bit); #if (BITS_PER_LONG < 64) EXT4_INODE_BIT_FNS(state, state_flags, 0) static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) { (ei)->i_state_flags = 0; } #else EXT4_INODE_BIT_FNS(state, flags, 32) static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) { /* We depend on the fact that callers will set i_flags */ } #endif #else /* Assume that user mode programs are passing in an ext4fs superblock, not * a kernel struct super_block. This will allow us to call the feature-test * macros from user land. */ #define EXT4_SB(sb) (sb) #endif static inline bool ext4_verity_in_progress(struct inode *inode) { return IS_ENABLED(CONFIG_FS_VERITY) && ext4_test_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); } #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime /* * Codes for operating systems */ #define EXT4_OS_LINUX 0 #define EXT4_OS_HURD 1 #define EXT4_OS_MASIX 2 #define EXT4_OS_FREEBSD 3 #define EXT4_OS_LITES 4 /* * Revision levels */ #define EXT4_GOOD_OLD_REV 0 /* The good old (original) format */ #define EXT4_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ #define EXT4_MAX_SUPP_REV EXT4_DYNAMIC_REV #define EXT4_GOOD_OLD_INODE_SIZE 128 #define EXT4_EXTRA_TIMESTAMP_MAX (((s64)1 << 34) - 1 + S32_MIN) #define EXT4_NON_EXTRA_TIMESTAMP_MAX S32_MAX #define EXT4_TIMESTAMP_MIN S32_MIN /* * Feature set definitions */ #define EXT4_FEATURE_COMPAT_DIR_PREALLOC 0x0001 #define EXT4_FEATURE_COMPAT_IMAGIC_INODES 0x0002 #define EXT4_FEATURE_COMPAT_HAS_JOURNAL 0x0004 #define EXT4_FEATURE_COMPAT_EXT_ATTR 0x0008 #define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010 #define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020 #define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200 /* * The reason why "FAST_COMMIT" is a compat feature is that, FS becomes * incompatible only if fast commit blocks are present in the FS. Since we * clear the journal (and thus the fast commit blocks), we don't mark FS as * incompatible. We also have a JBD2 incompat feature, which gets set when * there are fast commit blocks present in the journal. */ #define EXT4_FEATURE_COMPAT_FAST_COMMIT 0x0400 #define EXT4_FEATURE_COMPAT_STABLE_INODES 0x0800 #define EXT4_FEATURE_COMPAT_ORPHAN_FILE 0x1000 /* Orphan file exists */ #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 #define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 #define EXT4_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 #define EXT4_FEATURE_RO_COMPAT_HUGE_FILE 0x0008 #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 #define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 #define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 /* * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM). When * METADATA_CSUM is set, group descriptor checksums use the same algorithm as * all other data structures' checksums. However, the METADATA_CSUM and * GDT_CSUM bits are mutually exclusive. */ #define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 #define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000 #define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000 #define EXT4_FEATURE_RO_COMPAT_VERITY 0x8000 #define EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT 0x10000 /* Orphan file may be non-empty */ #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 #define EXT4_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ #define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ #define EXT4_FEATURE_INCOMPAT_META_BG 0x0010 #define EXT4_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ #define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 #define EXT4_FEATURE_INCOMPAT_MMP 0x0100 #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 #define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ #define EXT4_FEATURE_INCOMPAT_CSUM_SEED 0x2000 #define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ #define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ #define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000 #define EXT4_FEATURE_INCOMPAT_CASEFOLD 0x20000 extern void ext4_update_dynamic_rev(struct super_block *sb); #define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \ static inline bool ext4_has_feature_##name(struct super_block *sb) \ { \ return ((EXT4_SB(sb)->s_es->s_feature_compat & \ cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname)) != 0); \ } \ static inline void ext4_set_feature_##name(struct super_block *sb) \ { \ ext4_update_dynamic_rev(sb); \ EXT4_SB(sb)->s_es->s_feature_compat |= \ cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ } \ static inline void ext4_clear_feature_##name(struct super_block *sb) \ { \ EXT4_SB(sb)->s_es->s_feature_compat &= \ ~cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ } #define EXT4_FEATURE_RO_COMPAT_FUNCS(name, flagname) \ static inline bool ext4_has_feature_##name(struct super_block *sb) \ { \ return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \ cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname)) != 0); \ } \ static inline void ext4_set_feature_##name(struct super_block *sb) \ { \ ext4_update_dynamic_rev(sb); \ EXT4_SB(sb)->s_es->s_feature_ro_compat |= \ cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ } \ static inline void ext4_clear_feature_##name(struct super_block *sb) \ { \ EXT4_SB(sb)->s_es->s_feature_ro_compat &= \ ~cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ } #define EXT4_FEATURE_INCOMPAT_FUNCS(name, flagname) \ static inline bool ext4_has_feature_##name(struct super_block *sb) \ { \ return ((EXT4_SB(sb)->s_es->s_feature_incompat & \ cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname)) != 0); \ } \ static inline void ext4_set_feature_##name(struct super_block *sb) \ { \ ext4_update_dynamic_rev(sb); \ EXT4_SB(sb)->s_es->s_feature_incompat |= \ cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ } \ static inline void ext4_clear_feature_##name(struct super_block *sb) \ { \ EXT4_SB(sb)->s_es->s_feature_incompat &= \ ~cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ } EXT4_FEATURE_COMPAT_FUNCS(dir_prealloc, DIR_PREALLOC) EXT4_FEATURE_COMPAT_FUNCS(imagic_inodes, IMAGIC_INODES) EXT4_FEATURE_COMPAT_FUNCS(journal, HAS_JOURNAL) EXT4_FEATURE_COMPAT_FUNCS(xattr, EXT_ATTR) EXT4_FEATURE_COMPAT_FUNCS(resize_inode, RESIZE_INODE) EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX) EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2) EXT4_FEATURE_COMPAT_FUNCS(fast_commit, FAST_COMMIT) EXT4_FEATURE_COMPAT_FUNCS(stable_inodes, STABLE_INODES) EXT4_FEATURE_COMPAT_FUNCS(orphan_file, ORPHAN_FILE) EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER) EXT4_FEATURE_RO_COMPAT_FUNCS(large_file, LARGE_FILE) EXT4_FEATURE_RO_COMPAT_FUNCS(btree_dir, BTREE_DIR) EXT4_FEATURE_RO_COMPAT_FUNCS(huge_file, HUGE_FILE) EXT4_FEATURE_RO_COMPAT_FUNCS(gdt_csum, GDT_CSUM) EXT4_FEATURE_RO_COMPAT_FUNCS(dir_nlink, DIR_NLINK) EXT4_FEATURE_RO_COMPAT_FUNCS(extra_isize, EXTRA_ISIZE) EXT4_FEATURE_RO_COMPAT_FUNCS(quota, QUOTA) EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc, BIGALLOC) EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM) EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY) EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT) EXT4_FEATURE_RO_COMPAT_FUNCS(verity, VERITY) EXT4_FEATURE_RO_COMPAT_FUNCS(orphan_present, ORPHAN_PRESENT) EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION) EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE) EXT4_FEATURE_INCOMPAT_FUNCS(journal_needs_recovery, RECOVER) EXT4_FEATURE_INCOMPAT_FUNCS(journal_dev, JOURNAL_DEV) EXT4_FEATURE_INCOMPAT_FUNCS(meta_bg, META_BG) EXT4_FEATURE_INCOMPAT_FUNCS(extents, EXTENTS) EXT4_FEATURE_INCOMPAT_FUNCS(64bit, 64BIT) EXT4_FEATURE_INCOMPAT_FUNCS(mmp, MMP) EXT4_FEATURE_INCOMPAT_FUNCS(flex_bg, FLEX_BG) EXT4_FEATURE_INCOMPAT_FUNCS(ea_inode, EA_INODE) EXT4_FEATURE_INCOMPAT_FUNCS(dirdata, DIRDATA) EXT4_FEATURE_INCOMPAT_FUNCS(csum_seed, CSUM_SEED) EXT4_FEATURE_INCOMPAT_FUNCS(largedir, LARGEDIR) EXT4_FEATURE_INCOMPAT_FUNCS(inline_data, INLINE_DATA) EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD) #define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ EXT4_FEATURE_INCOMPAT_META_BG) #define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_BTREE_DIR) #define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ EXT4_FEATURE_INCOMPAT_RECOVER| \ EXT4_FEATURE_INCOMPAT_META_BG) #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_BTREE_DIR) #define EXT4_FEATURE_COMPAT_SUPP (EXT4_FEATURE_COMPAT_EXT_ATTR| \ EXT4_FEATURE_COMPAT_ORPHAN_FILE) #define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ EXT4_FEATURE_INCOMPAT_RECOVER| \ EXT4_FEATURE_INCOMPAT_META_BG| \ EXT4_FEATURE_INCOMPAT_EXTENTS| \ EXT4_FEATURE_INCOMPAT_64BIT| \ EXT4_FEATURE_INCOMPAT_FLEX_BG| \ EXT4_FEATURE_INCOMPAT_EA_INODE| \ EXT4_FEATURE_INCOMPAT_MMP | \ EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ EXT4_FEATURE_INCOMPAT_ENCRYPT | \ EXT4_FEATURE_INCOMPAT_CASEFOLD | \ EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ EXT4_FEATURE_INCOMPAT_LARGEDIR) #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ EXT4_FEATURE_RO_COMPAT_BIGALLOC |\ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ EXT4_FEATURE_RO_COMPAT_QUOTA |\ EXT4_FEATURE_RO_COMPAT_PROJECT |\ EXT4_FEATURE_RO_COMPAT_VERITY |\ EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT) #define EXTN_FEATURE_FUNCS(ver) \ static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \ { \ return ((EXT4_SB(sb)->s_es->s_feature_compat & \ cpu_to_le32(~EXT##ver##_FEATURE_COMPAT_SUPP)) != 0); \ } \ static inline bool ext4_has_unknown_ext##ver##_ro_compat_features(struct super_block *sb) \ { \ return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \ cpu_to_le32(~EXT##ver##_FEATURE_RO_COMPAT_SUPP)) != 0); \ } \ static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_block *sb) \ { \ return ((EXT4_SB(sb)->s_es->s_feature_incompat & \ cpu_to_le32(~EXT##ver##_FEATURE_INCOMPAT_SUPP)) != 0); \ } EXTN_FEATURE_FUNCS(2) EXTN_FEATURE_FUNCS(3) EXTN_FEATURE_FUNCS(4) static inline bool ext4_has_compat_features(struct super_block *sb) { return (EXT4_SB(sb)->s_es->s_feature_compat != 0); } static inline bool ext4_has_ro_compat_features(struct super_block *sb) { return (EXT4_SB(sb)->s_es->s_feature_ro_compat != 0); } static inline bool ext4_has_incompat_features(struct super_block *sb) { return (EXT4_SB(sb)->s_es->s_feature_incompat != 0); } extern int ext4_feature_set_ok(struct super_block *sb, int readonly); /* * Superblock flags */ enum { EXT4_FLAGS_RESIZING, /* Avoid superblock update and resize race */ EXT4_FLAGS_SHUTDOWN, /* Prevent access to the file system */ EXT4_FLAGS_BDEV_IS_DAX, /* Current block device support DAX */ EXT4_FLAGS_EMERGENCY_RO,/* Emergency read-only due to fs errors */ }; static inline int ext4_forced_shutdown(struct super_block *sb) { return test_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); } static inline int ext4_emergency_ro(struct super_block *sb) { return test_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags); } static inline int ext4_emergency_state(struct super_block *sb) { if (unlikely(ext4_forced_shutdown(sb))) return -EIO; if (unlikely(ext4_emergency_ro(sb))) return -EROFS; return 0; } /* * Default values for user and/or group using reserved blocks */ #define EXT4_DEF_RESUID 0 #define EXT4_DEF_RESGID 0 /* * Default project ID */ #define EXT4_DEF_PROJID 0 #define EXT4_DEF_INODE_READAHEAD_BLKS 32 /* * Default mount options */ #define EXT4_DEFM_DEBUG 0x0001 #define EXT4_DEFM_BSDGROUPS 0x0002 #define EXT4_DEFM_XATTR_USER 0x0004 #define EXT4_DEFM_ACL 0x0008 #define EXT4_DEFM_UID16 0x0010 #define EXT4_DEFM_JMODE 0x0060 #define EXT4_DEFM_JMODE_DATA 0x0020 #define EXT4_DEFM_JMODE_ORDERED 0x0040 #define EXT4_DEFM_JMODE_WBACK 0x0060 #define EXT4_DEFM_NOBARRIER 0x0100 #define EXT4_DEFM_BLOCK_VALIDITY 0x0200 #define EXT4_DEFM_DISCARD 0x0400 #define EXT4_DEFM_NODELALLOC 0x0800 /* * Default journal batch times and ioprio. */ #define EXT4_DEF_MIN_BATCH_TIME 0 #define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ #define EXT4_DEF_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) /* * Default values for superblock update */ #define EXT4_DEF_SB_UPDATE_INTERVAL_SEC (3600) /* seconds (1 hour) */ #define EXT4_DEF_SB_UPDATE_INTERVAL_KB (16384) /* kilobytes (16MB) */ /* * Minimum number of groups in a flexgroup before we separate out * directories into the first block group of a flexgroup */ #define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4 /* * Structure of a directory entry */ #define EXT4_NAME_LEN 255 /* * Base length of the ext4 directory entry excluding the name length */ #define EXT4_BASE_DIR_LEN (sizeof(struct ext4_dir_entry_2) - EXT4_NAME_LEN) struct ext4_dir_entry { __le32 inode; /* Inode number */ __le16 rec_len; /* Directory entry length */ __le16 name_len; /* Name length */ char name[EXT4_NAME_LEN]; /* File name */ }; /* * Encrypted Casefolded entries require saving the hash on disk. This structure * followed ext4_dir_entry_2's name[name_len] at the next 4 byte aligned * boundary. */ struct ext4_dir_entry_hash { __le32 hash; __le32 minor_hash; }; /* * The new version of the directory entry. Since EXT4 structures are * stored in intel byte order, and the name_len field could never be * bigger than 255 chars, it's safe to reclaim the extra byte for the * file_type field. */ struct ext4_dir_entry_2 { __le32 inode; /* Inode number */ __le16 rec_len; /* Directory entry length */ __u8 name_len; /* Name length */ __u8 file_type; /* See file type macros EXT4_FT_* below */ char name[EXT4_NAME_LEN]; /* File name */ }; /* * Access the hashes at the end of ext4_dir_entry_2 */ #define EXT4_DIRENT_HASHES(entry) \ ((struct ext4_dir_entry_hash *) \ (((void *)(entry)) + \ ((8 + (entry)->name_len + EXT4_DIR_ROUND) & ~EXT4_DIR_ROUND))) #define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(entry)->hash) #define EXT4_DIRENT_MINOR_HASH(entry) \ le32_to_cpu(EXT4_DIRENT_HASHES(entry)->minor_hash) static inline bool ext4_hash_in_dirent(const struct inode *inode) { return IS_CASEFOLDED(inode) && IS_ENCRYPTED(inode); } /* * This is a bogus directory entry at the end of each leaf block that * records checksums. */ struct ext4_dir_entry_tail { __le32 det_reserved_zero1; /* Pretend to be unused */ __le16 det_rec_len; /* 12 */ __u8 det_reserved_zero2; /* Zero name length */ __u8 det_reserved_ft; /* 0xDE, fake file type */ __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */ }; #define EXT4_DIRENT_TAIL(block, blocksize) \ ((struct ext4_dir_entry_tail *)(((void *)(block)) + \ ((blocksize) - \ sizeof(struct ext4_dir_entry_tail)))) /* * Ext4 directory file types. Only the low 3 bits are used. The * other bits are reserved for now. */ #define EXT4_FT_UNKNOWN 0 #define EXT4_FT_REG_FILE 1 #define EXT4_FT_DIR 2 #define EXT4_FT_CHRDEV 3 #define EXT4_FT_BLKDEV 4 #define EXT4_FT_FIFO 5 #define EXT4_FT_SOCK 6 #define EXT4_FT_SYMLINK 7 #define EXT4_FT_MAX 8 #define EXT4_FT_DIR_CSUM 0xDE /* * EXT4_DIR_PAD defines the directory entries boundaries * * NOTE: It must be a multiple of 4 */ #define EXT4_DIR_PAD 4 #define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) #define EXT4_MAX_REC_LEN ((1<<16)-1) /* * The rec_len is dependent on the type of directory. Directories that are * casefolded and encrypted need to store the hash as well, so we add room for * ext4_extended_dir_entry_2. For all entries related to '.' or '..' you should * pass NULL for dir, as those entries do not use the extra fields. */ static inline unsigned int ext4_dir_rec_len(__u8 name_len, const struct inode *dir) { int rec_len = (name_len + 8 + EXT4_DIR_ROUND); if (dir && ext4_hash_in_dirent(dir)) rec_len += sizeof(struct ext4_dir_entry_hash); return (rec_len & ~EXT4_DIR_ROUND); } /* * If we ever get support for fs block sizes > page_size, we'll need * to remove the #if statements in the next two functions... */ static inline unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) { unsigned len = le16_to_cpu(dlen); #if (PAGE_SIZE >= 65536) if (len == EXT4_MAX_REC_LEN || len == 0) return blocksize; return (len & 65532) | ((len & 3) << 16); #else return len; #endif } static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) { BUG_ON((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)); #if (PAGE_SIZE >= 65536) if (len < 65536) return cpu_to_le16(len); if (len == blocksize) { if (blocksize == 65536) return cpu_to_le16(EXT4_MAX_REC_LEN); else return cpu_to_le16(0); } return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); #else return cpu_to_le16(len); #endif } /* * Hash Tree Directory indexing * (c) Daniel Phillips, 2001 */ #define is_dx(dir) (ext4_has_feature_dir_index((dir)->i_sb) && \ ext4_test_inode_flag((dir), EXT4_INODE_INDEX)) #define EXT4_DIR_LINK_MAX(dir) unlikely((dir)->i_nlink >= EXT4_LINK_MAX && \ !(ext4_has_feature_dir_nlink((dir)->i_sb) && is_dx(dir))) #define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) /* Legal values for the dx_root hash_version field: */ #define DX_HASH_LEGACY 0 #define DX_HASH_HALF_MD4 1 #define DX_HASH_TEA 2 #define DX_HASH_LEGACY_UNSIGNED 3 #define DX_HASH_HALF_MD4_UNSIGNED 4 #define DX_HASH_TEA_UNSIGNED 5 #define DX_HASH_SIPHASH 6 #define DX_HASH_LAST DX_HASH_SIPHASH static inline u32 ext4_chksum(u32 crc, const void *address, unsigned int length) { return crc32c(crc, address, length); } #ifdef __KERNEL__ /* hash info structure used by the directory hash */ struct dx_hash_info { u32 hash; u32 minor_hash; int hash_version; u32 *seed; }; /* 32 and 64 bit signed EOF for dx directories */ #define EXT4_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) #define EXT4_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) /* * Control parameters used by ext4_htree_next_block */ #define HASH_NB_ALWAYS 1 struct ext4_filename { const struct qstr *usr_fname; struct fscrypt_str disk_name; struct dx_hash_info hinfo; #ifdef CONFIG_FS_ENCRYPTION struct fscrypt_str crypto_buf; #endif #if IS_ENABLED(CONFIG_UNICODE) struct qstr cf_name; #endif }; #define fname_name(p) ((p)->disk_name.name) #define fname_usr_name(p) ((p)->usr_fname->name) #define fname_len(p) ((p)->disk_name.len) /* * Describe an inode's exact location on disk and in memory */ struct ext4_iloc { struct buffer_head *bh; unsigned long offset; ext4_group_t block_group; }; static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc) { return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset); } static inline bool ext4_is_quota_file(struct inode *inode) { return IS_NOQUOTA(inode) && !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL); } /* * This structure is stuffed into the struct file's private_data field * for directories. It is where we put information so that we can do * readdir operations in hash tree order. */ struct dir_private_info { struct rb_root root; struct rb_node *curr_node; struct fname *extra_fname; loff_t last_pos; __u32 curr_hash; __u32 curr_minor_hash; __u32 next_hash; u64 cookie; bool initialized; }; /* calculate the first block number of the group */ static inline ext4_fsblk_t ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) { return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); } /* * Special error return code only used by dx_probe() and its callers. */ #define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1)) /* htree levels for ext4 */ #define EXT4_HTREE_LEVEL_COMPAT 2 #define EXT4_HTREE_LEVEL 3 static inline int ext4_dir_htree_level(struct super_block *sb) { return ext4_has_feature_largedir(sb) ? EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; } /* * Timeout and state flag for lazy initialization inode thread. */ #define EXT4_DEF_LI_WAIT_MULT 10 #define EXT4_DEF_LI_MAX_START_DELAY 5 #define EXT4_LAZYINIT_QUIT 0x0001 #define EXT4_LAZYINIT_RUNNING 0x0002 /* * Lazy inode table initialization info */ struct ext4_lazy_init { unsigned long li_state; struct list_head li_request_list; struct mutex li_list_mtx; }; enum ext4_li_mode { EXT4_LI_MODE_PREFETCH_BBITMAP, EXT4_LI_MODE_ITABLE, }; struct ext4_li_request { struct super_block *lr_super; enum ext4_li_mode lr_mode; ext4_group_t lr_first_not_zeroed; ext4_group_t lr_next_group; struct list_head lr_request; unsigned long lr_next_sched; unsigned long lr_timeout; }; struct ext4_features { struct kobject f_kobj; struct completion f_kobj_unregister; }; /* * This structure will be used for multiple mount protection. It will be * written into the block number saved in the s_mmp_block field in the * superblock. Programs that check MMP should assume that if * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe * to use the filesystem, regardless of how old the timestamp is. */ #define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ #define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ #define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ #define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ struct mmp_struct { __le32 mmp_magic; /* Magic number for MMP */ __le32 mmp_seq; /* Sequence no. updated periodically */ /* * mmp_time, mmp_nodename & mmp_bdevname are only used for information * purposes and do not affect the correctness of the algorithm */ __le64 mmp_time; /* Time last updated */ char mmp_nodename[64]; /* Node which last updated MMP block */ char mmp_bdevname[32]; /* Bdev which last updated MMP block */ /* * mmp_check_interval is used to verify if the MMP block has been * updated on the block device. The value is updated based on the * maximum time to write the MMP block during an update cycle. */ __le16 mmp_check_interval; __le16 mmp_pad1; __le32 mmp_pad2[226]; __le32 mmp_checksum; /* crc32c(uuid+mmp_block) */ }; /* arguments passed to the mmp thread */ struct mmpd_data { struct buffer_head *bh; /* bh from initial read_mmp_block() */ struct super_block *sb; /* super block of the fs */ }; /* * Check interval multiplier * The MMP block is written every update interval and initially checked every * update interval x the multiplier (the value is then adapted based on the * write latency). The reason is that writes can be delayed under load and we * don't want readers to incorrectly assume that the filesystem is no longer * in use. */ #define EXT4_MMP_CHECK_MULT 2UL /* * Minimum interval for MMP checking in seconds. */ #define EXT4_MMP_MIN_CHECK_INTERVAL 5UL /* * Maximum interval for MMP checking in seconds. */ #define EXT4_MMP_MAX_CHECK_INTERVAL 300UL /* * Function prototypes */ /* * Ok, these declarations are also in <linux/kernel.h> but none of the * ext4 source programs needs to include it so they are duplicated here. */ # define NORET_TYPE /**/ # define ATTRIB_NORET __attribute__((noreturn)) # define NORET_AND noreturn, /* bitmap.c */ extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); void ext4_inode_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh); int ext4_inode_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh); void ext4_block_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh); int ext4_block_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh); /* balloc.c */ extern void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); extern ext4_group_t ext4_get_group_number(struct super_block *sb, ext4_fsblk_t block); extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); extern unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group); extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned int flags, unsigned long *count, int *errp); extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi, s64 nclusters, unsigned int flags); extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *); extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, ext4_group_t block_group, struct buffer_head ** bh); extern struct ext4_group_info *ext4_get_group_info(struct super_block *sb, ext4_group_t group); extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, bool ignore_locked); extern int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, struct buffer_head *bh); extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group); extern unsigned ext4_free_clusters_after_init(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp); ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); #if IS_ENABLED(CONFIG_UNICODE) extern int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, struct ext4_filename *fname); static inline void ext4_fname_free_ci_filename(struct ext4_filename *fname) { kfree(fname->cf_name.name); fname->cf_name.name = NULL; } #else static inline int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, struct ext4_filename *fname) { return 0; } static inline void ext4_fname_free_ci_filename(struct ext4_filename *fname) { } #endif /* ext4 encryption related stuff goes here crypto.c */ #ifdef CONFIG_FS_ENCRYPTION extern const struct fscrypt_operations ext4_cryptops; int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct ext4_filename *fname); int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry, struct ext4_filename *fname); void ext4_fname_free_filename(struct ext4_filename *fname); int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg); #else /* !CONFIG_FS_ENCRYPTION */ static inline int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct ext4_filename *fname) { fname->usr_fname = iname; fname->disk_name.name = (unsigned char *) iname->name; fname->disk_name.len = iname->len; return ext4_fname_setup_ci_filename(dir, iname, fname); } static inline int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry, struct ext4_filename *fname) { return ext4_fname_setup_filename(dir, &dentry->d_name, 1, fname); } static inline void ext4_fname_free_filename(struct ext4_filename *fname) { ext4_fname_free_ci_filename(fname); } static inline int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } #endif /* !CONFIG_FS_ENCRYPTION */ /* dir.c */ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, struct file *, struct ext4_dir_entry_2 *, struct buffer_head *, char *, int, unsigned int); #define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ (de), (bh), (buf), (size), (offset))) extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent, struct fscrypt_str *ent_name); extern void ext4_htree_free_dir_info(struct dir_private_info *p); extern int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh, void *buf, int buf_size, struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de); void ext4_insert_dentry(struct inode *dir, struct inode *inode, struct ext4_dir_entry_2 *de, int buf_size, struct ext4_filename *fname); static inline void ext4_update_dx_flag(struct inode *inode) { if (!ext4_has_feature_dir_index(inode->i_sb) && ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { /* ext4_iget() should have caught this... */ WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb)); ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); } } static const unsigned char ext4_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK }; static inline unsigned char get_dtype(struct super_block *sb, int filetype) { if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX) return DT_UNKNOWN; return ext4_filetype_table[filetype]; } extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, int buf_size); /* fsync.c */ extern int ext4_sync_file(struct file *, loff_t, loff_t, int); /* hash.c */ extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len, struct dx_hash_info *hinfo); /* ialloc.c */ extern int ext4_mark_inode_used(struct super_block *sb, int ino); extern struct inode *__ext4_new_inode(struct mnt_idmap *, handle_t *, struct inode *, umode_t, const struct qstr *qstr, __u32 goal, uid_t *owner, __u32 i_flags, int handle_type, unsigned int line_no, int nblocks); #define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \ __ext4_new_inode(&nop_mnt_idmap, (handle), (dir), (mode), (qstr), \ (goal), (owner), i_flags, 0, 0, 0) #define ext4_new_inode_start_handle(idmap, dir, mode, qstr, goal, owner, \ type, nblocks) \ __ext4_new_inode((idmap), NULL, (dir), (mode), (qstr), (goal), (owner), \ 0, (type), __LINE__, (nblocks)) extern void ext4_free_inode(handle_t *, struct inode *); extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); extern unsigned long ext4_count_free_inodes(struct super_block *); extern unsigned long ext4_count_dirs(struct super_block *); extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int barrier); extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); /* fast_commit.c */ int ext4_fc_info_show(struct seq_file *seq, void *v); void ext4_fc_init(struct super_block *sb, journal_t *journal); void ext4_fc_init_inode(struct inode *inode); void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, ext4_lblk_t end); void __ext4_fc_track_unlink(handle_t *handle, struct inode *inode, struct dentry *dentry); void __ext4_fc_track_link(handle_t *handle, struct inode *inode, struct dentry *dentry); void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry); void ext4_fc_track_link(handle_t *handle, struct dentry *dentry); void __ext4_fc_track_create(handle_t *handle, struct inode *inode, struct dentry *dentry); void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); void ext4_fc_track_inode(handle_t *handle, struct inode *inode); void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle); void ext4_fc_del(struct inode *inode); bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block); void ext4_fc_replay_cleanup(struct super_block *sb); int ext4_fc_commit(journal_t *journal, tid_t commit_tid); int __init ext4_fc_init_dentry_cache(void); void ext4_fc_destroy_dentry_cache(void); int ext4_fc_record_regions(struct super_block *sb, int ino, ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay); /* mballoc.c */ extern const struct seq_operations ext4_mb_seq_groups_ops; extern const struct seq_operations ext4_mb_seq_structs_summary_ops; extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset); extern int ext4_mb_init(struct super_block *); extern void ext4_mb_release(struct super_block *); extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, struct ext4_allocation_request *, int *); extern void ext4_discard_preallocations(struct inode *); extern int __init ext4_init_mballoc(void); extern void ext4_exit_mballoc(void); extern ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, unsigned int nr, int *cnt); extern void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, unsigned int nr); extern void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t block, unsigned long count, int flags); extern int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups); extern int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t i, struct ext4_group_desc *desc); extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count); extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid); extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, int len, bool state); static inline bool ext4_mb_cr_expensive(enum criteria cr) { return cr >= CR_GOAL_LEN_SLOW; } /* inode.c */ void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei); int ext4_inode_is_fast_symlink(struct inode *inode); void ext4_check_map_extents_env(struct inode *inode); struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, bool wait, struct buffer_head **bhs); int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create); int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, struct buffer_head *head, unsigned from, unsigned to, int *partial, int (*fn)(handle_t *handle, struct inode *inode, struct buffer_head *bh)); int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh); void ext4_set_inode_mapping_order(struct inode *inode); #define FALL_BACK_TO_NONDELALLOC 1 #define CONVERT_INLINE_DATA 2 typedef enum { EXT4_IGET_NORMAL = 0, EXT4_IGET_SPECIAL = 0x0001, /* OK to iget a system inode */ EXT4_IGET_HANDLE = 0x0002, /* Inode # is from a handle */ EXT4_IGET_BAD = 0x0004, /* Allow to iget a bad inode */ EXT4_IGET_EA_INODE = 0x0008 /* Inode should contain an EA value */ } ext4_iget_flags; extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ext4_iget_flags flags, const char *function, unsigned int line); #define ext4_iget(sb, ino, flags) \ __ext4_iget((sb), (ino), (flags), __func__, __LINE__) extern int ext4_write_inode(struct inode *, struct writeback_control *); extern int ext4_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern u32 ext4_dio_alignment(struct inode *inode); extern int ext4_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern void ext4_evict_inode(struct inode *); extern void ext4_clear_inode(struct inode *); extern int ext4_file_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern void ext4_dirty_inode(struct inode *, int); extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); extern int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino, struct ext4_iloc *iloc); extern int ext4_inode_attach_jinode(struct inode *inode); extern int ext4_can_truncate(struct inode *inode); extern int ext4_truncate(struct inode *); extern int ext4_break_layouts(struct inode *); extern int ext4_truncate_page_cache_block_range(struct inode *inode, loff_t start, loff_t end); extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); extern void ext4_set_inode_flags(struct inode *, bool init); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_chunk_trans_extent(struct inode *inode, int nrblocks); extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); extern void ext4_da_release_space(struct inode *inode, int to_free); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, ext4_lblk_t len); static inline bool is_special_ino(struct super_block *sb, unsigned long ino) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; return (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) || ino == le32_to_cpu(es->s_usr_quota_inum) || ino == le32_to_cpu(es->s_grp_quota_inum) || ino == le32_to_cpu(es->s_prj_quota_inum) || ino == le32_to_cpu(es->s_orphan_file_inum); } /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); extern void ext4_ind_truncate(handle_t *, struct inode *inode); extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode, ext4_lblk_t start, ext4_lblk_t end); /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); int ext4_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct file_kattr *fa); int ext4_fileattr_get(struct dentry *dentry, struct file_kattr *fa); extern void ext4_reset_inode_seed(struct inode *inode); int ext4_update_overhead(struct super_block *sb, bool force); int ext4_force_shutdown(struct super_block *sb, u32 flags); /* migrate.c */ extern int ext4_ext_migrate(struct inode *); extern int ext4_ind_migrate(struct inode *inode); /* namei.c */ extern int ext4_init_new_dir(handle_t *handle, struct inode *dir, struct inode *inode); extern int ext4_dirblock_csum_verify(struct inode *inode, struct buffer_head *bh); extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, __u32 *next_hash); extern int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, struct inode *dir, struct ext4_filename *fname, unsigned int offset, struct ext4_dir_entry_2 **res_dir); extern int ext4_generic_delete_entry(struct inode *dir, struct ext4_dir_entry_2 *de_del, struct buffer_head *bh, void *entry_buf, int buf_size, int csum_size); extern bool ext4_empty_dir(struct inode *inode); /* resize.c */ extern void ext4_kvfree_array_rcu(void *to_free); extern int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input); extern int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_fsblk_t n_blocks_count); extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); extern unsigned int ext4_list_backups(struct super_block *sb, unsigned int *three, unsigned int *five, unsigned int *seven); /* super.c */ extern struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, blk_opf_t op_flags); extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, sector_t block); extern void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io, bool simu_fail); extern int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io, bool simu_fail); extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait); extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block); extern int ext4_seq_options_show(struct seq_file *seq, void *offset); extern int ext4_calculate_overhead(struct super_block *sb); extern __le32 ext4_superblock_csum(struct ext4_super_block *es); extern void ext4_superblock_csum_set(struct super_block *sb); extern int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup); extern const char *ext4_decode_error(struct super_block *sb, int errno, char nbuf[16]); extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, ext4_group_t block_group, unsigned int flags); extern unsigned int ext4_num_base_meta_blocks(struct super_block *sb, ext4_group_t block_group); extern __printf(7, 8) void __ext4_error(struct super_block *, const char *, unsigned int, bool, int, __u64, const char *, ...); extern __printf(6, 7) void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, int, const char *, ...); extern __printf(5, 6) void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, const char *, ...); extern void __ext4_std_error(struct super_block *, const char *, unsigned int, int); extern __printf(4, 5) void __ext4_warning(struct super_block *, const char *, unsigned int, const char *, ...); extern __printf(4, 5) void __ext4_warning_inode(const struct inode *inode, const char *function, unsigned int line, const char *fmt, ...); extern __printf(3, 4) void __ext4_msg(struct super_block *, const char *, const char *, ...); extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, const char *, unsigned int, const char *); extern __printf(7, 8) void __ext4_grp_locked_error(const char *, unsigned int, struct super_block *, ext4_group_t, unsigned long, ext4_fsblk_t, const char *, ...); #define EXT4_ERROR_INODE(inode, fmt, a...) \ ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) #define EXT4_ERROR_INODE_ERR(inode, err, fmt, a...) \ __ext4_error_inode((inode), __func__, __LINE__, 0, (err), (fmt), ## a) #define ext4_error_inode_block(inode, block, err, fmt, a...) \ __ext4_error_inode((inode), __func__, __LINE__, (block), (err), \ (fmt), ## a) #define EXT4_ERROR_FILE(file, block, fmt, a...) \ ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) #define ext4_abort(sb, err, fmt, a...) \ __ext4_error((sb), __func__, __LINE__, true, (err), 0, (fmt), ## a) #ifdef CONFIG_PRINTK #define ext4_error_inode(inode, func, line, block, fmt, ...) \ __ext4_error_inode(inode, func, line, block, 0, fmt, ##__VA_ARGS__) #define ext4_error_inode_err(inode, func, line, block, err, fmt, ...) \ __ext4_error_inode((inode), (func), (line), (block), \ (err), (fmt), ##__VA_ARGS__) #define ext4_error_file(file, func, line, block, fmt, ...) \ __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__) #define ext4_error(sb, fmt, ...) \ __ext4_error((sb), __func__, __LINE__, false, 0, 0, (fmt), \ ##__VA_ARGS__) #define ext4_error_err(sb, err, fmt, ...) \ __ext4_error((sb), __func__, __LINE__, false, (err), 0, (fmt), \ ##__VA_ARGS__) #define ext4_warning(sb, fmt, ...) \ __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) #define ext4_warning_inode(inode, fmt, ...) \ __ext4_warning_inode(inode, __func__, __LINE__, fmt, ##__VA_ARGS__) #define ext4_msg(sb, level, fmt, ...) \ __ext4_msg(sb, level, fmt, ##__VA_ARGS__) #define dump_mmp_msg(sb, mmp, msg) \ __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg) #define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \ fmt, ##__VA_ARGS__) #else #define ext4_error_inode(inode, func, line, block, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_error_inode(inode, "", 0, block, 0, " "); \ } while (0) #define ext4_error_inode_err(inode, func, line, block, err, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_error_inode(inode, "", 0, block, err, " "); \ } while (0) #define ext4_error_file(file, func, line, block, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_error_file(file, "", 0, block, " "); \ } while (0) #define ext4_error(sb, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_error(sb, "", 0, false, 0, 0, " "); \ } while (0) #define ext4_error_err(sb, err, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_error(sb, "", 0, false, err, 0, " "); \ } while (0) #define ext4_warning(sb, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_warning(sb, "", 0, " "); \ } while (0) #define ext4_warning_inode(inode, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_warning_inode(inode, "", 0, " "); \ } while (0) #define ext4_msg(sb, level, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_msg(sb, "", " "); \ } while (0) #define dump_mmp_msg(sb, mmp, msg) \ __dump_mmp_msg(sb, mmp, "", 0, "") #define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_grp_locked_error("", 0, sb, grp, ino, block, " "); \ } while (0) #endif extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, struct ext4_group_desc *bg); extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, struct ext4_group_desc *bg); extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, struct ext4_group_desc *bg); extern __u32 ext4_free_group_clusters(struct super_block *sb, struct ext4_group_desc *bg); extern __u32 ext4_free_inodes_count(struct super_block *sb, struct ext4_group_desc *bg); extern __u32 ext4_used_dirs_count(struct super_block *sb, struct ext4_group_desc *bg); extern __u32 ext4_itable_unused_count(struct super_block *sb, struct ext4_group_desc *bg); extern void ext4_block_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk); extern void ext4_inode_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk); extern void ext4_inode_table_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk); extern void ext4_free_group_clusters_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); extern void ext4_free_inodes_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); extern void ext4_used_dirs_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); extern void ext4_itable_unused_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group, struct ext4_group_desc *gdp); extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group, struct ext4_group_desc *gdp); extern int ext4_register_li_request(struct super_block *sb, ext4_group_t first_not_zeroed); static inline int ext4_has_group_desc_csum(struct super_block *sb) { return ext4_has_feature_gdt_csum(sb) || ext4_has_feature_metadata_csum(sb); } #define ext4_read_incompat_64bit_val(es, name) \ (((es)->s_feature_incompat & cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT) \ ? (ext4_fsblk_t)le32_to_cpu(es->name##_hi) << 32 : 0) | \ le32_to_cpu(es->name##_lo)) static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) { return ext4_read_incompat_64bit_val(es, s_blocks_count); } static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es) { return ext4_read_incompat_64bit_val(es, s_r_blocks_count); } static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es) { return ext4_read_incompat_64bit_val(es, s_free_blocks_count); } static inline void ext4_blocks_count_set(struct ext4_super_block *es, ext4_fsblk_t blk) { es->s_blocks_count_lo = cpu_to_le32((u32)blk); es->s_blocks_count_hi = cpu_to_le32(blk >> 32); } static inline void ext4_free_blocks_count_set(struct ext4_super_block *es, ext4_fsblk_t blk) { es->s_free_blocks_count_lo = cpu_to_le32((u32)blk); es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32); } static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, ext4_fsblk_t blk) { es->s_r_blocks_count_lo = cpu_to_le32((u32)blk); es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); } static inline loff_t ext4_isize(struct super_block *sb, struct ext4_inode *raw_inode) { if (ext4_has_feature_largedir(sb) || S_ISREG(le16_to_cpu(raw_inode->i_mode))) return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | le32_to_cpu(raw_inode->i_size_lo); return (loff_t) le32_to_cpu(raw_inode->i_size_lo); } static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) { raw_inode->i_size_lo = cpu_to_le32(i_size); raw_inode->i_size_high = cpu_to_le32(i_size >> 32); } /* * Reading s_groups_count requires using smp_rmb() afterwards. See * the locking protocol documented in the comments of ext4_group_add() * in resize.c */ static inline ext4_group_t ext4_get_groups_count(struct super_block *sb) { ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; smp_rmb(); return ngroups; } static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi, ext4_group_t block_group) { return block_group >> sbi->s_log_groups_per_flex; } static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) { return 1 << sbi->s_log_groups_per_flex; } static inline loff_t ext4_get_maxbytes(struct inode *inode) { if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return inode->i_sb->s_maxbytes; return EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; } #define ext4_std_error(sb, errno) \ do { \ if ((errno)) \ __ext4_std_error((sb), __func__, __LINE__, (errno)); \ } while (0) #ifdef CONFIG_SMP /* Each CPU can accumulate percpu_counter_batch clusters in their local * counters. So we need to make sure we have free clusters more * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times. */ #define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids)) #else #define EXT4_FREECLUSTERS_WATERMARK 0 #endif /* Update i_disksize. Requires i_rwsem to avoid races with truncate */ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) { WARN_ON_ONCE(S_ISREG(inode->i_mode) && !inode_is_locked(inode)); down_write(&EXT4_I(inode)->i_data_sem); if (newsize > EXT4_I(inode)->i_disksize) WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize); up_write(&EXT4_I(inode)->i_data_sem); } /* Update i_size, i_disksize. Requires i_rwsem to avoid races with truncate */ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize) { int changed = 0; if (newsize > inode->i_size) { i_size_write(inode, newsize); changed = 1; } if (newsize > EXT4_I(inode)->i_disksize) { ext4_update_i_disksize(inode, newsize); changed |= 2; } return changed; } int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, loff_t len); struct ext4_group_info { unsigned long bb_state; #ifdef AGGRESSIVE_CHECK unsigned long bb_check_counter; #endif struct rb_root bb_free_root; ext4_grpblk_t bb_first_free; /* first free block */ ext4_grpblk_t bb_free; /* total free blocks */ ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ int bb_avg_fragment_size_order; /* order of average fragment in BG */ ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ ext4_group_t bb_group; /* Group number */ struct list_head bb_prealloc_list; #ifdef DOUBLE_CHECK void *bb_bitmap; #endif struct rw_semaphore alloc_sem; ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block * regions, index is order. * bb_counters[3] = 5 means * 5 free 8-block regions. */ }; #define EXT4_GROUP_INFO_NEED_INIT_BIT 0 #define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 #define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2 #define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3 #define EXT4_GROUP_INFO_BBITMAP_CORRUPT \ (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT) #define EXT4_GROUP_INFO_IBITMAP_CORRUPT \ (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT) #define EXT4_GROUP_INFO_BBITMAP_READ_BIT 4 #define EXT4_MB_GRP_NEED_INIT(grp) \ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_BBITMAP_CORRUPT(grp) \ (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_IBITMAP_CORRUPT(grp) \ (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_WAS_TRIMMED(grp) \ (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_SET_TRIMMED(grp) \ (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \ (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_TEST_AND_SET_READ(grp) \ (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state))) #define EXT4_MAX_CONTENTION 8 #define EXT4_CONTENTION_THRESHOLD 2 static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, ext4_group_t group) { return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); } /* * Returns true if the filesystem is busy enough that attempts to * access the block group locks has run into contention. */ static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi) { return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD); } static inline bool ext4_try_lock_group(struct super_block *sb, ext4_group_t group) { if (!spin_trylock(ext4_group_lock_ptr(sb, group))) return false; /* * We're able to grab the lock right away, so drop the lock * contention counter. */ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); return true; } static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) { if (!ext4_try_lock_group(sb, group)) { /* * The lock is busy, so bump the contention counter, * and then wait on the spin lock. */ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1, EXT4_MAX_CONTENTION); spin_lock(ext4_group_lock_ptr(sb, group)); } } static inline void ext4_unlock_group(struct super_block *sb, ext4_group_t group) { spin_unlock(ext4_group_lock_ptr(sb, group)); } #ifdef CONFIG_QUOTA static inline bool ext4_quota_capable(struct super_block *sb) { return (test_opt(sb, QUOTA) || ext4_has_feature_quota(sb)); } static inline bool ext4_is_quota_journalled(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); return (ext4_has_feature_quota(sb) || sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]); } int ext4_enable_quotas(struct super_block *sb); #endif /* * Block validity checking */ #define ext4_check_indirect_blockref(inode, bh) \ ext4_check_blockref(__func__, __LINE__, inode, \ (__le32 *)(bh)->b_data, \ EXT4_ADDR_PER_BLOCK((inode)->i_sb)) #define ext4_ind_check_inode(inode) \ ext4_check_blockref(__func__, __LINE__, inode, \ EXT4_I(inode)->i_data, \ EXT4_NDIR_BLOCKS) /* * Inodes and files operations */ /* dir.c */ extern const struct file_operations ext4_dir_operations; /* file.c */ extern const struct inode_operations ext4_file_inode_operations; extern const struct file_operations ext4_file_operations; extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); /* inline.c */ extern int ext4_get_max_inline_size(struct inode *inode); extern int ext4_find_inline_data_nolock(struct inode *inode); extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); extern void ext4_update_final_de(void *de_buf, int old_size, int new_size); int ext4_readpage_inline(struct inode *inode, struct folio *folio); extern int ext4_try_to_write_inline_data(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, struct folio **foliop); int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct folio *folio); extern int ext4_generic_write_inline_data(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, struct folio **foliop, void **fsdata, bool da); extern int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode); extern int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, struct inode *inode); extern int ext4_read_inline_dir(struct file *filp, struct dir_context *ctx, int *has_inline_data); extern int ext4_inlinedir_to_tree(struct file *dir_file, struct inode *dir, ext4_lblk_t block, struct dx_hash_info *hinfo, __u32 start_hash, __u32 start_minor_hash, int *has_inline_data); extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir, int *has_inline_data); extern int ext4_delete_inline_entry(handle_t *handle, struct inode *dir, struct ext4_dir_entry_2 *de_del, struct buffer_head *bh, int *has_inline_data); extern bool empty_inline_dir(struct inode *dir, int *has_inline_data); extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, struct ext4_dir_entry_2 **parent_de, int *retval); extern void *ext4_read_inline_link(struct inode *inode); struct iomap; extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap); extern int ext4_inline_data_truncate(struct inode *inode, int *has_inline); extern int ext4_convert_inline_data(struct inode *inode); static inline int ext4_has_inline_data(struct inode *inode) { return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && EXT4_I(inode)->i_inline_off; } /* namei.c */ extern const struct inode_operations ext4_dir_inode_operations; extern const struct inode_operations ext4_special_inode_operations; extern struct dentry *ext4_get_parent(struct dentry *child); extern int ext4_init_dirblock(handle_t *handle, struct inode *inode, struct buffer_head *dir_block, unsigned int parent_ino, void *inline_buf, int inline_size); extern void ext4_initialize_dirent_tail(struct buffer_head *bh, unsigned int blocksize); extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, struct buffer_head *bh); extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name, struct inode *inode, struct dentry *dentry); extern int __ext4_link(struct inode *dir, struct inode *inode, struct dentry *dentry); #define S_SHIFT 12 static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = { [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, }; static inline void ext4_set_de_type(struct super_block *sb, struct ext4_dir_entry_2 *de, umode_t mode) { if (ext4_has_feature_filetype(sb)) de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; } /* readpages.c */ extern int ext4_mpage_readpages(struct inode *inode, struct readahead_control *rac, struct folio *folio); extern int __init ext4_init_post_read_processing(void); extern void ext4_exit_post_read_processing(void); /* symlink.c */ extern const struct inode_operations ext4_encrypted_symlink_inode_operations; extern const struct inode_operations ext4_symlink_inode_operations; extern const struct inode_operations ext4_fast_symlink_inode_operations; /* sysfs.c */ extern void ext4_notify_error_sysfs(struct ext4_sb_info *sbi); extern int ext4_register_sysfs(struct super_block *sb); extern void ext4_unregister_sysfs(struct super_block *sb); extern int __init ext4_init_sysfs(void); extern void ext4_exit_sysfs(void); /* block_validity */ extern void ext4_release_system_zone(struct super_block *sb); extern int ext4_setup_system_zone(struct super_block *sb); extern int __init ext4_init_system_zone(void); extern void ext4_exit_system_zone(void); extern int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk, unsigned int count); extern int ext4_check_blockref(const char *, unsigned int, struct inode *, __le32 *, unsigned int); extern int ext4_sb_block_valid(struct super_block *sb, struct inode *inode, ext4_fsblk_t start_blk, unsigned int count); /* extents.c */ struct ext4_ext_path; struct ext4_extent; /* * Maximum number of logical blocks in a file; ext4_extent's ee_block is * __le32. */ #define EXT_MAX_BLOCKS 0xffffffff extern void ext4_ext_tree_init(handle_t *handle, struct inode *inode); extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern int ext4_ext_truncate(handle_t *, struct inode *); extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end); extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); extern long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len); extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, loff_t offset, ssize_t len); extern int ext4_convert_unwritten_extents_atomic(handle_t *handle, struct inode *inode, loff_t offset, ssize_t len); extern int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, struct ext4_ext_path *path); extern struct ext4_ext_path *ext4_ext_insert_extent( handle_t *handle, struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *newext, int gb_flags); extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, struct ext4_ext_path *, int flags); extern void ext4_free_ext_path(struct ext4_ext_path *); extern int ext4_ext_check_inode(struct inode *inode); extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); extern int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); extern int ext4_ext_precache(struct inode *inode); extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, ext4_lblk_t count, int mark_unwritten,int *err); extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu); extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode, int check_cred, int restart_cred, int revoke_cred); extern void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end); extern int ext4_ext_replay_set_iblocks(struct inode *inode); extern int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start, int len, int unwritten, ext4_fsblk_t pblk); extern int ext4_ext_clear_bb(struct inode *inode); /* move_extent.c */ extern void ext4_double_down_write_data_sem(struct inode *first, struct inode *second); extern void ext4_double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode); extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 start_orig, __u64 start_donor, __u64 len, __u64 *moved_len); /* page-io.c */ extern int __init ext4_init_pageio(void); extern void ext4_exit_pageio(void); extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end); extern int ext4_put_io_end(ext4_io_end_t *io_end); extern void ext4_put_io_end_defer(ext4_io_end_t *io_end); extern void ext4_io_submit_init(struct ext4_io_submit *io, struct writeback_control *wbc); extern void ext4_end_io_rsv_work(struct work_struct *work); extern void ext4_io_submit(struct ext4_io_submit *io); int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page, size_t len); extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end); extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end); /* mmp.c */ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); /* mmp.c */ extern void ext4_stop_mmpd(struct ext4_sb_info *sbi); /* verity.c */ extern const struct fsverity_operations ext4_verityops; /* orphan.c */ extern int ext4_orphan_add(handle_t *, struct inode *); extern int ext4_orphan_del(handle_t *, struct inode *); extern void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es); extern void ext4_release_orphan_info(struct super_block *sb); extern int ext4_init_orphan_info(struct super_block *sb); extern int ext4_orphan_file_empty(struct super_block *sb); extern void ext4_orphan_file_block_trigger( struct jbd2_buffer_trigger_type *triggers, struct buffer_head *bh, void *data, size_t size); /* * Add new method to test whether block and inode bitmaps are properly * initialized. With uninit_bg reading the block from disk is not enough * to mark the bitmap uptodate. We need to also zero-out the bitmap */ #define BH_BITMAP_UPTODATE BH_JBDPrivateStart static inline int bitmap_uptodate(struct buffer_head *bh) { return (buffer_uptodate(bh) && test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state)); } static inline void set_bitmap_uptodate(struct buffer_head *bh) { set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); } extern int ext4_resize_begin(struct super_block *sb); extern int ext4_resize_end(struct super_block *sb, bool update_backups); static inline void ext4_set_io_unwritten_flag(struct ext4_io_end *io_end) { if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) io_end->flag |= EXT4_IO_END_UNWRITTEN; } static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) { if (io_end->flag & EXT4_IO_END_UNWRITTEN) io_end->flag &= ~EXT4_IO_END_UNWRITTEN; } extern const struct iomap_ops ext4_iomap_ops; extern const struct iomap_ops ext4_iomap_overwrite_ops; extern const struct iomap_ops ext4_iomap_report_ops; static inline int ext4_buffer_uptodate(struct buffer_head *bh) { /* * If the buffer has the write error flag, we have failed * to write out data in the block. In this case, we don't * have to read the block because we may read the old data * successfully. */ if (buffer_write_io_error(bh)) set_buffer_uptodate(bh); return buffer_uptodate(bh); } static inline bool ext4_inode_can_atomic_write(struct inode *inode) { return S_ISREG(inode->i_mode) && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && EXT4_SB(inode->i_sb)->s_awu_min > 0; } extern int ext4_block_write_begin(handle_t *handle, struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block); #endif /* __KERNEL__ */ #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #endif /* _EXT4_H */
20 12 8 5 4 5 5 3 1 4 5 2 5 3 3 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/module.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tproxy.h> #include <net/inet_sock.h> #include <net/tcp.h> #include <linux/if_ether.h> #include <net/netfilter/ipv4/nf_defrag_ipv4.h> #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) #include <net/netfilter/ipv6/nf_defrag_ipv6.h> #endif struct nft_tproxy { u8 sreg_addr; u8 sreg_port; u8 family; }; static void nft_tproxy_eval_v4(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_tproxy *priv = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; const struct iphdr *iph = ip_hdr(skb); struct udphdr _hdr, *hp; __be32 taddr = 0; __be16 tport = 0; struct sock *sk; if (pkt->tprot != IPPROTO_TCP && pkt->tprot != IPPROTO_UDP) { regs->verdict.code = NFT_BREAK; return; } hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr); if (!hp) { regs->verdict.code = NFT_BREAK; return; } /* check if there's an ongoing connection on the packet addresses, this * happens if the redirect already happened and the current packet * belongs to an already established connection */ sk = nf_tproxy_get_sock_v4(nft_net(pkt), skb, iph->protocol, iph->saddr, iph->daddr, hp->source, hp->dest, skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED); if (priv->sreg_addr) taddr = nft_reg_load_be32(&regs->data[priv->sreg_addr]); taddr = nf_tproxy_laddr4(skb, taddr, iph->daddr); if (priv->sreg_port) tport = nft_reg_load_be16(&regs->data[priv->sreg_port]); if (!tport) tport = hp->dest; /* UDP has no TCP_TIME_WAIT state, so we never enter here */ if (sk && sk->sk_state == TCP_TIME_WAIT) { /* reopening a TIME_WAIT connection needs special handling */ sk = nf_tproxy_handle_time_wait4(nft_net(pkt), skb, taddr, tport, sk); } else if (!sk) { /* no, there's no established connection, check if * there's a listener on the redirected addr/port */ sk = nf_tproxy_get_sock_v4(nft_net(pkt), skb, iph->protocol, iph->saddr, taddr, hp->source, tport, skb->dev, NF_TPROXY_LOOKUP_LISTENER); } if (sk && nf_tproxy_sk_is_transparent(sk)) nf_tproxy_assign_sock(skb, sk); else regs->verdict.code = NFT_BREAK; } #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) static void nft_tproxy_eval_v6(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_tproxy *priv = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; const struct ipv6hdr *iph = ipv6_hdr(skb); int thoff = nft_thoff(pkt); struct udphdr _hdr, *hp; struct in6_addr taddr; __be16 tport = 0; struct sock *sk; int l4proto; memset(&taddr, 0, sizeof(taddr)); if (pkt->tprot != IPPROTO_TCP && pkt->tprot != IPPROTO_UDP) { regs->verdict.code = NFT_BREAK; return; } l4proto = pkt->tprot; hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr); if (hp == NULL) { regs->verdict.code = NFT_BREAK; return; } /* check if there's an ongoing connection on the packet addresses, this * happens if the redirect already happened and the current packet * belongs to an already established connection */ sk = nf_tproxy_get_sock_v6(nft_net(pkt), skb, thoff, l4proto, &iph->saddr, &iph->daddr, hp->source, hp->dest, nft_in(pkt), NF_TPROXY_LOOKUP_ESTABLISHED); if (priv->sreg_addr) memcpy(&taddr, &regs->data[priv->sreg_addr], sizeof(taddr)); taddr = *nf_tproxy_laddr6(skb, &taddr, &iph->daddr); if (priv->sreg_port) tport = nft_reg_load_be16(&regs->data[priv->sreg_port]); if (!tport) tport = hp->dest; /* UDP has no TCP_TIME_WAIT state, so we never enter here */ if (sk && sk->sk_state == TCP_TIME_WAIT) { /* reopening a TIME_WAIT connection needs special handling */ sk = nf_tproxy_handle_time_wait6(skb, l4proto, thoff, nft_net(pkt), &taddr, tport, sk); } else if (!sk) { /* no there's no established connection, check if * there's a listener on the redirected addr/port */ sk = nf_tproxy_get_sock_v6(nft_net(pkt), skb, thoff, l4proto, &iph->saddr, &taddr, hp->source, tport, nft_in(pkt), NF_TPROXY_LOOKUP_LISTENER); } /* NOTE: assign_sock consumes our sk reference */ if (sk && nf_tproxy_sk_is_transparent(sk)) nf_tproxy_assign_sock(skb, sk); else regs->verdict.code = NFT_BREAK; } #endif static void nft_tproxy_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_tproxy *priv = nft_expr_priv(expr); switch (nft_pf(pkt)) { case NFPROTO_IPV4: switch (priv->family) { case NFPROTO_IPV4: case NFPROTO_UNSPEC: nft_tproxy_eval_v4(expr, regs, pkt); return; } break; #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) case NFPROTO_IPV6: switch (priv->family) { case NFPROTO_IPV6: case NFPROTO_UNSPEC: nft_tproxy_eval_v6(expr, regs, pkt); return; } #endif } regs->verdict.code = NFT_BREAK; } static const struct nla_policy nft_tproxy_policy[NFTA_TPROXY_MAX + 1] = { [NFTA_TPROXY_FAMILY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_TPROXY_REG_ADDR] = { .type = NLA_U32 }, [NFTA_TPROXY_REG_PORT] = { .type = NLA_U32 }, }; static int nft_tproxy_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_tproxy *priv = nft_expr_priv(expr); unsigned int alen = 0; int err; if (!tb[NFTA_TPROXY_FAMILY] || (!tb[NFTA_TPROXY_REG_ADDR] && !tb[NFTA_TPROXY_REG_PORT])) return -EINVAL; priv->family = ntohl(nla_get_be32(tb[NFTA_TPROXY_FAMILY])); switch (ctx->family) { case NFPROTO_IPV4: if (priv->family != NFPROTO_IPV4) return -EINVAL; break; #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) case NFPROTO_IPV6: if (priv->family != NFPROTO_IPV6) return -EINVAL; break; #endif case NFPROTO_INET: break; default: return -EOPNOTSUPP; } /* Address is specified but the rule family is not set accordingly */ if (priv->family == NFPROTO_UNSPEC && tb[NFTA_TPROXY_REG_ADDR]) return -EINVAL; switch (priv->family) { case NFPROTO_IPV4: alen = sizeof_field(union nf_inet_addr, in); err = nf_defrag_ipv4_enable(ctx->net); if (err) return err; break; #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) case NFPROTO_IPV6: alen = sizeof_field(union nf_inet_addr, in6); err = nf_defrag_ipv6_enable(ctx->net); if (err) return err; break; #endif case NFPROTO_UNSPEC: /* No address is specified here */ err = nf_defrag_ipv4_enable(ctx->net); if (err) return err; #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) err = nf_defrag_ipv6_enable(ctx->net); if (err) return err; #endif break; default: return -EOPNOTSUPP; } if (tb[NFTA_TPROXY_REG_ADDR]) { err = nft_parse_register_load(ctx, tb[NFTA_TPROXY_REG_ADDR], &priv->sreg_addr, alen); if (err < 0) return err; } if (tb[NFTA_TPROXY_REG_PORT]) { err = nft_parse_register_load(ctx, tb[NFTA_TPROXY_REG_PORT], &priv->sreg_port, sizeof(u16)); if (err < 0) return err; } return 0; } static void nft_tproxy_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_tproxy *priv = nft_expr_priv(expr); switch (priv->family) { case NFPROTO_IPV4: nf_defrag_ipv4_disable(ctx->net); break; #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) case NFPROTO_IPV6: nf_defrag_ipv6_disable(ctx->net); break; #endif case NFPROTO_UNSPEC: nf_defrag_ipv4_disable(ctx->net); #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) nf_defrag_ipv6_disable(ctx->net); #endif break; } } static int nft_tproxy_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_tproxy *priv = nft_expr_priv(expr); if (nla_put_be32(skb, NFTA_TPROXY_FAMILY, htonl(priv->family))) return -1; if (priv->sreg_addr && nft_dump_register(skb, NFTA_TPROXY_REG_ADDR, priv->sreg_addr)) return -1; if (priv->sreg_port && nft_dump_register(skb, NFTA_TPROXY_REG_PORT, priv->sreg_port)) return -1; return 0; } static int nft_tproxy_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && ctx->family != NFPROTO_INET) return -EOPNOTSUPP; return nft_chain_validate_hooks(ctx->chain, 1 << NF_INET_PRE_ROUTING); } static struct nft_expr_type nft_tproxy_type; static const struct nft_expr_ops nft_tproxy_ops = { .type = &nft_tproxy_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_tproxy)), .eval = nft_tproxy_eval, .init = nft_tproxy_init, .destroy = nft_tproxy_destroy, .dump = nft_tproxy_dump, .reduce = NFT_REDUCE_READONLY, .validate = nft_tproxy_validate, }; static struct nft_expr_type nft_tproxy_type __read_mostly = { .name = "tproxy", .ops = &nft_tproxy_ops, .policy = nft_tproxy_policy, .maxattr = NFTA_TPROXY_MAX, .owner = THIS_MODULE, }; static int __init nft_tproxy_module_init(void) { return nft_register_expr(&nft_tproxy_type); } static void __exit nft_tproxy_module_exit(void) { nft_unregister_expr(&nft_tproxy_type); } module_init(nft_tproxy_module_init); module_exit(nft_tproxy_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Máté Eckl"); MODULE_DESCRIPTION("nf_tables tproxy support module"); MODULE_ALIAS_NFT_EXPR("tproxy");
1493 1342 1302 18 241 235 235 176 48 12 11 56 3 17 6 5 2 1 4 7 1343 1343 1069 359 359 1307 19 1308 1292 13 1292 743 38 2 38 2 2 39 1 1 40 75 75 58 12 86 86 44 1 34 2 4 5 4 1 3 3 369 39 68 4 64 1 64 4 52 89 83 8 4 86 82 3 82 29 58 19 19 19 1 1 7 3 1 2 88 582 249 37 37 42 44 8 34 2 37 44 44 43 44 44 44 15 29 1 2 1 3 4 42 1 586 131 252 252 252 83 83 72 4 7 399 398 400 13 1 1 3 9 8 1 243 190 66 1 23 361 2 1 71 3 3 356 128 22 349 6 347 633 636 2 1 534 1 1 389 385 60 150 23 263 60 369 364 4 115 472 2 8 475 240 211 29 192 47 2 225 13 12 469 57 54 2 451 446 360 94 50 328 109 26 127 127 68 53 2 484 114 461 277 56 13 13 4 2 2 82 81 82 6 34 98 29 491 29 521 29 11 11 75 81 36 3 82 80 37 79 36 82 3 80 82 82 82 35 438 437 438 438 281 1144 96 132 1 77 56 15 15 14 12 12 6 15 10 14 14 7 11 37 28 2 7 177 177 178 2 47 11 149 162 88 75 163 148 2 2 123 107 44 62 8 18 9 31 38 17 22 15 15 8 30 4 11 3 4 24 11 36 5 31 36 21 15 36 17 75 57 48 17 2 16 9 7 9 8 3 1218 857 393 393 390 4 393 393 393 52 12 40 40 27 8 4 31 30 37 4 2 2 20 2 2 2 58 60 35 35 110 3 111 57 1 53 4 41 62 62 3 1 60 60 7 4 114 114 3 54 55 97 97 97 97 64 52 1 52 51 8 167 7 159 67 39 27 177 178 174 2 148 163 171 162 12 53 55 97 9 12 9 2 7 12 19 35 48 36 35 29 179 182 183 179 4 17 65 56 62 47 56 51 18 18 1 13 168 586 588 587 385 214 3 381 219 8 4 2 3 2 4 126 90 1 1 33 1 1 6 8 3 3 2 2 1 2 2 1 1 1 1 1 1 2 324 65 262 16 1 2 1 2 2 1 1 1 2 2 1 3 9 42 9 34 45 39 11 44 7 7 7 7 8 11 11 11 11 11 7 8 7 7 7 7 11 2 7 11 11 11 11 7 11 11 8 6 5 5 3 496 496 496 496 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * The User Datagram Protocol (UDP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Alan Cox, <alan@lxorguk.ukuu.org.uk> * Hirokazu Takahashi, <taka@valinux.co.jp> * * Fixes: * Alan Cox : verify_area() calls * Alan Cox : stopped close while in use off icmp * messages. Not a fix but a botch that * for udp at least is 'valid'. * Alan Cox : Fixed icmp handling properly * Alan Cox : Correct error for oversized datagrams * Alan Cox : Tidied select() semantics. * Alan Cox : udp_err() fixed properly, also now * select and read wake correctly on errors * Alan Cox : udp_send verify_area moved to avoid mem leak * Alan Cox : UDP can count its memory * Alan Cox : send to an unknown connection causes * an ECONNREFUSED off the icmp, but * does NOT close. * Alan Cox : Switched to new sk_buff handlers. No more backlog! * Alan Cox : Using generic datagram code. Even smaller and the PEEK * bug no longer crashes it. * Fred Van Kempen : Net2e support for sk->broadcast. * Alan Cox : Uses skb_free_datagram * Alan Cox : Added get/set sockopt support. * Alan Cox : Broadcasting without option set returns EACCES. * Alan Cox : No wakeup calls. Instead we now use the callbacks. * Alan Cox : Use ip_tos and ip_ttl * Alan Cox : SNMP Mibs * Alan Cox : MSG_DONTROUTE, and 0.0.0.0 support. * Matt Dillon : UDP length checks. * Alan Cox : Smarter af_inet used properly. * Alan Cox : Use new kernel side addressing. * Alan Cox : Incorrect return on truncated datagram receive. * Arnt Gulbrandsen : New udp_send and stuff * Alan Cox : Cache last socket * Alan Cox : Route cache * Jon Peatfield : Minor efficiency fix to sendto(). * Mike Shaver : RFC1122 checks. * Alan Cox : Nonblocking error fix. * Willy Konynenberg : Transparent proxying support. * Mike McLagan : Routing by source * David S. Miller : New socket lookup architecture. * Last socket cache retained as it * does have a high hit rate. * Olaf Kirch : Don't linearise iovec on sendmsg. * Andi Kleen : Some cleanups, cache destination entry * for connect. * Vitaly E. Lavrov : Transparent proxy revived after year coma. * Melvin Smith : Check msg_name not msg_namelen in sendto(), * return ENOTCONN for unconnected sockets (POSIX) * Janos Farkas : don't deliver multi/broadcasts to a different * bound-to-device socket * Hirokazu Takahashi : HW checksumming for outgoing UDP * datagrams. * Hirokazu Takahashi : sendfile() on UDP works now. * Arnaldo C. Melo : convert /proc/net/udp to seq_file * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which * Alexey Kuznetsov: allow both IPv4 and IPv6 sockets to bind * a single port at the same time. * Derek Atkins <derek@ihtfp.com>: Add Encapulation Support * James Chapman : Add L2TP encapsulation type. */ #define pr_fmt(fmt) "UDP: " fmt #include <linux/bpf-cgroup.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include <linux/memblock.h> #include <linux/highmem.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/module.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/igmp.h> #include <linux/inetdevice.h> #include <linux/in.h> #include <linux/errno.h> #include <linux/timer.h> #include <linux/mm.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <linux/sock_diag.h> #include <net/tcp_states.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <net/net_namespace.h> #include <net/icmp.h> #include <net/inet_hashtables.h> #include <net/ip.h> #include <net/ip_tunnels.h> #include <net/route.h> #include <net/checksum.h> #include <net/gso.h> #include <net/xfrm.h> #include <trace/events/udp.h> #include <linux/static_key.h> #include <linux/btf_ids.h> #include <trace/events/skb.h> #include <net/busy_poll.h> #include "udp_impl.h" #include <net/sock_reuseport.h> #include <net/addrconf.h> #include <net/udp_tunnel.h> #include <net/gro.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6_stubs.h> #endif #include <net/rps.h> struct udp_table udp_table __read_mostly; long sysctl_udp_mem[3] __read_mostly; EXPORT_IPV6_MOD(sysctl_udp_mem); DEFINE_PER_CPU(int, udp_memory_per_cpu_fw_alloc); EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc); #define MAX_UDP_PORTS 65536 #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN_PERNET) static struct udp_table *udp_get_table_prot(struct sock *sk) { return sk->sk_prot->h.udp_table ? : sock_net(sk)->ipv4.udp_table; } static int udp_lib_lport_inuse(struct net *net, __u16 num, const struct udp_hslot *hslot, unsigned long *bitmap, struct sock *sk, unsigned int log) { kuid_t uid = sk_uid(sk); struct sock *sk2; sk_for_each(sk2, &hslot->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && (bitmap || udp_sk(sk2)->udp_port_hash == num) && (!sk2->sk_reuse || !sk->sk_reuse) && (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && inet_rcv_saddr_equal(sk, sk2, true)) { if (sk2->sk_reuseport && sk->sk_reuseport && !rcu_access_pointer(sk->sk_reuseport_cb) && uid_eq(uid, sk_uid(sk2))) { if (!bitmap) return 0; } else { if (!bitmap) return 1; __set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap); } } } return 0; } /* * Note: we still hold spinlock of primary hash chain, so no other writer * can insert/delete a socket with local_port == num */ static int udp_lib_lport_inuse2(struct net *net, __u16 num, struct udp_hslot *hslot2, struct sock *sk) { kuid_t uid = sk_uid(sk); struct sock *sk2; int res = 0; spin_lock(&hslot2->lock); udp_portaddr_for_each_entry(sk2, &hslot2->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && (udp_sk(sk2)->udp_port_hash == num) && (!sk2->sk_reuse || !sk->sk_reuse) && (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && inet_rcv_saddr_equal(sk, sk2, true)) { if (sk2->sk_reuseport && sk->sk_reuseport && !rcu_access_pointer(sk->sk_reuseport_cb) && uid_eq(uid, sk_uid(sk2))) { res = 0; } else { res = 1; } break; } } spin_unlock(&hslot2->lock); return res; } static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot) { struct net *net = sock_net(sk); kuid_t uid = sk_uid(sk); struct sock *sk2; sk_for_each(sk2, &hslot->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && sk2->sk_family == sk->sk_family && ipv6_only_sock(sk2) == ipv6_only_sock(sk) && (udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) && (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && sk2->sk_reuseport && uid_eq(uid, sk_uid(sk2)) && inet_rcv_saddr_equal(sk, sk2, false)) { return reuseport_add_sock(sk, sk2, inet_rcv_saddr_any(sk)); } } return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } /** * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6 * * @sk: socket struct in question * @snum: port number to look up * @hash2_nulladdr: AF-dependent hash value in secondary hash chains, * with NULL address */ int udp_lib_get_port(struct sock *sk, unsigned short snum, unsigned int hash2_nulladdr) { struct udp_table *udptable = udp_get_table_prot(sk); struct udp_hslot *hslot, *hslot2; struct net *net = sock_net(sk); int error = -EADDRINUSE; if (!snum) { DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN); unsigned short first, last; int low, high, remaining; unsigned int rand; inet_sk_get_local_port_range(sk, &low, &high); remaining = (high - low) + 1; rand = get_random_u32(); first = reciprocal_scale(rand, remaining) + low; /* * force rand to be an odd multiple of UDP_HTABLE_SIZE */ rand = (rand | 1) * (udptable->mask + 1); last = first + udptable->mask + 1; do { hslot = udp_hashslot(udptable, net, first); bitmap_zero(bitmap, PORTS_PER_CHAIN); spin_lock_bh(&hslot->lock); udp_lib_lport_inuse(net, snum, hslot, bitmap, sk, udptable->log); snum = first; /* * Iterate on all possible values of snum for this hash. * Using steps of an odd multiple of UDP_HTABLE_SIZE * give us randomization and full range coverage. */ do { if (low <= snum && snum <= high && !test_bit(snum >> udptable->log, bitmap) && !inet_is_local_reserved_port(net, snum)) goto found; snum += rand; } while (snum != first); spin_unlock_bh(&hslot->lock); cond_resched(); } while (++first != last); goto fail; } else { hslot = udp_hashslot(udptable, net, snum); spin_lock_bh(&hslot->lock); if (hslot->count > 10) { int exist; unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum; slot2 &= udptable->mask; hash2_nulladdr &= udptable->mask; hslot2 = udp_hashslot2(udptable, slot2); if (hslot->count < hslot2->count) goto scan_primary_hash; exist = udp_lib_lport_inuse2(net, snum, hslot2, sk); if (!exist && (hash2_nulladdr != slot2)) { hslot2 = udp_hashslot2(udptable, hash2_nulladdr); exist = udp_lib_lport_inuse2(net, snum, hslot2, sk); } if (exist) goto fail_unlock; else goto found; } scan_primary_hash: if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, 0)) goto fail_unlock; } found: inet_sk(sk)->inet_num = snum; udp_sk(sk)->udp_port_hash = snum; udp_sk(sk)->udp_portaddr_hash ^= snum; if (sk_unhashed(sk)) { if (sk->sk_reuseport && udp_reuseport_add_sock(sk, hslot)) { inet_sk(sk)->inet_num = 0; udp_sk(sk)->udp_port_hash = 0; udp_sk(sk)->udp_portaddr_hash ^= snum; goto fail_unlock; } sock_set_flag(sk, SOCK_RCU_FREE); sk_add_node_rcu(sk, &hslot->head); hslot->count++; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); spin_lock(&hslot2->lock); if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && sk->sk_family == AF_INET6) hlist_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node, &hslot2->head); else hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, &hslot2->head); hslot2->count++; spin_unlock(&hslot2->lock); } error = 0; fail_unlock: spin_unlock_bh(&hslot->lock); fail: return error; } EXPORT_IPV6_MOD(udp_lib_get_port); int udp_v4_get_port(struct sock *sk, unsigned short snum) { unsigned int hash2_nulladdr = ipv4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum); unsigned int hash2_partial = ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0); /* precompute partial secondary hash */ udp_sk(sk)->udp_portaddr_hash = hash2_partial; return udp_lib_get_port(sk, snum, hash2_nulladdr); } static int compute_score(struct sock *sk, const struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned short hnum, int dif, int sdif) { int score; struct inet_sock *inet; bool dev_match; if (!net_eq(sock_net(sk), net) || udp_sk(sk)->udp_port_hash != hnum || ipv6_only_sock(sk)) return -1; if (sk->sk_rcv_saddr != daddr) return -1; score = (sk->sk_family == PF_INET) ? 2 : 1; inet = inet_sk(sk); if (inet->inet_daddr) { if (inet->inet_daddr != saddr) return -1; score += 4; } if (inet->inet_dport) { if (inet->inet_dport != sport) return -1; score += 4; } dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif); if (!dev_match) return -1; if (sk->sk_bound_dev_if) score += 4; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; return score; } u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, const __be16 fport) { net_get_random_once(&udp_ehash_secret, sizeof(udp_ehash_secret)); return __inet_ehashfn(laddr, lport, faddr, fport, udp_ehash_secret + net_hash_mix(net)); } EXPORT_IPV6_MOD(udp_ehashfn); /** * udp4_lib_lookup1() - Simplified lookup using primary hash (destination port) * @net: Network namespace * @saddr: Source address, network order * @sport: Source port, network order * @daddr: Destination address, network order * @hnum: Destination port, host order * @dif: Destination interface index * @sdif: Destination bridge port index, if relevant * @udptable: Set of UDP hash tables * * Simplified lookup to be used as fallback if no sockets are found due to a * potential race between (receive) address change, and lookup happening before * the rehash operation. This function ignores SO_REUSEPORT groups while scoring * result sockets, because if we have one, we don't need the fallback at all. * * Called under rcu_read_lock(). * * Return: socket with highest matching score if any, NULL if none */ static struct sock *udp4_lib_lookup1(const struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned int hnum, int dif, int sdif, const struct udp_table *udptable) { unsigned int slot = udp_hashfn(net, hnum, udptable->mask); struct udp_hslot *hslot = &udptable->hash[slot]; struct sock *sk, *result = NULL; int score, badness = 0; sk_for_each_rcu(sk, &hslot->head) { score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif); if (score > badness) { result = sk; badness = score; } } return result; } /* called with rcu_read_lock() */ static struct sock *udp4_lib_lookup2(const struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned int hnum, int dif, int sdif, struct udp_hslot *hslot2, struct sk_buff *skb) { struct sock *sk, *result; int score, badness; bool need_rescore; result = NULL; badness = 0; udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { need_rescore = false; rescore: score = compute_score(need_rescore ? result : sk, net, saddr, sport, daddr, hnum, dif, sdif); if (score > badness) { badness = score; if (need_rescore) continue; if (sk->sk_state == TCP_ESTABLISHED) { result = sk; continue; } result = inet_lookup_reuseport(net, sk, skb, sizeof(struct udphdr), saddr, sport, daddr, hnum, udp_ehashfn); if (!result) { result = sk; continue; } /* Fall back to scoring if group has connections */ if (!reuseport_has_conns(sk)) return result; /* Reuseport logic returned an error, keep original score. */ if (IS_ERR(result)) continue; /* compute_score is too long of a function to be * inlined, and calling it again here yields * measureable overhead for some * workloads. Work around it by jumping * backwards to rescore 'result'. */ need_rescore = true; goto rescore; } } return result; } #if IS_ENABLED(CONFIG_BASE_SMALL) static struct sock *udp4_lib_lookup4(const struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned int hnum, int dif, int sdif, struct udp_table *udptable) { return NULL; } static void udp_rehash4(struct udp_table *udptable, struct sock *sk, u16 newhash4) { } static void udp_unhash4(struct udp_table *udptable, struct sock *sk) { } #else /* !CONFIG_BASE_SMALL */ static struct sock *udp4_lib_lookup4(const struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned int hnum, int dif, int sdif, struct udp_table *udptable) { const __portpair ports = INET_COMBINED_PORTS(sport, hnum); const struct hlist_nulls_node *node; struct udp_hslot *hslot4; unsigned int hash4, slot; struct udp_sock *up; struct sock *sk; hash4 = udp_ehashfn(net, daddr, hnum, saddr, sport); slot = hash4 & udptable->mask; hslot4 = &udptable->hash4[slot]; INET_ADDR_COOKIE(acookie, saddr, daddr); begin: /* SLAB_TYPESAFE_BY_RCU not used, so we don't need to touch sk_refcnt */ udp_lrpa_for_each_entry_rcu(up, node, &hslot4->nulls_head) { sk = (struct sock *)up; if (inet_match(net, sk, acookie, ports, dif, sdif)) return sk; } /* if the nulls value we got at the end of this lookup is not the * expected one, we must restart lookup. We probably met an item that * was moved to another chain due to rehash. */ if (get_nulls_value(node) != slot) goto begin; return NULL; } /* udp_rehash4() only checks hslot4, and hash4_cnt is not processed. */ static void udp_rehash4(struct udp_table *udptable, struct sock *sk, u16 newhash4) { struct udp_hslot *hslot4, *nhslot4; hslot4 = udp_hashslot4(udptable, udp_sk(sk)->udp_lrpa_hash); nhslot4 = udp_hashslot4(udptable, newhash4); udp_sk(sk)->udp_lrpa_hash = newhash4; if (hslot4 != nhslot4) { spin_lock_bh(&hslot4->lock); hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_lrpa_node); hslot4->count--; spin_unlock_bh(&hslot4->lock); spin_lock_bh(&nhslot4->lock); hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_lrpa_node, &nhslot4->nulls_head); nhslot4->count++; spin_unlock_bh(&nhslot4->lock); } } static void udp_unhash4(struct udp_table *udptable, struct sock *sk) { struct udp_hslot *hslot2, *hslot4; if (udp_hashed4(sk)) { hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); hslot4 = udp_hashslot4(udptable, udp_sk(sk)->udp_lrpa_hash); spin_lock(&hslot4->lock); hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_lrpa_node); hslot4->count--; spin_unlock(&hslot4->lock); spin_lock(&hslot2->lock); udp_hash4_dec(hslot2); spin_unlock(&hslot2->lock); } } void udp_lib_hash4(struct sock *sk, u16 hash) { struct udp_hslot *hslot, *hslot2, *hslot4; struct net *net = sock_net(sk); struct udp_table *udptable; /* Connected udp socket can re-connect to another remote address, which * will be handled by rehash. Thus no need to redo hash4 here. */ if (udp_hashed4(sk)) return; udptable = net->ipv4.udp_table; hslot = udp_hashslot(udptable, net, udp_sk(sk)->udp_port_hash); hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); hslot4 = udp_hashslot4(udptable, hash); udp_sk(sk)->udp_lrpa_hash = hash; spin_lock_bh(&hslot->lock); if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); spin_lock(&hslot4->lock); hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_lrpa_node, &hslot4->nulls_head); hslot4->count++; spin_unlock(&hslot4->lock); spin_lock(&hslot2->lock); udp_hash4_inc(hslot2); spin_unlock(&hslot2->lock); spin_unlock_bh(&hslot->lock); } EXPORT_IPV6_MOD(udp_lib_hash4); /* call with sock lock */ void udp4_hash4(struct sock *sk) { struct net *net = sock_net(sk); unsigned int hash; if (sk_unhashed(sk) || sk->sk_rcv_saddr == htonl(INADDR_ANY)) return; hash = udp_ehashfn(net, sk->sk_rcv_saddr, sk->sk_num, sk->sk_daddr, sk->sk_dport); udp_lib_hash4(sk, hash); } EXPORT_IPV6_MOD(udp4_hash4); #endif /* CONFIG_BASE_SMALL */ /* UDP is nearly always wildcards out the wazoo, it makes no sense to try * harder than this. -DaveM */ struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif, int sdif, struct udp_table *udptable, struct sk_buff *skb) { unsigned short hnum = ntohs(dport); struct udp_hslot *hslot2; struct sock *result, *sk; unsigned int hash2; hash2 = ipv4_portaddr_hash(net, daddr, hnum); hslot2 = udp_hashslot2(udptable, hash2); if (udp_has_hash4(hslot2)) { result = udp4_lib_lookup4(net, saddr, sport, daddr, hnum, dif, sdif, udptable); if (result) /* udp4_lib_lookup4 return sk or NULL */ return result; } /* Lookup connected or non-wildcard socket */ result = udp4_lib_lookup2(net, saddr, sport, daddr, hnum, dif, sdif, hslot2, skb); if (!IS_ERR_OR_NULL(result) && result->sk_state == TCP_ESTABLISHED) goto done; /* Lookup redirect from BPF */ if (static_branch_unlikely(&bpf_sk_lookup_enabled) && udptable == net->ipv4.udp_table) { sk = inet_lookup_run_sk_lookup(net, IPPROTO_UDP, skb, sizeof(struct udphdr), saddr, sport, daddr, hnum, dif, udp_ehashfn); if (sk) { result = sk; goto done; } } /* Got non-wildcard socket or error on first lookup */ if (result) goto done; /* Lookup wildcard sockets */ hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); hslot2 = udp_hashslot2(udptable, hash2); result = udp4_lib_lookup2(net, saddr, sport, htonl(INADDR_ANY), hnum, dif, sdif, hslot2, skb); if (!IS_ERR_OR_NULL(result)) goto done; /* Primary hash (destination port) lookup as fallback for this race: * 1. __ip4_datagram_connect() sets sk_rcv_saddr * 2. lookup (this function): new sk_rcv_saddr, hashes not updated yet * 3. rehash operation updating _secondary and four-tuple_ hashes * The primary hash doesn't need an update after 1., so, thanks to this * further step, 1. and 3. don't need to be atomic against the lookup. */ result = udp4_lib_lookup1(net, saddr, sport, daddr, hnum, dif, sdif, udptable); done: if (IS_ERR(result)) return NULL; return result; } EXPORT_SYMBOL_GPL(__udp4_lib_lookup); static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport, struct udp_table *udptable) { const struct iphdr *iph = ip_hdr(skb); return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport, iph->daddr, dport, inet_iif(skb), inet_sdif(skb), udptable, skb); } struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport) { const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; const struct iphdr *iph = (struct iphdr *)(skb->data + offset); struct net *net = dev_net(skb->dev); int iif, sdif; inet_get_iif_sdif(skb, &iif, &sdif); return __udp4_lib_lookup(net, iph->saddr, sport, iph->daddr, dport, iif, sdif, net->ipv4.udp_table, NULL); } /* Must be called under rcu_read_lock(). * Does increment socket refcount. */ #if IS_ENABLED(CONFIG_NF_TPROXY_IPV4) || IS_ENABLED(CONFIG_NF_SOCKET_IPV4) struct sock *udp4_lib_lookup(const struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif) { struct sock *sk; sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, 0, net->ipv4.udp_table, NULL); if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; return sk; } EXPORT_SYMBOL_GPL(udp4_lib_lookup); #endif static inline bool __udp_is_mcast_sock(struct net *net, const struct sock *sk, __be16 loc_port, __be32 loc_addr, __be16 rmt_port, __be32 rmt_addr, int dif, int sdif, unsigned short hnum) { const struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net) || udp_sk(sk)->udp_port_hash != hnum || (inet->inet_daddr && inet->inet_daddr != rmt_addr) || (inet->inet_dport != rmt_port && inet->inet_dport) || (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) || ipv6_only_sock(sk) || !udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return false; if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif, sdif)) return false; return true; } DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); EXPORT_IPV6_MOD(udp_encap_needed_key); #if IS_ENABLED(CONFIG_IPV6) DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key); EXPORT_IPV6_MOD(udpv6_encap_needed_key); #endif void udp_encap_enable(void) { static_branch_inc(&udp_encap_needed_key); } EXPORT_SYMBOL(udp_encap_enable); void udp_encap_disable(void) { static_branch_dec(&udp_encap_needed_key); } EXPORT_SYMBOL(udp_encap_disable); /* Handler for tunnels with arbitrary destination ports: no socket lookup, go * through error handlers in encapsulations looking for a match. */ static int __udp4_lib_err_encap_no_sk(struct sk_buff *skb, u32 info) { int i; for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) { int (*handler)(struct sk_buff *skb, u32 info); const struct ip_tunnel_encap_ops *encap; encap = rcu_dereference(iptun_encaps[i]); if (!encap) continue; handler = encap->err_handler; if (handler && !handler(skb, info)) return 0; } return -ENOENT; } /* Try to match ICMP errors to UDP tunnels by looking up a socket without * reversing source and destination port: this will match tunnels that force the * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that * lwtunnels might actually break this assumption by being configured with * different destination ports on endpoints, in this case we won't be able to * trace ICMP messages back to them. * * If this doesn't match any socket, probe tunnels with arbitrary destination * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port * we've sent packets to won't necessarily match the local destination port. * * Then ask the tunnel implementation to match the error against a valid * association. * * Return an error if we can't find a match, the socket if we need further * processing, zero otherwise. */ static struct sock *__udp4_lib_err_encap(struct net *net, const struct iphdr *iph, struct udphdr *uh, struct udp_table *udptable, struct sock *sk, struct sk_buff *skb, u32 info) { int (*lookup)(struct sock *sk, struct sk_buff *skb); int network_offset, transport_offset; struct udp_sock *up; network_offset = skb_network_offset(skb); transport_offset = skb_transport_offset(skb); /* Network header needs to point to the outer IPv4 header inside ICMP */ skb_reset_network_header(skb); /* Transport header needs to point to the UDP header */ skb_set_transport_header(skb, iph->ihl << 2); if (sk) { up = udp_sk(sk); lookup = READ_ONCE(up->encap_err_lookup); if (lookup && lookup(sk, skb)) sk = NULL; goto out; } sk = __udp4_lib_lookup(net, iph->daddr, uh->source, iph->saddr, uh->dest, skb->dev->ifindex, 0, udptable, NULL); if (sk) { up = udp_sk(sk); lookup = READ_ONCE(up->encap_err_lookup); if (!lookup || lookup(sk, skb)) sk = NULL; } out: if (!sk) sk = ERR_PTR(__udp4_lib_err_encap_no_sk(skb, info)); skb_set_transport_header(skb, transport_offset); skb_set_network_header(skb, network_offset); return sk; } /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should * be closed and the error returned to the user. If err > 0 * it's just the icmp type << 8 | icmp code. * Header points to the ip header of the error packet. We move * on past this. Then (as it used to claim before adjustment) * header points to the first 8 bytes of the udp header. We need * to find the appropriate port. */ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) { struct inet_sock *inet; const struct iphdr *iph = (const struct iphdr *)skb->data; struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2)); const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; bool tunnel = false; struct sock *sk; int harderr; int err; struct net *net = dev_net(skb->dev); sk = __udp4_lib_lookup(net, iph->daddr, uh->dest, iph->saddr, uh->source, skb->dev->ifindex, inet_sdif(skb), udptable, NULL); if (!sk || READ_ONCE(udp_sk(sk)->encap_type)) { /* No socket for error: try tunnels before discarding */ if (static_branch_unlikely(&udp_encap_needed_key)) { sk = __udp4_lib_err_encap(net, iph, uh, udptable, sk, skb, info); if (!sk) return 0; } else sk = ERR_PTR(-ENOENT); if (IS_ERR(sk)) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return PTR_ERR(sk); } tunnel = true; } err = 0; harderr = 0; inet = inet_sk(sk); switch (type) { default: case ICMP_TIME_EXCEEDED: err = EHOSTUNREACH; break; case ICMP_SOURCE_QUENCH: goto out; case ICMP_PARAMETERPROB: err = EPROTO; harderr = 1; break; case ICMP_DEST_UNREACH: if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ ipv4_sk_update_pmtu(skb, sk, info); if (READ_ONCE(inet->pmtudisc) != IP_PMTUDISC_DONT) { err = EMSGSIZE; harderr = 1; break; } goto out; } err = EHOSTUNREACH; if (code <= NR_ICMP_UNREACH) { harderr = icmp_err_convert[code].fatal; err = icmp_err_convert[code].errno; } break; case ICMP_REDIRECT: ipv4_sk_redirect(skb, sk); goto out; } /* * RFC1122: OK. Passes ICMP errors back to application, as per * 4.1.3.3. */ if (tunnel) { /* ...not for tunnels though: we don't have a sending socket */ if (udp_sk(sk)->encap_err_rcv) udp_sk(sk)->encap_err_rcv(sk, skb, err, uh->dest, info, (u8 *)(uh+1)); goto out; } if (!inet_test_bit(RECVERR, sk)) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out; } else ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1)); sk->sk_err = err; sk_error_report(sk); out: return 0; } int udp_err(struct sk_buff *skb, u32 info) { return __udp4_lib_err(skb, info, dev_net(skb->dev)->ipv4.udp_table); } /* * Throw away all pending data and cancel the corking. Socket is locked. */ void udp_flush_pending_frames(struct sock *sk) { struct udp_sock *up = udp_sk(sk); if (up->pending) { up->len = 0; WRITE_ONCE(up->pending, 0); ip_flush_pending_frames(sk); } } EXPORT_IPV6_MOD(udp_flush_pending_frames); /** * udp4_hwcsum - handle outgoing HW checksumming * @skb: sk_buff containing the filled-in UDP header * (checksum field must be zeroed out) * @src: source IP address * @dst: destination IP address */ void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) { struct udphdr *uh = udp_hdr(skb); int offset = skb_transport_offset(skb); int len = skb->len - offset; int hlen = len; __wsum csum = 0; if (!skb_has_frag_list(skb)) { /* * Only one fragment on the socket. */ skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); uh->check = ~csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, 0); } else { struct sk_buff *frags; /* * HW-checksum won't work as there are two or more * fragments on the socket so that all csums of sk_buffs * should be together */ skb_walk_frags(skb, frags) { csum = csum_add(csum, frags->csum); hlen -= frags->len; } csum = skb_checksum(skb, offset, hlen, csum); skb->ip_summed = CHECKSUM_NONE; uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum); if (uh->check == 0) uh->check = CSUM_MANGLED_0; } } EXPORT_SYMBOL_GPL(udp4_hwcsum); /* Function to set UDP checksum for an IPv4 UDP packet. This is intended * for the simple case like when setting the checksum for a UDP tunnel. */ void udp_set_csum(bool nocheck, struct sk_buff *skb, __be32 saddr, __be32 daddr, int len) { struct udphdr *uh = udp_hdr(skb); if (nocheck) { uh->check = 0; } else if (skb_is_gso(skb)) { uh->check = ~udp_v4_check(len, saddr, daddr, 0); } else if (skb->ip_summed == CHECKSUM_PARTIAL) { uh->check = 0; uh->check = udp_v4_check(len, saddr, daddr, lco_csum(skb)); if (uh->check == 0) uh->check = CSUM_MANGLED_0; } else { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); uh->check = ~udp_v4_check(len, saddr, daddr, 0); } } EXPORT_SYMBOL(udp_set_csum); static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, struct inet_cork *cork) { struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(sk); struct udphdr *uh; int err; int is_udplite = IS_UDPLITE(sk); int offset = skb_transport_offset(skb); int len = skb->len - offset; int datalen = len - sizeof(*uh); __wsum csum = 0; /* * Create a UDP header */ uh = udp_hdr(skb); uh->source = inet->inet_sport; uh->dest = fl4->fl4_dport; uh->len = htons(len); uh->check = 0; if (cork->gso_size) { const int hlen = skb_network_header_len(skb) + sizeof(struct udphdr); if (hlen + min(datalen, cork->gso_size) > cork->fragsize) { kfree_skb(skb); return -EMSGSIZE; } if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) { kfree_skb(skb); return -EINVAL; } if (sk->sk_no_check_tx) { kfree_skb(skb); return -EINVAL; } if (is_udplite || dst_xfrm(skb_dst(skb))) { kfree_skb(skb); return -EIO; } if (datalen > cork->gso_size) { skb_shinfo(skb)->gso_size = cork->gso_size; skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen, cork->gso_size); /* Don't checksum the payload, skb will get segmented */ goto csum_partial; } } if (is_udplite) /* UDP-Lite */ csum = udplite_csum(skb); else if (sk->sk_no_check_tx) { /* UDP csum off */ skb->ip_summed = CHECKSUM_NONE; goto send; } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ csum_partial: udp4_hwcsum(skb, fl4->saddr, fl4->daddr); goto send; } else csum = udp_csum(skb); /* add protocol-dependent pseudo-header */ uh->check = csum_tcpudp_magic(fl4->saddr, fl4->daddr, len, sk->sk_protocol, csum); if (uh->check == 0) uh->check = CSUM_MANGLED_0; send: err = ip_send_skb(sock_net(sk), skb); if (err) { if (err == -ENOBUFS && !inet_test_bit(RECVERR, sk)) { UDP_INC_STATS(sock_net(sk), UDP_MIB_SNDBUFERRORS, is_udplite); err = 0; } } else UDP_INC_STATS(sock_net(sk), UDP_MIB_OUTDATAGRAMS, is_udplite); return err; } /* * Push out all pending data as one UDP datagram. Socket is locked. */ int udp_push_pending_frames(struct sock *sk) { struct udp_sock *up = udp_sk(sk); struct inet_sock *inet = inet_sk(sk); struct flowi4 *fl4 = &inet->cork.fl.u.ip4; struct sk_buff *skb; int err = 0; skb = ip_finish_skb(sk, fl4); if (!skb) goto out; err = udp_send_skb(skb, fl4, &inet->cork.base); out: up->len = 0; WRITE_ONCE(up->pending, 0); return err; } EXPORT_IPV6_MOD(udp_push_pending_frames); static int __udp_cmsg_send(struct cmsghdr *cmsg, u16 *gso_size) { switch (cmsg->cmsg_type) { case UDP_SEGMENT: if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u16))) return -EINVAL; *gso_size = *(__u16 *)CMSG_DATA(cmsg); return 0; default: return -EINVAL; } } int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size) { struct cmsghdr *cmsg; bool need_ip = false; int err; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_UDP) { need_ip = true; continue; } err = __udp_cmsg_send(cmsg, gso_size); if (err) return err; } return need_ip; } EXPORT_IPV6_MOD_GPL(udp_cmsg_send); int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); struct udp_sock *up = udp_sk(sk); DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); struct flowi4 fl4_stack; struct flowi4 *fl4; int ulen = len; struct ipcm_cookie ipc; struct rtable *rt = NULL; int free = 0; int connected = 0; __be32 daddr, faddr, saddr; u8 scope; __be16 dport; int err, is_udplite = IS_UDPLITE(sk); int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE; int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); struct sk_buff *skb; struct ip_options_data opt_copy; int uc_index; if (len > 0xFFFF) return -EMSGSIZE; /* * Check the flags. */ if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */ return -EOPNOTSUPP; getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; fl4 = &inet->cork.fl.u.ip4; if (READ_ONCE(up->pending)) { /* * There are pending frames. * The socket lock must be held while it's corked. */ lock_sock(sk); if (likely(up->pending)) { if (unlikely(up->pending != AF_INET)) { release_sock(sk); return -EINVAL; } goto do_append_data; } release_sock(sk); } ulen += sizeof(struct udphdr); /* * Get and verify the address. */ if (usin) { if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) { if (usin->sin_family != AF_UNSPEC) return -EAFNOSUPPORT; } daddr = usin->sin_addr.s_addr; dport = usin->sin_port; if (dport == 0) return -EINVAL; } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; daddr = inet->inet_daddr; dport = inet->inet_dport; /* Open fast path for connected socket. Route will not be used, if at least one option is set. */ connected = 1; } ipcm_init_sk(&ipc, inet); ipc.gso_size = READ_ONCE(up->gso_size); if (msg->msg_controllen) { err = udp_cmsg_send(sk, msg, &ipc.gso_size); if (err > 0) { err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6); connected = 0; } if (unlikely(err < 0)) { kfree(ipc.opt); return err; } if (ipc.opt) free = 1; } if (!ipc.opt) { struct ip_options_rcu *inet_opt; rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { memcpy(&opt_copy, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen); ipc.opt = &opt_copy.opt; } rcu_read_unlock(); } if (cgroup_bpf_enabled(CGROUP_UDP4_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, (struct sockaddr *)usin, &msg->msg_namelen, &ipc.addr); if (err) goto out_free; if (usin) { if (usin->sin_port == 0) { /* BPF program set invalid port. Reject it. */ err = -EINVAL; goto out_free; } daddr = usin->sin_addr.s_addr; dport = usin->sin_port; } } saddr = ipc.addr; ipc.addr = faddr = daddr; if (ipc.opt && ipc.opt->opt.srr) { if (!daddr) { err = -EINVAL; goto out_free; } faddr = ipc.opt->opt.faddr; connected = 0; } scope = ip_sendmsg_scope(inet, &ipc, msg); if (scope == RT_SCOPE_LINK) connected = 0; uc_index = READ_ONCE(inet->uc_index); if (ipv4_is_multicast(daddr)) { if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = READ_ONCE(inet->mc_index); if (!saddr) saddr = READ_ONCE(inet->mc_addr); connected = 0; } else if (!ipc.oif) { ipc.oif = uc_index; } else if (ipv4_is_lbcast(daddr) && uc_index) { /* oif is set, packet is to local broadcast and * uc_index is set. oif is most likely set * by sk_bound_dev_if. If uc_index != oif check if the * oif is an L3 master and uc_index is an L3 slave. * If so, we want to allow the send using the uc_index. */ if (ipc.oif != uc_index && ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk), uc_index)) { ipc.oif = uc_index; } } if (connected) rt = dst_rtable(sk_dst_check(sk, 0)); if (!rt) { struct net *net = sock_net(sk); __u8 flow_flags = inet_sk_flowi_flags(sk); fl4 = &fl4_stack; flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, ipc.tos & INET_DSCP_MASK, scope, sk->sk_protocol, flow_flags, faddr, saddr, dport, inet->inet_sport, sk_uid(sk)); security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; if (err == -ENETUNREACH) IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); goto out; } err = -EACCES; if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) goto out; if (connected) sk_dst_set(sk, dst_clone(&rt->dst)); } if (msg->msg_flags&MSG_CONFIRM) goto do_confirm; back_from_confirm: saddr = fl4->saddr; if (!ipc.addr) daddr = ipc.addr = fl4->daddr; /* Lockless fast path for the non-corking case. */ if (!corkreq) { struct inet_cork cork; skb = ip_make_skb(sk, fl4, getfrag, msg, ulen, sizeof(struct udphdr), &ipc, &rt, &cork, msg->msg_flags); err = PTR_ERR(skb); if (!IS_ERR_OR_NULL(skb)) err = udp_send_skb(skb, fl4, &cork); goto out; } lock_sock(sk); if (unlikely(up->pending)) { /* The socket is already corked while preparing it. */ /* ... which is an evident application bug. --ANK */ release_sock(sk); net_dbg_ratelimited("socket already corked\n"); err = -EINVAL; goto out; } /* * Now cork the socket to pend data. */ fl4 = &inet->cork.fl.u.ip4; fl4->daddr = daddr; fl4->saddr = saddr; fl4->fl4_dport = dport; fl4->fl4_sport = inet->inet_sport; WRITE_ONCE(up->pending, AF_INET); do_append_data: up->len += ulen; err = ip_append_data(sk, fl4, getfrag, msg, ulen, sizeof(struct udphdr), &ipc, &rt, corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); if (err) udp_flush_pending_frames(sk); else if (!corkreq) err = udp_push_pending_frames(sk); else if (unlikely(skb_queue_empty(&sk->sk_write_queue))) WRITE_ONCE(up->pending, 0); release_sock(sk); out: ip_rt_put(rt); out_free: if (free) kfree(ipc.opt); if (!err) return len; /* * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space. Reporting * ENOBUFS might not be good (it's not tunable per se), but otherwise * we don't have a good statistic (IpOutDiscards but it can be too many * things). We could add another new stat but at least for now that * seems like overkill. */ if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { UDP_INC_STATS(sock_net(sk), UDP_MIB_SNDBUFERRORS, is_udplite); } return err; do_confirm: if (msg->msg_flags & MSG_PROBE) dst_confirm_neigh(&rt->dst, &fl4->daddr); if (!(msg->msg_flags&MSG_PROBE) || len) goto back_from_confirm; err = 0; goto out; } EXPORT_SYMBOL(udp_sendmsg); void udp_splice_eof(struct socket *sock) { struct sock *sk = sock->sk; struct udp_sock *up = udp_sk(sk); if (!READ_ONCE(up->pending) || udp_test_bit(CORK, sk)) return; lock_sock(sk); if (up->pending && !udp_test_bit(CORK, sk)) udp_push_pending_frames(sk); release_sock(sk); } EXPORT_IPV6_MOD_GPL(udp_splice_eof); #define UDP_SKB_IS_STATELESS 0x80000000 /* all head states (dst, sk, nf conntrack) except skb extensions are * cleared by udp_rcv(). * * We need to preserve secpath, if present, to eventually process * IP_CMSG_PASSSEC at recvmsg() time. * * Other extensions can be cleared. */ static bool udp_try_make_stateless(struct sk_buff *skb) { if (!skb_has_extensions(skb)) return true; if (!secpath_exists(skb)) { skb_ext_reset(skb); return true; } return false; } static void udp_set_dev_scratch(struct sk_buff *skb) { struct udp_dev_scratch *scratch = udp_skb_scratch(skb); BUILD_BUG_ON(sizeof(struct udp_dev_scratch) > sizeof(long)); scratch->_tsize_state = skb->truesize; #if BITS_PER_LONG == 64 scratch->len = skb->len; scratch->csum_unnecessary = !!skb_csum_unnecessary(skb); scratch->is_linear = !skb_is_nonlinear(skb); #endif if (udp_try_make_stateless(skb)) scratch->_tsize_state |= UDP_SKB_IS_STATELESS; } static void udp_skb_csum_unnecessary_set(struct sk_buff *skb) { /* We come here after udp_lib_checksum_complete() returned 0. * This means that __skb_checksum_complete() might have * set skb->csum_valid to 1. * On 64bit platforms, we can set csum_unnecessary * to true, but only if the skb is not shared. */ #if BITS_PER_LONG == 64 if (!skb_shared(skb)) udp_skb_scratch(skb)->csum_unnecessary = true; #endif } static int udp_skb_truesize(struct sk_buff *skb) { return udp_skb_scratch(skb)->_tsize_state & ~UDP_SKB_IS_STATELESS; } static bool udp_skb_has_head_state(struct sk_buff *skb) { return !(udp_skb_scratch(skb)->_tsize_state & UDP_SKB_IS_STATELESS); } /* fully reclaim rmem/fwd memory allocated for skb */ static void udp_rmem_release(struct sock *sk, unsigned int size, int partial, bool rx_queue_lock_held) { struct udp_sock *up = udp_sk(sk); struct sk_buff_head *sk_queue; unsigned int amt; if (likely(partial)) { up->forward_deficit += size; size = up->forward_deficit; if (size < READ_ONCE(up->forward_threshold) && !skb_queue_empty(&up->reader_queue)) return; } else { size += up->forward_deficit; } up->forward_deficit = 0; /* acquire the sk_receive_queue for fwd allocated memory scheduling, * if the called don't held it already */ sk_queue = &sk->sk_receive_queue; if (!rx_queue_lock_held) spin_lock(&sk_queue->lock); amt = (size + sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1); sk_forward_alloc_add(sk, size - amt); if (amt) __sk_mem_reduce_allocated(sk, amt >> PAGE_SHIFT); atomic_sub(size, &sk->sk_rmem_alloc); /* this can save us from acquiring the rx queue lock on next receive */ skb_queue_splice_tail_init(sk_queue, &up->reader_queue); if (!rx_queue_lock_held) spin_unlock(&sk_queue->lock); } /* Note: called with reader_queue.lock held. * Instead of using skb->truesize here, find a copy of it in skb->dev_scratch * This avoids a cache line miss while receive_queue lock is held. * Look at __udp_enqueue_schedule_skb() to find where this copy is done. */ void udp_skb_destructor(struct sock *sk, struct sk_buff *skb) { prefetch(&skb->data); udp_rmem_release(sk, udp_skb_truesize(skb), 1, false); } EXPORT_IPV6_MOD(udp_skb_destructor); /* as above, but the caller held the rx queue lock, too */ static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb) { prefetch(&skb->data); udp_rmem_release(sk, udp_skb_truesize(skb), 1, true); } /* Idea of busylocks is to let producers grab an extra spinlock * to relieve pressure on the receive_queue spinlock shared by consumer. * Under flood, this means that only one producer can be in line * trying to acquire the receive_queue spinlock. * These busylock can be allocated on a per cpu manner, instead of a * per socket one (that would consume a cache line per socket) */ static int udp_busylocks_log __read_mostly; static spinlock_t *udp_busylocks __read_mostly; static spinlock_t *busylock_acquire(void *ptr) { spinlock_t *busy; busy = udp_busylocks + hash_ptr(ptr, udp_busylocks_log); spin_lock(busy); return busy; } static void busylock_release(spinlock_t *busy) { if (busy) spin_unlock(busy); } static int udp_rmem_schedule(struct sock *sk, int size) { int delta; delta = size - sk->sk_forward_alloc; if (delta > 0 && !__sk_mem_schedule(sk, delta, SK_MEM_RECV)) return -ENOBUFS; return 0; } int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) { struct sk_buff_head *list = &sk->sk_receive_queue; unsigned int rmem, rcvbuf; spinlock_t *busy = NULL; int size, err = -ENOMEM; rmem = atomic_read(&sk->sk_rmem_alloc); rcvbuf = READ_ONCE(sk->sk_rcvbuf); size = skb->truesize; /* Immediately drop when the receive queue is full. * Cast to unsigned int performs the boundary check for INT_MAX. */ if (rmem + size > rcvbuf) { if (rcvbuf > INT_MAX >> 1) goto drop; /* Always allow at least one packet for small buffer. */ if (rmem > rcvbuf) goto drop; } /* Under mem pressure, it might be helpful to help udp_recvmsg() * having linear skbs : * - Reduce memory overhead and thus increase receive queue capacity * - Less cache line misses at copyout() time * - Less work at consume_skb() (less alien page frag freeing) */ if (rmem > (rcvbuf >> 1)) { skb_condense(skb); size = skb->truesize; busy = busylock_acquire(sk); } udp_set_dev_scratch(skb); atomic_add(size, &sk->sk_rmem_alloc); spin_lock(&list->lock); err = udp_rmem_schedule(sk, size); if (err) { spin_unlock(&list->lock); goto uncharge_drop; } sk_forward_alloc_add(sk, -size); /* no need to setup a destructor, we will explicitly release the * forward allocated memory on dequeue */ sock_skb_set_dropcount(sk, skb); __skb_queue_tail(list, skb); spin_unlock(&list->lock); if (!sock_flag(sk, SOCK_DEAD)) INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk); busylock_release(busy); return 0; uncharge_drop: atomic_sub(skb->truesize, &sk->sk_rmem_alloc); drop: atomic_inc(&sk->sk_drops); busylock_release(busy); return err; } EXPORT_IPV6_MOD_GPL(__udp_enqueue_schedule_skb); void udp_destruct_common(struct sock *sk) { /* reclaim completely the forward allocated memory */ struct udp_sock *up = udp_sk(sk); unsigned int total = 0; struct sk_buff *skb; skb_queue_splice_tail_init(&sk->sk_receive_queue, &up->reader_queue); while ((skb = __skb_dequeue(&up->reader_queue)) != NULL) { total += skb->truesize; kfree_skb(skb); } udp_rmem_release(sk, total, 0, true); } EXPORT_IPV6_MOD_GPL(udp_destruct_common); static void udp_destruct_sock(struct sock *sk) { udp_destruct_common(sk); inet_sock_destruct(sk); } int udp_init_sock(struct sock *sk) { udp_lib_init_sock(sk); sk->sk_destruct = udp_destruct_sock; set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); return 0; } void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) { if (unlikely(READ_ONCE(udp_sk(sk)->peeking_with_offset))) sk_peek_offset_bwd(sk, len); if (!skb_unref(skb)) return; /* In the more common cases we cleared the head states previously, * see __udp_queue_rcv_skb(). */ if (unlikely(udp_skb_has_head_state(skb))) skb_release_head_state(skb); __consume_stateless_skb(skb); } EXPORT_IPV6_MOD_GPL(skb_consume_udp); static struct sk_buff *__first_packet_length(struct sock *sk, struct sk_buff_head *rcvq, unsigned int *total) { struct sk_buff *skb; while ((skb = skb_peek(rcvq)) != NULL) { if (udp_lib_checksum_complete(skb)) { __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, IS_UDPLITE(sk)); __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, IS_UDPLITE(sk)); atomic_inc(&sk->sk_drops); __skb_unlink(skb, rcvq); *total += skb->truesize; kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); } else { udp_skb_csum_unnecessary_set(skb); break; } } return skb; } /** * first_packet_length - return length of first packet in receive queue * @sk: socket * * Drops all bad checksum frames, until a valid one is found. * Returns the length of found skb, or -1 if none is found. */ static int first_packet_length(struct sock *sk) { struct sk_buff_head *rcvq = &udp_sk(sk)->reader_queue; struct sk_buff_head *sk_queue = &sk->sk_receive_queue; unsigned int total = 0; struct sk_buff *skb; int res; spin_lock_bh(&rcvq->lock); skb = __first_packet_length(sk, rcvq, &total); if (!skb && !skb_queue_empty_lockless(sk_queue)) { spin_lock(&sk_queue->lock); skb_queue_splice_tail_init(sk_queue, rcvq); spin_unlock(&sk_queue->lock); skb = __first_packet_length(sk, rcvq, &total); } res = skb ? skb->len : -1; if (total) udp_rmem_release(sk, total, 1, false); spin_unlock_bh(&rcvq->lock); return res; } /* * IOCTL requests applicable to the UDP protocol */ int udp_ioctl(struct sock *sk, int cmd, int *karg) { switch (cmd) { case SIOCOUTQ: { *karg = sk_wmem_alloc_get(sk); return 0; } case SIOCINQ: { *karg = max_t(int, 0, first_packet_length(sk)); return 0; } default: return -ENOIOCTLCMD; } return 0; } EXPORT_IPV6_MOD(udp_ioctl); struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, int *off, int *err) { struct sk_buff_head *sk_queue = &sk->sk_receive_queue; struct sk_buff_head *queue; struct sk_buff *last; long timeo; int error; queue = &udp_sk(sk)->reader_queue; timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { struct sk_buff *skb; error = sock_error(sk); if (error) break; error = -EAGAIN; do { spin_lock_bh(&queue->lock); skb = __skb_try_recv_from_queue(queue, flags, off, err, &last); if (skb) { if (!(flags & MSG_PEEK)) udp_skb_destructor(sk, skb); spin_unlock_bh(&queue->lock); return skb; } if (skb_queue_empty_lockless(sk_queue)) { spin_unlock_bh(&queue->lock); goto busy_check; } /* refill the reader queue and walk it again * keep both queues locked to avoid re-acquiring * the sk_receive_queue lock if fwd memory scheduling * is needed. */ spin_lock(&sk_queue->lock); skb_queue_splice_tail_init(sk_queue, queue); skb = __skb_try_recv_from_queue(queue, flags, off, err, &last); if (skb && !(flags & MSG_PEEK)) udp_skb_dtor_locked(sk, skb); spin_unlock(&sk_queue->lock); spin_unlock_bh(&queue->lock); if (skb) return skb; busy_check: if (!sk_can_busy_loop(sk)) break; sk_busy_loop(sk, flags & MSG_DONTWAIT); } while (!skb_queue_empty_lockless(sk_queue)); /* sk_queue is empty, reader_queue may contain peeked packets */ } while (timeo && !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, &error, &timeo, (struct sk_buff *)sk_queue)); *err = error; return NULL; } EXPORT_SYMBOL(__skb_recv_udp); int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { struct sk_buff *skb; int err; try_again: skb = skb_recv_udp(sk, MSG_DONTWAIT, &err); if (!skb) return err; if (udp_lib_checksum_complete(skb)) { int is_udplite = IS_UDPLITE(sk); struct net *net = sock_net(sk); __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, is_udplite); __UDP_INC_STATS(net, UDP_MIB_INERRORS, is_udplite); atomic_inc(&sk->sk_drops); kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); goto try_again; } WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk)); return recv_actor(sk, skb); } EXPORT_IPV6_MOD(udp_read_skb); /* * This should be easy, if there is something there we * return it, otherwise we block. */ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct inet_sock *inet = inet_sk(sk); DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct sk_buff *skb; unsigned int ulen, copied; int off, err, peeking = flags & MSG_PEEK; int is_udplite = IS_UDPLITE(sk); bool checksum_valid = false; if (flags & MSG_ERRQUEUE) return ip_recv_error(sk, msg, len, addr_len); try_again: off = sk_peek_offset(sk, flags); skb = __skb_recv_udp(sk, flags, &off, &err); if (!skb) return err; ulen = udp_skb_len(skb); copied = len; if (copied > ulen - off) copied = ulen - off; else if (copied < ulen) msg->msg_flags |= MSG_TRUNC; /* * If checksum is needed at all, try to do it while copying the * data. If the data is truncated, or if we only want a partial * coverage checksum (UDP-Lite), do it before the copy. */ if (copied < ulen || peeking || (is_udplite && UDP_SKB_CB(skb)->partial_cov)) { checksum_valid = udp_skb_csum_unnecessary(skb) || !__udp_lib_checksum_complete(skb); if (!checksum_valid) goto csum_copy_err; } if (checksum_valid || udp_skb_csum_unnecessary(skb)) { if (udp_skb_is_linear(skb)) err = copy_linear_skb(skb, copied, off, &msg->msg_iter); else err = skb_copy_datagram_msg(skb, off, msg, copied); } else { err = skb_copy_and_csum_datagram_msg(skb, off, msg); if (err == -EINVAL) goto csum_copy_err; } if (unlikely(err)) { if (!peeking) { atomic_inc(&sk->sk_drops); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } kfree_skb(skb); return err; } if (!peeking) UDP_INC_STATS(sock_net(sk), UDP_MIB_INDATAGRAMS, is_udplite); sock_recv_cmsgs(msg, sk, skb); /* Copy the address. */ if (sin) { sin->sin_family = AF_INET; sin->sin_port = udp_hdr(skb)->source; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, (struct sockaddr *)sin, addr_len); } if (udp_test_bit(GRO_ENABLED, sk)) udp_cmsg_recv(msg, sk, skb); if (inet_cmsg_flags(inet)) ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off); err = copied; if (flags & MSG_TRUNC) err = ulen; skb_consume_udp(sk, skb, peeking ? -err : err); return err; csum_copy_err: if (!__sk_queue_drop_skb(sk, &udp_sk(sk)->reader_queue, skb, flags, udp_skb_destructor)) { UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); /* starting over for a new packet, but check if we need to yield */ cond_resched(); msg->msg_flags &= ~MSG_TRUNC; goto try_again; } int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { /* This check is replicated from __ip4_datagram_connect() and * intended to prevent BPF program called below from accessing bytes * that are out of the bound specified by user in addr_len. */ if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, &addr_len); } EXPORT_IPV6_MOD(udp_pre_connect); static int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { int res; lock_sock(sk); res = __ip4_datagram_connect(sk, uaddr, addr_len); if (!res) udp4_hash4(sk); release_sock(sk); return res; } int __udp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); /* * 1003.1g - break association. */ sk->sk_state = TCP_CLOSE; inet->inet_daddr = 0; inet->inet_dport = 0; sock_rps_reset_rxhash(sk); sk->sk_bound_dev_if = 0; if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) { inet_reset_saddr(sk); if (sk->sk_prot->rehash && (sk->sk_userlocks & SOCK_BINDPORT_LOCK)) sk->sk_prot->rehash(sk); } if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) { sk->sk_prot->unhash(sk); inet->inet_sport = 0; } sk_dst_reset(sk); return 0; } EXPORT_SYMBOL(__udp_disconnect); int udp_disconnect(struct sock *sk, int flags) { lock_sock(sk); __udp_disconnect(sk, flags); release_sock(sk); return 0; } EXPORT_IPV6_MOD(udp_disconnect); void udp_lib_unhash(struct sock *sk) { if (sk_hashed(sk)) { struct udp_table *udptable = udp_get_table_prot(sk); struct udp_hslot *hslot, *hslot2; sock_rps_delete_flow(sk); hslot = udp_hashslot(udptable, sock_net(sk), udp_sk(sk)->udp_port_hash); hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); spin_lock_bh(&hslot->lock); if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); if (sk_del_node_init_rcu(sk)) { hslot->count--; inet_sk(sk)->inet_num = 0; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_lock(&hslot2->lock); hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); hslot2->count--; spin_unlock(&hslot2->lock); udp_unhash4(udptable, sk); } spin_unlock_bh(&hslot->lock); } } EXPORT_IPV6_MOD(udp_lib_unhash); /* * inet_rcv_saddr was changed, we must rehash secondary hash */ void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4) { if (sk_hashed(sk)) { struct udp_table *udptable = udp_get_table_prot(sk); struct udp_hslot *hslot, *hslot2, *nhslot2; hslot = udp_hashslot(udptable, sock_net(sk), udp_sk(sk)->udp_port_hash); hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); nhslot2 = udp_hashslot2(udptable, newhash); udp_sk(sk)->udp_portaddr_hash = newhash; if (hslot2 != nhslot2 || rcu_access_pointer(sk->sk_reuseport_cb)) { /* we must lock primary chain too */ spin_lock_bh(&hslot->lock); if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); if (hslot2 != nhslot2) { spin_lock(&hslot2->lock); hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); hslot2->count--; spin_unlock(&hslot2->lock); spin_lock(&nhslot2->lock); hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, &nhslot2->head); nhslot2->count++; spin_unlock(&nhslot2->lock); } spin_unlock_bh(&hslot->lock); } /* Now process hash4 if necessary: * (1) update hslot4; * (2) update hslot2->hash4_cnt. * Note that hslot2/hslot4 should be checked separately, as * either of them may change with the other unchanged. */ if (udp_hashed4(sk)) { spin_lock_bh(&hslot->lock); udp_rehash4(udptable, sk, newhash4); if (hslot2 != nhslot2) { spin_lock(&hslot2->lock); udp_hash4_dec(hslot2); spin_unlock(&hslot2->lock); spin_lock(&nhslot2->lock); udp_hash4_inc(nhslot2); spin_unlock(&nhslot2->lock); } spin_unlock_bh(&hslot->lock); } } } EXPORT_IPV6_MOD(udp_lib_rehash); void udp_v4_rehash(struct sock *sk) { u16 new_hash = ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, inet_sk(sk)->inet_num); u16 new_hash4 = udp_ehashfn(sock_net(sk), sk->sk_rcv_saddr, sk->sk_num, sk->sk_daddr, sk->sk_dport); udp_lib_rehash(sk, new_hash, new_hash4); } static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int rc; if (inet_sk(sk)->inet_daddr) { sock_rps_save_rxhash(sk, skb); sk_mark_napi_id(sk, skb); sk_incoming_cpu_update(sk); } else { sk_mark_napi_id_once(sk, skb); } rc = __udp_enqueue_schedule_skb(sk, skb); if (rc < 0) { int is_udplite = IS_UDPLITE(sk); int drop_reason; /* Note that an ENOMEM error is charged twice */ if (rc == -ENOMEM) { UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS, is_udplite); drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF; } else { UDP_INC_STATS(sock_net(sk), UDP_MIB_MEMERRORS, is_udplite); drop_reason = SKB_DROP_REASON_PROTO_MEM; } UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); trace_udp_fail_queue_rcv_skb(rc, sk, skb); sk_skb_reason_drop(sk, skb, drop_reason); return -1; } return 0; } /* returns: * -1: error * 0: success * >0: "udp encap" protocol resubmission * * Note that in the success and error cases, the skb is assumed to * have either been requeued or freed. */ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) { enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; struct udp_sock *up = udp_sk(sk); int is_udplite = IS_UDPLITE(sk); /* * Charge it to the socket, dropping if the queue is full. */ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { drop_reason = SKB_DROP_REASON_XFRM_POLICY; goto drop; } nf_reset_ct(skb); if (static_branch_unlikely(&udp_encap_needed_key) && READ_ONCE(up->encap_type)) { int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); /* * This is an encapsulation socket so pass the skb to * the socket's udp_encap_rcv() hook. Otherwise, just * fall through and pass this up the UDP socket. * up->encap_rcv() returns the following value: * =0 if skb was successfully passed to the encap * handler or was discarded by it. * >0 if skb should be passed on to UDP. * <0 if skb should be resubmitted as proto -N */ /* if we're overly short, let UDP handle it */ encap_rcv = READ_ONCE(up->encap_rcv); if (encap_rcv) { int ret; /* Verify checksum before giving to encap */ if (udp_lib_checksum_complete(skb)) goto csum_error; ret = encap_rcv(sk, skb); if (ret <= 0) { __UDP_INC_STATS(sock_net(sk), UDP_MIB_INDATAGRAMS, is_udplite); return -ret; } } /* FALLTHROUGH -- it's a UDP Packet */ } /* * UDP-Lite specific tests, ignored on UDP sockets */ if (udp_test_bit(UDPLITE_RECV_CC, sk) && UDP_SKB_CB(skb)->partial_cov) { u16 pcrlen = READ_ONCE(up->pcrlen); /* * MIB statistics other than incrementing the error count are * disabled for the following two types of errors: these depend * on the application settings, not on the functioning of the * protocol stack as such. * * RFC 3828 here recommends (sec 3.3): "There should also be a * way ... to ... at least let the receiving application block * delivery of packets with coverage values less than a value * provided by the application." */ if (pcrlen == 0) { /* full coverage was set */ net_dbg_ratelimited("UDPLite: partial coverage %d while full coverage %d requested\n", UDP_SKB_CB(skb)->cscov, skb->len); goto drop; } /* The next case involves violating the min. coverage requested * by the receiver. This is subtle: if receiver wants x and x is * greater than the buffersize/MTU then receiver will complain * that it wants x while sender emits packets of smaller size y. * Therefore the above ...()->partial_cov statement is essential. */ if (UDP_SKB_CB(skb)->cscov < pcrlen) { net_dbg_ratelimited("UDPLite: coverage %d too small, need min %d\n", UDP_SKB_CB(skb)->cscov, pcrlen); goto drop; } } prefetch(&sk->sk_rmem_alloc); if (rcu_access_pointer(sk->sk_filter) && udp_lib_checksum_complete(skb)) goto csum_error; if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr), &drop_reason)) goto drop; udp_csum_pull_header(skb); ipv4_pktinfo_prepare(sk, skb, true); return __udp_queue_rcv_skb(sk, skb); csum_error: drop_reason = SKB_DROP_REASON_UDP_CSUM; __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); atomic_inc(&sk->sk_drops); sk_skb_reason_drop(sk, skb, drop_reason); return -1; } static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct sk_buff *next, *segs; int ret; if (likely(!udp_unexpected_gso(sk, skb))) return udp_queue_rcv_one_skb(sk, skb); BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_GSO_CB_OFFSET); __skb_push(skb, -skb_mac_offset(skb)); segs = udp_rcv_segment(sk, skb, true); skb_list_walk_safe(segs, skb, next) { __skb_pull(skb, skb_transport_offset(skb)); udp_post_segment_fix_csum(skb); ret = udp_queue_rcv_one_skb(sk, skb); if (ret > 0) ip_protocol_deliver_rcu(dev_net(skb->dev), skb, ret); } return 0; } /* For TCP sockets, sk_rx_dst is protected by socket lock * For UDP, we use xchg() to guard against concurrent changes. */ bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) { struct dst_entry *old; if (dst_hold_safe(dst)) { old = unrcu_pointer(xchg(&sk->sk_rx_dst, RCU_INITIALIZER(dst))); dst_release(old); return old != dst; } return false; } EXPORT_IPV6_MOD(udp_sk_rx_dst_set); /* * Multicasts and broadcasts go to each listener. * * Note: called only from the BH handler context. */ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct udphdr *uh, __be32 saddr, __be32 daddr, struct udp_table *udptable, int proto) { struct sock *sk, *first = NULL; unsigned short hnum = ntohs(uh->dest); struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); unsigned int offset = offsetof(typeof(*sk), sk_node); int dif = skb->dev->ifindex; int sdif = inet_sdif(skb); struct hlist_node *node; struct sk_buff *nskb; if (use_hash2) { hash2_any = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & udptable->mask; hash2 = ipv4_portaddr_hash(net, daddr, hnum) & udptable->mask; start_lookup: hslot = &udptable->hash2[hash2].hslot; offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); } sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) { if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr, uh->source, saddr, dif, sdif, hnum)) continue; if (!first) { first = sk; continue; } nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) { atomic_inc(&sk->sk_drops); __UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS, IS_UDPLITE(sk)); __UDP_INC_STATS(net, UDP_MIB_INERRORS, IS_UDPLITE(sk)); continue; } if (udp_queue_rcv_skb(sk, nskb) > 0) consume_skb(nskb); } /* Also lookup *:port if we are using hash2 and haven't done so yet. */ if (use_hash2 && hash2 != hash2_any) { hash2 = hash2_any; goto start_lookup; } if (first) { if (udp_queue_rcv_skb(first, skb) > 0) consume_skb(skb); } else { kfree_skb(skb); __UDP_INC_STATS(net, UDP_MIB_IGNOREDMULTI, proto == IPPROTO_UDPLITE); } return 0; } /* Initialize UDP checksum. If exited with zero value (success), * CHECKSUM_UNNECESSARY means, that no more checks are required. * Otherwise, csum completion requires checksumming packet body, * including udp header and folding it to skb->csum. */ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto) { int err; UDP_SKB_CB(skb)->partial_cov = 0; UDP_SKB_CB(skb)->cscov = skb->len; if (proto == IPPROTO_UDPLITE) { err = udplite_checksum_init(skb, uh); if (err) return err; if (UDP_SKB_CB(skb)->partial_cov) { skb->csum = inet_compute_pseudo(skb, proto); return 0; } } /* Note, we are only interested in != 0 or == 0, thus the * force to int. */ err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check, inet_compute_pseudo); if (err) return err; if (skb->ip_summed == CHECKSUM_COMPLETE && !skb->csum_valid) { /* If SW calculated the value, we know it's bad */ if (skb->csum_complete_sw) return 1; /* HW says the value is bad. Let's validate that. * skb->csum is no longer the full packet checksum, * so don't treat it as such. */ skb_checksum_complete_unset(skb); } return 0; } /* wrapper for udp_queue_rcv_skb tacking care of csum conversion and * return code conversion for ip layer consumption */ static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, struct udphdr *uh) { int ret; if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk)) skb_checksum_try_convert(skb, IPPROTO_UDP, inet_compute_pseudo); ret = udp_queue_rcv_skb(sk, skb); /* a return value > 0 means to resubmit the input, but * it wants the return to be -protocol, or 0 */ if (ret > 0) return -ret; return 0; } /* * All we need to do is get the socket, and then do a checksum. */ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto) { struct sock *sk = NULL; struct udphdr *uh; unsigned short ulen; struct rtable *rt = skb_rtable(skb); __be32 saddr, daddr; struct net *net = dev_net(skb->dev); bool refcounted; int drop_reason; drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; /* * Validate the packet. */ if (!pskb_may_pull(skb, sizeof(struct udphdr))) goto drop; /* No space for header. */ uh = udp_hdr(skb); ulen = ntohs(uh->len); saddr = ip_hdr(skb)->saddr; daddr = ip_hdr(skb)->daddr; if (ulen > skb->len) goto short_packet; if (proto == IPPROTO_UDP) { /* UDP validates ulen. */ if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen)) goto short_packet; uh = udp_hdr(skb); } if (udp4_csum_init(skb, uh, proto)) goto csum_error; sk = inet_steal_sock(net, skb, sizeof(struct udphdr), saddr, uh->source, daddr, uh->dest, &refcounted, udp_ehashfn); if (IS_ERR(sk)) goto no_sk; if (sk) { struct dst_entry *dst = skb_dst(skb); int ret; if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst)) udp_sk_rx_dst_set(sk, dst); ret = udp_unicast_rcv_skb(sk, skb, uh); if (refcounted) sock_put(sk); return ret; } if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) return __udp4_lib_mcast_deliver(net, skb, uh, saddr, daddr, udptable, proto); sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk) return udp_unicast_rcv_skb(sk, skb, uh); no_sk: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; nf_reset_ct(skb); /* No socket. Drop packet silently, if checksum is wrong */ if (udp_lib_checksum_complete(skb)) goto csum_error; drop_reason = SKB_DROP_REASON_NO_SOCKET; __UDP_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); /* * Hmm. We got an UDP packet to a port to which we * don't wanna listen. Ignore it. */ sk_skb_reason_drop(sk, skb, drop_reason); return 0; short_packet: drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; net_dbg_ratelimited("UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n", proto == IPPROTO_UDPLITE ? "Lite" : "", &saddr, ntohs(uh->source), ulen, skb->len, &daddr, ntohs(uh->dest)); goto drop; csum_error: /* * RFC1122: OK. Discards the bad packet silently (as far as * the network is concerned, anyway) as per 4.1.3.4 (MUST). */ drop_reason = SKB_DROP_REASON_UDP_CSUM; net_dbg_ratelimited("UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n", proto == IPPROTO_UDPLITE ? "Lite" : "", &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest), ulen); __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); drop: __UDP_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); sk_skb_reason_drop(sk, skb, drop_reason); return 0; } /* We can only early demux multicast if there is a single matching socket. * If more than one socket found returns NULL */ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net, __be16 loc_port, __be32 loc_addr, __be16 rmt_port, __be32 rmt_addr, int dif, int sdif) { struct udp_table *udptable = net->ipv4.udp_table; unsigned short hnum = ntohs(loc_port); struct sock *sk, *result; struct udp_hslot *hslot; unsigned int slot; slot = udp_hashfn(net, hnum, udptable->mask); hslot = &udptable->hash[slot]; /* Do not bother scanning a too big list */ if (hslot->count > 10) return NULL; result = NULL; sk_for_each_rcu(sk, &hslot->head) { if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr, rmt_port, rmt_addr, dif, sdif, hnum)) { if (result) return NULL; result = sk; } } return result; } /* For unicast we should only early demux connected sockets or we can * break forwarding setups. The chains here can be long so only check * if the first socket is an exact match and if not move on. */ static struct sock *__udp4_lib_demux_lookup(struct net *net, __be16 loc_port, __be32 loc_addr, __be16 rmt_port, __be32 rmt_addr, int dif, int sdif) { struct udp_table *udptable = net->ipv4.udp_table; INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr); unsigned short hnum = ntohs(loc_port); struct udp_hslot *hslot2; unsigned int hash2; __portpair ports; struct sock *sk; hash2 = ipv4_portaddr_hash(net, loc_addr, hnum); hslot2 = udp_hashslot2(udptable, hash2); ports = INET_COMBINED_PORTS(rmt_port, hnum); udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { if (inet_match(net, sk, acookie, ports, dif, sdif)) return sk; /* Only check first socket in chain */ break; } return NULL; } int udp_v4_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct in_device *in_dev = NULL; const struct iphdr *iph; const struct udphdr *uh; struct sock *sk = NULL; struct dst_entry *dst; int dif = skb->dev->ifindex; int sdif = inet_sdif(skb); int ours; /* validate the packet */ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr))) return 0; iph = ip_hdr(skb); uh = udp_hdr(skb); if (skb->pkt_type == PACKET_MULTICAST) { in_dev = __in_dev_get_rcu(skb->dev); if (!in_dev) return 0; ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, iph->protocol); if (!ours) return 0; sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, uh->source, iph->saddr, dif, sdif); } else if (skb->pkt_type == PACKET_HOST) { sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr, uh->source, iph->saddr, dif, sdif); } if (!sk) return 0; skb->sk = sk; DEBUG_NET_WARN_ON_ONCE(sk_is_refcounted(sk)); skb->destructor = sock_pfree; dst = rcu_dereference(sk->sk_rx_dst); if (dst) dst = dst_check(dst, 0); if (dst) { u32 itag = 0; /* set noref for now. * any place which wants to hold dst has to call * dst_hold_safe() */ skb_dst_set_noref(skb, dst); /* for unconnected multicast sockets we need to validate * the source on each packet */ if (!inet_sk(sk)->inet_daddr && in_dev) return ip_mc_validate_source(skb, iph->daddr, iph->saddr, ip4h_dscp(iph), skb->dev, in_dev, &itag); } return 0; } int udp_rcv(struct sk_buff *skb) { return __udp4_lib_rcv(skb, dev_net(skb->dev)->ipv4.udp_table, IPPROTO_UDP); } void udp_destroy_sock(struct sock *sk) { struct udp_sock *up = udp_sk(sk); bool slow = lock_sock_fast(sk); /* protects from races with udp_abort() */ sock_set_flag(sk, SOCK_DEAD); udp_flush_pending_frames(sk); unlock_sock_fast(sk, slow); if (static_branch_unlikely(&udp_encap_needed_key)) { if (up->encap_type) { void (*encap_destroy)(struct sock *sk); encap_destroy = READ_ONCE(up->encap_destroy); if (encap_destroy) encap_destroy(sk); } if (udp_test_bit(ENCAP_ENABLED, sk)) { static_branch_dec(&udp_encap_needed_key); udp_tunnel_cleanup_gro(sk); } } } typedef struct sk_buff *(*udp_gro_receive_t)(struct sock *sk, struct list_head *head, struct sk_buff *skb); static void set_xfrm_gro_udp_encap_rcv(__u16 encap_type, unsigned short family, struct sock *sk) { #ifdef CONFIG_XFRM udp_gro_receive_t new_gro_receive; if (udp_test_bit(GRO_ENABLED, sk) && encap_type == UDP_ENCAP_ESPINUDP) { if (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) new_gro_receive = ipv6_stub->xfrm6_gro_udp_encap_rcv; else new_gro_receive = xfrm4_gro_udp_encap_rcv; if (udp_sk(sk)->gro_receive != new_gro_receive) { /* * With IPV6_ADDRFORM the gro callback could change * after being set, unregister the old one, if valid. */ if (udp_sk(sk)->gro_receive) udp_tunnel_update_gro_rcv(sk, false); WRITE_ONCE(udp_sk(sk)->gro_receive, new_gro_receive); udp_tunnel_update_gro_rcv(sk, true); } } #endif } /* * Socket option code for UDP */ int udp_lib_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen, int (*push_pending_frames)(struct sock *)) { struct udp_sock *up = udp_sk(sk); int val, valbool; int err = 0; int is_udplite = IS_UDPLITE(sk); if (level == SOL_SOCKET) { err = sk_setsockopt(sk, level, optname, optval, optlen); if (optname == SO_RCVBUF || optname == SO_RCVBUFFORCE) { sockopt_lock_sock(sk); /* paired with READ_ONCE in udp_rmem_release() */ WRITE_ONCE(up->forward_threshold, sk->sk_rcvbuf >> 2); sockopt_release_sock(sk); } return err; } if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; valbool = val ? 1 : 0; switch (optname) { case UDP_CORK: if (val != 0) { udp_set_bit(CORK, sk); } else { udp_clear_bit(CORK, sk); lock_sock(sk); push_pending_frames(sk); release_sock(sk); } break; case UDP_ENCAP: sockopt_lock_sock(sk); switch (val) { case 0: #ifdef CONFIG_XFRM case UDP_ENCAP_ESPINUDP: set_xfrm_gro_udp_encap_rcv(val, sk->sk_family, sk); #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) WRITE_ONCE(up->encap_rcv, ipv6_stub->xfrm6_udp_encap_rcv); else #endif WRITE_ONCE(up->encap_rcv, xfrm4_udp_encap_rcv); #endif fallthrough; case UDP_ENCAP_L2TPINUDP: WRITE_ONCE(up->encap_type, val); udp_tunnel_encap_enable(sk); break; default: err = -ENOPROTOOPT; break; } sockopt_release_sock(sk); break; case UDP_NO_CHECK6_TX: udp_set_no_check6_tx(sk, valbool); break; case UDP_NO_CHECK6_RX: udp_set_no_check6_rx(sk, valbool); break; case UDP_SEGMENT: if (val < 0 || val > USHRT_MAX) return -EINVAL; WRITE_ONCE(up->gso_size, val); break; case UDP_GRO: sockopt_lock_sock(sk); /* when enabling GRO, accept the related GSO packet type */ if (valbool) udp_tunnel_encap_enable(sk); udp_assign_bit(GRO_ENABLED, sk, valbool); udp_assign_bit(ACCEPT_L4, sk, valbool); set_xfrm_gro_udp_encap_rcv(up->encap_type, sk->sk_family, sk); sockopt_release_sock(sk); break; /* * UDP-Lite's partial checksum coverage (RFC 3828). */ /* The sender sets actual checksum coverage length via this option. * The case coverage > packet length is handled by send module. */ case UDPLITE_SEND_CSCOV: if (!is_udplite) /* Disable the option on UDP sockets */ return -ENOPROTOOPT; if (val != 0 && val < 8) /* Illegal coverage: use default (8) */ val = 8; else if (val > USHRT_MAX) val = USHRT_MAX; WRITE_ONCE(up->pcslen, val); udp_set_bit(UDPLITE_SEND_CC, sk); break; /* The receiver specifies a minimum checksum coverage value. To make * sense, this should be set to at least 8 (as done below). If zero is * used, this again means full checksum coverage. */ case UDPLITE_RECV_CSCOV: if (!is_udplite) /* Disable the option on UDP sockets */ return -ENOPROTOOPT; if (val != 0 && val < 8) /* Avoid silly minimal values. */ val = 8; else if (val > USHRT_MAX) val = USHRT_MAX; WRITE_ONCE(up->pcrlen, val); udp_set_bit(UDPLITE_RECV_CC, sk); break; default: err = -ENOPROTOOPT; break; } return err; } EXPORT_IPV6_MOD(udp_lib_setsockopt); int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { if (level == SOL_UDP || level == SOL_UDPLITE || level == SOL_SOCKET) return udp_lib_setsockopt(sk, level, optname, optval, optlen, udp_push_pending_frames); return ip_setsockopt(sk, level, optname, optval, optlen); } int udp_lib_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { struct udp_sock *up = udp_sk(sk); int val, len; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; len = min_t(unsigned int, len, sizeof(int)); switch (optname) { case UDP_CORK: val = udp_test_bit(CORK, sk); break; case UDP_ENCAP: val = READ_ONCE(up->encap_type); break; case UDP_NO_CHECK6_TX: val = udp_get_no_check6_tx(sk); break; case UDP_NO_CHECK6_RX: val = udp_get_no_check6_rx(sk); break; case UDP_SEGMENT: val = READ_ONCE(up->gso_size); break; case UDP_GRO: val = udp_test_bit(GRO_ENABLED, sk); break; /* The following two cannot be changed on UDP sockets, the return is * always 0 (which corresponds to the full checksum coverage of UDP). */ case UDPLITE_SEND_CSCOV: val = READ_ONCE(up->pcslen); break; case UDPLITE_RECV_CSCOV: val = READ_ONCE(up->pcrlen); break; default: return -ENOPROTOOPT; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } EXPORT_IPV6_MOD(udp_lib_getsockopt); int udp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { if (level == SOL_UDP || level == SOL_UDPLITE) return udp_lib_getsockopt(sk, level, optname, optval, optlen); return ip_getsockopt(sk, level, optname, optval, optlen); } /** * udp_poll - wait for a UDP event. * @file: - file struct * @sock: - socket * @wait: - poll table * * This is same as datagram poll, except for the special case of * blocking sockets. If application is using a blocking fd * and a packet with checksum error is in the queue; * then it could get return from select indicating data available * but then block when reading it. Add special case code * to work around these arguably broken applications. */ __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait) { __poll_t mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; if (!skb_queue_empty_lockless(&udp_sk(sk)->reader_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* Check for false positives due to checksum errors */ if ((mask & EPOLLRDNORM) && !(file->f_flags & O_NONBLOCK) && !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1) mask &= ~(EPOLLIN | EPOLLRDNORM); /* psock ingress_msg queue should not contain any bad checksum frames */ if (sk_is_readable(sk)) mask |= EPOLLIN | EPOLLRDNORM; return mask; } EXPORT_IPV6_MOD(udp_poll); int udp_abort(struct sock *sk, int err) { if (!has_current_bpf_ctx()) lock_sock(sk); /* udp{v6}_destroy_sock() sets it under the sk lock, avoid racing * with close() */ if (sock_flag(sk, SOCK_DEAD)) goto out; sk->sk_err = err; sk_error_report(sk); __udp_disconnect(sk, 0); out: if (!has_current_bpf_ctx()) release_sock(sk); return 0; } EXPORT_IPV6_MOD_GPL(udp_abort); struct proto udp_prot = { .name = "UDP", .owner = THIS_MODULE, .close = udp_lib_close, .pre_connect = udp_pre_connect, .connect = udp_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, .init = udp_init_sock, .destroy = udp_destroy_sock, .setsockopt = udp_setsockopt, .getsockopt = udp_getsockopt, .sendmsg = udp_sendmsg, .recvmsg = udp_recvmsg, .splice_eof = udp_splice_eof, .release_cb = ip4_datagram_release_cb, .hash = udp_lib_hash, .unhash = udp_lib_unhash, .rehash = udp_v4_rehash, .get_port = udp_v4_get_port, .put_port = udp_lib_unhash, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = udp_bpf_update_proto, #endif .memory_allocated = &net_aligned_data.udp_memory_allocated, .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc, .sysctl_mem = sysctl_udp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp_sock), .h.udp_table = NULL, .diag_destroy = udp_abort, }; EXPORT_SYMBOL(udp_prot); /* ------------------------------------------------------------------------ */ #ifdef CONFIG_PROC_FS static unsigned short seq_file_family(const struct seq_file *seq); static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) { unsigned short family = seq_file_family(seq); /* AF_UNSPEC is used as a match all */ return ((family == AF_UNSPEC || family == sk->sk_family) && net_eq(sock_net(sk), seq_file_net(seq))); } #ifdef CONFIG_BPF_SYSCALL static const struct seq_operations bpf_iter_udp_seq_ops; #endif static struct udp_table *udp_get_table_seq(struct seq_file *seq, struct net *net) { const struct udp_seq_afinfo *afinfo; #ifdef CONFIG_BPF_SYSCALL if (seq->op == &bpf_iter_udp_seq_ops) return net->ipv4.udp_table; #endif afinfo = pde_data(file_inode(seq->file)); return afinfo->udp_table ? : net->ipv4.udp_table; } static struct sock *udp_get_first(struct seq_file *seq, int start) { struct udp_iter_state *state = seq->private; struct net *net = seq_file_net(seq); struct udp_table *udptable; struct sock *sk; udptable = udp_get_table_seq(seq, net); for (state->bucket = start; state->bucket <= udptable->mask; ++state->bucket) { struct udp_hslot *hslot = &udptable->hash[state->bucket]; if (hlist_empty(&hslot->head)) continue; spin_lock_bh(&hslot->lock); sk_for_each(sk, &hslot->head) { if (seq_sk_match(seq, sk)) goto found; } spin_unlock_bh(&hslot->lock); } sk = NULL; found: return sk; } static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) { struct udp_iter_state *state = seq->private; struct net *net = seq_file_net(seq); struct udp_table *udptable; do { sk = sk_next(sk); } while (sk && !seq_sk_match(seq, sk)); if (!sk) { udptable = udp_get_table_seq(seq, net); if (state->bucket <= udptable->mask) spin_unlock_bh(&udptable->hash[state->bucket].lock); return udp_get_first(seq, state->bucket + 1); } return sk; } static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos) { struct sock *sk = udp_get_first(seq, 0); if (sk) while (pos && (sk = udp_get_next(seq, sk)) != NULL) --pos; return pos ? NULL : sk; } void *udp_seq_start(struct seq_file *seq, loff_t *pos) { struct udp_iter_state *state = seq->private; state->bucket = MAX_UDP_PORTS; return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN; } EXPORT_IPV6_MOD(udp_seq_start); void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sock *sk; if (v == SEQ_START_TOKEN) sk = udp_get_idx(seq, 0); else sk = udp_get_next(seq, v); ++*pos; return sk; } EXPORT_IPV6_MOD(udp_seq_next); void udp_seq_stop(struct seq_file *seq, void *v) { struct udp_iter_state *state = seq->private; struct udp_table *udptable; udptable = udp_get_table_seq(seq, seq_file_net(seq)); if (state->bucket <= udptable->mask) spin_unlock_bh(&udptable->hash[state->bucket].lock); } EXPORT_IPV6_MOD(udp_seq_stop); /* ------------------------------------------------------------------------ */ static void udp4_format_sock(struct sock *sp, struct seq_file *f, int bucket) { struct inet_sock *inet = inet_sk(sp); __be32 dest = inet->inet_daddr; __be32 src = inet->inet_rcv_saddr; __u16 destp = ntohs(inet->inet_dport); __u16 srcp = ntohs(inet->inet_sport); seq_printf(f, "%5d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u", bucket, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), udp_rqueue_get(sp), 0, 0L, 0, from_kuid_munged(seq_user_ns(f), sk_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); } int udp4_seq_show(struct seq_file *seq, void *v) { seq_setwidth(seq, 127); if (v == SEQ_START_TOKEN) seq_puts(seq, " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " "inode ref pointer drops"); else { struct udp_iter_state *state = seq->private; udp4_format_sock(v, seq, state->bucket); } seq_pad(seq, '\n'); return 0; } #ifdef CONFIG_BPF_SYSCALL struct bpf_iter__udp { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct udp_sock *, udp_sk); uid_t uid __aligned(8); int bucket __aligned(8); }; union bpf_udp_iter_batch_item { struct sock *sk; __u64 cookie; }; struct bpf_udp_iter_state { struct udp_iter_state state; unsigned int cur_sk; unsigned int end_sk; unsigned int max_sk; union bpf_udp_iter_batch_item *batch; }; static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter, unsigned int new_batch_sz, gfp_t flags); static struct sock *bpf_iter_udp_resume(struct sock *first_sk, union bpf_udp_iter_batch_item *cookies, int n_cookies) { struct sock *sk = NULL; int i; for (i = 0; i < n_cookies; i++) { sk = first_sk; udp_portaddr_for_each_entry_from(sk) if (cookies[i].cookie == atomic64_read(&sk->sk_cookie)) goto done; } done: return sk; } static struct sock *bpf_iter_udp_batch(struct seq_file *seq) { struct bpf_udp_iter_state *iter = seq->private; struct udp_iter_state *state = &iter->state; unsigned int find_cookie, end_cookie; struct net *net = seq_file_net(seq); struct udp_table *udptable; unsigned int batch_sks = 0; int resume_bucket; int resizes = 0; struct sock *sk; int err = 0; resume_bucket = state->bucket; /* The current batch is done, so advance the bucket. */ if (iter->cur_sk == iter->end_sk) state->bucket++; udptable = udp_get_table_seq(seq, net); again: /* New batch for the next bucket. * Iterate over the hash table to find a bucket with sockets matching * the iterator attributes, and return the first matching socket from * the bucket. The remaining matched sockets from the bucket are batched * before releasing the bucket lock. This allows BPF programs that are * called in seq_show to acquire the bucket lock if needed. */ find_cookie = iter->cur_sk; end_cookie = iter->end_sk; iter->cur_sk = 0; iter->end_sk = 0; batch_sks = 0; for (; state->bucket <= udptable->mask; state->bucket++) { struct udp_hslot *hslot2 = &udptable->hash2[state->bucket].hslot; if (hlist_empty(&hslot2->head)) goto next_bucket; spin_lock_bh(&hslot2->lock); sk = hlist_entry_safe(hslot2->head.first, struct sock, __sk_common.skc_portaddr_node); /* Resume from the first (in iteration order) unseen socket from * the last batch that still exists in resume_bucket. Most of * the time this will just be where the last iteration left off * in resume_bucket unless that socket disappeared between * reads. */ if (state->bucket == resume_bucket) sk = bpf_iter_udp_resume(sk, &iter->batch[find_cookie], end_cookie - find_cookie); fill_batch: udp_portaddr_for_each_entry_from(sk) { if (seq_sk_match(seq, sk)) { if (iter->end_sk < iter->max_sk) { sock_hold(sk); iter->batch[iter->end_sk++].sk = sk; } batch_sks++; } } /* Allocate a larger batch and try again. */ if (unlikely(resizes <= 1 && iter->end_sk && iter->end_sk != batch_sks)) { resizes++; /* First, try with GFP_USER to maximize the chances of * grabbing more memory. */ if (resizes == 1) { spin_unlock_bh(&hslot2->lock); err = bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2, GFP_USER); if (err) return ERR_PTR(err); /* Start over. */ goto again; } /* Next, hold onto the lock, so the bucket doesn't * change while we get the rest of the sockets. */ err = bpf_iter_udp_realloc_batch(iter, batch_sks, GFP_NOWAIT); if (err) { spin_unlock_bh(&hslot2->lock); return ERR_PTR(err); } /* Pick up where we left off. */ sk = iter->batch[iter->end_sk - 1].sk; sk = hlist_entry_safe(sk->__sk_common.skc_portaddr_node.next, struct sock, __sk_common.skc_portaddr_node); batch_sks = iter->end_sk; goto fill_batch; } spin_unlock_bh(&hslot2->lock); if (iter->end_sk) break; next_bucket: resizes = 0; } WARN_ON_ONCE(iter->end_sk != batch_sks); return iter->end_sk ? iter->batch[0].sk : NULL; } static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bpf_udp_iter_state *iter = seq->private; struct sock *sk; /* Whenever seq_next() is called, the iter->cur_sk is * done with seq_show(), so unref the iter->cur_sk. */ if (iter->cur_sk < iter->end_sk) sock_put(iter->batch[iter->cur_sk++].sk); /* After updating iter->cur_sk, check if there are more sockets * available in the current bucket batch. */ if (iter->cur_sk < iter->end_sk) sk = iter->batch[iter->cur_sk].sk; else /* Prepare a new batch. */ sk = bpf_iter_udp_batch(seq); ++*pos; return sk; } static void *bpf_iter_udp_seq_start(struct seq_file *seq, loff_t *pos) { /* bpf iter does not support lseek, so it always * continue from where it was stop()-ped. */ if (*pos) return bpf_iter_udp_batch(seq); return SEQ_START_TOKEN; } static int udp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, struct udp_sock *udp_sk, uid_t uid, int bucket) { struct bpf_iter__udp ctx; meta->seq_num--; /* skip SEQ_START_TOKEN */ ctx.meta = meta; ctx.udp_sk = udp_sk; ctx.uid = uid; ctx.bucket = bucket; return bpf_iter_run_prog(prog, &ctx); } static int bpf_iter_udp_seq_show(struct seq_file *seq, void *v) { struct udp_iter_state *state = seq->private; struct bpf_iter_meta meta; struct bpf_prog *prog; struct sock *sk = v; uid_t uid; int ret; if (v == SEQ_START_TOKEN) return 0; lock_sock(sk); if (unlikely(sk_unhashed(sk))) { ret = SEQ_SKIP; goto unlock; } uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk)); meta.seq = seq; prog = bpf_iter_get_info(&meta, false); ret = udp_prog_seq_show(prog, &meta, v, uid, state->bucket); unlock: release_sock(sk); return ret; } static void bpf_iter_udp_put_batch(struct bpf_udp_iter_state *iter) { union bpf_udp_iter_batch_item *item; unsigned int cur_sk = iter->cur_sk; __u64 cookie; /* Remember the cookies of the sockets we haven't seen yet, so we can * pick up where we left off next time around. */ while (cur_sk < iter->end_sk) { item = &iter->batch[cur_sk++]; cookie = sock_gen_cookie(item->sk); sock_put(item->sk); item->cookie = cookie; } } static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v) { struct bpf_udp_iter_state *iter = seq->private; struct bpf_iter_meta meta; struct bpf_prog *prog; if (!v) { meta.seq = seq; prog = bpf_iter_get_info(&meta, true); if (prog) (void)udp_prog_seq_show(prog, &meta, v, 0, 0); } if (iter->cur_sk < iter->end_sk) bpf_iter_udp_put_batch(iter); } static const struct seq_operations bpf_iter_udp_seq_ops = { .start = bpf_iter_udp_seq_start, .next = bpf_iter_udp_seq_next, .stop = bpf_iter_udp_seq_stop, .show = bpf_iter_udp_seq_show, }; #endif static unsigned short seq_file_family(const struct seq_file *seq) { const struct udp_seq_afinfo *afinfo; #ifdef CONFIG_BPF_SYSCALL /* BPF iterator: bpf programs to filter sockets. */ if (seq->op == &bpf_iter_udp_seq_ops) return AF_UNSPEC; #endif /* Proc fs iterator */ afinfo = pde_data(file_inode(seq->file)); return afinfo->family; } const struct seq_operations udp_seq_ops = { .start = udp_seq_start, .next = udp_seq_next, .stop = udp_seq_stop, .show = udp4_seq_show, }; EXPORT_IPV6_MOD(udp_seq_ops); static struct udp_seq_afinfo udp4_seq_afinfo = { .family = AF_INET, .udp_table = NULL, }; static int __net_init udp4_proc_init_net(struct net *net) { if (!proc_create_net_data("udp", 0444, net->proc_net, &udp_seq_ops, sizeof(struct udp_iter_state), &udp4_seq_afinfo)) return -ENOMEM; return 0; } static void __net_exit udp4_proc_exit_net(struct net *net) { remove_proc_entry("udp", net->proc_net); } static struct pernet_operations udp4_net_ops = { .init = udp4_proc_init_net, .exit = udp4_proc_exit_net, }; int __init udp4_proc_init(void) { return register_pernet_subsys(&udp4_net_ops); } void udp4_proc_exit(void) { unregister_pernet_subsys(&udp4_net_ops); } #endif /* CONFIG_PROC_FS */ static __initdata unsigned long uhash_entries; static int __init set_uhash_entries(char *str) { ssize_t ret; if (!str) return 0; ret = kstrtoul(str, 0, &uhash_entries); if (ret) return 0; if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN) uhash_entries = UDP_HTABLE_SIZE_MIN; return 1; } __setup("uhash_entries=", set_uhash_entries); void __init udp_table_init(struct udp_table *table, const char *name) { unsigned int i, slot_size; slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main) + udp_hash4_slot_size(); table->hash = alloc_large_system_hash(name, slot_size, uhash_entries, 21, /* one slot per 2 MB */ 0, &table->log, &table->mask, UDP_HTABLE_SIZE_MIN, UDP_HTABLE_SIZE_MAX); table->hash2 = (void *)(table->hash + (table->mask + 1)); for (i = 0; i <= table->mask; i++) { INIT_HLIST_HEAD(&table->hash[i].head); table->hash[i].count = 0; spin_lock_init(&table->hash[i].lock); } for (i = 0; i <= table->mask; i++) { INIT_HLIST_HEAD(&table->hash2[i].hslot.head); table->hash2[i].hslot.count = 0; spin_lock_init(&table->hash2[i].hslot.lock); } udp_table_hash4_init(table); } u32 udp_flow_hashrnd(void) { static u32 hashrnd __read_mostly; net_get_random_once(&hashrnd, sizeof(hashrnd)); return hashrnd; } EXPORT_SYMBOL(udp_flow_hashrnd); static void __net_init udp_sysctl_init(struct net *net) { net->ipv4.sysctl_udp_rmem_min = PAGE_SIZE; net->ipv4.sysctl_udp_wmem_min = PAGE_SIZE; #ifdef CONFIG_NET_L3_MASTER_DEV net->ipv4.sysctl_udp_l3mdev_accept = 0; #endif } static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_entries) { struct udp_table *udptable; unsigned int slot_size; int i; udptable = kmalloc(sizeof(*udptable), GFP_KERNEL); if (!udptable) goto out; slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main) + udp_hash4_slot_size(); udptable->hash = vmalloc_huge(hash_entries * slot_size, GFP_KERNEL_ACCOUNT); if (!udptable->hash) goto free_table; udptable->hash2 = (void *)(udptable->hash + hash_entries); udptable->mask = hash_entries - 1; udptable->log = ilog2(hash_entries); for (i = 0; i < hash_entries; i++) { INIT_HLIST_HEAD(&udptable->hash[i].head); udptable->hash[i].count = 0; spin_lock_init(&udptable->hash[i].lock); INIT_HLIST_HEAD(&udptable->hash2[i].hslot.head); udptable->hash2[i].hslot.count = 0; spin_lock_init(&udptable->hash2[i].hslot.lock); } udp_table_hash4_init(udptable); return udptable; free_table: kfree(udptable); out: return NULL; } static void __net_exit udp_pernet_table_free(struct net *net) { struct udp_table *udptable = net->ipv4.udp_table; if (udptable == &udp_table) return; kvfree(udptable->hash); kfree(udptable); } static void __net_init udp_set_table(struct net *net) { struct udp_table *udptable; unsigned int hash_entries; struct net *old_net; if (net_eq(net, &init_net)) goto fallback; old_net = current->nsproxy->net_ns; hash_entries = READ_ONCE(old_net->ipv4.sysctl_udp_child_hash_entries); if (!hash_entries) goto fallback; /* Set min to keep the bitmap on stack in udp_lib_get_port() */ if (hash_entries < UDP_HTABLE_SIZE_MIN_PERNET) hash_entries = UDP_HTABLE_SIZE_MIN_PERNET; else hash_entries = roundup_pow_of_two(hash_entries); udptable = udp_pernet_table_alloc(hash_entries); if (udptable) { net->ipv4.udp_table = udptable; } else { pr_warn("Failed to allocate UDP hash table (entries: %u) " "for a netns, fallback to the global one\n", hash_entries); fallback: net->ipv4.udp_table = &udp_table; } } static int __net_init udp_pernet_init(struct net *net) { #if IS_ENABLED(CONFIG_NET_UDP_TUNNEL) int i; /* No tunnel is configured */ for (i = 0; i < ARRAY_SIZE(net->ipv4.udp_tunnel_gro); ++i) { INIT_HLIST_HEAD(&net->ipv4.udp_tunnel_gro[i].list); RCU_INIT_POINTER(net->ipv4.udp_tunnel_gro[i].sk, NULL); } #endif udp_sysctl_init(net); udp_set_table(net); return 0; } static void __net_exit udp_pernet_exit(struct net *net) { udp_pernet_table_free(net); } static struct pernet_operations __net_initdata udp_sysctl_ops = { .init = udp_pernet_init, .exit = udp_pernet_exit, }; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta, struct udp_sock *udp_sk, uid_t uid, int bucket) static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter, unsigned int new_batch_sz, gfp_t flags) { union bpf_udp_iter_batch_item *new_batch; new_batch = kvmalloc_array(new_batch_sz, sizeof(*new_batch), flags | __GFP_NOWARN); if (!new_batch) return -ENOMEM; if (flags != GFP_NOWAIT) bpf_iter_udp_put_batch(iter); memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk); kvfree(iter->batch); iter->batch = new_batch; iter->max_sk = new_batch_sz; return 0; } #define INIT_BATCH_SZ 16 static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux) { struct bpf_udp_iter_state *iter = priv_data; int ret; ret = bpf_iter_init_seq_net(priv_data, aux); if (ret) return ret; ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ, GFP_USER); if (ret) bpf_iter_fini_seq_net(priv_data); iter->state.bucket = -1; return ret; } static void bpf_iter_fini_udp(void *priv_data) { struct bpf_udp_iter_state *iter = priv_data; bpf_iter_fini_seq_net(priv_data); kvfree(iter->batch); } static const struct bpf_iter_seq_info udp_seq_info = { .seq_ops = &bpf_iter_udp_seq_ops, .init_seq_private = bpf_iter_init_udp, .fini_seq_private = bpf_iter_fini_udp, .seq_priv_size = sizeof(struct bpf_udp_iter_state), }; static struct bpf_iter_reg udp_reg_info = { .target = "udp", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__udp, udp_sk), PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, }, .seq_info = &udp_seq_info, }; static void __init bpf_iter_register(void) { udp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UDP]; if (bpf_iter_reg_target(&udp_reg_info)) pr_warn("Warning: could not register bpf iterator udp\n"); } #endif void __init udp_init(void) { unsigned long limit; unsigned int i; udp_table_init(&udp_table, "UDP"); limit = nr_free_buffer_pages() / 8; limit = max(limit, 128UL); sysctl_udp_mem[0] = limit / 4 * 3; sysctl_udp_mem[1] = limit; sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2; /* 16 spinlocks per cpu */ udp_busylocks_log = ilog2(nr_cpu_ids) + 4; udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log, GFP_KERNEL); if (!udp_busylocks) panic("UDP: failed to alloc udp_busylocks\n"); for (i = 0; i < (1U << udp_busylocks_log); i++) spin_lock_init(udp_busylocks + i); if (register_pernet_subsys(&udp_sysctl_ops)) panic("UDP: failed to init sysctl parameters.\n"); #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) bpf_iter_register(); #endif }
11 2 12 2 2 2 15 33 6 18 11 11 1 2 19 22 7 3 1 11 18 15 33 22 22 22 14 6 3 1 1 1 8 8 2 8 22 50629 50615 50629 1 8 1 8 25 4 4 5 8 6 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 // SPDX-License-Identifier: GPL-2.0-only /* * xsave/xrstor support. * * Author: Suresh Siddha <suresh.b.siddha@intel.com> */ #include <linux/bitops.h> #include <linux/compat.h> #include <linux/cpu.h> #include <linux/mman.h> #include <linux/nospec.h> #include <linux/pkeys.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/vmalloc.h> #include <linux/coredump.h> #include <linux/sort.h> #include <asm/fpu/api.h> #include <asm/fpu/regset.h> #include <asm/fpu/signal.h> #include <asm/fpu/xcr.h> #include <asm/cpuid/api.h> #include <asm/msr.h> #include <asm/tlbflush.h> #include <asm/prctl.h> #include <asm/elf.h> #include <uapi/asm/elf.h> #include "context.h" #include "internal.h" #include "legacy.h" #include "xstate.h" #define for_each_extended_xfeature(bit, mask) \ (bit) = FIRST_EXTENDED_XFEATURE; \ for_each_set_bit_from(bit, (unsigned long *)&(mask), 8 * sizeof(mask)) /* * Although we spell it out in here, the Processor Trace * xfeature is completely unused. We use other mechanisms * to save/restore PT state in Linux. */ static const char *xfeature_names[] = { "x87 floating point registers", "SSE registers", "AVX registers", "MPX bounds registers", "MPX CSR", "AVX-512 opmask", "AVX-512 Hi256", "AVX-512 ZMM_Hi256", "Processor Trace (unused)", "Protection Keys User registers", "PASID state", "Control-flow User registers", "Control-flow Kernel registers (KVM only)", "unknown xstate feature", "unknown xstate feature", "unknown xstate feature", "unknown xstate feature", "AMX Tile config", "AMX Tile data", "APX registers", "unknown xstate feature", }; static unsigned short xsave_cpuid_features[] __initdata = { [XFEATURE_FP] = X86_FEATURE_FPU, [XFEATURE_SSE] = X86_FEATURE_XMM, [XFEATURE_YMM] = X86_FEATURE_AVX, [XFEATURE_BNDREGS] = X86_FEATURE_MPX, [XFEATURE_BNDCSR] = X86_FEATURE_MPX, [XFEATURE_OPMASK] = X86_FEATURE_AVX512F, [XFEATURE_ZMM_Hi256] = X86_FEATURE_AVX512F, [XFEATURE_Hi16_ZMM] = X86_FEATURE_AVX512F, [XFEATURE_PT_UNIMPLEMENTED_SO_FAR] = X86_FEATURE_INTEL_PT, [XFEATURE_PKRU] = X86_FEATURE_OSPKE, [XFEATURE_PASID] = X86_FEATURE_ENQCMD, [XFEATURE_CET_USER] = X86_FEATURE_SHSTK, [XFEATURE_CET_KERNEL] = X86_FEATURE_SHSTK, [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE, [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE, [XFEATURE_APX] = X86_FEATURE_APX, }; static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init; /* * Ordering of xstate components in uncompacted format: The xfeature * number does not necessarily indicate its position in the XSAVE buffer. * This array defines the traversal order of xstate features. */ static unsigned int xfeature_uncompact_order[XFEATURE_MAX] __ro_after_init = { [ 0 ... XFEATURE_MAX - 1] = -1}; static inline unsigned int next_xfeature_order(unsigned int i, u64 mask) { for (; xfeature_uncompact_order[i] != -1; i++) { if (mask & BIT_ULL(xfeature_uncompact_order[i])) break; } return i; } /* Iterate xstate features in uncompacted order: */ #define for_each_extended_xfeature_in_order(i, mask) \ for (i = 0; \ i = next_xfeature_order(i, mask), \ xfeature_uncompact_order[i] != -1; \ i++) #define XSTATE_FLAG_SUPERVISOR BIT(0) #define XSTATE_FLAG_ALIGNED64 BIT(1) /* * Return whether the system supports a given xfeature. * * Also return the name of the (most advanced) feature that the caller requested: */ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) { u64 xfeatures_missing = xfeatures_needed & ~fpu_kernel_cfg.max_features; if (unlikely(feature_name)) { long xfeature_idx, max_idx; u64 xfeatures_print; /* * So we use FLS here to be able to print the most advanced * feature that was requested but is missing. So if a driver * asks about "XFEATURE_MASK_SSE | XFEATURE_MASK_YMM" we'll print the * missing AVX feature - this is the most informative message * to users: */ if (xfeatures_missing) xfeatures_print = xfeatures_missing; else xfeatures_print = xfeatures_needed; xfeature_idx = fls64(xfeatures_print)-1; max_idx = ARRAY_SIZE(xfeature_names)-1; xfeature_idx = min(xfeature_idx, max_idx); *feature_name = xfeature_names[xfeature_idx]; } if (xfeatures_missing) return 0; return 1; } EXPORT_SYMBOL_GPL(cpu_has_xfeatures); static bool xfeature_is_aligned64(int xfeature_nr) { return xstate_flags[xfeature_nr] & XSTATE_FLAG_ALIGNED64; } static bool xfeature_is_supervisor(int xfeature_nr) { return xstate_flags[xfeature_nr] & XSTATE_FLAG_SUPERVISOR; } static unsigned int xfeature_get_offset(u64 xcomp_bv, int xfeature) { unsigned int offs, i; /* * Non-compacted format and legacy features use the cached fixed * offsets. */ if (!cpu_feature_enabled(X86_FEATURE_XCOMPACTED) || xfeature <= XFEATURE_SSE) return xstate_offsets[xfeature]; /* * Compacted format offsets depend on the actual content of the * compacted xsave area which is determined by the xcomp_bv header * field. */ offs = FXSAVE_SIZE + XSAVE_HDR_SIZE; for_each_extended_xfeature(i, xcomp_bv) { if (xfeature_is_aligned64(i)) offs = ALIGN(offs, 64); if (i == xfeature) break; offs += xstate_sizes[i]; } return offs; } /* * Enable the extended processor state save/restore feature. * Called once per CPU onlining. */ void fpu__init_cpu_xstate(void) { if (!boot_cpu_has(X86_FEATURE_XSAVE) || !fpu_kernel_cfg.max_features) return; cr4_set_bits(X86_CR4_OSXSAVE); /* * Must happen after CR4 setup and before xsetbv() to allow KVM * lazy passthrough. Write independent of the dynamic state static * key as that does not work on the boot CPU. This also ensures * that any stale state is wiped out from XFD. Reset the per CPU * xfd cache too. */ if (cpu_feature_enabled(X86_FEATURE_XFD)) xfd_set_state(init_fpstate.xfd); /* * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features * managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user * states can be set here. */ xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); /* * MSR_IA32_XSS sets supervisor states managed by XSAVES. */ if (boot_cpu_has(X86_FEATURE_XSAVES)) { wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() | xfeatures_mask_independent()); } } static bool xfeature_enabled(enum xfeature xfeature) { return fpu_kernel_cfg.max_features & BIT_ULL(xfeature); } static int compare_xstate_offsets(const void *xfeature1, const void *xfeature2) { return xstate_offsets[*(unsigned int *)xfeature1] - xstate_offsets[*(unsigned int *)xfeature2]; } /* * Record the offsets and sizes of various xstates contained * in the XSAVE state memory layout. Also, create an ordered * list of xfeatures for handling out-of-order offsets. */ static void __init setup_xstate_cache(void) { u32 eax, ebx, ecx, edx, xfeature, i = 0; /* * The FP xstates and SSE xstates are legacy states. They are always * in the fixed offsets in the xsave area in either compacted form * or standard form. */ xstate_offsets[XFEATURE_FP] = 0; xstate_sizes[XFEATURE_FP] = offsetof(struct fxregs_state, xmm_space); xstate_offsets[XFEATURE_SSE] = xstate_sizes[XFEATURE_FP]; xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state, xmm_space); for_each_extended_xfeature(xfeature, fpu_kernel_cfg.max_features) { cpuid_count(CPUID_LEAF_XSTATE, xfeature, &eax, &ebx, &ecx, &edx); xstate_sizes[xfeature] = eax; xstate_flags[xfeature] = ecx; /* * If an xfeature is supervisor state, the offset in EBX is * invalid, leave it to -1. */ if (xfeature_is_supervisor(xfeature)) continue; xstate_offsets[xfeature] = ebx; /* Populate the list of xfeatures before sorting */ xfeature_uncompact_order[i++] = xfeature; } /* * Sort xfeatures by their offsets to support out-of-order * offsets in the uncompacted format. */ sort(xfeature_uncompact_order, i, sizeof(unsigned int), compare_xstate_offsets, NULL); } /* * Print out all the supported xstate features: */ static void __init print_xstate_features(void) { int i; for (i = 0; i < XFEATURE_MAX; i++) { u64 mask = BIT_ULL(i); const char *name; if (cpu_has_xfeatures(mask, &name)) pr_info("x86/fpu: Supporting XSAVE feature 0x%03Lx: '%s'\n", mask, name); } } /* * This check is important because it is easy to get XSTATE_* * confused with XSTATE_BIT_*. */ #define CHECK_XFEATURE(nr) do { \ WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \ WARN_ON(nr >= XFEATURE_MAX); \ } while (0) /* * Print out xstate component offsets and sizes */ static void __init print_xstate_offset_size(void) { int i; for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", i, xfeature_get_offset(fpu_kernel_cfg.max_features, i), i, xstate_sizes[i]); } } /* * This function is called only during boot time when x86 caps are not set * up and alternative can not be used yet. */ static __init void os_xrstor_booting(struct xregs_state *xstate) { u64 mask = fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSTATE; u32 lmask = mask; u32 hmask = mask >> 32; int err; if (cpu_feature_enabled(X86_FEATURE_XSAVES)) XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); else XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); /* * We should never fault when copying from a kernel buffer, and the FPU * state we set at boot time should be valid. */ WARN_ON_FPU(err); } /* * All supported features have either init state all zeros or are * handled in setup_init_fpu() individually. This is an explicit * feature list and does not use XFEATURE_MASK*SUPPORTED to catch * newly added supported features at build time and make people * actually look at the init state for the new feature. */ #define XFEATURES_INIT_FPSTATE_HANDLED \ (XFEATURE_MASK_FP | \ XFEATURE_MASK_SSE | \ XFEATURE_MASK_YMM | \ XFEATURE_MASK_OPMASK | \ XFEATURE_MASK_ZMM_Hi256 | \ XFEATURE_MASK_Hi16_ZMM | \ XFEATURE_MASK_PKRU | \ XFEATURE_MASK_BNDREGS | \ XFEATURE_MASK_BNDCSR | \ XFEATURE_MASK_PASID | \ XFEATURE_MASK_CET_USER | \ XFEATURE_MASK_CET_KERNEL | \ XFEATURE_MASK_XTILE | \ XFEATURE_MASK_APX) /* * setup the xstate image representing the init state */ static void __init setup_init_fpu_buf(void) { BUILD_BUG_ON((XFEATURE_MASK_USER_SUPPORTED | XFEATURE_MASK_SUPERVISOR_SUPPORTED) != XFEATURES_INIT_FPSTATE_HANDLED); if (!boot_cpu_has(X86_FEATURE_XSAVE)) return; print_xstate_features(); xstate_init_xcomp_bv(&init_fpstate.regs.xsave, init_fpstate.xfeatures); /* * Init all the features state with header.xfeatures being 0x0 */ os_xrstor_booting(&init_fpstate.regs.xsave); /* * All components are now in init state. Read the state back so * that init_fpstate contains all non-zero init state. This only * works with XSAVE, but not with XSAVEOPT and XSAVEC/S because * those use the init optimization which skips writing data for * components in init state. * * XSAVE could be used, but that would require to reshuffle the * data when XSAVEC/S is available because XSAVEC/S uses xstate * compaction. But doing so is a pointless exercise because most * components have an all zeros init state except for the legacy * ones (FP and SSE). Those can be saved with FXSAVE into the * legacy area. Adding new features requires to ensure that init * state is all zeroes or if not to add the necessary handling * here. */ fxsave(&init_fpstate.regs.fxsave); } int xfeature_size(int xfeature_nr) { u32 eax, ebx, ecx, edx; CHECK_XFEATURE(xfeature_nr); cpuid_count(CPUID_LEAF_XSTATE, xfeature_nr, &eax, &ebx, &ecx, &edx); return eax; } /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ static int validate_user_xstate_header(const struct xstate_header *hdr, struct fpstate *fpstate) { /* No unknown or supervisor features may be set */ if (hdr->xfeatures & ~fpstate->user_xfeatures) return -EINVAL; /* Userspace must use the uncompacted format */ if (hdr->xcomp_bv) return -EINVAL; /* * If 'reserved' is shrunken to add a new field, make sure to validate * that new field here! */ BUILD_BUG_ON(sizeof(hdr->reserved) != 48); /* No reserved bits may be set */ if (memchr_inv(hdr->reserved, 0, sizeof(hdr->reserved))) return -EINVAL; return 0; } static void __init __xstate_dump_leaves(void) { int i; u32 eax, ebx, ecx, edx; static int should_dump = 1; if (!should_dump) return; should_dump = 0; /* * Dump out a few leaves past the ones that we support * just in case there are some goodies up there */ for (i = 0; i < XFEATURE_MAX + 10; i++) { cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx); pr_warn("CPUID[%02x, %02x]: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", CPUID_LEAF_XSTATE, i, eax, ebx, ecx, edx); } } #define XSTATE_WARN_ON(x, fmt, ...) do { \ if (WARN_ONCE(x, "XSAVE consistency problem: " fmt, ##__VA_ARGS__)) { \ __xstate_dump_leaves(); \ } \ } while (0) #define XCHECK_SZ(sz, nr, __struct) ({ \ if (WARN_ONCE(sz != sizeof(__struct), \ "[%s]: struct is %zu bytes, cpu state %d bytes\n", \ xfeature_names[nr], sizeof(__struct), sz)) { \ __xstate_dump_leaves(); \ } \ true; \ }) /** * check_xtile_data_against_struct - Check tile data state size. * * Calculate the state size by multiplying the single tile size which is * recorded in a C struct, and the number of tiles that the CPU informs. * Compare the provided size with the calculation. * * @size: The tile data state size * * Returns: 0 on success, -EINVAL on mismatch. */ static int __init check_xtile_data_against_struct(int size) { u32 max_palid, palid, state_size; u32 eax, ebx, ecx, edx; u16 max_tile; /* * Check the maximum palette id: * eax: the highest numbered palette subleaf. */ cpuid_count(CPUID_LEAF_TILE, 0, &max_palid, &ebx, &ecx, &edx); /* * Cross-check each tile size and find the maximum number of * supported tiles. */ for (palid = 1, max_tile = 0; palid <= max_palid; palid++) { u16 tile_size, max; /* * Check the tile size info: * eax[31:16]: bytes per title * ebx[31:16]: the max names (or max number of tiles) */ cpuid_count(CPUID_LEAF_TILE, palid, &eax, &ebx, &edx, &edx); tile_size = eax >> 16; max = ebx >> 16; if (tile_size != sizeof(struct xtile_data)) { pr_err("%s: struct is %zu bytes, cpu xtile %d bytes\n", __stringify(XFEATURE_XTILE_DATA), sizeof(struct xtile_data), tile_size); __xstate_dump_leaves(); return -EINVAL; } if (max > max_tile) max_tile = max; } state_size = sizeof(struct xtile_data) * max_tile; if (size != state_size) { pr_err("%s: calculated size is %u bytes, cpu state %d bytes\n", __stringify(XFEATURE_XTILE_DATA), state_size, size); __xstate_dump_leaves(); return -EINVAL; } return 0; } /* * We have a C struct for each 'xstate'. We need to ensure * that our software representation matches what the CPU * tells us about the state's size. */ static bool __init check_xstate_against_struct(int nr) { /* * Ask the CPU for the size of the state. */ int sz = xfeature_size(nr); /* * Match each CPU state with the corresponding software * structure. */ switch (nr) { case XFEATURE_YMM: return XCHECK_SZ(sz, nr, struct ymmh_struct); case XFEATURE_BNDREGS: return XCHECK_SZ(sz, nr, struct mpx_bndreg_state); case XFEATURE_BNDCSR: return XCHECK_SZ(sz, nr, struct mpx_bndcsr_state); case XFEATURE_OPMASK: return XCHECK_SZ(sz, nr, struct avx_512_opmask_state); case XFEATURE_ZMM_Hi256: return XCHECK_SZ(sz, nr, struct avx_512_zmm_uppers_state); case XFEATURE_Hi16_ZMM: return XCHECK_SZ(sz, nr, struct avx_512_hi16_state); case XFEATURE_PKRU: return XCHECK_SZ(sz, nr, struct pkru_state); case XFEATURE_PASID: return XCHECK_SZ(sz, nr, struct ia32_pasid_state); case XFEATURE_XTILE_CFG: return XCHECK_SZ(sz, nr, struct xtile_cfg); case XFEATURE_CET_USER: return XCHECK_SZ(sz, nr, struct cet_user_state); case XFEATURE_CET_KERNEL: return XCHECK_SZ(sz, nr, struct cet_supervisor_state); case XFEATURE_APX: return XCHECK_SZ(sz, nr, struct apx_state); case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true; default: XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr); return false; } return true; } static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted) { unsigned int topmost = fls64(xfeatures) - 1; unsigned int offset, i; if (topmost <= XFEATURE_SSE) return sizeof(struct xregs_state); if (compacted) { offset = xfeature_get_offset(xfeatures, topmost); } else { /* Walk through the xfeature order to pick the last */ for_each_extended_xfeature_in_order(i, xfeatures) topmost = xfeature_uncompact_order[i]; offset = xstate_offsets[topmost]; } return offset + xstate_sizes[topmost]; } /* * This essentially double-checks what the cpu told us about * how large the XSAVE buffer needs to be. We are recalculating * it to be safe. * * Independent XSAVE features allocate their own buffers and are not * covered by these checks. Only the size of the buffer for task->fpu * is checked here. */ static bool __init paranoid_xstate_size_valid(unsigned int kernel_size) { bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); bool xsaves = cpu_feature_enabled(X86_FEATURE_XSAVES); unsigned int size = FXSAVE_SIZE + XSAVE_HDR_SIZE; int i; for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { if (!check_xstate_against_struct(i)) return false; /* * Supervisor state components can be managed only by * XSAVES. */ if (!xsaves && xfeature_is_supervisor(i)) { XSTATE_WARN_ON(1, "Got supervisor feature %d, but XSAVES not advertised\n", i); return false; } } size = xstate_calculate_size(fpu_kernel_cfg.max_features, compacted); XSTATE_WARN_ON(size != kernel_size, "size %u != kernel_size %u\n", size, kernel_size); return size == kernel_size; } /* * Get total size of enabled xstates in XCR0 | IA32_XSS. * * Note the SDM's wording here. "sub-function 0" only enumerates * the size of the *user* states. If we use it to size a buffer * that we use 'XSAVES' on, we could potentially overflow the * buffer because 'XSAVES' saves system states too. * * This also takes compaction into account. So this works for * XSAVEC as well. */ static unsigned int __init get_compacted_size(void) { unsigned int eax, ebx, ecx, edx; /* * - CPUID function 0DH, sub-function 1: * EBX enumerates the size (in bytes) required by * the XSAVES instruction for an XSAVE area * containing all the state components * corresponding to bits currently set in * XCR0 | IA32_XSS. * * When XSAVES is not available but XSAVEC is (virt), then there * are no supervisor states, but XSAVEC still uses compacted * format. */ cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx); return ebx; } /* * Get the total size of the enabled xstates without the independent supervisor * features. */ static unsigned int __init get_xsave_compacted_size(void) { u64 mask = xfeatures_mask_independent(); unsigned int size; if (!mask) return get_compacted_size(); /* Disable independent features. */ wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor()); /* * Ask the hardware what size is required of the buffer. * This is the size required for the task->fpu buffer. */ size = get_compacted_size(); /* Re-enable independent features so XSAVES will work on them again. */ wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask); return size; } static unsigned int __init get_xsave_size_user(void) { unsigned int eax, ebx, ecx, edx; /* * - CPUID function 0DH, sub-function 0: * EBX enumerates the size (in bytes) required by * the XSAVE instruction for an XSAVE area * containing all the *user* state components * corresponding to bits currently set in XCR0. */ cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx); return ebx; } static int __init init_xstate_size(void) { /* Recompute the context size for enabled features: */ unsigned int user_size, kernel_size, kernel_default_size; bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); /* Uncompacted user space size */ user_size = get_xsave_size_user(); /* * XSAVES kernel size includes supervisor states and uses compacted * format. XSAVEC uses compacted format, but does not save * supervisor states. * * XSAVE[OPT] do not support supervisor states so kernel and user * size is identical. */ if (compacted) kernel_size = get_xsave_compacted_size(); else kernel_size = user_size; kernel_default_size = xstate_calculate_size(fpu_kernel_cfg.default_features, compacted); if (!paranoid_xstate_size_valid(kernel_size)) return -EINVAL; fpu_kernel_cfg.max_size = kernel_size; fpu_user_cfg.max_size = user_size; fpu_kernel_cfg.default_size = kernel_default_size; fpu_user_cfg.default_size = xstate_calculate_size(fpu_user_cfg.default_features, false); guest_default_cfg.size = xstate_calculate_size(guest_default_cfg.features, compacted); return 0; } /* * We enabled the XSAVE hardware, but something went wrong and * we can not use it. Disable it. */ static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) { pr_info("x86/fpu: XSAVE disabled\n"); fpu_kernel_cfg.max_features = 0; cr4_clear_bits(X86_CR4_OSXSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVE); /* Restore the legacy size.*/ fpu_kernel_cfg.max_size = legacy_size; fpu_kernel_cfg.default_size = legacy_size; fpu_user_cfg.max_size = legacy_size; fpu_user_cfg.default_size = legacy_size; guest_default_cfg.size = legacy_size; /* * Prevent enabling the static branch which enables writes to the * XFD MSR. */ init_fpstate.xfd = 0; fpstate_reset(x86_task_fpu(current)); } static u64 __init host_default_mask(void) { /* * Exclude dynamic features (require userspace opt-in) and features * that are supported only for KVM guests. */ return ~((u64)XFEATURE_MASK_USER_DYNAMIC | XFEATURE_MASK_GUEST_SUPERVISOR); } static u64 __init guest_default_mask(void) { /* * Exclude dynamic features, which require userspace opt-in even * for KVM guests. */ return ~(u64)XFEATURE_MASK_USER_DYNAMIC; } /* * Enable and initialize the xsave feature. * Called once per system bootup. */ void __init fpu__init_system_xstate(unsigned int legacy_size) { unsigned int eax, ebx, ecx, edx; u64 xfeatures; int err; int i; if (!boot_cpu_has(X86_FEATURE_FPU)) { pr_info("x86/fpu: No FPU detected\n"); return; } if (!boot_cpu_has(X86_FEATURE_XSAVE)) { pr_info("x86/fpu: x87 FPU will use %s\n", boot_cpu_has(X86_FEATURE_FXSR) ? "FXSAVE" : "FSAVE"); return; } /* * Find user xstates supported by the processor. */ cpuid_count(CPUID_LEAF_XSTATE, 0, &eax, &ebx, &ecx, &edx); fpu_kernel_cfg.max_features = eax + ((u64)edx << 32); /* * Find supervisor xstates supported by the processor. */ cpuid_count(CPUID_LEAF_XSTATE, 1, &eax, &ebx, &ecx, &edx); fpu_kernel_cfg.max_features |= ecx + ((u64)edx << 32); if ((fpu_kernel_cfg.max_features & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { /* * This indicates that something really unexpected happened * with the enumeration. Disable XSAVE and try to continue * booting without it. This is too early to BUG(). */ pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", fpu_kernel_cfg.max_features); goto out_disable; } if (fpu_kernel_cfg.max_features & XFEATURE_MASK_APX && fpu_kernel_cfg.max_features & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) { /* * This is a problematic CPU configuration where two * conflicting state components are both enumerated. */ pr_err("x86/fpu: Both APX/MPX present in the CPU's xstate features: 0x%llx.\n", fpu_kernel_cfg.max_features); goto out_disable; } fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features & XFEATURE_MASK_INDEPENDENT; /* * Clear XSAVE features that are disabled in the normal CPUID. */ for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { unsigned short cid = xsave_cpuid_features[i]; /* Careful: X86_FEATURE_FPU is 0! */ if ((i != XFEATURE_FP && !cid) || !boot_cpu_has(cid)) fpu_kernel_cfg.max_features &= ~BIT_ULL(i); } if (!cpu_feature_enabled(X86_FEATURE_XFD)) fpu_kernel_cfg.max_features &= ~XFEATURE_MASK_USER_DYNAMIC; if (!cpu_feature_enabled(X86_FEATURE_XSAVES)) fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; else fpu_kernel_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED | XFEATURE_MASK_SUPERVISOR_SUPPORTED; fpu_user_cfg.max_features = fpu_kernel_cfg.max_features; fpu_user_cfg.max_features &= XFEATURE_MASK_USER_SUPPORTED; /* * Now, given maximum feature set, determine default values by * applying default masks. */ fpu_kernel_cfg.default_features = fpu_kernel_cfg.max_features & host_default_mask(); fpu_user_cfg.default_features = fpu_user_cfg.max_features & host_default_mask(); guest_default_cfg.features = fpu_kernel_cfg.max_features & guest_default_mask(); /* Store it for paranoia check at the end */ xfeatures = fpu_kernel_cfg.max_features; /* * Initialize the default XFD state in initfp_state and enable the * dynamic sizing mechanism if dynamic states are available. The * static key cannot be enabled here because this runs before * jump_label_init(). This is delayed to an initcall. */ init_fpstate.xfd = fpu_user_cfg.max_features & XFEATURE_MASK_USER_DYNAMIC; /* Set up compaction feature bit */ if (cpu_feature_enabled(X86_FEATURE_XSAVEC) || cpu_feature_enabled(X86_FEATURE_XSAVES)) setup_force_cpu_cap(X86_FEATURE_XCOMPACTED); /* Enable xstate instructions to be able to continue with initialization: */ fpu__init_cpu_xstate(); /* Cache size, offset and flags for initialization */ setup_xstate_cache(); err = init_xstate_size(); if (err) goto out_disable; /* * Update info used for ptrace frames; use standard-format size and no * supervisor xstates: */ update_regset_xstate_info(fpu_user_cfg.max_size, fpu_user_cfg.max_features); /* * init_fpstate excludes dynamic states as they are large but init * state is zero. */ init_fpstate.size = fpu_kernel_cfg.default_size; init_fpstate.xfeatures = fpu_kernel_cfg.default_features; if (init_fpstate.size > sizeof(init_fpstate.regs)) { pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d)\n", sizeof(init_fpstate.regs), init_fpstate.size); goto out_disable; } setup_init_fpu_buf(); /* * Paranoia check whether something in the setup modified the * xfeatures mask. */ if (xfeatures != fpu_kernel_cfg.max_features) { pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init\n", xfeatures, fpu_kernel_cfg.max_features); goto out_disable; } /* * CPU capabilities initialization runs before FPU init. So * X86_FEATURE_OSXSAVE is not set. Now that XSAVE is completely * functional, set the feature bit so depending code works. */ setup_force_cpu_cap(X86_FEATURE_OSXSAVE); print_xstate_offset_size(); pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", fpu_kernel_cfg.max_features, fpu_kernel_cfg.max_size, boot_cpu_has(X86_FEATURE_XCOMPACTED) ? "compacted" : "standard"); return; out_disable: /* something went wrong, try to boot without any XSAVE support */ fpu__init_disable_system_xstate(legacy_size); } /* * Restore minimal FPU state after suspend: */ void fpu__resume_cpu(void) { /* * Restore XCR0 on xsave capable CPUs: */ if (cpu_feature_enabled(X86_FEATURE_XSAVE)) xsetbv(XCR_XFEATURE_ENABLED_MASK, fpu_user_cfg.max_features); /* * Restore IA32_XSS. The same CPUID bit enumerates support * of XSAVES and MSR_IA32_XSS. */ if (cpu_feature_enabled(X86_FEATURE_XSAVES)) { wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() | xfeatures_mask_independent()); } if (fpu_state_size_dynamic()) wrmsrq(MSR_IA32_XFD, x86_task_fpu(current)->fpstate->xfd); } /* * Given an xstate feature nr, calculate where in the xsave * buffer the state is. Callers should ensure that the buffer * is valid. */ static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr) { u64 xcomp_bv = xsave->header.xcomp_bv; if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) return NULL; if (cpu_feature_enabled(X86_FEATURE_XCOMPACTED)) { if (WARN_ON_ONCE(!(xcomp_bv & BIT_ULL(xfeature_nr)))) return NULL; } return (void *)xsave + xfeature_get_offset(xcomp_bv, xfeature_nr); } /* * Given the xsave area and a state inside, this function returns the * address of the state. * * This is the API that is called to get xstate address in either * standard format or compacted format of xsave area. * * Note that if there is no data for the field in the xsave buffer * this will return NULL. * * Inputs: * xstate: the thread's storage area for all FPU data * xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP, * XFEATURE_SSE, etc...) * Output: * address of the state in the xsave area, or NULL if the * field is not present in the xsave buffer. */ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) { /* * Do we even *have* xsave state? */ if (!boot_cpu_has(X86_FEATURE_XSAVE)) return NULL; /* * We should not ever be requesting features that we * have not enabled. */ if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) return NULL; /* * This assumes the last 'xsave*' instruction to * have requested that 'xfeature_nr' be saved. * If it did not, we might be seeing and old value * of the field in the buffer. * * This can happen because the last 'xsave' did not * request that this feature be saved (unlikely) * or because the "init optimization" caused it * to not be saved. */ if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr))) return NULL; return __raw_xsave_addr(xsave, xfeature_nr); } EXPORT_SYMBOL_GPL(get_xsave_addr); /* * Given an xstate feature nr, calculate where in the xsave buffer the state is. * The xsave buffer should be in standard format, not compacted (e.g. user mode * signal frames). */ void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr) { if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr))) return NULL; return (void __user *)xsave + xstate_offsets[xfeature_nr]; } #ifdef CONFIG_ARCH_HAS_PKEYS /* * This will go out and modify PKRU register to set the access * rights for @pkey to @init_val. */ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val) { u32 old_pkru, new_pkru_bits = 0; int pkey_shift; /* * This check implies XSAVE support. OSPKE only gets * set if we enable XSAVE and we enable PKU in XCR0. */ if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) return -EINVAL; /* * This code should only be called with valid 'pkey' * values originating from in-kernel users. Complain * if a bad value is observed. */ if (WARN_ON_ONCE(pkey >= arch_max_pkey())) return -EINVAL; /* Set the bits we need in PKRU: */ if (init_val & PKEY_DISABLE_ACCESS) new_pkru_bits |= PKRU_AD_BIT; if (init_val & PKEY_DISABLE_WRITE) new_pkru_bits |= PKRU_WD_BIT; /* Shift the bits in to the correct place in PKRU for pkey: */ pkey_shift = pkey * PKRU_BITS_PER_PKEY; new_pkru_bits <<= pkey_shift; /* Get old PKRU and mask off any old bits in place: */ old_pkru = read_pkru(); old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); /* Write old part along with new part: */ write_pkru(old_pkru | new_pkru_bits); return 0; } #endif /* ! CONFIG_ARCH_HAS_PKEYS */ static void copy_feature(bool from_xstate, struct membuf *to, void *xstate, void *init_xstate, unsigned int size) { membuf_write(to, from_xstate ? xstate : init_xstate, size); } /** * __copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer * @to: membuf descriptor * @fpstate: The fpstate buffer from which to copy * @xfeatures: The mask of xfeatures to save (XSAVE mode only) * @pkru_val: The PKRU value to store in the PKRU component * @copy_mode: The requested copy mode * * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming * format, i.e. from the kernel internal hardware dependent storage format * to the requested @mode. UABI XSTATE is always uncompacted! * * It supports partial copy but @to.pos always starts from zero. */ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, u64 xfeatures, u32 pkru_val, enum xstate_copy_mode copy_mode) { const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); struct xregs_state *xinit = &init_fpstate.regs.xsave; struct xregs_state *xsave = &fpstate->regs.xsave; unsigned int zerofrom, i, xfeature; struct xstate_header header; u64 mask; memset(&header, 0, sizeof(header)); header.xfeatures = xsave->header.xfeatures; /* Mask out the feature bits depending on copy mode */ switch (copy_mode) { case XSTATE_COPY_FP: header.xfeatures &= XFEATURE_MASK_FP; break; case XSTATE_COPY_FX: header.xfeatures &= XFEATURE_MASK_FP | XFEATURE_MASK_SSE; break; case XSTATE_COPY_XSAVE: header.xfeatures &= fpstate->user_xfeatures & xfeatures; break; } /* Copy FP state up to MXCSR */ copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387, &xinit->i387, off_mxcsr); /* Copy MXCSR when SSE or YMM are set in the feature mask */ copy_feature(header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM), &to, &xsave->i387.mxcsr, &xinit->i387.mxcsr, MXCSR_AND_FLAGS_SIZE); /* Copy the remaining FP state */ copy_feature(header.xfeatures & XFEATURE_MASK_FP, &to, &xsave->i387.st_space, &xinit->i387.st_space, sizeof(xsave->i387.st_space)); /* Copy the SSE state - shared with YMM, but independently managed */ copy_feature(header.xfeatures & XFEATURE_MASK_SSE, &to, &xsave->i387.xmm_space, &xinit->i387.xmm_space, sizeof(xsave->i387.xmm_space)); if (copy_mode != XSTATE_COPY_XSAVE) goto out; /* Zero the padding area */ membuf_zero(&to, sizeof(xsave->i387.padding)); /* Copy xsave->i387.sw_reserved */ membuf_write(&to, xstate_fx_sw_bytes, sizeof(xsave->i387.sw_reserved)); /* Copy the user space relevant state of @xsave->header */ membuf_write(&to, &header, sizeof(header)); zerofrom = offsetof(struct xregs_state, extended_state_area); /* * This 'mask' indicates which states to copy from fpstate. * Those extended states that are not present in fpstate are * either disabled or initialized: * * In non-compacted format, disabled features still occupy * state space but there is no state to copy from in the * compacted init_fpstate. The gap tracking will zero these * states. * * The extended features have an all zeroes init state. Thus, * remove them from 'mask' to zero those features in the user * buffer instead of retrieving them from init_fpstate. */ mask = header.xfeatures; for_each_extended_xfeature_in_order(i, mask) { xfeature = xfeature_uncompact_order[i]; /* * If there was a feature or alignment gap, zero the space * in the destination buffer. */ if (zerofrom < xstate_offsets[xfeature]) membuf_zero(&to, xstate_offsets[xfeature] - zerofrom); if (xfeature == XFEATURE_PKRU) { struct pkru_state pkru = {0}; /* * PKRU is not necessarily up to date in the * XSAVE buffer. Use the provided value. */ pkru.pkru = pkru_val; membuf_write(&to, &pkru, sizeof(pkru)); } else { membuf_write(&to, __raw_xsave_addr(xsave, xfeature), xstate_sizes[xfeature]); } /* * Keep track of the last copied state in the non-compacted * target buffer for gap zeroing. */ zerofrom = xstate_offsets[xfeature] + xstate_sizes[xfeature]; } out: if (to.left) membuf_zero(&to, to.left); } /** * copy_xstate_to_uabi_buf - Copy kernel saved xstate to a UABI buffer * @to: membuf descriptor * @tsk: The task from which to copy the saved xstate * @copy_mode: The requested copy mode * * Converts from kernel XSAVE or XSAVES compacted format to UABI conforming * format, i.e. from the kernel internal hardware dependent storage format * to the requested @mode. UABI XSTATE is always uncompacted! * * It supports partial copy but @to.pos always starts from zero. */ void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, enum xstate_copy_mode copy_mode) { __copy_xstate_to_uabi_buf(to, x86_task_fpu(tsk)->fpstate, x86_task_fpu(tsk)->fpstate->user_xfeatures, tsk->thread.pkru, copy_mode); } static int copy_from_buffer(void *dst, unsigned int offset, unsigned int size, const void *kbuf, const void __user *ubuf) { if (kbuf) { memcpy(dst, kbuf + offset, size); } else { if (copy_from_user(dst, ubuf + offset, size)) return -EFAULT; } return 0; } /** * copy_uabi_to_xstate - Copy a UABI format buffer to the kernel xstate * @fpstate: The fpstate buffer to copy to * @kbuf: The UABI format buffer, if it comes from the kernel * @ubuf: The UABI format buffer, if it comes from userspace * @pkru: The location to write the PKRU value to * * Converts from the UABI format into the kernel internal hardware * dependent format. * * This function ultimately has three different callers with distinct PKRU * behavior. * 1. When called from sigreturn the PKRU register will be restored from * @fpstate via an XRSTOR. Correctly copying the UABI format buffer to * @fpstate is sufficient to cover this case, but the caller will also * pass a pointer to the thread_struct's pkru field in @pkru and updating * it is harmless. * 2. When called from ptrace the PKRU register will be restored from the * thread_struct's pkru field. A pointer to that is passed in @pkru. * The kernel will restore it manually, so the XRSTOR behavior that resets * the PKRU register to the hardware init value (0) if the corresponding * xfeatures bit is not set is emulated here. * 3. When called from KVM the PKRU register will be restored from the vcpu's * pkru field. A pointer to that is passed in @pkru. KVM hasn't used * XRSTOR and hasn't had the PKRU resetting behavior described above. To * preserve that KVM behavior, it passes NULL for @pkru if the xfeatures * bit is not set. */ static int copy_uabi_to_xstate(struct fpstate *fpstate, const void *kbuf, const void __user *ubuf, u32 *pkru) { struct xregs_state *xsave = &fpstate->regs.xsave; unsigned int offset, size; struct xstate_header hdr; u64 mask; int i; offset = offsetof(struct xregs_state, header); if (copy_from_buffer(&hdr, offset, sizeof(hdr), kbuf, ubuf)) return -EFAULT; if (validate_user_xstate_header(&hdr, fpstate)) return -EINVAL; /* Validate MXCSR when any of the related features is in use */ mask = XFEATURE_MASK_FP | XFEATURE_MASK_SSE | XFEATURE_MASK_YMM; if (hdr.xfeatures & mask) { u32 mxcsr[2]; offset = offsetof(struct fxregs_state, mxcsr); if (copy_from_buffer(mxcsr, offset, sizeof(mxcsr), kbuf, ubuf)) return -EFAULT; /* Reserved bits in MXCSR must be zero. */ if (mxcsr[0] & ~mxcsr_feature_mask) return -EINVAL; /* SSE and YMM require MXCSR even when FP is not in use. */ if (!(hdr.xfeatures & XFEATURE_MASK_FP)) { xsave->i387.mxcsr = mxcsr[0]; xsave->i387.mxcsr_mask = mxcsr[1]; } } for (i = 0; i < XFEATURE_MAX; i++) { mask = BIT_ULL(i); if (hdr.xfeatures & mask) { void *dst = __raw_xsave_addr(xsave, i); offset = xstate_offsets[i]; size = xstate_sizes[i]; if (copy_from_buffer(dst, offset, size, kbuf, ubuf)) return -EFAULT; } } if (hdr.xfeatures & XFEATURE_MASK_PKRU) { struct pkru_state *xpkru; xpkru = __raw_xsave_addr(xsave, XFEATURE_PKRU); *pkru = xpkru->pkru; } else { /* * KVM may pass NULL here to indicate that it does not need * PKRU updated. */ if (pkru) *pkru = 0; } /* * The state that came in from userspace was user-state only. * Mask all the user states out of 'xfeatures': */ xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL; /* * Add back in the features that came in from userspace: */ xsave->header.xfeatures |= hdr.xfeatures; return 0; } /* * Convert from a ptrace standard-format kernel buffer to kernel XSAVE[S] * format and copy to the target thread. Used by ptrace and KVM. */ int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u32 *pkru) { return copy_uabi_to_xstate(fpstate, kbuf, NULL, pkru); } /* * Convert from a sigreturn standard-format user-space buffer to kernel * XSAVE[S] format and copy to the target thread. This is called from the * sigreturn() and rt_sigreturn() system calls. */ int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void __user *ubuf) { return copy_uabi_to_xstate(x86_task_fpu(tsk)->fpstate, NULL, ubuf, &tsk->thread.pkru); } static bool validate_independent_components(u64 mask) { u64 xchk; if (WARN_ON_FPU(!cpu_feature_enabled(X86_FEATURE_XSAVES))) return false; xchk = ~xfeatures_mask_independent(); if (WARN_ON_ONCE(!mask || mask & xchk)) return false; return true; } /** * xsaves - Save selected components to a kernel xstate buffer * @xstate: Pointer to the buffer * @mask: Feature mask to select the components to save * * The @xstate buffer must be 64 byte aligned and correctly initialized as * XSAVES does not write the full xstate header. Before first use the * buffer should be zeroed otherwise a consecutive XRSTORS from that buffer * can #GP. * * The feature mask must be a subset of the independent features. */ void xsaves(struct xregs_state *xstate, u64 mask) { int err; if (!validate_independent_components(mask)) return; XSTATE_OP(XSAVES, xstate, (u32)mask, (u32)(mask >> 32), err); WARN_ON_ONCE(err); } /** * xrstors - Restore selected components from a kernel xstate buffer * @xstate: Pointer to the buffer * @mask: Feature mask to select the components to restore * * The @xstate buffer must be 64 byte aligned and correctly initialized * otherwise XRSTORS from that buffer can #GP. * * Proper usage is to restore the state which was saved with * xsaves() into @xstate. * * The feature mask must be a subset of the independent features. */ void xrstors(struct xregs_state *xstate, u64 mask) { int err; if (!validate_independent_components(mask)) return; XSTATE_OP(XRSTORS, xstate, (u32)mask, (u32)(mask >> 32), err); WARN_ON_ONCE(err); } #if IS_ENABLED(CONFIG_KVM) void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeature) { void *addr = get_xsave_addr(&fpstate->regs.xsave, xfeature); if (addr) memset(addr, 0, xstate_sizes[xfeature]); } EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component); #endif #ifdef CONFIG_X86_64 #ifdef CONFIG_X86_DEBUG_FPU /* * Ensure that a subsequent XSAVE* or XRSTOR* instruction with RFBM=@mask * can safely operate on the @fpstate buffer. */ static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor) { u64 xfd = __this_cpu_read(xfd_state); if (fpstate->xfd == xfd) return true; /* * The XFD MSR does not match fpstate->xfd. That's invalid when * the passed in fpstate is current's fpstate. */ if (fpstate->xfd == x86_task_fpu(current)->fpstate->xfd) return false; /* * XRSTOR(S) from init_fpstate are always correct as it will just * bring all components into init state and not read from the * buffer. XSAVE(S) raises #PF after init. */ if (fpstate == &init_fpstate) return rstor; /* * XSAVE(S): clone(), fpu_swap_kvm_fpstate() * XRSTORS(S): fpu_swap_kvm_fpstate() */ /* * No XSAVE/XRSTOR instructions (except XSAVE itself) touch * the buffer area for XFD-disabled state components. */ mask &= ~xfd; /* * Remove features which are valid in fpstate. They * have space allocated in fpstate. */ mask &= ~fpstate->xfeatures; /* * Any remaining state components in 'mask' might be written * by XSAVE/XRSTOR. Fail validation it found. */ return !mask; } void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rstor) { WARN_ON_ONCE(!xstate_op_valid(fpstate, mask, rstor)); } #endif /* CONFIG_X86_DEBUG_FPU */ static int __init xfd_update_static_branch(void) { /* * If init_fpstate.xfd has bits set then dynamic features are * available and the dynamic sizing must be enabled. */ if (init_fpstate.xfd) static_branch_enable(&__fpu_state_size_dynamic); return 0; } arch_initcall(xfd_update_static_branch) void fpstate_free(struct fpu *fpu) { if (fpu->fpstate && fpu->fpstate != &fpu->__fpstate) vfree(fpu->fpstate); } /** * fpstate_realloc - Reallocate struct fpstate for the requested new features * * @xfeatures: A bitmap of xstate features which extend the enabled features * of that task * @ksize: The required size for the kernel buffer * @usize: The required size for user space buffers * @guest_fpu: Pointer to a guest FPU container. NULL for host allocations * * Note vs. vmalloc(): If the task with a vzalloc()-allocated buffer * terminates quickly, vfree()-induced IPIs may be a concern, but tasks * with large states are likely to live longer. * * Returns: 0 on success, -ENOMEM on allocation error. */ static int fpstate_realloc(u64 xfeatures, unsigned int ksize, unsigned int usize, struct fpu_guest *guest_fpu) { struct fpu *fpu = x86_task_fpu(current); struct fpstate *curfps, *newfps = NULL; unsigned int fpsize; bool in_use; fpsize = ksize + ALIGN(offsetof(struct fpstate, regs), 64); newfps = vzalloc(fpsize); if (!newfps) return -ENOMEM; newfps->size = ksize; newfps->user_size = usize; newfps->is_valloc = true; /* * When a guest FPU is supplied, use @guest_fpu->fpstate * as reference independent whether it is in use or not. */ curfps = guest_fpu ? guest_fpu->fpstate : fpu->fpstate; /* Determine whether @curfps is the active fpstate */ in_use = fpu->fpstate == curfps; if (guest_fpu) { newfps->is_guest = true; newfps->is_confidential = curfps->is_confidential; newfps->in_use = curfps->in_use; guest_fpu->xfeatures |= xfeatures; guest_fpu->uabi_size = usize; } fpregs_lock(); /* * If @curfps is in use, ensure that the current state is in the * registers before swapping fpstate as that might invalidate it * due to layout changes. */ if (in_use && test_thread_flag(TIF_NEED_FPU_LOAD)) fpregs_restore_userregs(); newfps->xfeatures = curfps->xfeatures | xfeatures; newfps->user_xfeatures = curfps->user_xfeatures | xfeatures; newfps->xfd = curfps->xfd & ~xfeatures; /* Do the final updates within the locked region */ xstate_init_xcomp_bv(&newfps->regs.xsave, newfps->xfeatures); if (guest_fpu) { guest_fpu->fpstate = newfps; /* If curfps is active, update the FPU fpstate pointer */ if (in_use) fpu->fpstate = newfps; } else { fpu->fpstate = newfps; } if (in_use) xfd_update_state(fpu->fpstate); fpregs_unlock(); /* Only free valloc'ed state */ if (curfps && curfps->is_valloc) vfree(curfps); return 0; } static int validate_sigaltstack(unsigned int usize) { struct task_struct *thread, *leader = current->group_leader; unsigned long framesize = get_sigframe_size(); lockdep_assert_held(&current->sighand->siglock); /* get_sigframe_size() is based on fpu_user_cfg.max_size */ framesize -= fpu_user_cfg.max_size; framesize += usize; for_each_thread(leader, thread) { if (thread->sas_ss_size && thread->sas_ss_size < framesize) return -ENOSPC; } return 0; } static int __xstate_request_perm(u64 permitted, u64 requested, bool guest) { /* * This deliberately does not exclude !XSAVES as we still might * decide to optionally context switch XCR0 or talk the silicon * vendors into extending XFD for the pre AMX states, especially * AVX512. */ bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); struct fpu *fpu = x86_task_fpu(current->group_leader); struct fpu_state_perm *perm; unsigned int ksize, usize; u64 mask; int ret = 0; /* Check whether fully enabled */ if ((permitted & requested) == requested) return 0; /* * Calculate the resulting kernel state size. Note, @permitted also * contains supervisor xfeatures even though supervisor are always * permitted for kernel and guest FPUs, and never permitted for user * FPUs. */ mask = permitted | requested; ksize = xstate_calculate_size(mask, compacted); /* * Calculate the resulting user state size. Take care not to clobber * the supervisor xfeatures in the new mask! */ usize = xstate_calculate_size(mask & XFEATURE_MASK_USER_SUPPORTED, false); if (!guest) { ret = validate_sigaltstack(usize); if (ret) return ret; } perm = guest ? &fpu->guest_perm : &fpu->perm; /* Pairs with the READ_ONCE() in xstate_get_group_perm() */ WRITE_ONCE(perm->__state_perm, mask); /* Protected by sighand lock */ perm->__state_size = ksize; perm->__user_state_size = usize; return ret; } /* * Permissions array to map facilities with more than one component */ static const u64 xstate_prctl_req[XFEATURE_MAX] = { [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE_DATA, }; static int xstate_request_perm(unsigned long idx, bool guest) { u64 permitted, requested; int ret; if (idx >= XFEATURE_MAX) return -EINVAL; /* * Look up the facility mask which can require more than * one xstate component. */ idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req)); requested = xstate_prctl_req[idx]; if (!requested) return -EOPNOTSUPP; if ((fpu_user_cfg.max_features & requested) != requested) return -EOPNOTSUPP; /* Lockless quick check */ permitted = xstate_get_group_perm(guest); if ((permitted & requested) == requested) return 0; /* Protect against concurrent modifications */ spin_lock_irq(&current->sighand->siglock); permitted = xstate_get_group_perm(guest); /* First vCPU allocation locks the permissions. */ if (guest && (permitted & FPU_GUEST_PERM_LOCKED)) ret = -EBUSY; else ret = __xstate_request_perm(permitted, requested, guest); spin_unlock_irq(&current->sighand->siglock); return ret; } int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu) { u64 xfd_event = xfd_err & XFEATURE_MASK_USER_DYNAMIC; struct fpu_state_perm *perm; unsigned int ksize, usize; struct fpu *fpu; if (!xfd_event) { if (!guest_fpu) pr_err_once("XFD: Invalid xfd error: %016llx\n", xfd_err); return 0; } /* Protect against concurrent modifications */ spin_lock_irq(&current->sighand->siglock); /* If not permitted let it die */ if ((xstate_get_group_perm(!!guest_fpu) & xfd_event) != xfd_event) { spin_unlock_irq(&current->sighand->siglock); return -EPERM; } fpu = x86_task_fpu(current->group_leader); perm = guest_fpu ? &fpu->guest_perm : &fpu->perm; ksize = perm->__state_size; usize = perm->__user_state_size; /* * The feature is permitted. State size is sufficient. Dropping * the lock is safe here even if more features are added from * another task, the retrieved buffer sizes are valid for the * currently requested feature(s). */ spin_unlock_irq(&current->sighand->siglock); /* * Try to allocate a new fpstate. If that fails there is no way * out. */ if (fpstate_realloc(xfd_event, ksize, usize, guest_fpu)) return -EFAULT; return 0; } int xfd_enable_feature(u64 xfd_err) { return __xfd_enable_feature(xfd_err, NULL); } #else /* CONFIG_X86_64 */ static inline int xstate_request_perm(unsigned long idx, bool guest) { return -EPERM; } #endif /* !CONFIG_X86_64 */ u64 xstate_get_guest_group_perm(void) { return xstate_get_group_perm(true); } EXPORT_SYMBOL_GPL(xstate_get_guest_group_perm); /** * fpu_xstate_prctl - xstate permission operations * @option: A subfunction of arch_prctl() * @arg2: option argument * Return: 0 if successful; otherwise, an error code * * Option arguments: * * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info * ARCH_REQ_XCOMP_PERM: Facility number requested * * For facilities which require more than one XSTATE component, the request * must be the highest state component number related to that facility, * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18). */ long fpu_xstate_prctl(int option, unsigned long arg2) { u64 __user *uptr = (u64 __user *)arg2; u64 permitted, supported; unsigned long idx = arg2; bool guest = false; switch (option) { case ARCH_GET_XCOMP_SUPP: supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features; return put_user(supported, uptr); case ARCH_GET_XCOMP_PERM: /* * Lockless snapshot as it can also change right after the * dropping the lock. */ permitted = xstate_get_host_group_perm(); permitted &= XFEATURE_MASK_USER_SUPPORTED; return put_user(permitted, uptr); case ARCH_GET_XCOMP_GUEST_PERM: permitted = xstate_get_guest_group_perm(); permitted &= XFEATURE_MASK_USER_SUPPORTED; return put_user(permitted, uptr); case ARCH_REQ_XCOMP_GUEST_PERM: guest = true; fallthrough; case ARCH_REQ_XCOMP_PERM: if (!IS_ENABLED(CONFIG_X86_64)) return -EOPNOTSUPP; return xstate_request_perm(idx, guest); default: return -EINVAL; } } #ifdef CONFIG_PROC_PID_ARCH_STATUS /* * Report the amount of time elapsed in millisecond since last AVX512 * use in the task. Report -1 if no AVX-512 usage. */ static void avx512_status(struct seq_file *m, struct task_struct *task) { unsigned long timestamp; long delta = -1; /* AVX-512 usage is not tracked for kernel threads. Don't report anything. */ if (task->flags & (PF_KTHREAD | PF_USER_WORKER)) return; timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp); if (timestamp) { delta = (long)(jiffies - timestamp); /* * Cap to LONG_MAX if time difference > LONG_MAX */ if (delta < 0) delta = LONG_MAX; delta = jiffies_to_msecs(delta); } seq_put_decimal_ll(m, "AVX512_elapsed_ms:\t", delta); seq_putc(m, '\n'); } /* * Report architecture specific information */ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { /* * Report AVX512 state if the processor and build option supported. */ if (cpu_feature_enabled(X86_FEATURE_AVX512F)) avx512_status(m, task); return 0; } #endif /* CONFIG_PROC_PID_ARCH_STATUS */ #ifdef CONFIG_COREDUMP static const char owner_name[] = "LINUX"; /* * Dump type, size, offset and flag values for every xfeature that is present. */ static int dump_xsave_layout_desc(struct coredump_params *cprm) { int num_records = 0; int i; for_each_extended_xfeature(i, fpu_user_cfg.max_features) { struct x86_xfeat_component xc = { .type = i, .size = xstate_sizes[i], .offset = xstate_offsets[i], /* reserved for future use */ .flags = 0, }; if (!dump_emit(cprm, &xc, sizeof(xc))) return 0; num_records++; } return num_records; } static u32 get_xsave_desc_size(void) { u32 cnt = 0; u32 i; for_each_extended_xfeature(i, fpu_user_cfg.max_features) cnt++; return cnt * (sizeof(struct x86_xfeat_component)); } int elf_coredump_extra_notes_write(struct coredump_params *cprm) { int num_records = 0; struct elf_note en; if (!fpu_user_cfg.max_features) return 0; en.n_namesz = sizeof(owner_name); en.n_descsz = get_xsave_desc_size(); en.n_type = NT_X86_XSAVE_LAYOUT; if (!dump_emit(cprm, &en, sizeof(en))) return 1; if (!dump_emit(cprm, owner_name, en.n_namesz)) return 1; if (!dump_align(cprm, 4)) return 1; num_records = dump_xsave_layout_desc(cprm); if (!num_records) return 1; /* Total size should be equal to the number of records */ if ((sizeof(struct x86_xfeat_component) * num_records) != en.n_descsz) return 1; return 0; } int elf_coredump_extra_notes_size(void) { int size; if (!fpu_user_cfg.max_features) return 0; /* .note header */ size = sizeof(struct elf_note); /* Name plus alignment to 4 bytes */ size += roundup(sizeof(owner_name), 4); size += get_xsave_desc_size(); return size; } #endif /* CONFIG_COREDUMP */
13 15 1 1 15 15 15 15 3 12 15 14 2 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 // SPDX-License-Identifier: GPL-2.0-or-later /* L2TPv3 ethernet pseudowire driver * * Copyright (c) 2008,2009,2010 Katalix Systems Ltd */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/socket.h> #include <linux/hash.h> #include <linux/l2tp.h> #include <linux/in.h> #include <linux/etherdevice.h> #include <linux/spinlock.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/udp.h> #include <net/inet_common.h> #include <net/inet_hashtables.h> #include <net/tcp_states.h> #include <net/protocol.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/netdev_lock.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/udp.h> #include "l2tp_core.h" /* Default device name. May be overridden by name specified by user */ #define L2TP_ETH_DEV_NAME "l2tpeth%d" /* via netdev_priv() */ struct l2tp_eth { struct l2tp_session *session; }; /* via l2tp_session_priv() */ struct l2tp_eth_sess { struct net_device __rcu *dev; }; static int l2tp_eth_dev_init(struct net_device *dev) { eth_hw_addr_random(dev); eth_broadcast_addr(dev->broadcast); netdev_lockdep_set_classes(dev); return 0; } static void l2tp_eth_dev_uninit(struct net_device *dev) { struct l2tp_eth *priv = netdev_priv(dev); struct l2tp_eth_sess *spriv; spriv = l2tp_session_priv(priv->session); RCU_INIT_POINTER(spriv->dev, NULL); /* No need for synchronize_net() here. We're called by * unregister_netdev*(), which does the synchronisation for us. */ } static netdev_tx_t l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev) { struct l2tp_eth *priv = netdev_priv(dev); struct l2tp_session *session = priv->session; unsigned int len = skb->len; int ret = l2tp_xmit_skb(session, skb); if (likely(ret == NET_XMIT_SUCCESS)) dev_dstats_tx_add(dev, len); else dev_dstats_tx_dropped(dev); return NETDEV_TX_OK; } static const struct net_device_ops l2tp_eth_netdev_ops = { .ndo_init = l2tp_eth_dev_init, .ndo_uninit = l2tp_eth_dev_uninit, .ndo_start_xmit = l2tp_eth_dev_xmit, .ndo_set_mac_address = eth_mac_addr, }; static const struct device_type l2tpeth_type = { .name = "l2tpeth", }; static void l2tp_eth_dev_setup(struct net_device *dev) { SET_NETDEV_DEVTYPE(dev, &l2tpeth_type); ether_setup(dev); dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->lltx = true; dev->netdev_ops = &l2tp_eth_netdev_ops; dev->needs_free_netdev = true; dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS; } static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, int data_len) { struct l2tp_eth_sess *spriv = l2tp_session_priv(session); struct net_device *dev; if (!pskb_may_pull(skb, ETH_HLEN)) goto error; secpath_reset(skb); /* checksums verified by L2TP */ skb->ip_summed = CHECKSUM_NONE; /* drop outer flow-hash */ skb_clear_hash(skb); skb_dst_drop(skb); nf_reset_ct(skb); rcu_read_lock(); dev = rcu_dereference(spriv->dev); if (!dev) goto error_rcu; if (dev_forward_skb(dev, skb) == NET_RX_SUCCESS) dev_dstats_rx_add(dev, data_len); else DEV_STATS_INC(dev, rx_errors); rcu_read_unlock(); return; error_rcu: rcu_read_unlock(); error: kfree_skb(skb); } static void l2tp_eth_delete(struct l2tp_session *session) { struct l2tp_eth_sess *spriv; struct net_device *dev; if (session) { spriv = l2tp_session_priv(session); rtnl_lock(); dev = rtnl_dereference(spriv->dev); if (dev) { unregister_netdevice(dev); rtnl_unlock(); module_put(THIS_MODULE); } else { rtnl_unlock(); } } } static void l2tp_eth_show(struct seq_file *m, void *arg) { struct l2tp_session *session = arg; struct l2tp_eth_sess *spriv = l2tp_session_priv(session); struct net_device *dev; rcu_read_lock(); dev = rcu_dereference(spriv->dev); if (!dev) { rcu_read_unlock(); return; } dev_hold(dev); rcu_read_unlock(); seq_printf(m, " interface %s\n", dev->name); dev_put(dev); } static void l2tp_eth_adjust_mtu(struct l2tp_tunnel *tunnel, struct l2tp_session *session, struct net_device *dev) { unsigned int overhead = 0; u32 l3_overhead = 0; u32 mtu; /* if the encap is UDP, account for UDP header size */ if (tunnel->encap == L2TP_ENCAPTYPE_UDP) { overhead += sizeof(struct udphdr); dev->needed_headroom += sizeof(struct udphdr); } lock_sock(tunnel->sock); l3_overhead = kernel_sock_ip_overhead(tunnel->sock); release_sock(tunnel->sock); if (l3_overhead == 0) { /* L3 Overhead couldn't be identified, this could be * because tunnel->sock was NULL or the socket's * address family was not IPv4 or IPv6, * dev mtu stays at 1500. */ return; } /* Adjust MTU, factor overhead - underlay L3, overlay L2 hdr * UDP overhead, if any, was already factored in above. */ overhead += session->hdr_len + ETH_HLEN + l3_overhead; mtu = l2tp_tunnel_dst_mtu(tunnel) - overhead; if (mtu < dev->min_mtu || mtu > dev->max_mtu) dev->mtu = ETH_DATA_LEN - overhead; else dev->mtu = mtu; dev->needed_headroom += session->hdr_len; } static int l2tp_eth_create(struct net *net, struct l2tp_tunnel *tunnel, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg) { unsigned char name_assign_type; struct net_device *dev; char name[IFNAMSIZ]; struct l2tp_session *session; struct l2tp_eth *priv; struct l2tp_eth_sess *spriv; int rc; if (cfg->ifname) { strscpy(name, cfg->ifname, IFNAMSIZ); name_assign_type = NET_NAME_USER; } else { strcpy(name, L2TP_ETH_DEV_NAME); name_assign_type = NET_NAME_ENUM; } session = l2tp_session_create(sizeof(*spriv), tunnel, session_id, peer_session_id, cfg); if (IS_ERR(session)) { rc = PTR_ERR(session); goto err; } dev = alloc_netdev(sizeof(*priv), name, name_assign_type, l2tp_eth_dev_setup); if (!dev) { rc = -ENOMEM; goto err_sess; } dev_net_set(dev, net); dev->min_mtu = 0; dev->max_mtu = ETH_MAX_MTU; l2tp_eth_adjust_mtu(tunnel, session, dev); priv = netdev_priv(dev); priv->session = session; session->recv_skb = l2tp_eth_dev_recv; session->session_close = l2tp_eth_delete; if (IS_ENABLED(CONFIG_L2TP_DEBUGFS)) session->show = l2tp_eth_show; spriv = l2tp_session_priv(session); refcount_inc(&session->ref_count); rtnl_lock(); /* Register both device and session while holding the rtnl lock. This * ensures that l2tp_eth_delete() will see that there's a device to * unregister, even if it happened to run before we assign spriv->dev. */ rc = l2tp_session_register(session, tunnel); if (rc < 0) { rtnl_unlock(); goto err_sess_dev; } rc = register_netdevice(dev); if (rc < 0) { rtnl_unlock(); l2tp_session_delete(session); l2tp_session_put(session); free_netdev(dev); return rc; } strscpy(session->ifname, dev->name, IFNAMSIZ); rcu_assign_pointer(spriv->dev, dev); rtnl_unlock(); l2tp_session_put(session); __module_get(THIS_MODULE); return 0; err_sess_dev: l2tp_session_put(session); free_netdev(dev); err_sess: l2tp_session_put(session); err: return rc; } static const struct l2tp_nl_cmd_ops l2tp_eth_nl_cmd_ops = { .session_create = l2tp_eth_create, .session_delete = l2tp_session_delete, }; static int __init l2tp_eth_init(void) { int err = 0; err = l2tp_nl_register_ops(L2TP_PWTYPE_ETH, &l2tp_eth_nl_cmd_ops); if (err) goto err; pr_info("L2TP ethernet pseudowire support (L2TPv3)\n"); return 0; err: return err; } static void __exit l2tp_eth_exit(void) { l2tp_nl_unregister_ops(L2TP_PWTYPE_ETH); } module_init(l2tp_eth_init); module_exit(l2tp_eth_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("James Chapman <jchapman@katalix.com>"); MODULE_DESCRIPTION("L2TP ethernet pseudowire driver"); MODULE_VERSION("1.0"); MODULE_ALIAS_L2TP_PWTYPE(5);
6104 6070 544 84 84 84 84 84 84 237 238 238 207 238 68 68 68 68 68 4051 4049 4049 4049 2789 2746 118 118 118 84 84 84 84 84 84 84 84 84 118 5146 5150 84 84 84 84 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 // SPDX-License-Identifier: GPL-2.0 /* * bus.c - bus driver management * * Copyright (c) 2002-3 Patrick Mochel * Copyright (c) 2002-3 Open Source Development Labs * Copyright (c) 2007 Greg Kroah-Hartman <gregkh@suse.de> * Copyright (c) 2007 Novell Inc. * Copyright (c) 2023 Greg Kroah-Hartman <gregkh@linuxfoundation.org> */ #include <linux/async.h> #include <linux/device/bus.h> #include <linux/device.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/string.h> #include <linux/mutex.h> #include <linux/sysfs.h> #include "base.h" #include "power/power.h" /* /sys/devices/system */ static struct kset *system_kset; /* /sys/bus */ static struct kset *bus_kset; #define to_bus_attr(_attr) container_of(_attr, struct bus_attribute, attr) /* * sysfs bindings for drivers */ #define to_drv_attr(_attr) container_of(_attr, struct driver_attribute, attr) #define DRIVER_ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) \ struct driver_attribute driver_attr_##_name = \ __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) static int __must_check bus_rescan_devices_helper(struct device *dev, void *data); /** * bus_to_subsys - Turn a struct bus_type into a struct subsys_private * * @bus: pointer to the struct bus_type to look up * * The driver core internals needs to work on the subsys_private structure, not * the external struct bus_type pointer. This function walks the list of * registered busses in the system and finds the matching one and returns the * internal struct subsys_private that relates to that bus. * * Note, the reference count of the return value is INCREMENTED if it is not * NULL. A call to subsys_put() must be done when finished with the pointer in * order for it to be properly freed. */ struct subsys_private *bus_to_subsys(const struct bus_type *bus) { struct subsys_private *sp = NULL; struct kobject *kobj; if (!bus || !bus_kset) return NULL; spin_lock(&bus_kset->list_lock); if (list_empty(&bus_kset->list)) goto done; list_for_each_entry(kobj, &bus_kset->list, entry) { struct kset *kset = container_of(kobj, struct kset, kobj); sp = container_of_const(kset, struct subsys_private, subsys); if (sp->bus == bus) goto done; } sp = NULL; done: sp = subsys_get(sp); spin_unlock(&bus_kset->list_lock); return sp; } static const struct bus_type *bus_get(const struct bus_type *bus) { struct subsys_private *sp = bus_to_subsys(bus); if (sp) return bus; return NULL; } static void bus_put(const struct bus_type *bus) { struct subsys_private *sp = bus_to_subsys(bus); /* two puts are required as the call to bus_to_subsys incremented it again */ subsys_put(sp); subsys_put(sp); } static ssize_t drv_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct driver_attribute *drv_attr = to_drv_attr(attr); struct driver_private *drv_priv = to_driver(kobj); ssize_t ret = -EIO; if (drv_attr->show) ret = drv_attr->show(drv_priv->driver, buf); return ret; } static ssize_t drv_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct driver_attribute *drv_attr = to_drv_attr(attr); struct driver_private *drv_priv = to_driver(kobj); ssize_t ret = -EIO; if (drv_attr->store) ret = drv_attr->store(drv_priv->driver, buf, count); return ret; } static const struct sysfs_ops driver_sysfs_ops = { .show = drv_attr_show, .store = drv_attr_store, }; static void driver_release(struct kobject *kobj) { struct driver_private *drv_priv = to_driver(kobj); pr_debug("driver: '%s': %s\n", kobject_name(kobj), __func__); kfree(drv_priv); } static const struct kobj_type driver_ktype = { .sysfs_ops = &driver_sysfs_ops, .release = driver_release, }; /* * sysfs bindings for buses */ static ssize_t bus_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct bus_attribute *bus_attr = to_bus_attr(attr); struct subsys_private *subsys_priv = to_subsys_private(kobj); /* return -EIO for reading a bus attribute without show() */ ssize_t ret = -EIO; if (bus_attr->show) ret = bus_attr->show(subsys_priv->bus, buf); return ret; } static ssize_t bus_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct bus_attribute *bus_attr = to_bus_attr(attr); struct subsys_private *subsys_priv = to_subsys_private(kobj); /* return -EIO for writing a bus attribute without store() */ ssize_t ret = -EIO; if (bus_attr->store) ret = bus_attr->store(subsys_priv->bus, buf, count); return ret; } static const struct sysfs_ops bus_sysfs_ops = { .show = bus_attr_show, .store = bus_attr_store, }; int bus_create_file(const struct bus_type *bus, struct bus_attribute *attr) { struct subsys_private *sp = bus_to_subsys(bus); int error; if (!sp) return -EINVAL; error = sysfs_create_file(&sp->subsys.kobj, &attr->attr); subsys_put(sp); return error; } EXPORT_SYMBOL_GPL(bus_create_file); void bus_remove_file(const struct bus_type *bus, struct bus_attribute *attr) { struct subsys_private *sp = bus_to_subsys(bus); if (!sp) return; sysfs_remove_file(&sp->subsys.kobj, &attr->attr); subsys_put(sp); } EXPORT_SYMBOL_GPL(bus_remove_file); static void bus_release(struct kobject *kobj) { struct subsys_private *priv = to_subsys_private(kobj); lockdep_unregister_key(&priv->lock_key); kfree(priv); } static const struct kobj_type bus_ktype = { .sysfs_ops = &bus_sysfs_ops, .release = bus_release, }; static int bus_uevent_filter(const struct kobject *kobj) { const struct kobj_type *ktype = get_ktype(kobj); if (ktype == &bus_ktype) return 1; return 0; } static const struct kset_uevent_ops bus_uevent_ops = { .filter = bus_uevent_filter, }; /* Manually detach a device from its associated driver. */ static ssize_t unbind_store(struct device_driver *drv, const char *buf, size_t count) { const struct bus_type *bus = bus_get(drv->bus); struct device *dev; int err = -ENODEV; dev = bus_find_device_by_name(bus, NULL, buf); if (dev && dev->driver == drv) { device_driver_detach(dev); err = count; } put_device(dev); bus_put(bus); return err; } static DRIVER_ATTR_IGNORE_LOCKDEP(unbind, 0200, NULL, unbind_store); /* * Manually attach a device to a driver. * Note: the driver must want to bind to the device, * it is not possible to override the driver's id table. */ static ssize_t bind_store(struct device_driver *drv, const char *buf, size_t count) { const struct bus_type *bus = bus_get(drv->bus); struct device *dev; int err = -ENODEV; dev = bus_find_device_by_name(bus, NULL, buf); if (dev && driver_match_device(drv, dev)) { err = device_driver_attach(drv, dev); if (!err) { /* success */ err = count; } } put_device(dev); bus_put(bus); return err; } static DRIVER_ATTR_IGNORE_LOCKDEP(bind, 0200, NULL, bind_store); static ssize_t drivers_autoprobe_show(const struct bus_type *bus, char *buf) { struct subsys_private *sp = bus_to_subsys(bus); int ret; if (!sp) return -EINVAL; ret = sysfs_emit(buf, "%d\n", sp->drivers_autoprobe); subsys_put(sp); return ret; } static ssize_t drivers_autoprobe_store(const struct bus_type *bus, const char *buf, size_t count) { struct subsys_private *sp = bus_to_subsys(bus); if (!sp) return -EINVAL; if (buf[0] == '0') sp->drivers_autoprobe = 0; else sp->drivers_autoprobe = 1; subsys_put(sp); return count; } static ssize_t drivers_probe_store(const struct bus_type *bus, const char *buf, size_t count) { struct device *dev; int err = -EINVAL; dev = bus_find_device_by_name(bus, NULL, buf); if (!dev) return -ENODEV; if (bus_rescan_devices_helper(dev, NULL) == 0) err = count; put_device(dev); return err; } static struct device *next_device(struct klist_iter *i) { struct klist_node *n = klist_next(i); struct device *dev = NULL; struct device_private *dev_prv; if (n) { dev_prv = to_device_private_bus(n); dev = dev_prv->device; } return dev; } /** * bus_for_each_dev - device iterator. * @bus: bus type. * @start: device to start iterating from. * @data: data for the callback. * @fn: function to be called for each device. * * Iterate over @bus's list of devices, and call @fn for each, * passing it @data. If @start is not NULL, we use that device to * begin iterating from. * * We check the return of @fn each time. If it returns anything * other than 0, we break out and return that value. * * NOTE: The device that returns a non-zero value is not retained * in any way, nor is its refcount incremented. If the caller needs * to retain this data, it should do so, and increment the reference * count in the supplied callback. */ int bus_for_each_dev(const struct bus_type *bus, struct device *start, void *data, device_iter_t fn) { struct subsys_private *sp = bus_to_subsys(bus); struct klist_iter i; struct device *dev; int error = 0; if (!sp) return -EINVAL; klist_iter_init_node(&sp->klist_devices, &i, (start ? &start->p->knode_bus : NULL)); while (!error && (dev = next_device(&i))) error = fn(dev, data); klist_iter_exit(&i); subsys_put(sp); return error; } EXPORT_SYMBOL_GPL(bus_for_each_dev); /** * bus_find_device - device iterator for locating a particular device. * @bus: bus type * @start: Device to begin with * @data: Data to pass to match function * @match: Callback function to check device * * This is similar to the bus_for_each_dev() function above, but it * returns a reference to a device that is 'found' for later use, as * determined by the @match callback. * * The callback should return 0 if the device doesn't match and non-zero * if it does. If the callback returns non-zero, this function will * return to the caller and not iterate over any more devices. */ struct device *bus_find_device(const struct bus_type *bus, struct device *start, const void *data, device_match_t match) { struct subsys_private *sp = bus_to_subsys(bus); struct klist_iter i; struct device *dev; if (!sp) return NULL; klist_iter_init_node(&sp->klist_devices, &i, (start ? &start->p->knode_bus : NULL)); while ((dev = next_device(&i))) { if (match(dev, data)) { get_device(dev); break; } } klist_iter_exit(&i); subsys_put(sp); return dev; } EXPORT_SYMBOL_GPL(bus_find_device); static struct device_driver *next_driver(struct klist_iter *i) { struct klist_node *n = klist_next(i); struct driver_private *drv_priv; if (n) { drv_priv = container_of(n, struct driver_private, knode_bus); return drv_priv->driver; } return NULL; } /** * bus_for_each_drv - driver iterator * @bus: bus we're dealing with. * @start: driver to start iterating on. * @data: data to pass to the callback. * @fn: function to call for each driver. * * This is nearly identical to the device iterator above. * We iterate over each driver that belongs to @bus, and call * @fn for each. If @fn returns anything but 0, we break out * and return it. If @start is not NULL, we use it as the head * of the list. * * NOTE: we don't return the driver that returns a non-zero * value, nor do we leave the reference count incremented for that * driver. If the caller needs to know that info, it must set it * in the callback. It must also be sure to increment the refcount * so it doesn't disappear before returning to the caller. */ int bus_for_each_drv(const struct bus_type *bus, struct device_driver *start, void *data, int (*fn)(struct device_driver *, void *)) { struct subsys_private *sp = bus_to_subsys(bus); struct klist_iter i; struct device_driver *drv; int error = 0; if (!sp) return -EINVAL; klist_iter_init_node(&sp->klist_drivers, &i, start ? &start->p->knode_bus : NULL); while ((drv = next_driver(&i)) && !error) error = fn(drv, data); klist_iter_exit(&i); subsys_put(sp); return error; } EXPORT_SYMBOL_GPL(bus_for_each_drv); /** * bus_add_device - add device to bus * @dev: device being added * * - Add device's bus attributes. * - Create links to device's bus. * - Add the device to its bus's list of devices. */ int bus_add_device(struct device *dev) { struct subsys_private *sp = bus_to_subsys(dev->bus); int error; if (!sp) { /* * This is a normal operation for many devices that do not * have a bus assigned to them, just say that all went * well. */ return 0; } /* * Reference in sp is now incremented and will be dropped when * the device is removed from the bus */ pr_debug("bus: '%s': add device %s\n", sp->bus->name, dev_name(dev)); error = device_add_groups(dev, sp->bus->dev_groups); if (error) goto out_put; error = sysfs_create_link(&sp->devices_kset->kobj, &dev->kobj, dev_name(dev)); if (error) goto out_groups; error = sysfs_create_link(&dev->kobj, &sp->subsys.kobj, "subsystem"); if (error) goto out_subsys; klist_add_tail(&dev->p->knode_bus, &sp->klist_devices); return 0; out_subsys: sysfs_remove_link(&sp->devices_kset->kobj, dev_name(dev)); out_groups: device_remove_groups(dev, sp->bus->dev_groups); out_put: subsys_put(sp); return error; } /** * bus_probe_device - probe drivers for a new device * @dev: device to probe * * - Automatically probe for a driver if the bus allows it. */ void bus_probe_device(struct device *dev) { struct subsys_private *sp = bus_to_subsys(dev->bus); struct subsys_interface *sif; if (!sp) return; if (sp->drivers_autoprobe) device_initial_probe(dev); mutex_lock(&sp->mutex); list_for_each_entry(sif, &sp->interfaces, node) if (sif->add_dev) sif->add_dev(dev, sif); mutex_unlock(&sp->mutex); subsys_put(sp); } /** * bus_remove_device - remove device from bus * @dev: device to be removed * * - Remove device from all interfaces. * - Remove symlink from bus' directory. * - Delete device from bus's list. * - Detach from its driver. * - Drop reference taken in bus_add_device(). */ void bus_remove_device(struct device *dev) { struct subsys_private *sp = bus_to_subsys(dev->bus); struct subsys_interface *sif; if (!sp) return; mutex_lock(&sp->mutex); list_for_each_entry(sif, &sp->interfaces, node) if (sif->remove_dev) sif->remove_dev(dev, sif); mutex_unlock(&sp->mutex); sysfs_remove_link(&dev->kobj, "subsystem"); sysfs_remove_link(&sp->devices_kset->kobj, dev_name(dev)); device_remove_groups(dev, dev->bus->dev_groups); if (klist_node_attached(&dev->p->knode_bus)) klist_del(&dev->p->knode_bus); pr_debug("bus: '%s': remove device %s\n", dev->bus->name, dev_name(dev)); device_release_driver(dev); /* * Decrement the reference count twice, once for the bus_to_subsys() * call in the start of this function, and the second one from the * reference increment in bus_add_device() */ subsys_put(sp); subsys_put(sp); } static int __must_check add_bind_files(struct device_driver *drv) { int ret; ret = driver_create_file(drv, &driver_attr_unbind); if (ret == 0) { ret = driver_create_file(drv, &driver_attr_bind); if (ret) driver_remove_file(drv, &driver_attr_unbind); } return ret; } static void remove_bind_files(struct device_driver *drv) { driver_remove_file(drv, &driver_attr_bind); driver_remove_file(drv, &driver_attr_unbind); } static BUS_ATTR_WO(drivers_probe); static BUS_ATTR_RW(drivers_autoprobe); static int add_probe_files(const struct bus_type *bus) { int retval; retval = bus_create_file(bus, &bus_attr_drivers_probe); if (retval) goto out; retval = bus_create_file(bus, &bus_attr_drivers_autoprobe); if (retval) bus_remove_file(bus, &bus_attr_drivers_probe); out: return retval; } static void remove_probe_files(const struct bus_type *bus) { bus_remove_file(bus, &bus_attr_drivers_autoprobe); bus_remove_file(bus, &bus_attr_drivers_probe); } static ssize_t uevent_store(struct device_driver *drv, const char *buf, size_t count) { int rc; rc = kobject_synth_uevent(&drv->p->kobj, buf, count); return rc ? rc : count; } static DRIVER_ATTR_WO(uevent); /** * bus_add_driver - Add a driver to the bus. * @drv: driver. */ int bus_add_driver(struct device_driver *drv) { struct subsys_private *sp = bus_to_subsys(drv->bus); struct driver_private *priv; int error = 0; if (!sp) return -EINVAL; /* * Reference in sp is now incremented and will be dropped when * the driver is removed from the bus */ pr_debug("bus: '%s': add driver %s\n", sp->bus->name, drv->name); priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) { error = -ENOMEM; goto out_put_bus; } klist_init(&priv->klist_devices, NULL, NULL); priv->driver = drv; drv->p = priv; priv->kobj.kset = sp->drivers_kset; error = kobject_init_and_add(&priv->kobj, &driver_ktype, NULL, "%s", drv->name); if (error) goto out_unregister; klist_add_tail(&priv->knode_bus, &sp->klist_drivers); if (sp->drivers_autoprobe) { error = driver_attach(drv); if (error) goto out_del_list; } error = module_add_driver(drv->owner, drv); if (error) { printk(KERN_ERR "%s: failed to create module links for %s\n", __func__, drv->name); goto out_detach; } error = driver_create_file(drv, &driver_attr_uevent); if (error) { printk(KERN_ERR "%s: uevent attr (%s) failed\n", __func__, drv->name); } error = driver_add_groups(drv, sp->bus->drv_groups); if (error) { /* How the hell do we get out of this pickle? Give up */ printk(KERN_ERR "%s: driver_add_groups(%s) failed\n", __func__, drv->name); } if (!drv->suppress_bind_attrs) { error = add_bind_files(drv); if (error) { /* Ditto */ printk(KERN_ERR "%s: add_bind_files(%s) failed\n", __func__, drv->name); } } return 0; out_detach: driver_detach(drv); out_del_list: klist_del(&priv->knode_bus); out_unregister: kobject_put(&priv->kobj); /* drv->p is freed in driver_release() */ drv->p = NULL; out_put_bus: subsys_put(sp); return error; } /** * bus_remove_driver - delete driver from bus's knowledge. * @drv: driver. * * Detach the driver from the devices it controls, and remove * it from its bus's list of drivers. Finally, we drop the reference * to the bus we took in bus_add_driver(). */ void bus_remove_driver(struct device_driver *drv) { struct subsys_private *sp = bus_to_subsys(drv->bus); if (!sp) return; pr_debug("bus: '%s': remove driver %s\n", sp->bus->name, drv->name); if (!drv->suppress_bind_attrs) remove_bind_files(drv); driver_remove_groups(drv, sp->bus->drv_groups); driver_remove_file(drv, &driver_attr_uevent); klist_remove(&drv->p->knode_bus); driver_detach(drv); module_remove_driver(drv); kobject_put(&drv->p->kobj); /* * Decrement the reference count twice, once for the bus_to_subsys() * call in the start of this function, and the second one from the * reference increment in bus_add_driver() */ subsys_put(sp); subsys_put(sp); } /* Helper for bus_rescan_devices's iter */ static int __must_check bus_rescan_devices_helper(struct device *dev, void *data) { int ret = 0; if (!dev->driver) { if (dev->parent && dev->bus->need_parent_lock) device_lock(dev->parent); ret = device_attach(dev); if (dev->parent && dev->bus->need_parent_lock) device_unlock(dev->parent); } return ret < 0 ? ret : 0; } /** * bus_rescan_devices - rescan devices on the bus for possible drivers * @bus: the bus to scan. * * This function will look for devices on the bus with no driver * attached and rescan it against existing drivers to see if it matches * any by calling device_attach() for the unbound devices. */ int bus_rescan_devices(const struct bus_type *bus) { return bus_for_each_dev(bus, NULL, NULL, bus_rescan_devices_helper); } EXPORT_SYMBOL_GPL(bus_rescan_devices); /** * device_reprobe - remove driver for a device and probe for a new driver * @dev: the device to reprobe * * This function detaches the attached driver (if any) for the given * device and restarts the driver probing process. It is intended * to use if probing criteria changed during a devices lifetime and * driver attachment should change accordingly. */ int device_reprobe(struct device *dev) { if (dev->driver) device_driver_detach(dev); return bus_rescan_devices_helper(dev, NULL); } EXPORT_SYMBOL_GPL(device_reprobe); static void klist_devices_get(struct klist_node *n) { struct device_private *dev_prv = to_device_private_bus(n); struct device *dev = dev_prv->device; get_device(dev); } static void klist_devices_put(struct klist_node *n) { struct device_private *dev_prv = to_device_private_bus(n); struct device *dev = dev_prv->device; put_device(dev); } static ssize_t bus_uevent_store(const struct bus_type *bus, const char *buf, size_t count) { struct subsys_private *sp = bus_to_subsys(bus); int ret; if (!sp) return -EINVAL; ret = kobject_synth_uevent(&sp->subsys.kobj, buf, count); subsys_put(sp); if (ret) return ret; return count; } /* * "open code" the old BUS_ATTR() macro here. We want to use BUS_ATTR_WO() * here, but can not use it as earlier in the file we have * DEVICE_ATTR_WO(uevent), which would cause a clash with the with the store * function name. */ static struct bus_attribute bus_attr_uevent = __ATTR(uevent, 0200, NULL, bus_uevent_store); /** * bus_register - register a driver-core subsystem * @bus: bus to register * * Once we have that, we register the bus with the kobject * infrastructure, then register the children subsystems it has: * the devices and drivers that belong to the subsystem. */ int bus_register(const struct bus_type *bus) { int retval; struct subsys_private *priv; struct kobject *bus_kobj; struct lock_class_key *key; priv = kzalloc(sizeof(struct subsys_private), GFP_KERNEL); if (!priv) return -ENOMEM; priv->bus = bus; BLOCKING_INIT_NOTIFIER_HEAD(&priv->bus_notifier); bus_kobj = &priv->subsys.kobj; retval = kobject_set_name(bus_kobj, "%s", bus->name); if (retval) goto out; bus_kobj->kset = bus_kset; bus_kobj->ktype = &bus_ktype; priv->drivers_autoprobe = 1; retval = kset_register(&priv->subsys); if (retval) goto out; retval = bus_create_file(bus, &bus_attr_uevent); if (retval) goto bus_uevent_fail; priv->devices_kset = kset_create_and_add("devices", NULL, bus_kobj); if (!priv->devices_kset) { retval = -ENOMEM; goto bus_devices_fail; } priv->drivers_kset = kset_create_and_add("drivers", NULL, bus_kobj); if (!priv->drivers_kset) { retval = -ENOMEM; goto bus_drivers_fail; } INIT_LIST_HEAD(&priv->interfaces); key = &priv->lock_key; lockdep_register_key(key); __mutex_init(&priv->mutex, "subsys mutex", key); klist_init(&priv->klist_devices, klist_devices_get, klist_devices_put); klist_init(&priv->klist_drivers, NULL, NULL); retval = add_probe_files(bus); if (retval) goto bus_probe_files_fail; retval = sysfs_create_groups(bus_kobj, bus->bus_groups); if (retval) goto bus_groups_fail; pr_debug("bus: '%s': registered\n", bus->name); return 0; bus_groups_fail: remove_probe_files(bus); bus_probe_files_fail: kset_unregister(priv->drivers_kset); bus_drivers_fail: kset_unregister(priv->devices_kset); bus_devices_fail: bus_remove_file(bus, &bus_attr_uevent); bus_uevent_fail: kset_unregister(&priv->subsys); /* Above kset_unregister() will kfree @priv */ priv = NULL; out: kfree(priv); return retval; } EXPORT_SYMBOL_GPL(bus_register); /** * bus_unregister - remove a bus from the system * @bus: bus. * * Unregister the child subsystems and the bus itself. * Finally, we call bus_put() to release the refcount */ void bus_unregister(const struct bus_type *bus) { struct subsys_private *sp = bus_to_subsys(bus); struct kobject *bus_kobj; if (!sp) return; pr_debug("bus: '%s': unregistering\n", bus->name); if (sp->dev_root) device_unregister(sp->dev_root); bus_kobj = &sp->subsys.kobj; sysfs_remove_groups(bus_kobj, bus->bus_groups); remove_probe_files(bus); bus_remove_file(bus, &bus_attr_uevent); kset_unregister(sp->drivers_kset); kset_unregister(sp->devices_kset); kset_unregister(&sp->subsys); subsys_put(sp); } EXPORT_SYMBOL_GPL(bus_unregister); int bus_register_notifier(const struct bus_type *bus, struct notifier_block *nb) { struct subsys_private *sp = bus_to_subsys(bus); int retval; if (!sp) return -EINVAL; retval = blocking_notifier_chain_register(&sp->bus_notifier, nb); subsys_put(sp); return retval; } EXPORT_SYMBOL_GPL(bus_register_notifier); int bus_unregister_notifier(const struct bus_type *bus, struct notifier_block *nb) { struct subsys_private *sp = bus_to_subsys(bus); int retval; if (!sp) return -EINVAL; retval = blocking_notifier_chain_unregister(&sp->bus_notifier, nb); subsys_put(sp); return retval; } EXPORT_SYMBOL_GPL(bus_unregister_notifier); void bus_notify(struct device *dev, enum bus_notifier_event value) { struct subsys_private *sp = bus_to_subsys(dev->bus); if (!sp) return; blocking_notifier_call_chain(&sp->bus_notifier, value, dev); subsys_put(sp); } struct kset *bus_get_kset(const struct bus_type *bus) { struct subsys_private *sp = bus_to_subsys(bus); struct kset *kset; if (!sp) return NULL; kset = &sp->subsys; subsys_put(sp); return kset; } EXPORT_SYMBOL_GPL(bus_get_kset); /* * Yes, this forcibly breaks the klist abstraction temporarily. It * just wants to sort the klist, not change reference counts and * take/drop locks rapidly in the process. It does all this while * holding the lock for the list, so objects can't otherwise be * added/removed while we're swizzling. */ static void device_insertion_sort_klist(struct device *a, struct list_head *list, int (*compare)(const struct device *a, const struct device *b)) { struct klist_node *n; struct device_private *dev_prv; struct device *b; list_for_each_entry(n, list, n_node) { dev_prv = to_device_private_bus(n); b = dev_prv->device; if (compare(a, b) <= 0) { list_move_tail(&a->p->knode_bus.n_node, &b->p->knode_bus.n_node); return; } } list_move_tail(&a->p->knode_bus.n_node, list); } void bus_sort_breadthfirst(const struct bus_type *bus, int (*compare)(const struct device *a, const struct device *b)) { struct subsys_private *sp = bus_to_subsys(bus); LIST_HEAD(sorted_devices); struct klist_node *n, *tmp; struct device_private *dev_prv; struct device *dev; struct klist *device_klist; if (!sp) return; device_klist = &sp->klist_devices; spin_lock(&device_klist->k_lock); list_for_each_entry_safe(n, tmp, &device_klist->k_list, n_node) { dev_prv = to_device_private_bus(n); dev = dev_prv->device; device_insertion_sort_klist(dev, &sorted_devices, compare); } list_splice(&sorted_devices, &device_klist->k_list); spin_unlock(&device_klist->k_lock); subsys_put(sp); } EXPORT_SYMBOL_GPL(bus_sort_breadthfirst); struct subsys_dev_iter { struct klist_iter ki; const struct device_type *type; }; /** * subsys_dev_iter_init - initialize subsys device iterator * @iter: subsys iterator to initialize * @sp: the subsys private (i.e. bus) we wanna iterate over * @start: the device to start iterating from, if any * @type: device_type of the devices to iterate over, NULL for all * * Initialize subsys iterator @iter such that it iterates over devices * of @subsys. If @start is set, the list iteration will start there, * otherwise if it is NULL, the iteration starts at the beginning of * the list. */ static void subsys_dev_iter_init(struct subsys_dev_iter *iter, struct subsys_private *sp, struct device *start, const struct device_type *type) { struct klist_node *start_knode = NULL; if (start) start_knode = &start->p->knode_bus; klist_iter_init_node(&sp->klist_devices, &iter->ki, start_knode); iter->type = type; } /** * subsys_dev_iter_next - iterate to the next device * @iter: subsys iterator to proceed * * Proceed @iter to the next device and return it. Returns NULL if * iteration is complete. * * The returned device is referenced and won't be released till * iterator is proceed to the next device or exited. The caller is * free to do whatever it wants to do with the device including * calling back into subsys code. */ static struct device *subsys_dev_iter_next(struct subsys_dev_iter *iter) { struct klist_node *knode; struct device *dev; for (;;) { knode = klist_next(&iter->ki); if (!knode) return NULL; dev = to_device_private_bus(knode)->device; if (!iter->type || iter->type == dev->type) return dev; } } /** * subsys_dev_iter_exit - finish iteration * @iter: subsys iterator to finish * * Finish an iteration. Always call this function after iteration is * complete whether the iteration ran till the end or not. */ static void subsys_dev_iter_exit(struct subsys_dev_iter *iter) { klist_iter_exit(&iter->ki); } int subsys_interface_register(struct subsys_interface *sif) { struct subsys_private *sp; struct subsys_dev_iter iter; struct device *dev; if (!sif || !sif->subsys) return -ENODEV; sp = bus_to_subsys(sif->subsys); if (!sp) return -EINVAL; /* * Reference in sp is now incremented and will be dropped when * the interface is removed from the bus */ mutex_lock(&sp->mutex); list_add_tail(&sif->node, &sp->interfaces); if (sif->add_dev) { subsys_dev_iter_init(&iter, sp, NULL, NULL); while ((dev = subsys_dev_iter_next(&iter))) sif->add_dev(dev, sif); subsys_dev_iter_exit(&iter); } mutex_unlock(&sp->mutex); return 0; } EXPORT_SYMBOL_GPL(subsys_interface_register); void subsys_interface_unregister(struct subsys_interface *sif) { struct subsys_private *sp; struct subsys_dev_iter iter; struct device *dev; if (!sif || !sif->subsys) return; sp = bus_to_subsys(sif->subsys); if (!sp) return; mutex_lock(&sp->mutex); list_del_init(&sif->node); if (sif->remove_dev) { subsys_dev_iter_init(&iter, sp, NULL, NULL); while ((dev = subsys_dev_iter_next(&iter))) sif->remove_dev(dev, sif); subsys_dev_iter_exit(&iter); } mutex_unlock(&sp->mutex); /* * Decrement the reference count twice, once for the bus_to_subsys() * call in the start of this function, and the second one from the * reference increment in subsys_interface_register() */ subsys_put(sp); subsys_put(sp); } EXPORT_SYMBOL_GPL(subsys_interface_unregister); static void system_root_device_release(struct device *dev) { kfree(dev); } static int subsys_register(const struct bus_type *subsys, const struct attribute_group **groups, struct kobject *parent_of_root) { struct subsys_private *sp; struct device *dev; int err; err = bus_register(subsys); if (err < 0) return err; sp = bus_to_subsys(subsys); if (!sp) { err = -EINVAL; goto err_sp; } dev = kzalloc(sizeof(struct device), GFP_KERNEL); if (!dev) { err = -ENOMEM; goto err_dev; } err = dev_set_name(dev, "%s", subsys->name); if (err < 0) goto err_name; dev->kobj.parent = parent_of_root; dev->groups = groups; dev->release = system_root_device_release; err = device_register(dev); if (err < 0) goto err_dev_reg; sp->dev_root = dev; subsys_put(sp); return 0; err_dev_reg: put_device(dev); dev = NULL; err_name: kfree(dev); err_dev: subsys_put(sp); err_sp: bus_unregister(subsys); return err; } /** * subsys_system_register - register a subsystem at /sys/devices/system/ * @subsys: system subsystem * @groups: default attributes for the root device * * All 'system' subsystems have a /sys/devices/system/<name> root device * with the name of the subsystem. The root device can carry subsystem- * wide attributes. All registered devices are below this single root * device and are named after the subsystem with a simple enumeration * number appended. The registered devices are not explicitly named; * only 'id' in the device needs to be set. * * Do not use this interface for anything new, it exists for compatibility * with bad ideas only. New subsystems should use plain subsystems; and * add the subsystem-wide attributes should be added to the subsystem * directory itself and not some create fake root-device placed in * /sys/devices/system/<name>. */ int subsys_system_register(const struct bus_type *subsys, const struct attribute_group **groups) { return subsys_register(subsys, groups, &system_kset->kobj); } EXPORT_SYMBOL_GPL(subsys_system_register); /** * subsys_virtual_register - register a subsystem at /sys/devices/virtual/ * @subsys: virtual subsystem * @groups: default attributes for the root device * * All 'virtual' subsystems have a /sys/devices/system/<name> root device * with the name of the subsystem. The root device can carry subsystem-wide * attributes. All registered devices are below this single root device. * There's no restriction on device naming. This is for kernel software * constructs which need sysfs interface. */ int subsys_virtual_register(const struct bus_type *subsys, const struct attribute_group **groups) { struct kobject *virtual_dir; virtual_dir = virtual_device_parent(); if (!virtual_dir) return -ENOMEM; return subsys_register(subsys, groups, virtual_dir); } EXPORT_SYMBOL_GPL(subsys_virtual_register); /** * driver_find - locate driver on a bus by its name. * @name: name of the driver. * @bus: bus to scan for the driver. * * Call kset_find_obj() to iterate over list of drivers on * a bus to find driver by name. Return driver if found. * * This routine provides no locking to prevent the driver it returns * from being unregistered or unloaded while the caller is using it. * The caller is responsible for preventing this. */ struct device_driver *driver_find(const char *name, const struct bus_type *bus) { struct subsys_private *sp = bus_to_subsys(bus); struct kobject *k; struct driver_private *priv; if (!sp) return NULL; k = kset_find_obj(sp->drivers_kset, name); subsys_put(sp); if (!k) return NULL; priv = to_driver(k); /* Drop reference added by kset_find_obj() */ kobject_put(k); return priv->driver; } EXPORT_SYMBOL_GPL(driver_find); /* * Warning, the value could go to "removed" instantly after calling this function, so be very * careful when calling it... */ bool bus_is_registered(const struct bus_type *bus) { struct subsys_private *sp = bus_to_subsys(bus); bool is_initialized = false; if (sp) { is_initialized = true; subsys_put(sp); } return is_initialized; } /** * bus_get_dev_root - return a pointer to the "device root" of a bus * @bus: bus to return the device root of. * * If a bus has a "device root" structure, return it, WITH THE REFERENCE * COUNT INCREMENTED. * * Note, when finished with the device, a call to put_device() is required. * * If the device root is not present (or bus is not a valid pointer), NULL * will be returned. */ struct device *bus_get_dev_root(const struct bus_type *bus) { struct subsys_private *sp = bus_to_subsys(bus); struct device *dev_root; if (!sp) return NULL; dev_root = get_device(sp->dev_root); subsys_put(sp); return dev_root; } EXPORT_SYMBOL_GPL(bus_get_dev_root); int __init buses_init(void) { bus_kset = kset_create_and_add("bus", &bus_uevent_ops, NULL); if (!bus_kset) return -ENOMEM; system_kset = kset_create_and_add("system", NULL, &devices_kset->kobj); if (!system_kset) { /* Do error handling here as devices_init() do */ kset_unregister(bus_kset); bus_kset = NULL; pr_err("%s: failed to create and add kset 'bus'\n", __func__); return -ENOMEM; } return 0; }
4763 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_SMP_H #define __LINUX_SMP_H /* * Generic SMP support * Alan Cox. <alan@redhat.com> */ #include <linux/errno.h> #include <linux/types.h> #include <linux/list.h> #include <linux/cpumask.h> #include <linux/init.h> #include <linux/smp_types.h> typedef void (*smp_call_func_t)(void *info); typedef bool (*smp_cond_func_t)(int cpu, void *info); /* * structure shares (partial) layout with struct irq_work */ struct __call_single_data { struct __call_single_node node; smp_call_func_t func; void *info; }; #define CSD_INIT(_func, _info) \ (struct __call_single_data){ .func = (_func), .info = (_info), } /* Use __aligned() to avoid to use 2 cache lines for 1 csd */ typedef struct __call_single_data call_single_data_t __aligned(sizeof(struct __call_single_data)); #define INIT_CSD(_csd, _func, _info) \ do { \ *(_csd) = CSD_INIT((_func), (_info)); \ } while (0) /* * Enqueue a llist_node on the call_single_queue; be very careful, read * flush_smp_call_function_queue() in detail. */ extern void __smp_call_single_queue(int cpu, struct llist_node *node); /* total number of cpus in this system (may exceed NR_CPUS) */ extern unsigned int total_cpus; int smp_call_function_single(int cpuid, smp_call_func_t func, void *info, int wait); void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, void *info, bool wait, const struct cpumask *mask); int smp_call_function_single_async(int cpu, call_single_data_t *csd); /* * Cpus stopping functions in panic. All have default weak definitions. * Architecture-dependent code may override them. */ void __noreturn panic_smp_self_stop(void); void __noreturn nmi_panic_self_stop(struct pt_regs *regs); void crash_smp_send_stop(void); /* * Call a function on all processors */ static inline void on_each_cpu(smp_call_func_t func, void *info, int wait) { on_each_cpu_cond_mask(NULL, func, info, wait, cpu_online_mask); } /** * on_each_cpu_mask(): Run a function on processors specified by * cpumask, which may include the local processor. * @mask: The set of cpus to run on (only runs on online subset). * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait (atomically) until function has completed * on other CPUs. * * If @wait is true, then returns once @func has returned. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. The * exception is that it may be used during early boot while * early_boot_irqs_disabled is set. */ static inline void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait) { on_each_cpu_cond_mask(NULL, func, info, wait, mask); } /* * Call a function on each processor for which the supplied function * cond_func returns a positive value. This may include the local * processor. May be used during early boot while early_boot_irqs_disabled is * set. Use local_irq_save/restore() instead of local_irq_disable/enable(). */ static inline void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func, void *info, bool wait) { on_each_cpu_cond_mask(cond_func, func, info, wait, cpu_online_mask); } /* * Architecture specific boot CPU setup. Defined as empty weak function in * init/main.c. Architectures can override it. */ void __init smp_prepare_boot_cpu(void); #ifdef CONFIG_SMP #include <linux/preempt.h> #include <linux/compiler.h> #include <linux/thread_info.h> #include <asm/smp.h> /* * main cross-CPU interfaces, handles INIT, TLB flush, STOP, etc. * (defined in asm header): */ /* * stops all CPUs but the current one: */ extern void smp_send_stop(void); /* * sends a 'reschedule' event to another CPU: */ extern void arch_smp_send_reschedule(int cpu); /* * scheduler_ipi() is inline so can't be passed as callback reason, but the * callsite IP should be sufficient for root-causing IPIs sent from here. */ #define smp_send_reschedule(cpu) ({ \ trace_ipi_send_cpu(cpu, _RET_IP_, NULL); \ arch_smp_send_reschedule(cpu); \ }) /* * Prepare machine for booting other CPUs. */ extern void smp_prepare_cpus(unsigned int max_cpus); /* * Bring a CPU up */ extern int __cpu_up(unsigned int cpunum, struct task_struct *tidle); /* * Final polishing of CPUs */ extern void smp_cpus_done(unsigned int max_cpus); /* * Call a function on all other processors */ void smp_call_function(smp_call_func_t func, void *info, int wait); void smp_call_function_many(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait); int smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, void *info, int wait); void kick_all_cpus_sync(void); void wake_up_all_idle_cpus(void); /* * Generic and arch helpers */ void __init call_function_init(void); void generic_smp_call_function_single_interrupt(void); #define generic_smp_call_function_interrupt \ generic_smp_call_function_single_interrupt extern unsigned int setup_max_cpus; extern void __init setup_nr_cpu_ids(void); extern void __init smp_init(void); extern int __boot_cpu_id; static inline int get_boot_cpu_id(void) { return __boot_cpu_id; } #else /* !SMP */ static inline void smp_send_stop(void) { } /* * These macros fold the SMP functionality into a single CPU system */ #define raw_smp_processor_id() 0 static inline void up_smp_call_function(smp_call_func_t func, void *info) { } #define smp_call_function(func, info, wait) \ (up_smp_call_function(func, info)) static inline void smp_send_reschedule(int cpu) { } #define smp_call_function_many(mask, func, info, wait) \ (up_smp_call_function(func, info)) static inline void call_function_init(void) { } static inline int smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, void *info, int wait) { return smp_call_function_single(0, func, info, wait); } static inline void kick_all_cpus_sync(void) { } static inline void wake_up_all_idle_cpus(void) { } #define setup_max_cpus 0 #ifdef CONFIG_UP_LATE_INIT extern void __init up_late_init(void); static __always_inline void smp_init(void) { up_late_init(); } #else static inline void smp_init(void) { } #endif static inline int get_boot_cpu_id(void) { return 0; } #endif /* !SMP */ /** * raw_smp_processor_id() - get the current (unstable) CPU id * * For then you know what you are doing and need an unstable * CPU id. */ /** * smp_processor_id() - get the current (stable) CPU id * * This is the normal accessor to the CPU id and should be used * whenever possible. * * The CPU id is stable when: * * - IRQs are disabled; * - preemption is disabled; * - the task is CPU affine. * * When CONFIG_DEBUG_PREEMPT; we verify these assumption and WARN * when smp_processor_id() is used when the CPU id is not stable. */ /* * Allow the architecture to differentiate between a stable and unstable read. * For example, x86 uses an IRQ-safe asm-volatile read for the unstable but a * regular asm read for the stable. */ #ifndef __smp_processor_id #define __smp_processor_id() raw_smp_processor_id() #endif #ifdef CONFIG_DEBUG_PREEMPT extern unsigned int debug_smp_processor_id(void); # define smp_processor_id() debug_smp_processor_id() #else # define smp_processor_id() __smp_processor_id() #endif #define get_cpu() ({ preempt_disable(); __smp_processor_id(); }) #define put_cpu() preempt_enable() /* * Callback to arch code if there's nosmp or maxcpus=0 on the * boot command line: */ extern void arch_disable_smp_support(void); extern void arch_thaw_secondary_cpus_begin(void); extern void arch_thaw_secondary_cpus_end(void); void smp_setup_processor_id(void); int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys); /* SMP core functions */ int smpcfd_prepare_cpu(unsigned int cpu); int smpcfd_dead_cpu(unsigned int cpu); int smpcfd_dying_cpu(unsigned int cpu); #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG bool csd_lock_is_stuck(void); #else static inline bool csd_lock_is_stuck(void) { return false; } #endif #endif /* __LINUX_SMP_H */
26 7 26 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 /* SPDX-License-Identifier: GPL-2.0 */ /* * Definitions and Declarations for tuple. * * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> * - generalize L3 protocol dependent part. * * Derived from include/linux/netfiter_ipv4/ip_conntrack_tuple.h */ #ifndef _NF_CONNTRACK_TUPLE_H #define _NF_CONNTRACK_TUPLE_H #include <linux/netfilter/x_tables.h> #include <linux/netfilter/nf_conntrack_tuple_common.h> #include <linux/list_nulls.h> /* A `tuple' is a structure containing the information to uniquely identify a connection. ie. if two packets have the same tuple, they are in the same connection; if not, they are not. We divide the structure along "manipulatable" and "non-manipulatable" lines, for the benefit of the NAT code. */ #define NF_CT_TUPLE_L3SIZE ARRAY_SIZE(((union nf_inet_addr *)NULL)->all) /* The manipulable part of the tuple. */ struct nf_conntrack_man { union nf_inet_addr u3; union nf_conntrack_man_proto u; /* Layer 3 protocol */ u_int16_t l3num; }; /* This contains the information to distinguish a connection. */ struct nf_conntrack_tuple { struct nf_conntrack_man src; /* These are the parts of the tuple which are fixed. */ struct { union nf_inet_addr u3; union { /* Add other protocols here. */ __be16 all; struct { __be16 port; } tcp; struct { __be16 port; } udp; struct { u_int8_t type, code; } icmp; struct { __be16 port; } dccp; struct { __be16 port; } sctp; struct { __be16 key; } gre; } u; /* The protocol. */ u_int8_t protonum; /* The direction must be ignored for the tuplehash */ struct { } __nfct_hash_offsetend; /* The direction (for tuplehash) */ u_int8_t dir; } dst; }; struct nf_conntrack_tuple_mask { struct { union nf_inet_addr u3; union nf_conntrack_man_proto u; } src; }; static inline void nf_ct_dump_tuple_ip(const struct nf_conntrack_tuple *t) { #ifdef DEBUG printk("tuple %p: %u %pI4:%hu -> %pI4:%hu\n", t, t->dst.protonum, &t->src.u3.ip, ntohs(t->src.u.all), &t->dst.u3.ip, ntohs(t->dst.u.all)); #endif } static inline void nf_ct_dump_tuple_ipv6(const struct nf_conntrack_tuple *t) { #ifdef DEBUG printk("tuple %p: %u %pI6 %hu -> %pI6 %hu\n", t, t->dst.protonum, t->src.u3.all, ntohs(t->src.u.all), t->dst.u3.all, ntohs(t->dst.u.all)); #endif } static inline void nf_ct_dump_tuple(const struct nf_conntrack_tuple *t) { switch (t->src.l3num) { case AF_INET: nf_ct_dump_tuple_ip(t); break; case AF_INET6: nf_ct_dump_tuple_ipv6(t); break; } } /* If we're the first tuple, it's the original dir. */ #define NF_CT_DIRECTION(h) \ ((enum ip_conntrack_dir)(h)->tuple.dst.dir) /* Connections have two entries in the hash table: one for each way */ struct nf_conntrack_tuple_hash { struct hlist_nulls_node hnnode; struct nf_conntrack_tuple tuple; }; static inline bool __nf_ct_tuple_src_equal(const struct nf_conntrack_tuple *t1, const struct nf_conntrack_tuple *t2) { return (nf_inet_addr_cmp(&t1->src.u3, &t2->src.u3) && t1->src.u.all == t2->src.u.all && t1->src.l3num == t2->src.l3num); } static inline bool __nf_ct_tuple_dst_equal(const struct nf_conntrack_tuple *t1, const struct nf_conntrack_tuple *t2) { return (nf_inet_addr_cmp(&t1->dst.u3, &t2->dst.u3) && t1->dst.u.all == t2->dst.u.all && t1->dst.protonum == t2->dst.protonum); } static inline bool nf_ct_tuple_equal(const struct nf_conntrack_tuple *t1, const struct nf_conntrack_tuple *t2) { return __nf_ct_tuple_src_equal(t1, t2) && __nf_ct_tuple_dst_equal(t1, t2); } static inline bool nf_ct_tuple_mask_equal(const struct nf_conntrack_tuple_mask *m1, const struct nf_conntrack_tuple_mask *m2) { return (nf_inet_addr_cmp(&m1->src.u3, &m2->src.u3) && m1->src.u.all == m2->src.u.all); } static inline bool nf_ct_tuple_src_mask_cmp(const struct nf_conntrack_tuple *t1, const struct nf_conntrack_tuple *t2, const struct nf_conntrack_tuple_mask *mask) { int count; for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++) { if ((t1->src.u3.all[count] ^ t2->src.u3.all[count]) & mask->src.u3.all[count]) return false; } if ((t1->src.u.all ^ t2->src.u.all) & mask->src.u.all) return false; if (t1->src.l3num != t2->src.l3num || t1->dst.protonum != t2->dst.protonum) return false; return true; } static inline bool nf_ct_tuple_mask_cmp(const struct nf_conntrack_tuple *t, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple_mask *mask) { return nf_ct_tuple_src_mask_cmp(t, tuple, mask) && __nf_ct_tuple_dst_equal(t, tuple); } #endif /* _NF_CONNTRACK_TUPLE_H */
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/module.h> #include <net/ip.h> #include <linux/ipv6.h> #include <linux/icmp.h> #include <net/ipv6.h> #include <net/tcp.h> #include <net/udp.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_tcpudp.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> MODULE_DESCRIPTION("Xtables: TCP, UDP and UDP-Lite match"); MODULE_LICENSE("GPL"); MODULE_ALIAS("xt_tcp"); MODULE_ALIAS("xt_udp"); MODULE_ALIAS("ipt_udp"); MODULE_ALIAS("ipt_tcp"); MODULE_ALIAS("ip6t_udp"); MODULE_ALIAS("ip6t_tcp"); MODULE_ALIAS("ipt_icmp"); MODULE_ALIAS("ip6t_icmp6"); /* Returns 1 if the port is matched by the range, 0 otherwise */ static inline bool port_match(u_int16_t min, u_int16_t max, u_int16_t port, bool invert) { return (port >= min && port <= max) ^ invert; } static bool tcp_find_option(u_int8_t option, const struct sk_buff *skb, unsigned int protoff, unsigned int optlen, bool invert, bool *hotdrop) { /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ const u_int8_t *op; u_int8_t _opt[60 - sizeof(struct tcphdr)]; unsigned int i; pr_debug("finding option\n"); if (!optlen) return invert; /* If we don't have the whole header, drop packet. */ op = skb_header_pointer(skb, protoff + sizeof(struct tcphdr), optlen, _opt); if (op == NULL) { *hotdrop = true; return false; } for (i = 0; i < optlen; ) { if (op[i] == option) return !invert; if (op[i] < 2) i++; else i += op[i+1]?:1; } return invert; } static bool tcp_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct tcphdr *th; struct tcphdr _tcph; const struct xt_tcp *tcpinfo = par->matchinfo; if (par->fragoff != 0) { /* To quote Alan: Don't allow a fragment of TCP 8 bytes in. Nobody normal causes this. Its a cracker trying to break in by doing a flag overwrite to pass the direction checks. */ if (par->fragoff == 1) { pr_debug("Dropping evil TCP offset=1 frag.\n"); par->hotdrop = true; } /* Must not be a fragment. */ return false; } th = skb_header_pointer(skb, par->thoff, sizeof(_tcph), &_tcph); if (th == NULL) { /* We've been asked to examine this packet, and we can't. Hence, no choice but to drop. */ pr_debug("Dropping evil TCP offset=0 tinygram.\n"); par->hotdrop = true; return false; } if (!port_match(tcpinfo->spts[0], tcpinfo->spts[1], ntohs(th->source), !!(tcpinfo->invflags & XT_TCP_INV_SRCPT))) return false; if (!port_match(tcpinfo->dpts[0], tcpinfo->dpts[1], ntohs(th->dest), !!(tcpinfo->invflags & XT_TCP_INV_DSTPT))) return false; if (!NF_INVF(tcpinfo, XT_TCP_INV_FLAGS, (((unsigned char *)th)[13] & tcpinfo->flg_mask) == tcpinfo->flg_cmp)) return false; if (tcpinfo->option) { if (th->doff * 4 < sizeof(_tcph)) { par->hotdrop = true; return false; } if (!tcp_find_option(tcpinfo->option, skb, par->thoff, th->doff*4 - sizeof(_tcph), tcpinfo->invflags & XT_TCP_INV_OPTION, &par->hotdrop)) return false; } return true; } static int tcp_mt_check(const struct xt_mtchk_param *par) { const struct xt_tcp *tcpinfo = par->matchinfo; /* Must specify no unknown invflags */ return (tcpinfo->invflags & ~XT_TCP_INV_MASK) ? -EINVAL : 0; } static bool udp_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct udphdr *uh; struct udphdr _udph; const struct xt_udp *udpinfo = par->matchinfo; /* Must not be a fragment. */ if (par->fragoff != 0) return false; uh = skb_header_pointer(skb, par->thoff, sizeof(_udph), &_udph); if (uh == NULL) { /* We've been asked to examine this packet, and we can't. Hence, no choice but to drop. */ pr_debug("Dropping evil UDP tinygram.\n"); par->hotdrop = true; return false; } return port_match(udpinfo->spts[0], udpinfo->spts[1], ntohs(uh->source), !!(udpinfo->invflags & XT_UDP_INV_SRCPT)) && port_match(udpinfo->dpts[0], udpinfo->dpts[1], ntohs(uh->dest), !!(udpinfo->invflags & XT_UDP_INV_DSTPT)); } static int udp_mt_check(const struct xt_mtchk_param *par) { const struct xt_udp *udpinfo = par->matchinfo; /* Must specify no unknown invflags */ return (udpinfo->invflags & ~XT_UDP_INV_MASK) ? -EINVAL : 0; } /* Returns 1 if the type and code is matched by the range, 0 otherwise */ static bool type_code_in_range(u8 test_type, u8 min_code, u8 max_code, u8 type, u8 code) { return type == test_type && code >= min_code && code <= max_code; } static bool icmp_type_code_match(u8 test_type, u8 min_code, u8 max_code, u8 type, u8 code, bool invert) { return (test_type == 0xFF || type_code_in_range(test_type, min_code, max_code, type, code)) ^ invert; } static bool icmp6_type_code_match(u8 test_type, u8 min_code, u8 max_code, u8 type, u8 code, bool invert) { return type_code_in_range(test_type, min_code, max_code, type, code) ^ invert; } static bool icmp_match(const struct sk_buff *skb, struct xt_action_param *par) { const struct icmphdr *ic; struct icmphdr _icmph; const struct ipt_icmp *icmpinfo = par->matchinfo; /* Must not be a fragment. */ if (par->fragoff != 0) return false; ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph); if (!ic) { /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ par->hotdrop = true; return false; } return icmp_type_code_match(icmpinfo->type, icmpinfo->code[0], icmpinfo->code[1], ic->type, ic->code, !!(icmpinfo->invflags & IPT_ICMP_INV)); } static bool icmp6_match(const struct sk_buff *skb, struct xt_action_param *par) { const struct icmp6hdr *ic; struct icmp6hdr _icmph; const struct ip6t_icmp *icmpinfo = par->matchinfo; /* Must not be a fragment. */ if (par->fragoff != 0) return false; ic = skb_header_pointer(skb, par->thoff, sizeof(_icmph), &_icmph); if (!ic) { /* We've been asked to examine this packet, and we * can't. Hence, no choice but to drop. */ par->hotdrop = true; return false; } return icmp6_type_code_match(icmpinfo->type, icmpinfo->code[0], icmpinfo->code[1], ic->icmp6_type, ic->icmp6_code, !!(icmpinfo->invflags & IP6T_ICMP_INV)); } static int icmp_checkentry(const struct xt_mtchk_param *par) { const struct ipt_icmp *icmpinfo = par->matchinfo; return (icmpinfo->invflags & ~IPT_ICMP_INV) ? -EINVAL : 0; } static int icmp6_checkentry(const struct xt_mtchk_param *par) { const struct ip6t_icmp *icmpinfo = par->matchinfo; return (icmpinfo->invflags & ~IP6T_ICMP_INV) ? -EINVAL : 0; } static struct xt_match tcpudp_mt_reg[] __read_mostly = { { .name = "tcp", .family = NFPROTO_IPV4, .checkentry = tcp_mt_check, .match = tcp_mt, .matchsize = sizeof(struct xt_tcp), .proto = IPPROTO_TCP, .me = THIS_MODULE, }, { .name = "tcp", .family = NFPROTO_IPV6, .checkentry = tcp_mt_check, .match = tcp_mt, .matchsize = sizeof(struct xt_tcp), .proto = IPPROTO_TCP, .me = THIS_MODULE, }, { .name = "udp", .family = NFPROTO_IPV4, .checkentry = udp_mt_check, .match = udp_mt, .matchsize = sizeof(struct xt_udp), .proto = IPPROTO_UDP, .me = THIS_MODULE, }, { .name = "udp", .family = NFPROTO_IPV6, .checkentry = udp_mt_check, .match = udp_mt, .matchsize = sizeof(struct xt_udp), .proto = IPPROTO_UDP, .me = THIS_MODULE, }, { .name = "udplite", .family = NFPROTO_IPV4, .checkentry = udp_mt_check, .match = udp_mt, .matchsize = sizeof(struct xt_udp), .proto = IPPROTO_UDPLITE, .me = THIS_MODULE, }, { .name = "udplite", .family = NFPROTO_IPV6, .checkentry = udp_mt_check, .match = udp_mt, .matchsize = sizeof(struct xt_udp), .proto = IPPROTO_UDPLITE, .me = THIS_MODULE, }, { .name = "icmp", .match = icmp_match, .matchsize = sizeof(struct ipt_icmp), .checkentry = icmp_checkentry, .proto = IPPROTO_ICMP, .family = NFPROTO_IPV4, .me = THIS_MODULE, }, { .name = "icmp6", .match = icmp6_match, .matchsize = sizeof(struct ip6t_icmp), .checkentry = icmp6_checkentry, .proto = IPPROTO_ICMPV6, .family = NFPROTO_IPV6, .me = THIS_MODULE, }, }; static int __init tcpudp_mt_init(void) { return xt_register_matches(tcpudp_mt_reg, ARRAY_SIZE(tcpudp_mt_reg)); } static void __exit tcpudp_mt_exit(void) { xt_unregister_matches(tcpudp_mt_reg, ARRAY_SIZE(tcpudp_mt_reg)); } module_init(tcpudp_mt_init); module_exit(tcpudp_mt_exit);
145 999 110 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * * This file is part of the SCTP kernel implementation * * Please send any bug reports or fixes you make to the * email addresses: * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Randall Stewart <randall@sctp.chicago.il.us> * Ken Morneau <kmorneau@cisco.com> * Qiaobing Xie <qxie1@email.mot.com> * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@us.ibm.com> * Xingang Guo <xingang.guo@intel.com> * Hui Huang <hui.huang@nokia.com> * Sridhar Samudrala <sri@us.ibm.com> * Daisy Chang <daisyc@us.ibm.com> * Dajiang Zhang <dajiang.zhang@nokia.com> * Ardelle Fan <ardelle.fan@intel.com> * Ryan Layer <rmlayer@us.ibm.com> * Anup Pemmaiah <pemmaiah@cc.usu.edu> * Kevin Gao <kevin.gao@intel.com> */ #ifndef __sctp_structs_h__ #define __sctp_structs_h__ #include <linux/ktime.h> #include <linux/generic-radix-tree.h> #include <linux/rhashtable-types.h> #include <linux/socket.h> /* linux/in.h needs this!! */ #include <linux/in.h> /* We get struct sockaddr_in. */ #include <linux/in6.h> /* We get struct in6_addr */ #include <linux/ipv6.h> #include <asm/param.h> /* We get MAXHOSTNAMELEN. */ #include <linux/atomic.h> /* This gets us atomic counters. */ #include <linux/skbuff.h> /* We need sk_buff_head. */ #include <linux/workqueue.h> /* We need tq_struct. */ #include <linux/sctp.h> /* We need sctp* header structs. */ #include <net/sctp/auth.h> /* We need auth specific structs */ #include <net/ip.h> /* For inet_skb_parm */ /* A convenience structure for handling sockaddr structures. * We should wean ourselves off this. */ union sctp_addr { struct sockaddr_inet sa; /* Large enough for both address families */ struct sockaddr_in v4; struct sockaddr_in6 v6; }; /* Forward declarations for data structures. */ struct sctp_globals; struct sctp_endpoint; struct sctp_association; struct sctp_transport; struct sctp_packet; struct sctp_chunk; struct sctp_inq; struct sctp_outq; struct sctp_bind_addr; struct sctp_ulpq; struct sctp_ep_common; struct crypto_shash; struct sctp_stream; #include <net/sctp/tsnmap.h> #include <net/sctp/ulpevent.h> #include <net/sctp/ulpqueue.h> #include <net/sctp/stream_interleave.h> /* Structures useful for managing bind/connect. */ struct sctp_bind_bucket { unsigned short port; signed char fastreuse; signed char fastreuseport; kuid_t fastuid; struct hlist_node node; struct hlist_head owner; struct net *net; }; struct sctp_bind_hashbucket { spinlock_t lock; struct hlist_head chain; }; /* Used for hashing all associations. */ struct sctp_hashbucket { rwlock_t lock; struct hlist_head chain; } __attribute__((__aligned__(8))); /* The SCTP globals structure. */ extern struct sctp_globals { /* This is a list of groups of functions for each address * family that we support. */ struct list_head address_families; /* This is the hash of all endpoints. */ struct sctp_hashbucket *ep_hashtable; /* This is the sctp port control hash. */ struct sctp_bind_hashbucket *port_hashtable; /* This is the hash of all transports. */ struct rhltable transport_hashtable; /* Sizes of above hashtables. */ int ep_hashsize; int port_hashsize; /* Default initialization values to be applied to new associations. */ __u16 max_instreams; __u16 max_outstreams; /* Flag to indicate whether computing and verifying checksum * is disabled. */ bool checksum_disable; } sctp_globals; #define sctp_max_instreams (sctp_globals.max_instreams) #define sctp_max_outstreams (sctp_globals.max_outstreams) #define sctp_address_families (sctp_globals.address_families) #define sctp_ep_hashsize (sctp_globals.ep_hashsize) #define sctp_ep_hashtable (sctp_globals.ep_hashtable) #define sctp_port_hashsize (sctp_globals.port_hashsize) #define sctp_port_hashtable (sctp_globals.port_hashtable) #define sctp_transport_hashtable (sctp_globals.transport_hashtable) #define sctp_checksum_disable (sctp_globals.checksum_disable) /* SCTP Socket type: UDP or TCP style. */ enum sctp_socket_type { SCTP_SOCKET_UDP = 0, SCTP_SOCKET_UDP_HIGH_BANDWIDTH, SCTP_SOCKET_TCP }; /* Per socket SCTP information. */ struct sctp_sock { /* inet_sock has to be the first member of sctp_sock */ struct inet_sock inet; /* What kind of a socket is this? */ enum sctp_socket_type type; /* PF_ family specific functions. */ struct sctp_pf *pf; /* Access to HMAC transform. */ struct crypto_shash *hmac; char *sctp_hmac_alg; /* What is our base endpointer? */ struct sctp_endpoint *ep; struct sctp_bind_bucket *bind_hash; /* Various Socket Options. */ __u16 default_stream; __u32 default_ppid; __u16 default_flags; __u32 default_context; __u32 default_timetolive; __u32 default_rcv_context; int max_burst; /* Heartbeat interval: The endpoint sends out a Heartbeat chunk to * the destination address every heartbeat interval. This value * will be inherited by all new associations. */ __u32 hbinterval; __u32 probe_interval; __be16 udp_port; __be16 encap_port; /* This is the max_retrans value for new associations. */ __u16 pathmaxrxt; __u32 flowlabel; __u8 dscp; __u16 pf_retrans; __u16 ps_retrans; /* The initial Path MTU to use for new associations. */ __u32 pathmtu; /* The default SACK delay timeout for new associations. */ __u32 sackdelay; __u32 sackfreq; /* Flags controlling Heartbeat, SACK delay, and Path MTU Discovery. */ __u32 param_flags; __u32 default_ss; struct sctp_rtoinfo rtoinfo; struct sctp_paddrparams paddrparam; struct sctp_assocparams assocparams; /* * These two structures must be grouped together for the usercopy * whitelist region. */ __u16 subscribe; struct sctp_initmsg initmsg; int user_frag; __u32 autoclose; __u32 adaptation_ind; __u32 pd_point; __u16 nodelay:1, pf_expose:2, reuse:1, disable_fragments:1, v4mapped:1, frag_interleave:1, recvrcvinfo:1, recvnxtinfo:1, data_ready_signalled:1; atomic_t pd_mode; /* Fields after this point will be skipped on copies, like on accept * and peeloff operations */ /* Receive to here while partial delivery is in effect. */ struct sk_buff_head pd_lobby; struct list_head auto_asconf_list; int do_auto_asconf; }; #define sctp_sk(ptr) container_of_const(ptr, struct sctp_sock, inet.sk) static inline struct sock *sctp_opt2sk(const struct sctp_sock *sp) { return (struct sock *)sp; } #if IS_ENABLED(CONFIG_IPV6) struct sctp6_sock { struct sctp_sock sctp; struct ipv6_pinfo inet6; }; #endif /* CONFIG_IPV6 */ /* This is our APPLICATION-SPECIFIC state cookie. * THIS IS NOT DICTATED BY THE SPECIFICATION. */ /* These are the parts of an association which we send in the cookie. * Most of these are straight out of: * RFC2960 12.2 Parameters necessary per association (i.e. the TCB) * */ struct sctp_cookie { /* My : Tag expected in every inbound packet and sent * Verification: in the INIT or INIT ACK chunk. * Tag : */ __u32 my_vtag; /* Peer's : Tag expected in every outbound packet except * Verification: in the INIT chunk. * Tag : */ __u32 peer_vtag; /* The rest of these are not from the spec, but really need to * be in the cookie. */ /* My Tie Tag : Assist in discovering a restarting association. */ __u32 my_ttag; /* Peer's Tie Tag: Assist in discovering a restarting association. */ __u32 peer_ttag; /* When does this cookie expire? */ ktime_t expiration; /* Number of inbound/outbound streams which are set * and negotiated during the INIT process. */ __u16 sinit_num_ostreams; __u16 sinit_max_instreams; /* This is the first sequence number I used. */ __u32 initial_tsn; /* This holds the originating address of the INIT packet. */ union sctp_addr peer_addr; /* IG Section 2.35.3 * Include the source port of the INIT-ACK */ __u16 my_port; __u8 prsctp_capable; /* Padding for future use */ __u8 padding; __u32 adaptation_ind; __u8 auth_random[sizeof(struct sctp_paramhdr) + SCTP_AUTH_RANDOM_LENGTH]; __u8 auth_hmacs[SCTP_AUTH_NUM_HMACS * sizeof(__u16) + 2]; __u8 auth_chunks[sizeof(struct sctp_paramhdr) + SCTP_AUTH_MAX_CHUNKS]; /* This is a shim for my peer's INIT packet, followed by * a copy of the raw address list of the association. * The length of the raw address list is saved in the * raw_addr_list_len field, which will be used at the time when * the association TCB is re-constructed from the cookie. */ __u32 raw_addr_list_len; /* struct sctp_init_chunk peer_init[]; */ }; /* The format of our cookie that we send to our peer. */ struct sctp_signed_cookie { __u8 signature[SCTP_SECRET_SIZE]; __u32 __pad; /* force sctp_cookie alignment to 64 bits */ struct sctp_cookie c; } __packed; /* This is another convenience type to allocate memory for address * params for the maximum size and pass such structures around * internally. */ union sctp_addr_param { struct sctp_paramhdr p; struct sctp_ipv4addr_param v4; struct sctp_ipv6addr_param v6; }; /* A convenience type to allow walking through the various * parameters and avoid casting all over the place. */ union sctp_params { void *v; struct sctp_paramhdr *p; struct sctp_cookie_preserve_param *life; struct sctp_hostname_param *dns; struct sctp_cookie_param *cookie; struct sctp_supported_addrs_param *sat; struct sctp_ipv4addr_param *v4; struct sctp_ipv6addr_param *v6; union sctp_addr_param *addr; struct sctp_adaptation_ind_param *aind; struct sctp_supported_ext_param *ext; struct sctp_random_param *random; struct sctp_chunks_param *chunks; struct sctp_hmac_algo_param *hmac_algo; struct sctp_addip_param *addip; }; /* RFC 2960. Section 3.3.5 Heartbeat. * Heartbeat Information: variable length * The Sender-specific Heartbeat Info field should normally include * information about the sender's current time when this HEARTBEAT * chunk is sent and the destination transport address to which this * HEARTBEAT is sent (see Section 8.3). */ struct sctp_sender_hb_info { struct sctp_paramhdr param_hdr; union sctp_addr daddr; unsigned long sent_at; __u64 hb_nonce; __u32 probe_size; }; int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, gfp_t gfp); int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid); void sctp_stream_free(struct sctp_stream *stream); void sctp_stream_clear(struct sctp_stream *stream); void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new); /* What is the current SSN number for this stream? */ #define sctp_ssn_peek(stream, type, sid) \ (sctp_stream_##type((stream), (sid))->ssn) /* Return the next SSN number for this stream. */ #define sctp_ssn_next(stream, type, sid) \ (sctp_stream_##type((stream), (sid))->ssn++) /* Skip over this ssn and all below. */ #define sctp_ssn_skip(stream, type, sid, ssn) \ (sctp_stream_##type((stream), (sid))->ssn = ssn + 1) /* What is the current MID number for this stream? */ #define sctp_mid_peek(stream, type, sid) \ (sctp_stream_##type((stream), (sid))->mid) /* Return the next MID number for this stream. */ #define sctp_mid_next(stream, type, sid) \ (sctp_stream_##type((stream), (sid))->mid++) /* Skip over this mid and all below. */ #define sctp_mid_skip(stream, type, sid, mid) \ (sctp_stream_##type((stream), (sid))->mid = mid + 1) /* What is the current MID_uo number for this stream? */ #define sctp_mid_uo_peek(stream, type, sid) \ (sctp_stream_##type((stream), (sid))->mid_uo) /* Return the next MID_uo number for this stream. */ #define sctp_mid_uo_next(stream, type, sid) \ (sctp_stream_##type((stream), (sid))->mid_uo++) /* * Pointers to address related SCTP functions. * (i.e. things that depend on the address family.) */ struct sctp_af { int (*sctp_xmit) (struct sk_buff *skb, struct sctp_transport *); int (*setsockopt) (struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int (*getsockopt) (struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); void (*get_dst) (struct sctp_transport *t, union sctp_addr *saddr, struct flowi *fl, struct sock *sk); void (*get_saddr) (struct sctp_sock *sk, struct sctp_transport *t, struct flowi *fl); void (*copy_addrlist) (struct list_head *, struct net_device *); int (*cmp_addr) (const union sctp_addr *addr1, const union sctp_addr *addr2); void (*addr_copy) (union sctp_addr *dst, union sctp_addr *src); void (*from_skb) (union sctp_addr *, struct sk_buff *skb, int saddr); void (*from_sk) (union sctp_addr *, struct sock *sk); bool (*from_addr_param) (union sctp_addr *, union sctp_addr_param *, __be16 port, int iif); int (*to_addr_param) (const union sctp_addr *, union sctp_addr_param *); int (*addr_valid) (union sctp_addr *, struct sctp_sock *, const struct sk_buff *); enum sctp_scope (*scope)(union sctp_addr *); void (*inaddr_any) (union sctp_addr *, __be16); int (*is_any) (const union sctp_addr *); int (*available) (union sctp_addr *, struct sctp_sock *); int (*skb_iif) (const struct sk_buff *sk); int (*skb_sdif)(const struct sk_buff *sk); int (*is_ce) (const struct sk_buff *sk); void (*seq_dump_addr)(struct seq_file *seq, union sctp_addr *addr); void (*ecn_capable)(struct sock *sk); __u16 net_header_len; int sockaddr_len; int (*ip_options_len)(struct sock *sk); sa_family_t sa_family; struct list_head list; }; struct sctp_af *sctp_get_af_specific(sa_family_t); int sctp_register_af(struct sctp_af *); /* Protocol family functions. */ struct sctp_pf { void (*event_msgname)(struct sctp_ulpevent *, char *, int *); void (*skb_msgname) (struct sk_buff *, char *, int *); int (*af_supported) (sa_family_t, struct sctp_sock *); int (*cmp_addr) (const union sctp_addr *, const union sctp_addr *, struct sctp_sock *); int (*bind_verify) (struct sctp_sock *, union sctp_addr *); int (*send_verify) (struct sctp_sock *, union sctp_addr *); int (*supported_addrs)(const struct sctp_sock *, __be16 *); struct sock *(*create_accept_sk) (struct sock *sk, struct sctp_association *asoc, bool kern); int (*addr_to_user)(struct sctp_sock *sk, union sctp_addr *addr); void (*to_sk_saddr)(union sctp_addr *, struct sock *sk); void (*to_sk_daddr)(union sctp_addr *, struct sock *sk); void (*copy_ip_options)(struct sock *sk, struct sock *newsk); struct sctp_af *af; }; /* Structure to track chunk fragments that have been acked, but peer * fragments of the same message have not. */ struct sctp_datamsg { /* Chunks waiting to be submitted to lower layer. */ struct list_head chunks; /* Reference counting. */ refcount_t refcnt; /* When is this message no longer interesting to the peer? */ unsigned long expires_at; /* Did the message fail to send? */ int send_error; u8 send_failed:1, can_delay:1, /* should this message be Nagle delayed */ abandoned:1; /* should this message be abandoned */ }; struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *, struct sctp_sndrcvinfo *, struct iov_iter *); void sctp_datamsg_free(struct sctp_datamsg *); void sctp_datamsg_put(struct sctp_datamsg *); void sctp_chunk_fail(struct sctp_chunk *, int error); int sctp_chunk_abandoned(struct sctp_chunk *); /* RFC2960 1.4 Key Terms * * o Chunk: A unit of information within an SCTP packet, consisting of * a chunk header and chunk-specific content. * * As a matter of convenience, we remember the SCTP common header for * each chunk as well as a few other header pointers... */ struct sctp_chunk { struct list_head list; refcount_t refcnt; /* How many times this chunk have been sent, for prsctp RTX policy */ int sent_count; union { /* This is our link to the per-transport transmitted list. */ struct list_head transmitted_list; /* List in specific stream outq */ struct list_head stream_list; }; /* This field is used by chunks that hold fragmented data. * For the first fragment this is the list that holds the rest of * fragments. For the remaining fragments, this is the link to the * frag_list maintained in the first fragment. */ struct list_head frag_list; /* This points to the sk_buff containing the actual data. */ struct sk_buff *skb; union { /* In case of GSO packets, this will store the head one */ struct sk_buff *head_skb; /* In case of auth enabled, this will point to the shkey */ struct sctp_shared_key *shkey; }; /* These are the SCTP headers by reverse order in a packet. * Note that some of these may happen more than once. In that * case, we point at the "current" one, whatever that means * for that level of header. */ /* We point this at the FIRST TLV parameter to chunk_hdr. */ union sctp_params param_hdr; union { __u8 *v; struct sctp_datahdr *data_hdr; struct sctp_inithdr *init_hdr; struct sctp_sackhdr *sack_hdr; struct sctp_heartbeathdr *hb_hdr; struct sctp_sender_hb_info *hbs_hdr; struct sctp_shutdownhdr *shutdown_hdr; struct sctp_signed_cookie *cookie_hdr; struct sctp_ecnehdr *ecne_hdr; struct sctp_cwrhdr *ecn_cwr_hdr; struct sctp_errhdr *err_hdr; struct sctp_addiphdr *addip_hdr; struct sctp_fwdtsn_hdr *fwdtsn_hdr; struct sctp_authhdr *auth_hdr; struct sctp_idatahdr *idata_hdr; struct sctp_ifwdtsn_hdr *ifwdtsn_hdr; } subh; __u8 *chunk_end; struct sctp_chunkhdr *chunk_hdr; struct sctphdr *sctp_hdr; /* This needs to be recoverable for SCTP_SEND_FAILED events. */ struct sctp_sndrcvinfo sinfo; /* Which association does this belong to? */ struct sctp_association *asoc; /* What endpoint received this chunk? */ struct sctp_ep_common *rcvr; /* We fill this in if we are calculating RTT. */ unsigned long sent_at; /* What is the origin IP address for this chunk? */ union sctp_addr source; /* Destination address for this chunk. */ union sctp_addr dest; /* For outbound message, track all fragments for SEND_FAILED. */ struct sctp_datamsg *msg; /* For an inbound chunk, this tells us where it came from. * For an outbound chunk, it tells us where we'd like it to * go. It is NULL if we have no preference. */ struct sctp_transport *transport; /* SCTP-AUTH: For the special case inbound processing of COOKIE-ECHO * we need save a pointer to the AUTH chunk, since the SCTP-AUTH * spec violates the principle premis that all chunks are processed * in order. */ struct sk_buff *auth_chunk; #define SCTP_CAN_FRTX 0x0 #define SCTP_NEED_FRTX 0x1 #define SCTP_DONT_FRTX 0x2 __u16 rtt_in_progress:1, /* This chunk used for RTT calc? */ has_tsn:1, /* Does this chunk have a TSN yet? */ has_ssn:1, /* Does this chunk have a SSN yet? */ #define has_mid has_ssn singleton:1, /* Only chunk in the packet? */ end_of_packet:1, /* Last chunk in the packet? */ ecn_ce_done:1, /* Have we processed the ECN CE bit? */ pdiscard:1, /* Discard the whole packet now? */ tsn_gap_acked:1, /* Is this chunk acked by a GAP ACK? */ data_accepted:1, /* At least 1 chunk accepted */ auth:1, /* IN: was auth'ed | OUT: needs auth */ has_asconf:1, /* IN: have seen an asconf before */ pmtu_probe:1, /* Used by PLPMTUD, can be set in s HB chunk */ tsn_missing_report:2, /* Data chunk missing counter. */ fast_retransmit:2; /* Is this chunk fast retransmitted? */ }; #define sctp_chunk_retransmitted(chunk) (chunk->sent_count > 1) void sctp_chunk_hold(struct sctp_chunk *); void sctp_chunk_put(struct sctp_chunk *); int sctp_user_addto_chunk(struct sctp_chunk *chunk, int len, struct iov_iter *from); void sctp_chunk_free(struct sctp_chunk *); void *sctp_addto_chunk(struct sctp_chunk *, int len, const void *data); struct sctp_chunk *sctp_chunkify(struct sk_buff *, const struct sctp_association *, struct sock *, gfp_t gfp); void sctp_init_addrs(struct sctp_chunk *, union sctp_addr *, union sctp_addr *); const union sctp_addr *sctp_source(const struct sctp_chunk *chunk); static inline __u16 sctp_chunk_stream_no(struct sctp_chunk *ch) { return ntohs(ch->subh.data_hdr->stream); } enum { SCTP_ADDR_NEW, /* new address added to assoc/ep */ SCTP_ADDR_SRC, /* address can be used as source */ SCTP_ADDR_DEL, /* address about to be deleted */ }; /* This is a structure for holding either an IPv6 or an IPv4 address. */ struct sctp_sockaddr_entry { struct list_head list; struct rcu_head rcu; union sctp_addr a; __u8 state; __u8 valid; }; #define SCTP_ADDRESS_TICK_DELAY 500 /* This structure holds lists of chunks as we are assembling for * transmission. */ struct sctp_packet { /* These are the SCTP header values (host order) for the packet. */ __u16 source_port; __u16 destination_port; __u32 vtag; /* This contains the payload chunks. */ struct list_head chunk_list; /* This is the overhead of the sctp and ip headers. */ size_t overhead; /* This is the total size of all chunks INCLUDING padding. */ size_t size; /* This is the maximum size this packet may have */ size_t max_size; /* The packet is destined for this transport address. * The function we finally use to pass down to the next lower * layer lives in the transport structure. */ struct sctp_transport *transport; /* pointer to the auth chunk for this packet */ struct sctp_chunk *auth; u8 has_cookie_echo:1, /* This packet contains a COOKIE-ECHO chunk. */ has_sack:1, /* This packet contains a SACK chunk. */ has_auth:1, /* This packet contains an AUTH chunk */ has_data:1, /* This packet contains at least 1 DATA chunk */ ipfragok:1; /* So let ip fragment this packet */ }; void sctp_packet_init(struct sctp_packet *, struct sctp_transport *, __u16 sport, __u16 dport); void sctp_packet_config(struct sctp_packet *, __u32 vtag, int); enum sctp_xmit sctp_packet_transmit_chunk(struct sctp_packet *packet, struct sctp_chunk *chunk, int one_packet, gfp_t gfp); enum sctp_xmit sctp_packet_append_chunk(struct sctp_packet *packet, struct sctp_chunk *chunk); int sctp_packet_transmit(struct sctp_packet *, gfp_t); void sctp_packet_free(struct sctp_packet *); static inline int sctp_packet_empty(struct sctp_packet *packet) { return packet->size == packet->overhead; } /* This represents a remote transport address. * For local transport addresses, we just use union sctp_addr. * * RFC2960 Section 1.4 Key Terms * * o Transport address: A Transport Address is traditionally defined * by Network Layer address, Transport Layer protocol and Transport * Layer port number. In the case of SCTP running over IP, a * transport address is defined by the combination of an IP address * and an SCTP port number (where SCTP is the Transport protocol). * * RFC2960 Section 7.1 SCTP Differences from TCP Congestion control * * o The sender keeps a separate congestion control parameter set for * each of the destination addresses it can send to (not each * source-destination pair but for each destination). The parameters * should decay if the address is not used for a long enough time * period. * */ struct sctp_transport { /* A list of transports. */ struct list_head transports; struct rhlist_head node; /* Reference counting. */ refcount_t refcnt; __u32 dead:1, /* RTO-Pending : A flag used to track if one of the DATA * chunks sent to this address is currently being * used to compute a RTT. If this flag is 0, * the next DATA chunk sent to this destination * should be used to compute a RTT and this flag * should be set. Every time the RTT * calculation completes (i.e. the DATA chunk * is SACK'd) clear this flag. */ rto_pending:1, /* * hb_sent : a flag that signals that we have a pending * heartbeat. */ hb_sent:1, /* Is the Path MTU update pending on this transport */ pmtu_pending:1, dst_pending_confirm:1, /* need to confirm neighbour */ /* Has this transport moved the ctsn since we last sacked */ sack_generation:1; u32 dst_cookie; struct flowi fl; /* This is the peer's IP address and port. */ union sctp_addr ipaddr; /* These are the functions we call to handle LLP stuff. */ struct sctp_af *af_specific; /* Which association do we belong to? */ struct sctp_association *asoc; /* RFC2960 * * 12.3 Per Transport Address Data * * For each destination transport address in the peer's * address list derived from the INIT or INIT ACK chunk, a * number of data elements needs to be maintained including: */ /* RTO : The current retransmission timeout value. */ unsigned long rto; __u32 rtt; /* This is the most recent RTT. */ /* RTTVAR : The current RTT variation. */ __u32 rttvar; /* SRTT : The current smoothed round trip time. */ __u32 srtt; /* * These are the congestion stats. */ /* cwnd : The current congestion window. */ __u32 cwnd; /* This is the actual cwnd. */ /* ssthresh : The current slow start threshold value. */ __u32 ssthresh; /* partial : The tracking method for increase of cwnd when in * bytes acked : congestion avoidance mode (see Section 6.2.2) */ __u32 partial_bytes_acked; /* Data that has been sent, but not acknowledged. */ __u32 flight_size; __u32 burst_limited; /* Holds old cwnd when max.burst is applied */ /* Destination */ struct dst_entry *dst; /* Source address. */ union sctp_addr saddr; /* Heartbeat interval: The endpoint sends out a Heartbeat chunk to * the destination address every heartbeat interval. */ unsigned long hbinterval; unsigned long probe_interval; /* SACK delay timeout */ unsigned long sackdelay; __u32 sackfreq; atomic_t mtu_info; /* When was the last time that we heard from this transport? We use * this to pick new active and retran paths. */ ktime_t last_time_heard; /* When was the last time that we sent a chunk using this * transport? We use this to check for idle transports */ unsigned long last_time_sent; /* Last time(in jiffies) when cwnd is reduced due to the congestion * indication based on ECNE chunk. */ unsigned long last_time_ecne_reduced; __be16 encap_port; /* This is the max_retrans value for the transport and will * be initialized from the assocs value. This can be changed * using the SCTP_SET_PEER_ADDR_PARAMS socket option. */ __u16 pathmaxrxt; __u32 flowlabel; __u8 dscp; /* This is the partially failed retrans value for the transport * and will be initialized from the assocs value. This can be changed * using the SCTP_PEER_ADDR_THLDS socket option */ __u16 pf_retrans; /* Used for primary path switchover. */ __u16 ps_retrans; /* PMTU : The current known path MTU. */ __u32 pathmtu; /* Flags controlling Heartbeat, SACK delay, and Path MTU Discovery. */ __u32 param_flags; /* The number of times INIT has been sent on this transport. */ int init_sent_count; /* state : The current state of this destination, * : i.e. SCTP_ACTIVE, SCTP_INACTIVE, SCTP_UNKNOWN. */ int state; /* These are the error stats for this destination. */ /* Error count : The current error count for this destination. */ unsigned short error_count; /* Per : A timer used by each destination. * Destination : * Timer : * * [Everywhere else in the text this is called T3-rtx. -ed] */ struct timer_list T3_rtx_timer; /* Heartbeat timer is per destination. */ struct timer_list hb_timer; /* Timer to handle ICMP proto unreachable envets */ struct timer_list proto_unreach_timer; /* Timer to handler reconf chunk rtx */ struct timer_list reconf_timer; /* Timer to send a probe HB packet for PLPMTUD */ struct timer_list probe_timer; /* Since we're using per-destination retransmission timers * (see above), we're also using per-destination "transmitted" * queues. This probably ought to be a private struct * accessible only within the outqueue, but it's not, yet. */ struct list_head transmitted; /* We build bundle-able packets for this transport here. */ struct sctp_packet packet; /* This is the list of transports that have chunks to send. */ struct list_head send_ready; /* State information saved for SFR_CACC algorithm. The key * idea in SFR_CACC is to maintain state at the sender on a * per-destination basis when a changeover happens. * char changeover_active; * char cycling_changeover; * __u32 next_tsn_at_change; * char cacc_saw_newack; */ struct { /* An unsigned integer, which stores the next TSN to be * used by the sender, at the moment of changeover. */ __u32 next_tsn_at_change; /* A flag which indicates the occurrence of a changeover */ char changeover_active; /* A flag which indicates whether the change of primary is * the first switch to this destination address during an * active switch. */ char cycling_changeover; /* A temporary flag, which is used during the processing of * a SACK to estimate the causative TSN(s)'s group. */ char cacc_saw_newack; } cacc; struct { __u16 pmtu; __u16 probe_size; __u16 probe_high; __u8 probe_count; __u8 state; } pl; /* plpmtud related */ /* 64-bit random number sent with heartbeat. */ __u64 hb_nonce; struct rcu_head rcu; }; struct sctp_transport *sctp_transport_new(struct net *, const union sctp_addr *, gfp_t); void sctp_transport_set_owner(struct sctp_transport *, struct sctp_association *); void sctp_transport_route(struct sctp_transport *, union sctp_addr *, struct sctp_sock *); void sctp_transport_pmtu(struct sctp_transport *, struct sock *sk); void sctp_transport_free(struct sctp_transport *); void sctp_transport_reset_t3_rtx(struct sctp_transport *); void sctp_transport_reset_hb_timer(struct sctp_transport *); void sctp_transport_reset_reconf_timer(struct sctp_transport *transport); void sctp_transport_reset_probe_timer(struct sctp_transport *transport); void sctp_transport_reset_raise_timer(struct sctp_transport *transport); int sctp_transport_hold(struct sctp_transport *); void sctp_transport_put(struct sctp_transport *); void sctp_transport_update_rto(struct sctp_transport *, __u32); void sctp_transport_raise_cwnd(struct sctp_transport *, __u32, __u32); void sctp_transport_lower_cwnd(struct sctp_transport *t, enum sctp_lower_cwnd reason); void sctp_transport_burst_limited(struct sctp_transport *); void sctp_transport_burst_reset(struct sctp_transport *); unsigned long sctp_transport_timeout(struct sctp_transport *); void sctp_transport_reset(struct sctp_transport *t); bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu); void sctp_transport_immediate_rtx(struct sctp_transport *); void sctp_transport_dst_release(struct sctp_transport *t); void sctp_transport_dst_confirm(struct sctp_transport *t); void sctp_transport_pl_send(struct sctp_transport *t); bool sctp_transport_pl_recv(struct sctp_transport *t); /* This is the structure we use to queue packets as they come into * SCTP. We write packets to it and read chunks from it. */ struct sctp_inq { /* This is actually a queue of sctp_chunk each * containing a partially decoded packet. */ struct list_head in_chunk_list; /* This is the packet which is currently off the in queue and is * being worked on through the inbound chunk processing. */ struct sctp_chunk *in_progress; /* This is the delayed task to finish delivering inbound * messages. */ struct work_struct immediate; }; void sctp_inq_init(struct sctp_inq *); void sctp_inq_free(struct sctp_inq *); void sctp_inq_push(struct sctp_inq *, struct sctp_chunk *packet); struct sctp_chunk *sctp_inq_pop(struct sctp_inq *); struct sctp_chunkhdr *sctp_inq_peek(struct sctp_inq *); void sctp_inq_set_th_handler(struct sctp_inq *, work_func_t); /* This is the structure we use to hold outbound chunks. You push * chunks in and they automatically pop out the other end as bundled * packets (it calls (*output_handler)()). * * This structure covers sections 6.3, 6.4, 6.7, 6.8, 6.10, 7., 8.1, * and 8.2 of the v13 draft. * * It handles retransmissions. The connection to the timeout portion * of the state machine is through sctp_..._timeout() and timeout_handler. * * If you feed it SACKs, it will eat them. * * If you give it big chunks, it will fragment them. * * It assigns TSN's to data chunks. This happens at the last possible * instant before transmission. * * When free()'d, it empties itself out via output_handler(). */ struct sctp_outq { struct sctp_association *asoc; /* Data pending that has never been transmitted. */ struct list_head out_chunk_list; /* Stream scheduler being used */ struct sctp_sched_ops *sched; unsigned int out_qlen; /* Total length of queued data chunks. */ /* Error of send failed, may used in SCTP_SEND_FAILED event. */ unsigned int error; /* These are control chunks we want to send. */ struct list_head control_chunk_list; /* These are chunks that have been sacked but are above the * CTSN, or cumulative tsn ack point. */ struct list_head sacked; /* Put chunks on this list to schedule them for * retransmission. */ struct list_head retransmit; /* Put chunks on this list to save them for FWD TSN processing as * they were abandoned. */ struct list_head abandoned; /* How many unackd bytes do we have in-flight? */ __u32 outstanding_bytes; /* Are we doing fast-rtx on this queue */ char fast_rtx; /* Corked? */ char cork; }; void sctp_outq_init(struct sctp_association *, struct sctp_outq *); void sctp_outq_teardown(struct sctp_outq *); void sctp_outq_free(struct sctp_outq*); void sctp_outq_tail(struct sctp_outq *, struct sctp_chunk *chunk, gfp_t); int sctp_outq_sack(struct sctp_outq *, struct sctp_chunk *); int sctp_outq_is_empty(const struct sctp_outq *); void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport, enum sctp_retransmit_reason reason); void sctp_retransmit_mark(struct sctp_outq *, struct sctp_transport *, __u8); void sctp_outq_uncork(struct sctp_outq *, gfp_t gfp); void sctp_prsctp_prune(struct sctp_association *asoc, struct sctp_sndrcvinfo *sinfo, int msg_len); void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn); /* Uncork and flush an outqueue. */ static inline void sctp_outq_cork(struct sctp_outq *q) { q->cork = 1; } /* SCTP skb control block. * sctp_input_cb is currently used on rx and sock rx queue */ struct sctp_input_cb { union { struct inet_skb_parm h4; #if IS_ENABLED(CONFIG_IPV6) struct inet6_skb_parm h6; #endif } header; struct sctp_chunk *chunk; struct sctp_af *af; __be16 encap_port; }; #define SCTP_INPUT_CB(__skb) ((struct sctp_input_cb *)&((__skb)->cb[0])) struct sctp_output_cb { struct sk_buff *last; }; #define SCTP_OUTPUT_CB(__skb) ((struct sctp_output_cb *)&((__skb)->cb[0])) static inline const struct sk_buff *sctp_gso_headskb(const struct sk_buff *skb) { const struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk; return chunk->head_skb ? : skb; } /* These bind address data fields common between endpoints and associations */ struct sctp_bind_addr { /* RFC 2960 12.1 Parameters necessary for the SCTP instance * * SCTP Port: The local SCTP port number the endpoint is * bound to. */ __u16 port; /* RFC 2960 12.1 Parameters necessary for the SCTP instance * * Address List: The list of IP addresses that this instance * has bound. This information is passed to one's * peer(s) in INIT and INIT ACK chunks. */ struct list_head address_list; }; void sctp_bind_addr_init(struct sctp_bind_addr *, __u16 port); void sctp_bind_addr_free(struct sctp_bind_addr *); int sctp_bind_addr_copy(struct net *net, struct sctp_bind_addr *dest, const struct sctp_bind_addr *src, enum sctp_scope scope, gfp_t gfp, int flags); int sctp_bind_addr_dup(struct sctp_bind_addr *dest, const struct sctp_bind_addr *src, gfp_t gfp); int sctp_add_bind_addr(struct sctp_bind_addr *, union sctp_addr *, int new_size, __u8 addr_state, gfp_t gfp); int sctp_del_bind_addr(struct sctp_bind_addr *, union sctp_addr *); int sctp_bind_addr_match(struct sctp_bind_addr *, const union sctp_addr *, struct sctp_sock *); int sctp_bind_addr_conflict(struct sctp_bind_addr *, const union sctp_addr *, struct sctp_sock *, struct sctp_sock *); int sctp_bind_addr_state(const struct sctp_bind_addr *bp, const union sctp_addr *addr); int sctp_bind_addrs_check(struct sctp_sock *sp, struct sctp_sock *sp2, int cnt2); union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr *bp, const union sctp_addr *addrs, int addrcnt, struct sctp_sock *opt); union sctp_params sctp_bind_addrs_to_raw(const struct sctp_bind_addr *bp, int *addrs_len, gfp_t gfp); int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw, int len, __u16 port, gfp_t gfp); enum sctp_scope sctp_scope(const union sctp_addr *addr); int sctp_in_scope(struct net *net, const union sctp_addr *addr, const enum sctp_scope scope); int sctp_is_any(struct sock *sk, const union sctp_addr *addr); int sctp_is_ep_boundall(struct sock *sk); /* What type of endpoint? */ enum sctp_endpoint_type { SCTP_EP_TYPE_SOCKET, SCTP_EP_TYPE_ASSOCIATION, }; /* * A common base class to bridge the implementation view of a * socket (usually listening) endpoint versus an association's * local endpoint. * This common structure is useful for several purposes: * 1) Common interface for lookup routines. * a) Subfunctions work for either endpoint or association * b) Single interface to lookup allows hiding the lookup lock rather * than acquiring it externally. * 2) Common interface for the inbound chunk handling/state machine. * 3) Common object handling routines for reference counting, etc. * 4) Disentangle association lookup from endpoint lookup, where we * do not have to find our endpoint to find our association. * */ struct sctp_ep_common { /* Runtime type information. What kind of endpoint is this? */ enum sctp_endpoint_type type; /* Some fields to help us manage this object. * refcnt - Reference count access to this object. * dead - Do not attempt to use this object. */ refcount_t refcnt; bool dead; /* What socket does this endpoint belong to? */ struct sock *sk; /* Cache netns and it won't change once set */ struct net *net; /* This is where we receive inbound chunks. */ struct sctp_inq inqueue; /* This substructure includes the defining parameters of the * endpoint: * bind_addr.port is our shared port number. * bind_addr.address_list is our set of local IP addresses. */ struct sctp_bind_addr bind_addr; }; /* RFC Section 1.4 Key Terms * * o SCTP endpoint: The logical sender/receiver of SCTP packets. On a * multi-homed host, an SCTP endpoint is represented to its peers as a * combination of a set of eligible destination transport addresses to * which SCTP packets can be sent and a set of eligible source * transport addresses from which SCTP packets can be received. * All transport addresses used by an SCTP endpoint must use the * same port number, but can use multiple IP addresses. A transport * address used by an SCTP endpoint must not be used by another * SCTP endpoint. In other words, a transport address is unique * to an SCTP endpoint. * * From an implementation perspective, each socket has one of these. * A TCP-style socket will have exactly one association on one of * these. An UDP-style socket will have multiple associations hanging * off one of these. */ struct sctp_endpoint { /* Common substructure for endpoint and association. */ struct sctp_ep_common base; /* Fields to help us manage our entries in the hash tables. */ struct hlist_node node; int hashent; /* Associations: A list of current associations and mappings * to the data consumers for each association. This * may be in the form of a hash table or other * implementation dependent structure. The data * consumers may be process identification * information such as file descriptors, named pipe * pointer, or table pointers dependent on how SCTP * is implemented. */ /* This is really a list of struct sctp_association entries. */ struct list_head asocs; /* Secret Key: A secret key used by this endpoint to compute * the MAC. This SHOULD be a cryptographic quality * random number with a sufficient length. * Discussion in [RFC1750] can be helpful in * selection of the key. */ __u8 secret_key[SCTP_SECRET_SIZE]; /* digest: This is a digest of the sctp cookie. This field is * only used on the receive path when we try to validate * that the cookie has not been tampered with. We put * this here so we pre-allocate this once and can re-use * on every receive. */ __u8 *digest; /* sendbuf acct. policy. */ __u32 sndbuf_policy; /* rcvbuf acct. policy. */ __u32 rcvbuf_policy; /* SCTP AUTH: array of the HMACs that will be allocated * we need this per association so that we don't serialize */ struct crypto_shash **auth_hmacs; /* SCTP-AUTH: hmacs for the endpoint encoded into parameter */ struct sctp_hmac_algo_param *auth_hmacs_list; /* SCTP-AUTH: chunks to authenticate encoded into parameter */ struct sctp_chunks_param *auth_chunk_list; /* SCTP-AUTH: endpoint shared keys */ struct list_head endpoint_shared_keys; __u16 active_key_id; __u8 ecn_enable:1, auth_enable:1, intl_enable:1, prsctp_enable:1, asconf_enable:1, reconf_enable:1; __u8 strreset_enable; struct rcu_head rcu; }; /* Recover the outer endpoint structure. */ static inline struct sctp_endpoint *sctp_ep(struct sctp_ep_common *base) { struct sctp_endpoint *ep; ep = container_of(base, struct sctp_endpoint, base); return ep; } /* These are function signatures for manipulating endpoints. */ struct sctp_endpoint *sctp_endpoint_new(struct sock *, gfp_t); void sctp_endpoint_free(struct sctp_endpoint *); void sctp_endpoint_put(struct sctp_endpoint *); int sctp_endpoint_hold(struct sctp_endpoint *ep); void sctp_endpoint_add_asoc(struct sctp_endpoint *, struct sctp_association *); struct sctp_association *sctp_endpoint_lookup_assoc( const struct sctp_endpoint *ep, const union sctp_addr *paddr, struct sctp_transport **); bool sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep, const union sctp_addr *paddr); struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *ep, struct net *net, const union sctp_addr *laddr, int dif, int sdif); bool sctp_has_association(struct net *net, const union sctp_addr *laddr, const union sctp_addr *paddr, int dif, int sdif); int sctp_verify_init(struct net *net, const struct sctp_endpoint *ep, const struct sctp_association *asoc, enum sctp_cid cid, struct sctp_init_chunk *peer_init, struct sctp_chunk *chunk, struct sctp_chunk **err_chunk); int sctp_process_init(struct sctp_association *, struct sctp_chunk *chunk, const union sctp_addr *peer, struct sctp_init_chunk *init, gfp_t gfp); __u32 sctp_generate_tag(const struct sctp_endpoint *); __u32 sctp_generate_tsn(const struct sctp_endpoint *); struct sctp_inithdr_host { __u32 init_tag; __u32 a_rwnd; __u16 num_outbound_streams; __u16 num_inbound_streams; __u32 initial_tsn; }; struct sctp_stream_priorities { /* List of priorities scheduled */ struct list_head prio_sched; /* List of streams scheduled */ struct list_head active; /* The next stream in line */ struct sctp_stream_out_ext *next; __u16 prio; __u16 users; }; struct sctp_stream_out_ext { __u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1]; __u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1]; struct list_head outq; /* chunks enqueued by this stream */ union { struct { /* Scheduled streams list */ struct list_head prio_list; struct sctp_stream_priorities *prio_head; }; /* Fields used by RR scheduler */ struct { struct list_head rr_list; }; struct { struct list_head fc_list; __u32 fc_length; __u16 fc_weight; }; }; }; struct sctp_stream_out { union { __u32 mid; __u16 ssn; }; __u32 mid_uo; struct sctp_stream_out_ext *ext; __u8 state; }; struct sctp_stream_in { union { __u32 mid; __u16 ssn; }; __u32 mid_uo; __u32 fsn; __u32 fsn_uo; char pd_mode; char pd_mode_uo; }; struct sctp_stream { GENRADIX(struct sctp_stream_out) out; GENRADIX(struct sctp_stream_in) in; __u16 outcnt; __u16 incnt; /* Current stream being sent, if any */ struct sctp_stream_out *out_curr; union { /* Fields used by priority scheduler */ struct { /* List of priorities scheduled */ struct list_head prio_list; }; /* Fields used by RR scheduler */ struct { /* List of streams scheduled */ struct list_head rr_list; /* The next stream in line */ struct sctp_stream_out_ext *rr_next; }; struct { struct list_head fc_list; }; }; struct sctp_stream_interleave *si; }; static inline struct sctp_stream_out *sctp_stream_out( struct sctp_stream *stream, __u16 sid) { return genradix_ptr(&stream->out, sid); } static inline struct sctp_stream_in *sctp_stream_in( struct sctp_stream *stream, __u16 sid) { return genradix_ptr(&stream->in, sid); } #define SCTP_SO(s, i) sctp_stream_out((s), (i)) #define SCTP_SI(s, i) sctp_stream_in((s), (i)) #define SCTP_STREAM_CLOSED 0x00 #define SCTP_STREAM_OPEN 0x01 static inline __u16 sctp_datachk_len(const struct sctp_stream *stream) { return stream->si->data_chunk_len; } static inline __u16 sctp_datahdr_len(const struct sctp_stream *stream) { return stream->si->data_chunk_len - sizeof(struct sctp_chunkhdr); } static inline __u16 sctp_ftsnchk_len(const struct sctp_stream *stream) { return stream->si->ftsn_chunk_len; } static inline __u16 sctp_ftsnhdr_len(const struct sctp_stream *stream) { return stream->si->ftsn_chunk_len - sizeof(struct sctp_chunkhdr); } /* SCTP_GET_ASSOC_STATS counters */ struct sctp_priv_assoc_stats { /* Maximum observed rto in the association during subsequent * observations. Value is set to 0 if no RTO measurement took place * The transport where the max_rto was observed is returned in * obs_rto_ipaddr */ struct sockaddr_storage obs_rto_ipaddr; __u64 max_obs_rto; /* Total In and Out SACKs received and sent */ __u64 isacks; __u64 osacks; /* Total In and Out packets received and sent */ __u64 opackets; __u64 ipackets; /* Total retransmitted chunks */ __u64 rtxchunks; /* TSN received > next expected */ __u64 outofseqtsns; /* Duplicate Chunks received */ __u64 idupchunks; /* Gap Ack Blocks received */ __u64 gapcnt; /* Unordered data chunks sent and received */ __u64 ouodchunks; __u64 iuodchunks; /* Ordered data chunks sent and received */ __u64 oodchunks; __u64 iodchunks; /* Control chunks sent and received */ __u64 octrlchunks; __u64 ictrlchunks; }; /* RFC2960 * * 12. Recommended Transmission Control Block (TCB) Parameters * * This section details a recommended set of parameters that should * be contained within the TCB for an implementation. This section is * for illustrative purposes and should not be deemed as requirements * on an implementation or as an exhaustive list of all parameters * inside an SCTP TCB. Each implementation may need its own additional * parameters for optimization. */ /* Here we have information about each individual association. */ struct sctp_association { /* A base structure common to endpoint and association. * In this context, it represents the associations's view * of the local endpoint of the association. */ struct sctp_ep_common base; /* Associations on the same socket. */ struct list_head asocs; /* association id. */ sctp_assoc_t assoc_id; /* This is our parent endpoint. */ struct sctp_endpoint *ep; /* These are those association elements needed in the cookie. */ struct sctp_cookie c; /* This is all information about our peer. */ struct { /* transport_addr_list * * Peer : A list of SCTP transport addresses that the * Transport : peer is bound to. This information is derived * Address : from the INIT or INIT ACK and is used to * List : associate an inbound packet with a given * : association. Normally this information is * : hashed or keyed for quick lookup and access * : of the TCB. * : The list is also initialized with the list * : of addresses passed with the sctp_connectx() * : call. * * It is a list of SCTP_transport's. */ struct list_head transport_addr_list; /* rwnd * * Peer Rwnd : Current calculated value of the peer's rwnd. */ __u32 rwnd; /* transport_count * * Peer : A count of the number of peer addresses * Transport : in the Peer Transport Address List. * Address : * Count : */ __u16 transport_count; /* port * The transport layer port number. */ __u16 port; /* primary_path * * Primary : This is the current primary destination * Path : transport address of the peer endpoint. It * : may also specify a source transport address * : on this endpoint. * * All of these paths live on transport_addr_list. * * At the bakeoffs, we discovered that the intent of * primaryPath is that it only changes when the ULP * asks to have it changed. We add the activePath to * designate the connection we are currently using to * transmit new data and most control chunks. */ struct sctp_transport *primary_path; /* Cache the primary path address here, when we * need a an address for msg_name. */ union sctp_addr primary_addr; /* active_path * The path that we are currently using to * transmit new data and most control chunks. */ struct sctp_transport *active_path; /* retran_path * * RFC2960 6.4 Multi-homed SCTP Endpoints * ... * Furthermore, when its peer is multi-homed, an * endpoint SHOULD try to retransmit a chunk to an * active destination transport address that is * different from the last destination address to * which the DATA chunk was sent. */ struct sctp_transport *retran_path; /* Pointer to last transport I have sent on. */ struct sctp_transport *last_sent_to; /* This is the last transport I have received DATA on. */ struct sctp_transport *last_data_from; /* * Mapping An array of bits or bytes indicating which out of * Array order TSN's have been received (relative to the * Last Rcvd TSN). If no gaps exist, i.e. no out of * order packets have been received, this array * will be set to all zero. This structure may be * in the form of a circular buffer or bit array. * * Last Rcvd : This is the last TSN received in * TSN : sequence. This value is set initially by * : taking the peer's Initial TSN, received in * : the INIT or INIT ACK chunk, and subtracting * : one from it. * * Throughout most of the specification this is called the * "Cumulative TSN ACK Point". In this case, we * ignore the advice in 12.2 in favour of the term * used in the bulk of the text. This value is hidden * in tsn_map--we get it by calling sctp_tsnmap_get_ctsn(). */ struct sctp_tsnmap tsn_map; /* This mask is used to disable sending the ASCONF chunk * with specified parameter to peer. */ __be16 addip_disabled_mask; /* These are capabilities which our peer advertised. */ __u16 ecn_capable:1, /* Can peer do ECN? */ ipv4_address:1, /* Peer understands IPv4 addresses? */ ipv6_address:1, /* Peer understands IPv6 addresses? */ asconf_capable:1, /* Does peer support ADDIP? */ prsctp_capable:1, /* Can peer do PR-SCTP? */ reconf_capable:1, /* Can peer do RE-CONFIG? */ intl_capable:1, /* Can peer do INTERLEAVE */ auth_capable:1, /* Is peer doing SCTP-AUTH? */ /* sack_needed: * This flag indicates if the next received * packet is to be responded to with a * SACK. This is initialized to 0. When a packet * is received sack_cnt is incremented. If this value * reaches 2 or more, a SACK is sent and the * value is reset to 0. Note: This is used only * when no DATA chunks are received out of * order. When DATA chunks are out of order, * SACK's are not delayed (see Section 6). */ sack_needed:1, /* Do we need to sack the peer? */ sack_generation:1, zero_window_announced:1; __u32 sack_cnt; __u32 adaptation_ind; /* Adaptation Code point. */ struct sctp_inithdr_host i; void *cookie; int cookie_len; /* ADDIP Section 4.2 Upon reception of an ASCONF Chunk. * C1) ... "Peer-Serial-Number'. This value MUST be initialized to the * Initial TSN Value minus 1 */ __u32 addip_serial; /* SCTP-AUTH: We need to know pears random number, hmac list * and authenticated chunk list. All that is part of the * cookie and these are just pointers to those locations */ struct sctp_random_param *peer_random; struct sctp_chunks_param *peer_chunks; struct sctp_hmac_algo_param *peer_hmacs; } peer; /* State : A state variable indicating what state the * : association is in, i.e. COOKIE-WAIT, * : COOKIE-ECHOED, ESTABLISHED, SHUTDOWN-PENDING, * : SHUTDOWN-SENT, SHUTDOWN-RECEIVED, SHUTDOWN-ACK-SENT. * * Note: No "CLOSED" state is illustrated since if a * association is "CLOSED" its TCB SHOULD be removed. * * In this implementation we DO have a CLOSED * state which is used during initiation and shutdown. * * State takes values from SCTP_STATE_*. */ enum sctp_state state; /* Overall : The overall association error count. * Error Count : [Clear this any time I get something.] */ int overall_error_count; /* The cookie life I award for any cookie. */ ktime_t cookie_life; /* These are the association's initial, max, and min RTO values. * These values will be initialized by system defaults, but can * be modified via the SCTP_RTOINFO socket option. */ unsigned long rto_initial; unsigned long rto_max; unsigned long rto_min; /* Maximum number of new data packets that can be sent in a burst. */ int max_burst; /* This is the max_retrans value for the association. This value will * be initialized from system defaults, but can be * modified by the SCTP_ASSOCINFO socket option. */ int max_retrans; /* This is the partially failed retrans value for the transport * and will be initialized from the assocs value. This can be * changed using the SCTP_PEER_ADDR_THLDS socket option */ __u16 pf_retrans; /* Used for primary path switchover. */ __u16 ps_retrans; /* Maximum number of times the endpoint will retransmit INIT */ __u16 max_init_attempts; /* How many times have we resent an INIT? */ __u16 init_retries; /* The largest timeout or RTO value to use in attempting an INIT */ unsigned long max_init_timeo; /* Heartbeat interval: The endpoint sends out a Heartbeat chunk to * the destination address every heartbeat interval. This value * will be inherited by all new transports. */ unsigned long hbinterval; unsigned long probe_interval; __be16 encap_port; /* This is the max_retrans value for new transports in the * association. */ __u16 pathmaxrxt; __u32 flowlabel; __u8 dscp; /* Flag that path mtu update is pending */ __u8 pmtu_pending; /* Association : The smallest PMTU discovered for all of the * PMTU : peer's transport addresses. */ __u32 pathmtu; /* Flags controlling Heartbeat, SACK delay, and Path MTU Discovery. */ __u32 param_flags; __u32 sackfreq; /* SACK delay timeout */ unsigned long sackdelay; unsigned long timeouts[SCTP_NUM_TIMEOUT_TYPES]; struct timer_list timers[SCTP_NUM_TIMEOUT_TYPES]; /* Transport to which SHUTDOWN chunk was last sent. */ struct sctp_transport *shutdown_last_sent_to; /* Transport to which INIT chunk was last sent. */ struct sctp_transport *init_last_sent_to; /* How many times have we resent a SHUTDOWN */ int shutdown_retries; /* Next TSN : The next TSN number to be assigned to a new * : DATA chunk. This is sent in the INIT or INIT * : ACK chunk to the peer and incremented each * : time a DATA chunk is assigned a TSN * : (normally just prior to transmit or during * : fragmentation). */ __u32 next_tsn; /* * Last Rcvd : This is the last TSN received in sequence. This value * TSN : is set initially by taking the peer's Initial TSN, * : received in the INIT or INIT ACK chunk, and * : subtracting one from it. * * Most of RFC 2960 refers to this as the Cumulative TSN Ack Point. */ __u32 ctsn_ack_point; /* PR-SCTP Advanced.Peer.Ack.Point */ __u32 adv_peer_ack_point; /* Highest TSN that is acknowledged by incoming SACKs. */ __u32 highest_sacked; /* TSN marking the fast recovery exit point */ __u32 fast_recovery_exit; /* Flag to track the current fast recovery state */ __u8 fast_recovery; /* The number of unacknowledged data chunks. Reported through * the SCTP_STATUS sockopt. */ __u16 unack_data; /* The total number of data chunks that we've had to retransmit * as the result of a T3 timer expiration */ __u32 rtx_data_chunks; /* This is the association's receive buffer space. This value is used * to set a_rwnd field in an INIT or a SACK chunk. */ __u32 rwnd; /* This is the last advertised value of rwnd over a SACK chunk. */ __u32 a_rwnd; /* Number of bytes by which the rwnd has slopped. The rwnd is allowed * to slop over a maximum of the association's frag_point. */ __u32 rwnd_over; /* Keeps treack of rwnd pressure. This happens when we have * a window, but not receive buffer (i.e small packets). This one * is releases slowly (1 PMTU at a time ). */ __u32 rwnd_press; /* This is the sndbuf size in use for the association. * This corresponds to the sndbuf size for the association, * as specified in the sk->sndbuf. */ int sndbuf_used; /* This is the amount of memory that this association has allocated * in the receive path at any given time. */ atomic_t rmem_alloc; /* This is the wait queue head for send requests waiting on * the association sndbuf space. */ wait_queue_head_t wait; /* The message size at which SCTP fragmentation will occur. */ __u32 frag_point; __u32 user_frag; /* Counter used to count INIT errors. */ int init_err_counter; /* Count the number of INIT cycles (for doubling timeout). */ int init_cycle; /* Default send parameters. */ __u16 default_stream; __u16 default_flags; __u32 default_ppid; __u32 default_context; __u32 default_timetolive; /* Default receive parameters */ __u32 default_rcv_context; /* Stream arrays */ struct sctp_stream stream; /* All outbound chunks go through this structure. */ struct sctp_outq outqueue; /* A smart pipe that will handle reordering and fragmentation, * as well as handle passing events up to the ULP. */ struct sctp_ulpq ulpq; /* Last TSN that caused an ECNE Chunk to be sent. */ __u32 last_ecne_tsn; /* Last TSN that caused a CWR Chunk to be sent. */ __u32 last_cwr_tsn; /* How many duplicated TSNs have we seen? */ int numduptsns; /* These are to support * "SCTP Extensions for Dynamic Reconfiguration of IP Addresses * and Enforcement of Flow and Message Limits" * <draft-ietf-tsvwg-addip-sctp-02.txt> * or "ADDIP" for short. */ /* ADDIP Section 4.1.1 Congestion Control of ASCONF Chunks * * R1) One and only one ASCONF Chunk MAY be in transit and * unacknowledged at any one time. If a sender, after sending * an ASCONF chunk, decides it needs to transfer another * ASCONF Chunk, it MUST wait until the ASCONF-ACK Chunk * returns from the previous ASCONF Chunk before sending a * subsequent ASCONF. Note this restriction binds each side, * so at any time two ASCONF may be in-transit on any given * association (one sent from each endpoint). * * [This is our one-and-only-one ASCONF in flight. If we do * not have an ASCONF in flight, this is NULL.] */ struct sctp_chunk *addip_last_asconf; /* ADDIP Section 5.2 Upon reception of an ASCONF Chunk. * * This is needed to implement items E1 - E4 of the updated * spec. Here is the justification: * * Since the peer may bundle multiple ASCONF chunks toward us, * we now need the ability to cache multiple ACKs. The section * describes in detail how they are cached and cleaned up. */ struct list_head asconf_ack_list; /* These ASCONF chunks are waiting to be sent. * * These chunks can't be pushed to outqueue until receiving * ASCONF_ACK for the previous ASCONF indicated by * addip_last_asconf, so as to guarantee that only one ASCONF * is in flight at any time. * * ADDIP Section 4.1.1 Congestion Control of ASCONF Chunks * * In defining the ASCONF Chunk transfer procedures, it is * essential that these transfers MUST NOT cause congestion * within the network. To achieve this, we place these * restrictions on the transfer of ASCONF Chunks: * * R1) One and only one ASCONF Chunk MAY be in transit and * unacknowledged at any one time. If a sender, after sending * an ASCONF chunk, decides it needs to transfer another * ASCONF Chunk, it MUST wait until the ASCONF-ACK Chunk * returns from the previous ASCONF Chunk before sending a * subsequent ASCONF. Note this restriction binds each side, * so at any time two ASCONF may be in-transit on any given * association (one sent from each endpoint). * * * [I really think this is EXACTLY the sort of intelligence * which already resides in sctp_outq. Please move this * queue and its supporting logic down there. --piggy] */ struct list_head addip_chunk_list; /* ADDIP Section 4.1 ASCONF Chunk Procedures * * A2) A serial number should be assigned to the Chunk. The * serial number SHOULD be a monotonically increasing * number. The serial number SHOULD be initialized at * the start of the association to the same value as the * Initial TSN and every time a new ASCONF chunk is created * it is incremented by one after assigning the serial number * to the newly created chunk. * * ADDIP * 3.1.1 Address/Stream Configuration Change Chunk (ASCONF) * * Serial Number : 32 bits (unsigned integer) * * This value represents a Serial Number for the ASCONF * Chunk. The valid range of Serial Number is from 0 to * 4294967295 (2^32 - 1). Serial Numbers wrap back to 0 * after reaching 4294967295. */ __u32 addip_serial; int src_out_of_asoc_ok; union sctp_addr *asconf_addr_del_pending; struct sctp_transport *new_transport; /* SCTP AUTH: list of the endpoint shared keys. These * keys are provided out of band by the user application * and can't change during the lifetime of the association */ struct list_head endpoint_shared_keys; /* SCTP AUTH: * The current generated association shared key (secret) */ struct sctp_auth_bytes *asoc_shared_key; struct sctp_shared_key *shkey; /* SCTP AUTH: hmac id of the first peer requested algorithm * that we support. */ __u16 default_hmac_id; __u16 active_key_id; __u8 need_ecne:1, /* Need to send an ECNE Chunk? */ temp:1, /* Is it a temporary association? */ pf_expose:2, /* Expose pf state? */ force_delay:1; __u8 strreset_enable; __u8 strreset_outstanding; /* request param count on the fly */ __u32 strreset_outseq; /* Update after receiving response */ __u32 strreset_inseq; /* Update after receiving request */ __u32 strreset_result[2]; /* save the results of last 2 responses */ struct sctp_chunk *strreset_chunk; /* save request chunk */ struct sctp_priv_assoc_stats stats; int sent_cnt_removable; __u16 subscribe; __u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1]; __u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1]; /* Security identifiers from incoming (INIT). These are set by * security_sctp_assoc_request(). These will only be used by * SCTP TCP type sockets and peeled off connections as they * cause a new socket to be generated. security_sctp_sk_clone() * will then plug these into the new socket. */ u32 secid; u32 peer_secid; struct rcu_head rcu; }; /* An eyecatcher for determining if we are really looking at an * association data structure. */ enum { SCTP_ASSOC_EYECATCHER = 0xa550c123, }; /* Recover the outer association structure. */ static inline struct sctp_association *sctp_assoc(struct sctp_ep_common *base) { struct sctp_association *asoc; asoc = container_of(base, struct sctp_association, base); return asoc; } /* These are function signatures for manipulating associations. */ struct sctp_association * sctp_association_new(const struct sctp_endpoint *ep, const struct sock *sk, enum sctp_scope scope, gfp_t gfp); void sctp_association_free(struct sctp_association *); void sctp_association_put(struct sctp_association *); void sctp_association_hold(struct sctp_association *); struct sctp_transport *sctp_assoc_choose_alter_transport( struct sctp_association *, struct sctp_transport *); void sctp_assoc_update_retran_path(struct sctp_association *); struct sctp_transport *sctp_assoc_lookup_paddr(const struct sctp_association *, const union sctp_addr *); int sctp_assoc_lookup_laddr(struct sctp_association *asoc, const union sctp_addr *laddr); struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *, const union sctp_addr *address, const gfp_t gfp, const int peer_state); void sctp_assoc_rm_peer(struct sctp_association *asoc, struct sctp_transport *peer); void sctp_assoc_control_transport(struct sctp_association *asoc, struct sctp_transport *transport, enum sctp_transport_cmd command, sctp_sn_error_t error); struct sctp_transport *sctp_assoc_lookup_tsn(struct sctp_association *, __u32); void sctp_assoc_migrate(struct sctp_association *, struct sock *); int sctp_assoc_update(struct sctp_association *old, struct sctp_association *new); __u32 sctp_association_get_next_tsn(struct sctp_association *); void sctp_assoc_update_frag_point(struct sctp_association *asoc); void sctp_assoc_set_pmtu(struct sctp_association *asoc, __u32 pmtu); void sctp_assoc_sync_pmtu(struct sctp_association *asoc); void sctp_assoc_rwnd_increase(struct sctp_association *, unsigned int); void sctp_assoc_rwnd_decrease(struct sctp_association *, unsigned int); void sctp_assoc_set_primary(struct sctp_association *, struct sctp_transport *); void sctp_assoc_del_nonprimary_peers(struct sctp_association *, struct sctp_transport *); int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc, enum sctp_scope scope, gfp_t gfp); int sctp_assoc_set_bind_addr_from_cookie(struct sctp_association *, struct sctp_cookie*, gfp_t gfp); int sctp_assoc_set_id(struct sctp_association *, gfp_t); void sctp_assoc_clean_asconf_ack_cache(const struct sctp_association *asoc); struct sctp_chunk *sctp_assoc_lookup_asconf_ack( const struct sctp_association *asoc, __be32 serial); void sctp_asconf_queue_teardown(struct sctp_association *asoc); int sctp_cmp_addr_exact(const union sctp_addr *ss1, const union sctp_addr *ss2); struct sctp_chunk *sctp_get_ecne_prepend(struct sctp_association *asoc); /* A convenience structure to parse out SCTP specific CMSGs. */ struct sctp_cmsgs { struct sctp_initmsg *init; struct sctp_sndrcvinfo *srinfo; struct sctp_sndinfo *sinfo; struct sctp_prinfo *prinfo; struct sctp_authinfo *authinfo; struct msghdr *addrs_msg; }; /* Structure for tracking memory objects */ struct sctp_dbg_objcnt_entry { char *label; atomic_t *counter; }; #endif /* __sctp_structs_h__ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 /* SPDX-License-Identifier: GPL-2.0 */ /* * Portions of this file * Copyright(c) 2016-2017 Intel Deutschland GmbH * Copyright (C) 2018, 2021-2025 Intel Corporation */ #ifndef __CFG80211_RDEV_OPS #define __CFG80211_RDEV_OPS #include <linux/rtnetlink.h> #include <net/cfg80211.h> #include "core.h" #include "trace.h" static inline int rdev_suspend(struct cfg80211_registered_device *rdev, struct cfg80211_wowlan *wowlan) { int ret; trace_rdev_suspend(&rdev->wiphy, wowlan); ret = rdev->ops->suspend(&rdev->wiphy, wowlan); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_resume(struct cfg80211_registered_device *rdev) { int ret; trace_rdev_resume(&rdev->wiphy); ret = rdev->ops->resume(&rdev->wiphy); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_set_wakeup(struct cfg80211_registered_device *rdev, bool enabled) { trace_rdev_set_wakeup(&rdev->wiphy, enabled); rdev->ops->set_wakeup(&rdev->wiphy, enabled); trace_rdev_return_void(&rdev->wiphy); } static inline struct wireless_dev *rdev_add_virtual_intf(struct cfg80211_registered_device *rdev, char *name, unsigned char name_assign_type, enum nl80211_iftype type, struct vif_params *params) { struct wireless_dev *ret; trace_rdev_add_virtual_intf(&rdev->wiphy, name, type); ret = rdev->ops->add_virtual_intf(&rdev->wiphy, name, name_assign_type, type, params); trace_rdev_return_wdev(&rdev->wiphy, ret); return ret; } static inline int rdev_del_virtual_intf(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { int ret; trace_rdev_del_virtual_intf(&rdev->wiphy, wdev); ret = rdev->ops->del_virtual_intf(&rdev->wiphy, wdev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_change_virtual_intf(struct cfg80211_registered_device *rdev, struct net_device *dev, enum nl80211_iftype type, struct vif_params *params) { int ret; trace_rdev_change_virtual_intf(&rdev->wiphy, dev, type); ret = rdev->ops->change_virtual_intf(&rdev->wiphy, dev, type, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_add_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr, struct key_params *params) { int ret; trace_rdev_add_key(&rdev->wiphy, netdev, link_id, key_index, pairwise, mac_addr, params->mode); ret = rdev->ops->add_key(&rdev->wiphy, netdev, link_id, key_index, pairwise, mac_addr, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr, void *cookie, void (*callback)(void *cookie, struct key_params*)) { int ret; trace_rdev_get_key(&rdev->wiphy, netdev, link_id, key_index, pairwise, mac_addr); ret = rdev->ops->get_key(&rdev->wiphy, netdev, link_id, key_index, pairwise, mac_addr, cookie, callback); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, int link_id, u8 key_index, bool pairwise, const u8 *mac_addr) { int ret; trace_rdev_del_key(&rdev->wiphy, netdev, link_id, key_index, pairwise, mac_addr); ret = rdev->ops->del_key(&rdev->wiphy, netdev, link_id, key_index, pairwise, mac_addr); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_default_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, int link_id, u8 key_index, bool unicast, bool multicast) { int ret; trace_rdev_set_default_key(&rdev->wiphy, netdev, link_id, key_index, unicast, multicast); ret = rdev->ops->set_default_key(&rdev->wiphy, netdev, link_id, key_index, unicast, multicast); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_default_mgmt_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, int link_id, u8 key_index) { int ret; trace_rdev_set_default_mgmt_key(&rdev->wiphy, netdev, link_id, key_index); ret = rdev->ops->set_default_mgmt_key(&rdev->wiphy, netdev, link_id, key_index); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_default_beacon_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, int link_id, u8 key_index) { int ret; trace_rdev_set_default_beacon_key(&rdev->wiphy, netdev, link_id, key_index); ret = rdev->ops->set_default_beacon_key(&rdev->wiphy, netdev, link_id, key_index); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_start_ap(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ap_settings *settings) { int ret; trace_rdev_start_ap(&rdev->wiphy, dev, settings); ret = rdev->ops->start_ap(&rdev->wiphy, dev, settings); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_change_beacon(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ap_update *info) { int ret; trace_rdev_change_beacon(&rdev->wiphy, dev, info); ret = rdev->ops->change_beacon(&rdev->wiphy, dev, info); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_stop_ap(struct cfg80211_registered_device *rdev, struct net_device *dev, unsigned int link_id) { int ret; trace_rdev_stop_ap(&rdev->wiphy, dev, link_id); ret = rdev->ops->stop_ap(&rdev->wiphy, dev, link_id); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_add_station(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *mac, struct station_parameters *params) { int ret; trace_rdev_add_station(&rdev->wiphy, dev, mac, params); ret = rdev->ops->add_station(&rdev->wiphy, dev, mac, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_station(struct cfg80211_registered_device *rdev, struct net_device *dev, struct station_del_parameters *params) { int ret; trace_rdev_del_station(&rdev->wiphy, dev, params); ret = rdev->ops->del_station(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_change_station(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *mac, struct station_parameters *params) { int ret; trace_rdev_change_station(&rdev->wiphy, dev, mac, params); ret = rdev->ops->change_station(&rdev->wiphy, dev, mac, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_station(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *mac, struct station_info *sinfo) { int ret; trace_rdev_get_station(&rdev->wiphy, dev, mac); ret = rdev->ops->get_station(&rdev->wiphy, dev, mac, sinfo); trace_rdev_return_int_station_info(&rdev->wiphy, ret, sinfo); return ret; } static inline int rdev_dump_station(struct cfg80211_registered_device *rdev, struct net_device *dev, int idx, u8 *mac, struct station_info *sinfo) { int ret; trace_rdev_dump_station(&rdev->wiphy, dev, idx, mac); ret = rdev->ops->dump_station(&rdev->wiphy, dev, idx, mac, sinfo); trace_rdev_return_int_station_info(&rdev->wiphy, ret, sinfo); return ret; } static inline int rdev_add_mpath(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *dst, u8 *next_hop) { int ret; trace_rdev_add_mpath(&rdev->wiphy, dev, dst, next_hop); ret = rdev->ops->add_mpath(&rdev->wiphy, dev, dst, next_hop); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_mpath(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *dst) { int ret; trace_rdev_del_mpath(&rdev->wiphy, dev, dst); ret = rdev->ops->del_mpath(&rdev->wiphy, dev, dst); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_change_mpath(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *dst, u8 *next_hop) { int ret; trace_rdev_change_mpath(&rdev->wiphy, dev, dst, next_hop); ret = rdev->ops->change_mpath(&rdev->wiphy, dev, dst, next_hop); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_mpath(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *dst, u8 *next_hop, struct mpath_info *pinfo) { int ret; trace_rdev_get_mpath(&rdev->wiphy, dev, dst, next_hop); ret = rdev->ops->get_mpath(&rdev->wiphy, dev, dst, next_hop, pinfo); trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo); return ret; } static inline int rdev_get_mpp(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *dst, u8 *mpp, struct mpath_info *pinfo) { int ret; trace_rdev_get_mpp(&rdev->wiphy, dev, dst, mpp); ret = rdev->ops->get_mpp(&rdev->wiphy, dev, dst, mpp, pinfo); trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo); return ret; } static inline int rdev_dump_mpath(struct cfg80211_registered_device *rdev, struct net_device *dev, int idx, u8 *dst, u8 *next_hop, struct mpath_info *pinfo) { int ret; trace_rdev_dump_mpath(&rdev->wiphy, dev, idx, dst, next_hop); ret = rdev->ops->dump_mpath(&rdev->wiphy, dev, idx, dst, next_hop, pinfo); trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo); return ret; } static inline int rdev_dump_mpp(struct cfg80211_registered_device *rdev, struct net_device *dev, int idx, u8 *dst, u8 *mpp, struct mpath_info *pinfo) { int ret; trace_rdev_dump_mpp(&rdev->wiphy, dev, idx, dst, mpp); ret = rdev->ops->dump_mpp(&rdev->wiphy, dev, idx, dst, mpp, pinfo); trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo); return ret; } static inline int rdev_get_mesh_config(struct cfg80211_registered_device *rdev, struct net_device *dev, struct mesh_config *conf) { int ret; trace_rdev_get_mesh_config(&rdev->wiphy, dev); ret = rdev->ops->get_mesh_config(&rdev->wiphy, dev, conf); trace_rdev_return_int_mesh_config(&rdev->wiphy, ret, conf); return ret; } static inline int rdev_update_mesh_config(struct cfg80211_registered_device *rdev, struct net_device *dev, u32 mask, const struct mesh_config *nconf) { int ret; trace_rdev_update_mesh_config(&rdev->wiphy, dev, mask, nconf); ret = rdev->ops->update_mesh_config(&rdev->wiphy, dev, mask, nconf); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_join_mesh(struct cfg80211_registered_device *rdev, struct net_device *dev, const struct mesh_config *conf, const struct mesh_setup *setup) { int ret; trace_rdev_join_mesh(&rdev->wiphy, dev, conf, setup); ret = rdev->ops->join_mesh(&rdev->wiphy, dev, conf, setup); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_leave_mesh(struct cfg80211_registered_device *rdev, struct net_device *dev) { int ret; trace_rdev_leave_mesh(&rdev->wiphy, dev); ret = rdev->ops->leave_mesh(&rdev->wiphy, dev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_join_ocb(struct cfg80211_registered_device *rdev, struct net_device *dev, struct ocb_setup *setup) { int ret; trace_rdev_join_ocb(&rdev->wiphy, dev, setup); ret = rdev->ops->join_ocb(&rdev->wiphy, dev, setup); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_leave_ocb(struct cfg80211_registered_device *rdev, struct net_device *dev) { int ret; trace_rdev_leave_ocb(&rdev->wiphy, dev); ret = rdev->ops->leave_ocb(&rdev->wiphy, dev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_change_bss(struct cfg80211_registered_device *rdev, struct net_device *dev, struct bss_parameters *params) { int ret; trace_rdev_change_bss(&rdev->wiphy, dev, params); ret = rdev->ops->change_bss(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_inform_bss(struct cfg80211_registered_device *rdev, struct cfg80211_bss *bss, const struct cfg80211_bss_ies *ies, void *drv_data) { trace_rdev_inform_bss(&rdev->wiphy, bss); if (rdev->ops->inform_bss) rdev->ops->inform_bss(&rdev->wiphy, bss, ies, drv_data); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_set_txq_params(struct cfg80211_registered_device *rdev, struct net_device *dev, struct ieee80211_txq_params *params) { int ret; trace_rdev_set_txq_params(&rdev->wiphy, dev, params); ret = rdev->ops->set_txq_params(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_libertas_set_mesh_channel(struct cfg80211_registered_device *rdev, struct net_device *dev, struct ieee80211_channel *chan) { int ret; trace_rdev_libertas_set_mesh_channel(&rdev->wiphy, dev, chan); ret = rdev->ops->libertas_set_mesh_channel(&rdev->wiphy, dev, chan); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_monitor_channel(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_chan_def *chandef) { int ret; trace_rdev_set_monitor_channel(&rdev->wiphy, dev, chandef); ret = rdev->ops->set_monitor_channel(&rdev->wiphy, dev, chandef); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_scan(struct cfg80211_registered_device *rdev, struct cfg80211_scan_request_int *request) { int ret; if (WARN_ON_ONCE(!request->req.n_ssids && request->req.ssids)) return -EINVAL; trace_rdev_scan(&rdev->wiphy, request); ret = rdev->ops->scan(&rdev->wiphy, &request->req); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_abort_scan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { trace_rdev_abort_scan(&rdev->wiphy, wdev); rdev->ops->abort_scan(&rdev->wiphy, wdev); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_auth(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_auth_request *req) { int ret; trace_rdev_auth(&rdev->wiphy, dev, req); ret = rdev->ops->auth(&rdev->wiphy, dev, req); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_assoc(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_assoc_request *req) { int ret; trace_rdev_assoc(&rdev->wiphy, dev, req); ret = rdev->ops->assoc(&rdev->wiphy, dev, req); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_deauth(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_deauth_request *req) { int ret; trace_rdev_deauth(&rdev->wiphy, dev, req); ret = rdev->ops->deauth(&rdev->wiphy, dev, req); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_disassoc(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_disassoc_request *req) { int ret; trace_rdev_disassoc(&rdev->wiphy, dev, req); ret = rdev->ops->disassoc(&rdev->wiphy, dev, req); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_connect(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_connect_params *sme) { int ret; trace_rdev_connect(&rdev->wiphy, dev, sme); ret = rdev->ops->connect(&rdev->wiphy, dev, sme); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_update_connect_params(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_connect_params *sme, u32 changed) { int ret; trace_rdev_update_connect_params(&rdev->wiphy, dev, sme, changed); ret = rdev->ops->update_connect_params(&rdev->wiphy, dev, sme, changed); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_disconnect(struct cfg80211_registered_device *rdev, struct net_device *dev, u16 reason_code) { int ret; trace_rdev_disconnect(&rdev->wiphy, dev, reason_code); ret = rdev->ops->disconnect(&rdev->wiphy, dev, reason_code); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_join_ibss(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ibss_params *params) { int ret; trace_rdev_join_ibss(&rdev->wiphy, dev, params); ret = rdev->ops->join_ibss(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_leave_ibss(struct cfg80211_registered_device *rdev, struct net_device *dev) { int ret; trace_rdev_leave_ibss(&rdev->wiphy, dev); ret = rdev->ops->leave_ibss(&rdev->wiphy, dev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_wiphy_params(struct cfg80211_registered_device *rdev, int radio_idx, u32 changed) { int ret = -EOPNOTSUPP; trace_rdev_set_wiphy_params(&rdev->wiphy, radio_idx, changed); if (rdev->ops->set_wiphy_params) ret = rdev->ops->set_wiphy_params(&rdev->wiphy, radio_idx, changed); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_tx_power(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, int radio_idx, enum nl80211_tx_power_setting type, int mbm) { int ret; trace_rdev_set_tx_power(&rdev->wiphy, wdev, radio_idx, type, mbm); ret = rdev->ops->set_tx_power(&rdev->wiphy, wdev, radio_idx, type, mbm); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_tx_power(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, int radio_idx, unsigned int link_id, int *dbm) { int ret; trace_rdev_get_tx_power(&rdev->wiphy, wdev, radio_idx, link_id); ret = rdev->ops->get_tx_power(&rdev->wiphy, wdev, radio_idx, link_id, dbm); trace_rdev_return_int_int(&rdev->wiphy, ret, *dbm); return ret; } static inline int rdev_set_multicast_to_unicast(struct cfg80211_registered_device *rdev, struct net_device *dev, const bool enabled) { int ret; trace_rdev_set_multicast_to_unicast(&rdev->wiphy, dev, enabled); ret = rdev->ops->set_multicast_to_unicast(&rdev->wiphy, dev, enabled); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_txq_stats(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_txq_stats *txqstats) { int ret; trace_rdev_get_txq_stats(&rdev->wiphy, wdev); ret = rdev->ops->get_txq_stats(&rdev->wiphy, wdev, txqstats); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_rfkill_poll(struct cfg80211_registered_device *rdev) { trace_rdev_rfkill_poll(&rdev->wiphy); rdev->ops->rfkill_poll(&rdev->wiphy); trace_rdev_return_void(&rdev->wiphy); } #ifdef CONFIG_NL80211_TESTMODE static inline int rdev_testmode_cmd(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, void *data, int len) { int ret; trace_rdev_testmode_cmd(&rdev->wiphy, wdev); ret = rdev->ops->testmode_cmd(&rdev->wiphy, wdev, data, len); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_testmode_dump(struct cfg80211_registered_device *rdev, struct sk_buff *skb, struct netlink_callback *cb, void *data, int len) { int ret; trace_rdev_testmode_dump(&rdev->wiphy); ret = rdev->ops->testmode_dump(&rdev->wiphy, skb, cb, data, len); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } #endif static inline int rdev_set_bitrate_mask(struct cfg80211_registered_device *rdev, struct net_device *dev, unsigned int link_id, const u8 *peer, const struct cfg80211_bitrate_mask *mask) { int ret; trace_rdev_set_bitrate_mask(&rdev->wiphy, dev, link_id, peer, mask); ret = rdev->ops->set_bitrate_mask(&rdev->wiphy, dev, link_id, peer, mask); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_dump_survey(struct cfg80211_registered_device *rdev, struct net_device *netdev, int idx, struct survey_info *info) { int ret; trace_rdev_dump_survey(&rdev->wiphy, netdev, idx); ret = rdev->ops->dump_survey(&rdev->wiphy, netdev, idx, info); if (ret < 0) trace_rdev_return_int(&rdev->wiphy, ret); else trace_rdev_return_int_survey_info(&rdev->wiphy, ret, info); return ret; } static inline int rdev_set_pmksa(struct cfg80211_registered_device *rdev, struct net_device *netdev, struct cfg80211_pmksa *pmksa) { int ret; trace_rdev_set_pmksa(&rdev->wiphy, netdev, pmksa); ret = rdev->ops->set_pmksa(&rdev->wiphy, netdev, pmksa); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_pmksa(struct cfg80211_registered_device *rdev, struct net_device *netdev, struct cfg80211_pmksa *pmksa) { int ret; trace_rdev_del_pmksa(&rdev->wiphy, netdev, pmksa); ret = rdev->ops->del_pmksa(&rdev->wiphy, netdev, pmksa); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_flush_pmksa(struct cfg80211_registered_device *rdev, struct net_device *netdev) { int ret; trace_rdev_flush_pmksa(&rdev->wiphy, netdev); ret = rdev->ops->flush_pmksa(&rdev->wiphy, netdev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_remain_on_channel(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct ieee80211_channel *chan, unsigned int duration, u64 *cookie) { int ret; trace_rdev_remain_on_channel(&rdev->wiphy, wdev, chan, duration); ret = rdev->ops->remain_on_channel(&rdev->wiphy, wdev, chan, duration, cookie); trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie); return ret; } static inline int rdev_cancel_remain_on_channel(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u64 cookie) { int ret; trace_rdev_cancel_remain_on_channel(&rdev->wiphy, wdev, cookie); ret = rdev->ops->cancel_remain_on_channel(&rdev->wiphy, wdev, cookie); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_mgmt_tx(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_mgmt_tx_params *params, u64 *cookie) { int ret; trace_rdev_mgmt_tx(&rdev->wiphy, wdev, params); ret = rdev->ops->mgmt_tx(&rdev->wiphy, wdev, params, cookie); trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie); return ret; } static inline int rdev_tx_control_port(struct cfg80211_registered_device *rdev, struct net_device *dev, const void *buf, size_t len, const u8 *dest, __be16 proto, const bool noencrypt, int link, u64 *cookie) { int ret; trace_rdev_tx_control_port(&rdev->wiphy, dev, buf, len, dest, proto, noencrypt, link); ret = rdev->ops->tx_control_port(&rdev->wiphy, dev, buf, len, dest, proto, noencrypt, link, cookie); if (cookie) trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie); else trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_mgmt_tx_cancel_wait(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u64 cookie) { int ret; trace_rdev_mgmt_tx_cancel_wait(&rdev->wiphy, wdev, cookie); ret = rdev->ops->mgmt_tx_cancel_wait(&rdev->wiphy, wdev, cookie); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_power_mgmt(struct cfg80211_registered_device *rdev, struct net_device *dev, bool enabled, int timeout) { int ret; trace_rdev_set_power_mgmt(&rdev->wiphy, dev, enabled, timeout); ret = rdev->ops->set_power_mgmt(&rdev->wiphy, dev, enabled, timeout); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_cqm_rssi_config(struct cfg80211_registered_device *rdev, struct net_device *dev, s32 rssi_thold, u32 rssi_hyst) { int ret; trace_rdev_set_cqm_rssi_config(&rdev->wiphy, dev, rssi_thold, rssi_hyst); ret = rdev->ops->set_cqm_rssi_config(&rdev->wiphy, dev, rssi_thold, rssi_hyst); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_cqm_rssi_range_config(struct cfg80211_registered_device *rdev, struct net_device *dev, s32 low, s32 high) { int ret; trace_rdev_set_cqm_rssi_range_config(&rdev->wiphy, dev, low, high); ret = rdev->ops->set_cqm_rssi_range_config(&rdev->wiphy, dev, low, high); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_cqm_txe_config(struct cfg80211_registered_device *rdev, struct net_device *dev, u32 rate, u32 pkts, u32 intvl) { int ret; trace_rdev_set_cqm_txe_config(&rdev->wiphy, dev, rate, pkts, intvl); ret = rdev->ops->set_cqm_txe_config(&rdev->wiphy, dev, rate, pkts, intvl); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_update_mgmt_frame_registrations(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct mgmt_frame_regs *upd) { might_sleep(); trace_rdev_update_mgmt_frame_registrations(&rdev->wiphy, wdev, upd); if (rdev->ops->update_mgmt_frame_registrations) rdev->ops->update_mgmt_frame_registrations(&rdev->wiphy, wdev, upd); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_set_antenna(struct cfg80211_registered_device *rdev, int radio_idx, u32 tx_ant, u32 rx_ant) { int ret; trace_rdev_set_antenna(&rdev->wiphy, radio_idx, tx_ant, rx_ant); ret = rdev->ops->set_antenna(&rdev->wiphy, -1, tx_ant, rx_ant); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_antenna(struct cfg80211_registered_device *rdev, int radio_idx, u32 *tx_ant, u32 *rx_ant) { int ret; trace_rdev_get_antenna(&rdev->wiphy, radio_idx); ret = rdev->ops->get_antenna(&rdev->wiphy, radio_idx, tx_ant, rx_ant); if (ret) trace_rdev_return_int(&rdev->wiphy, ret); else trace_rdev_return_int_tx_rx(&rdev->wiphy, ret, *tx_ant, *rx_ant); return ret; } static inline int rdev_sched_scan_start(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_sched_scan_request *request) { int ret; trace_rdev_sched_scan_start(&rdev->wiphy, dev, request->reqid); ret = rdev->ops->sched_scan_start(&rdev->wiphy, dev, request); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_sched_scan_stop(struct cfg80211_registered_device *rdev, struct net_device *dev, u64 reqid) { int ret; trace_rdev_sched_scan_stop(&rdev->wiphy, dev, reqid); ret = rdev->ops->sched_scan_stop(&rdev->wiphy, dev, reqid); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_rekey_data(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_gtk_rekey_data *data) { int ret; trace_rdev_set_rekey_data(&rdev->wiphy, dev); ret = rdev->ops->set_rekey_data(&rdev->wiphy, dev, data); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_tdls_mgmt(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *peer, int link_id, u8 action_code, u8 dialog_token, u16 status_code, u32 peer_capability, bool initiator, const u8 *buf, size_t len) { int ret; trace_rdev_tdls_mgmt(&rdev->wiphy, dev, peer, link_id, action_code, dialog_token, status_code, peer_capability, initiator, buf, len); ret = rdev->ops->tdls_mgmt(&rdev->wiphy, dev, peer, link_id, action_code, dialog_token, status_code, peer_capability, initiator, buf, len); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_tdls_oper(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *peer, enum nl80211_tdls_operation oper) { int ret; trace_rdev_tdls_oper(&rdev->wiphy, dev, peer, oper); ret = rdev->ops->tdls_oper(&rdev->wiphy, dev, peer, oper); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_probe_client(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *peer, u64 *cookie) { int ret; trace_rdev_probe_client(&rdev->wiphy, dev, peer); ret = rdev->ops->probe_client(&rdev->wiphy, dev, peer, cookie); trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie); return ret; } static inline int rdev_set_noack_map(struct cfg80211_registered_device *rdev, struct net_device *dev, u16 noack_map) { int ret; trace_rdev_set_noack_map(&rdev->wiphy, dev, noack_map); ret = rdev->ops->set_noack_map(&rdev->wiphy, dev, noack_map); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_channel(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, unsigned int link_id, struct cfg80211_chan_def *chandef) { int ret; trace_rdev_get_channel(&rdev->wiphy, wdev, link_id); ret = rdev->ops->get_channel(&rdev->wiphy, wdev, link_id, chandef); trace_rdev_return_chandef(&rdev->wiphy, ret, chandef); return ret; } static inline int rdev_start_p2p_device(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { int ret; trace_rdev_start_p2p_device(&rdev->wiphy, wdev); ret = rdev->ops->start_p2p_device(&rdev->wiphy, wdev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_stop_p2p_device(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { trace_rdev_stop_p2p_device(&rdev->wiphy, wdev); rdev->ops->stop_p2p_device(&rdev->wiphy, wdev); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_start_nan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_nan_conf *conf) { int ret; trace_rdev_start_nan(&rdev->wiphy, wdev, conf); ret = rdev->ops->start_nan(&rdev->wiphy, wdev, conf); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_stop_nan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { trace_rdev_stop_nan(&rdev->wiphy, wdev); rdev->ops->stop_nan(&rdev->wiphy, wdev); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_add_nan_func(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_nan_func *nan_func) { int ret; trace_rdev_add_nan_func(&rdev->wiphy, wdev, nan_func); ret = rdev->ops->add_nan_func(&rdev->wiphy, wdev, nan_func); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_del_nan_func(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u64 cookie) { trace_rdev_del_nan_func(&rdev->wiphy, wdev, cookie); rdev->ops->del_nan_func(&rdev->wiphy, wdev, cookie); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_nan_change_conf(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_nan_conf *conf, u32 changes) { int ret; trace_rdev_nan_change_conf(&rdev->wiphy, wdev, conf, changes); if (rdev->ops->nan_change_conf) ret = rdev->ops->nan_change_conf(&rdev->wiphy, wdev, conf, changes); else ret = -EOPNOTSUPP; trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_mac_acl(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_acl_data *params) { int ret; trace_rdev_set_mac_acl(&rdev->wiphy, dev, params); ret = rdev->ops->set_mac_acl(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_update_ft_ies(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_update_ft_ies_params *ftie) { int ret; trace_rdev_update_ft_ies(&rdev->wiphy, dev, ftie); ret = rdev->ops->update_ft_ies(&rdev->wiphy, dev, ftie); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_crit_proto_start(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, enum nl80211_crit_proto_id protocol, u16 duration) { int ret; trace_rdev_crit_proto_start(&rdev->wiphy, wdev, protocol, duration); ret = rdev->ops->crit_proto_start(&rdev->wiphy, wdev, protocol, duration); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_crit_proto_stop(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { trace_rdev_crit_proto_stop(&rdev->wiphy, wdev); rdev->ops->crit_proto_stop(&rdev->wiphy, wdev); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_channel_switch(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_csa_settings *params) { int ret; trace_rdev_channel_switch(&rdev->wiphy, dev, params); ret = rdev->ops->channel_switch(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_qos_map(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_qos_map *qos_map) { int ret = -EOPNOTSUPP; if (rdev->ops->set_qos_map) { trace_rdev_set_qos_map(&rdev->wiphy, dev, qos_map); ret = rdev->ops->set_qos_map(&rdev->wiphy, dev, qos_map); trace_rdev_return_int(&rdev->wiphy, ret); } return ret; } static inline int rdev_set_ap_chanwidth(struct cfg80211_registered_device *rdev, struct net_device *dev, unsigned int link_id, struct cfg80211_chan_def *chandef) { int ret; trace_rdev_set_ap_chanwidth(&rdev->wiphy, dev, link_id, chandef); ret = rdev->ops->set_ap_chanwidth(&rdev->wiphy, dev, link_id, chandef); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_add_tx_ts(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 tsid, const u8 *peer, u8 user_prio, u16 admitted_time) { int ret = -EOPNOTSUPP; trace_rdev_add_tx_ts(&rdev->wiphy, dev, tsid, peer, user_prio, admitted_time); if (rdev->ops->add_tx_ts) ret = rdev->ops->add_tx_ts(&rdev->wiphy, dev, tsid, peer, user_prio, admitted_time); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_tx_ts(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 tsid, const u8 *peer) { int ret = -EOPNOTSUPP; trace_rdev_del_tx_ts(&rdev->wiphy, dev, tsid, peer); if (rdev->ops->del_tx_ts) ret = rdev->ops->del_tx_ts(&rdev->wiphy, dev, tsid, peer); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_tdls_channel_switch(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *addr, u8 oper_class, struct cfg80211_chan_def *chandef) { int ret; trace_rdev_tdls_channel_switch(&rdev->wiphy, dev, addr, oper_class, chandef); ret = rdev->ops->tdls_channel_switch(&rdev->wiphy, dev, addr, oper_class, chandef); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_tdls_cancel_channel_switch(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *addr) { trace_rdev_tdls_cancel_channel_switch(&rdev->wiphy, dev, addr); rdev->ops->tdls_cancel_channel_switch(&rdev->wiphy, dev, addr); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_start_radar_detection(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_chan_def *chandef, u32 cac_time_ms, int link_id) { int ret = -EOPNOTSUPP; trace_rdev_start_radar_detection(&rdev->wiphy, dev, chandef, cac_time_ms, link_id); if (rdev->ops->start_radar_detection) ret = rdev->ops->start_radar_detection(&rdev->wiphy, dev, chandef, cac_time_ms, link_id); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_end_cac(struct cfg80211_registered_device *rdev, struct net_device *dev, unsigned int link_id) { trace_rdev_end_cac(&rdev->wiphy, dev, link_id); if (rdev->ops->end_cac) rdev->ops->end_cac(&rdev->wiphy, dev, link_id); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_set_mcast_rate(struct cfg80211_registered_device *rdev, struct net_device *dev, int mcast_rate[NUM_NL80211_BANDS]) { int ret = -EOPNOTSUPP; trace_rdev_set_mcast_rate(&rdev->wiphy, dev, mcast_rate); if (rdev->ops->set_mcast_rate) ret = rdev->ops->set_mcast_rate(&rdev->wiphy, dev, mcast_rate); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_coalesce(struct cfg80211_registered_device *rdev, struct cfg80211_coalesce *coalesce) { int ret = -EOPNOTSUPP; trace_rdev_set_coalesce(&rdev->wiphy, coalesce); if (rdev->ops->set_coalesce) ret = rdev->ops->set_coalesce(&rdev->wiphy, coalesce); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_pmk(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_pmk_conf *pmk_conf) { int ret = -EOPNOTSUPP; trace_rdev_set_pmk(&rdev->wiphy, dev, pmk_conf); if (rdev->ops->set_pmk) ret = rdev->ops->set_pmk(&rdev->wiphy, dev, pmk_conf); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_pmk(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *aa) { int ret = -EOPNOTSUPP; trace_rdev_del_pmk(&rdev->wiphy, dev, aa); if (rdev->ops->del_pmk) ret = rdev->ops->del_pmk(&rdev->wiphy, dev, aa); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_external_auth(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_external_auth_params *params) { int ret = -EOPNOTSUPP; trace_rdev_external_auth(&rdev->wiphy, dev, params); if (rdev->ops->external_auth) ret = rdev->ops->external_auth(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_ftm_responder_stats(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ftm_responder_stats *ftm_stats) { int ret = -EOPNOTSUPP; trace_rdev_get_ftm_responder_stats(&rdev->wiphy, dev, ftm_stats); if (rdev->ops->get_ftm_responder_stats) ret = rdev->ops->get_ftm_responder_stats(&rdev->wiphy, dev, ftm_stats); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_start_pmsr(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_pmsr_request *request) { int ret = -EOPNOTSUPP; trace_rdev_start_pmsr(&rdev->wiphy, wdev, request->cookie); if (rdev->ops->start_pmsr) ret = rdev->ops->start_pmsr(&rdev->wiphy, wdev, request); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_abort_pmsr(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_pmsr_request *request) { trace_rdev_abort_pmsr(&rdev->wiphy, wdev, request->cookie); if (rdev->ops->abort_pmsr) rdev->ops->abort_pmsr(&rdev->wiphy, wdev, request); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_update_owe_info(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_update_owe_info *oweinfo) { int ret = -EOPNOTSUPP; trace_rdev_update_owe_info(&rdev->wiphy, dev, oweinfo); if (rdev->ops->update_owe_info) ret = rdev->ops->update_owe_info(&rdev->wiphy, dev, oweinfo); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_probe_mesh_link(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *dest, const void *buf, size_t len) { int ret; trace_rdev_probe_mesh_link(&rdev->wiphy, dev, dest, buf, len); ret = rdev->ops->probe_mesh_link(&rdev->wiphy, dev, buf, len); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_tid_config(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_tid_config *tid_conf) { int ret; trace_rdev_set_tid_config(&rdev->wiphy, dev, tid_conf); ret = rdev->ops->set_tid_config(&rdev->wiphy, dev, tid_conf); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_reset_tid_config(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *peer, u8 tids) { int ret; trace_rdev_reset_tid_config(&rdev->wiphy, dev, peer, tids); ret = rdev->ops->reset_tid_config(&rdev->wiphy, dev, peer, tids); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_sar_specs(struct cfg80211_registered_device *rdev, struct cfg80211_sar_specs *sar) { int ret; trace_rdev_set_sar_specs(&rdev->wiphy, sar); ret = rdev->ops->set_sar_specs(&rdev->wiphy, sar); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_color_change(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_color_change_settings *params) { int ret; trace_rdev_color_change(&rdev->wiphy, dev, params); ret = rdev->ops->color_change(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_fils_aad(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_fils_aad *fils_aad) { int ret = -EOPNOTSUPP; trace_rdev_set_fils_aad(&rdev->wiphy, dev, fils_aad); if (rdev->ops->set_fils_aad) ret = rdev->ops->set_fils_aad(&rdev->wiphy, dev, fils_aad); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_radar_background(struct cfg80211_registered_device *rdev, struct cfg80211_chan_def *chandef) { struct wiphy *wiphy = &rdev->wiphy; int ret = -EOPNOTSUPP; trace_rdev_set_radar_background(wiphy, chandef); if (rdev->ops->set_radar_background) ret = rdev->ops->set_radar_background(wiphy, chandef); trace_rdev_return_int(wiphy, ret); return ret; } static inline int rdev_add_intf_link(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, unsigned int link_id) { int ret = 0; trace_rdev_add_intf_link(&rdev->wiphy, wdev, link_id); if (rdev->ops->add_intf_link) ret = rdev->ops->add_intf_link(&rdev->wiphy, wdev, link_id); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_del_intf_link(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, unsigned int link_id) { trace_rdev_del_intf_link(&rdev->wiphy, wdev, link_id); if (rdev->ops->del_intf_link) rdev->ops->del_intf_link(&rdev->wiphy, wdev, link_id); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_add_link_station(struct cfg80211_registered_device *rdev, struct net_device *dev, struct link_station_parameters *params) { int ret = -EOPNOTSUPP; trace_rdev_add_link_station(&rdev->wiphy, dev, params); if (rdev->ops->add_link_station) ret = rdev->ops->add_link_station(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_mod_link_station(struct cfg80211_registered_device *rdev, struct net_device *dev, struct link_station_parameters *params) { int ret = -EOPNOTSUPP; trace_rdev_mod_link_station(&rdev->wiphy, dev, params); if (rdev->ops->mod_link_station) ret = rdev->ops->mod_link_station(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_link_station(struct cfg80211_registered_device *rdev, struct net_device *dev, struct link_station_del_parameters *params) { int ret = -EOPNOTSUPP; trace_rdev_del_link_station(&rdev->wiphy, dev, params); if (rdev->ops->del_link_station) ret = rdev->ops->del_link_station(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_hw_timestamp(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_set_hw_timestamp *hwts) { struct wiphy *wiphy = &rdev->wiphy; int ret = -EOPNOTSUPP; trace_rdev_set_hw_timestamp(wiphy, dev, hwts); if (rdev->ops->set_hw_timestamp) ret = rdev->ops->set_hw_timestamp(wiphy, dev, hwts); trace_rdev_return_int(wiphy, ret); return ret; } static inline int rdev_set_ttlm(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ttlm_params *params) { struct wiphy *wiphy = &rdev->wiphy; int ret = -EOPNOTSUPP; trace_rdev_set_ttlm(wiphy, dev, params); if (rdev->ops->set_ttlm) ret = rdev->ops->set_ttlm(wiphy, dev, params); trace_rdev_return_int(wiphy, ret); return ret; } static inline u32 rdev_get_radio_mask(struct cfg80211_registered_device *rdev, struct net_device *dev) { struct wiphy *wiphy = &rdev->wiphy; if (!rdev->ops->get_radio_mask) return 0; return rdev->ops->get_radio_mask(wiphy, dev); } static inline int rdev_assoc_ml_reconf(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ml_reconf_req *req) { struct wiphy *wiphy = &rdev->wiphy; int ret = -EOPNOTSUPP; trace_rdev_assoc_ml_reconf(wiphy, dev, req); if (rdev->ops->assoc_ml_reconf) ret = rdev->ops->assoc_ml_reconf(wiphy, dev, req); trace_rdev_return_int(wiphy, ret); return ret; } static inline int rdev_set_epcs(struct cfg80211_registered_device *rdev, struct net_device *dev, bool val) { struct wiphy *wiphy = &rdev->wiphy; int ret = -EOPNOTSUPP; trace_rdev_set_epcs(wiphy, dev, val); if (rdev->ops->set_epcs) ret = rdev->ops->set_epcs(wiphy, dev, val); trace_rdev_return_int(wiphy, ret); return ret; } #endif /* __CFG80211_RDEV_OPS */
888 3202 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_TLB_H #define _ASM_X86_TLB_H #define tlb_flush tlb_flush static inline void tlb_flush(struct mmu_gather *tlb); #include <asm-generic/tlb.h> #include <linux/kernel.h> #include <vdso/bits.h> #include <vdso/page.h> static inline void tlb_flush(struct mmu_gather *tlb) { unsigned long start = 0UL, end = TLB_FLUSH_ALL; unsigned int stride_shift = tlb_get_unmap_shift(tlb); if (!tlb->fullmm && !tlb->need_flush_all) { start = tlb->start; end = tlb->end; } flush_tlb_mm_range(tlb->mm, start, end, stride_shift, tlb->freed_tables); } static inline void invlpg(unsigned long addr) { asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); } enum addr_stride { PTE_STRIDE = 0, PMD_STRIDE = 1 }; /* * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination * of the three. For example: * - FLAG_VA | FLAG_INCLUDE_GLOBAL: invalidate all TLB entries at the address * - FLAG_PCID: invalidate all TLB entries matching the PCID * * The first is used to invalidate (kernel) mappings at a particular * address across all processes. * * The latter invalidates all TLB entries matching a PCID. */ #define INVLPGB_FLAG_VA BIT(0) #define INVLPGB_FLAG_PCID BIT(1) #define INVLPGB_FLAG_ASID BIT(2) #define INVLPGB_FLAG_INCLUDE_GLOBAL BIT(3) #define INVLPGB_FLAG_FINAL_ONLY BIT(4) #define INVLPGB_FLAG_INCLUDE_NESTED BIT(5) /* The implied mode when all bits are clear: */ #define INVLPGB_MODE_ALL_NONGLOBALS 0UL #ifdef CONFIG_BROADCAST_TLB_FLUSH /* * INVLPGB does broadcast TLB invalidation across all the CPUs in the system. * * The INVLPGB instruction is weakly ordered, and a batch of invalidations can * be done in a parallel fashion. * * The instruction takes the number of extra pages to invalidate, beyond the * first page, while __invlpgb gets the more human readable number of pages to * invalidate. * * The bits in rax[0:2] determine respectively which components of the address * (VA, PCID, ASID) get compared when flushing. If neither bits are set, *any* * address in the specified range matches. * * Since it is desired to only flush TLB entries for the ASID that is executing * the instruction (a host/hypervisor or a guest), the ASID valid bit should * always be set. On a host/hypervisor, the hardware will use the ASID value * specified in EDX[15:0] (which should be 0). On a guest, the hardware will * use the actual ASID value of the guest. * * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from * this CPU have completed. */ static inline void __invlpgb(unsigned long asid, unsigned long pcid, unsigned long addr, u16 nr_pages, enum addr_stride stride, u8 flags) { u64 rax = addr | flags | INVLPGB_FLAG_ASID; u32 ecx = (stride << 31) | (nr_pages - 1); u32 edx = (pcid << 16) | asid; /* The low bits in rax are for flags. Verify addr is clean. */ VM_WARN_ON_ONCE(addr & ~PAGE_MASK); /* INVLPGB; supported in binutils >= 2.36. */ asm volatile(".byte 0x0f, 0x01, 0xfe" :: "a" (rax), "c" (ecx), "d" (edx)); } static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags) { __invlpgb(asid, pcid, 0, 1, 0, flags); } static inline void __tlbsync(void) { /* * TLBSYNC waits for INVLPGB instructions originating on the same CPU * to have completed. Print a warning if the task has been migrated, * and might not be waiting on all the INVLPGBs issued during this TLB * invalidation sequence. */ cant_migrate(); /* TLBSYNC: supported in binutils >= 0.36. */ asm volatile(".byte 0x0f, 0x01, 0xff" ::: "memory"); } #else /* Some compilers (I'm looking at you clang!) simply can't do DCE */ static inline void __invlpgb(unsigned long asid, unsigned long pcid, unsigned long addr, u16 nr_pages, enum addr_stride s, u8 flags) { } static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags) { } static inline void __tlbsync(void) { } #endif static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid, unsigned long addr, u16 nr, bool stride) { enum addr_stride str = stride ? PMD_STRIDE : PTE_STRIDE; u8 flags = INVLPGB_FLAG_PCID | INVLPGB_FLAG_VA; __invlpgb(0, pcid, addr, nr, str, flags); } /* Flush all mappings for a given PCID, not including globals. */ static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid) { __invlpgb_all(0, pcid, INVLPGB_FLAG_PCID); } /* Flush all mappings, including globals, for all PCIDs. */ static inline void invlpgb_flush_all(void) { /* * TLBSYNC at the end needs to make sure all flushes done on the * current CPU have been executed system-wide. Therefore, make * sure nothing gets migrated in-between but disable preemption * as it is cheaper. */ guard(preempt)(); __invlpgb_all(0, 0, INVLPGB_FLAG_INCLUDE_GLOBAL); __tlbsync(); } /* Flush addr, including globals, for all PCIDs. */ static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr) { __invlpgb(0, 0, addr, nr, PTE_STRIDE, INVLPGB_FLAG_INCLUDE_GLOBAL); } /* Flush all mappings for all PCIDs except globals. */ static inline void invlpgb_flush_all_nonglobals(void) { guard(preempt)(); __invlpgb_all(0, 0, INVLPGB_MODE_ALL_NONGLOBALS); __tlbsync(); } #endif /* _ASM_X86_TLB_H */
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) 2017 Netronome Systems, Inc. * Copyright (C) 2019 Mellanox Technologies. All rights reserved */ #include <linux/completion.h> #include <linux/device.h> #include <linux/idr.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/refcount.h> #include <linux/slab.h> #include <linux/sysfs.h> #include "netdevsim.h" static DEFINE_IDA(nsim_bus_dev_ids); static LIST_HEAD(nsim_bus_dev_list); static DEFINE_MUTEX(nsim_bus_dev_list_lock); static bool nsim_bus_enable; static refcount_t nsim_bus_devs; /* Including the bus itself. */ static DECLARE_COMPLETION(nsim_bus_devs_released); static struct nsim_bus_dev *to_nsim_bus_dev(struct device *dev) { return container_of(dev, struct nsim_bus_dev, dev); } static ssize_t nsim_bus_dev_numvfs_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); unsigned int num_vfs; int ret; ret = kstrtouint(buf, 0, &num_vfs); if (ret) return ret; device_lock(dev); ret = -ENOENT; if (dev_get_drvdata(dev)) ret = nsim_drv_configure_vfs(nsim_bus_dev, num_vfs); device_unlock(dev); return ret ? ret : count; } static ssize_t nsim_bus_dev_numvfs_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); return sprintf(buf, "%u\n", nsim_bus_dev->num_vfs); } static struct device_attribute nsim_bus_dev_numvfs_attr = __ATTR(sriov_numvfs, 0664, nsim_bus_dev_numvfs_show, nsim_bus_dev_numvfs_store); static ssize_t new_port_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); u8 eth_addr[ETH_ALEN] = {}; unsigned int port_index; bool addr_set = false; int ret; /* Prevent to use nsim_bus_dev before initialization. */ if (!smp_load_acquire(&nsim_bus_dev->init)) return -EBUSY; ret = sscanf(buf, "%u %hhx:%hhx:%hhx:%hhx:%hhx:%hhx", &port_index, &eth_addr[0], &eth_addr[1], &eth_addr[2], &eth_addr[3], &eth_addr[4], &eth_addr[5]); switch (ret) { case 7: if (!is_valid_ether_addr(eth_addr)) { pr_err("The supplied perm_addr is not a valid MAC address\n"); return -EINVAL; } addr_set = true; fallthrough; case 1: break; default: pr_err("Format for adding new port is \"id [perm_addr]\" (uint MAC).\n"); return -EINVAL; } ret = nsim_drv_port_add(nsim_bus_dev, NSIM_DEV_PORT_TYPE_PF, port_index, addr_set ? eth_addr : NULL); return ret ? ret : count; } static struct device_attribute nsim_bus_dev_new_port_attr = __ATTR_WO(new_port); static ssize_t del_port_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); unsigned int port_index; int ret; /* Prevent to use nsim_bus_dev before initialization. */ if (!smp_load_acquire(&nsim_bus_dev->init)) return -EBUSY; ret = kstrtouint(buf, 0, &port_index); if (ret) return ret; ret = nsim_drv_port_del(nsim_bus_dev, NSIM_DEV_PORT_TYPE_PF, port_index); return ret ? ret : count; } static struct device_attribute nsim_bus_dev_del_port_attr = __ATTR_WO(del_port); static struct attribute *nsim_bus_dev_attrs[] = { &nsim_bus_dev_numvfs_attr.attr, &nsim_bus_dev_new_port_attr.attr, &nsim_bus_dev_del_port_attr.attr, NULL, }; static const struct attribute_group nsim_bus_dev_attr_group = { .attrs = nsim_bus_dev_attrs, }; static const struct attribute_group *nsim_bus_dev_attr_groups[] = { &nsim_bus_dev_attr_group, NULL, }; static void nsim_bus_dev_release(struct device *dev) { struct nsim_bus_dev *nsim_bus_dev; nsim_bus_dev = container_of(dev, struct nsim_bus_dev, dev); kfree(nsim_bus_dev); if (refcount_dec_and_test(&nsim_bus_devs)) complete(&nsim_bus_devs_released); } static const struct device_type nsim_bus_dev_type = { .groups = nsim_bus_dev_attr_groups, .release = nsim_bus_dev_release, }; static struct nsim_bus_dev * nsim_bus_dev_new(unsigned int id, unsigned int port_count, unsigned int num_queues); static ssize_t new_device_store(const struct bus_type *bus, const char *buf, size_t count) { unsigned int id, port_count, num_queues; struct nsim_bus_dev *nsim_bus_dev; int err; err = sscanf(buf, "%u %u %u", &id, &port_count, &num_queues); switch (err) { case 1: port_count = 1; fallthrough; case 2: num_queues = 1; fallthrough; case 3: if (id > INT_MAX) { pr_err("Value of \"id\" is too big.\n"); return -EINVAL; } break; default: pr_err("Format for adding new device is \"id port_count num_queues\" (uint uint unit).\n"); return -EINVAL; } mutex_lock(&nsim_bus_dev_list_lock); /* Prevent to use resource before initialization. */ if (!smp_load_acquire(&nsim_bus_enable)) { err = -EBUSY; goto err; } nsim_bus_dev = nsim_bus_dev_new(id, port_count, num_queues); if (IS_ERR(nsim_bus_dev)) { err = PTR_ERR(nsim_bus_dev); goto err; } refcount_inc(&nsim_bus_devs); /* Allow using nsim_bus_dev */ smp_store_release(&nsim_bus_dev->init, true); list_add_tail(&nsim_bus_dev->list, &nsim_bus_dev_list); mutex_unlock(&nsim_bus_dev_list_lock); return count; err: mutex_unlock(&nsim_bus_dev_list_lock); return err; } static BUS_ATTR_WO(new_device); static void nsim_bus_dev_del(struct nsim_bus_dev *nsim_bus_dev); static ssize_t del_device_store(const struct bus_type *bus, const char *buf, size_t count) { struct nsim_bus_dev *nsim_bus_dev, *tmp; unsigned int id; int err; err = sscanf(buf, "%u", &id); switch (err) { case 1: if (id > INT_MAX) { pr_err("Value of \"id\" is too big.\n"); return -EINVAL; } break; default: pr_err("Format for deleting device is \"id\" (uint).\n"); return -EINVAL; } err = -ENOENT; mutex_lock(&nsim_bus_dev_list_lock); /* Prevent to use resource before initialization. */ if (!smp_load_acquire(&nsim_bus_enable)) { mutex_unlock(&nsim_bus_dev_list_lock); return -EBUSY; } list_for_each_entry_safe(nsim_bus_dev, tmp, &nsim_bus_dev_list, list) { if (nsim_bus_dev->dev.id != id) continue; list_del(&nsim_bus_dev->list); nsim_bus_dev_del(nsim_bus_dev); err = 0; break; } mutex_unlock(&nsim_bus_dev_list_lock); return !err ? count : err; } static BUS_ATTR_WO(del_device); static ssize_t link_device_store(const struct bus_type *bus, const char *buf, size_t count) { struct netdevsim *nsim_a, *nsim_b, *peer; struct net_device *dev_a, *dev_b; unsigned int ifidx_a, ifidx_b; int netnsfd_a, netnsfd_b, err; struct net *ns_a, *ns_b; err = sscanf(buf, "%d:%u %d:%u", &netnsfd_a, &ifidx_a, &netnsfd_b, &ifidx_b); if (err != 4) { pr_err("Format for linking two devices is \"netnsfd_a:ifidx_a netnsfd_b:ifidx_b\" (int uint int uint).\n"); return -EINVAL; } ns_a = get_net_ns_by_fd(netnsfd_a); if (IS_ERR(ns_a)) { pr_err("Could not find netns with fd: %d\n", netnsfd_a); return -EINVAL; } ns_b = get_net_ns_by_fd(netnsfd_b); if (IS_ERR(ns_b)) { pr_err("Could not find netns with fd: %d\n", netnsfd_b); put_net(ns_a); return -EINVAL; } err = -EINVAL; rtnl_lock(); dev_a = __dev_get_by_index(ns_a, ifidx_a); if (!dev_a) { pr_err("Could not find device with ifindex %u in netnsfd %d\n", ifidx_a, netnsfd_a); goto out_err; } if (!netdev_is_nsim(dev_a)) { pr_err("Device with ifindex %u in netnsfd %d is not a netdevsim\n", ifidx_a, netnsfd_a); goto out_err; } dev_b = __dev_get_by_index(ns_b, ifidx_b); if (!dev_b) { pr_err("Could not find device with ifindex %u in netnsfd %d\n", ifidx_b, netnsfd_b); goto out_err; } if (!netdev_is_nsim(dev_b)) { pr_err("Device with ifindex %u in netnsfd %d is not a netdevsim\n", ifidx_b, netnsfd_b); goto out_err; } if (dev_a == dev_b) { pr_err("Cannot link a netdevsim to itself\n"); goto out_err; } err = -EBUSY; nsim_a = netdev_priv(dev_a); peer = rtnl_dereference(nsim_a->peer); if (peer) { pr_err("Netdevsim %d:%u is already linked\n", netnsfd_a, ifidx_a); goto out_err; } nsim_b = netdev_priv(dev_b); peer = rtnl_dereference(nsim_b->peer); if (peer) { pr_err("Netdevsim %d:%u is already linked\n", netnsfd_b, ifidx_b); goto out_err; } err = 0; rcu_assign_pointer(nsim_a->peer, nsim_b); rcu_assign_pointer(nsim_b->peer, nsim_a); out_err: put_net(ns_b); put_net(ns_a); rtnl_unlock(); return !err ? count : err; } static BUS_ATTR_WO(link_device); static ssize_t unlink_device_store(const struct bus_type *bus, const char *buf, size_t count) { struct netdevsim *nsim, *peer; struct net_device *dev; unsigned int ifidx; int netnsfd, err; struct net *ns; err = sscanf(buf, "%u:%u", &netnsfd, &ifidx); if (err != 2) { pr_err("Format for unlinking a device is \"netnsfd:ifidx\" (int uint).\n"); return -EINVAL; } ns = get_net_ns_by_fd(netnsfd); if (IS_ERR(ns)) { pr_err("Could not find netns with fd: %d\n", netnsfd); return -EINVAL; } err = -EINVAL; rtnl_lock(); dev = __dev_get_by_index(ns, ifidx); if (!dev) { pr_err("Could not find device with ifindex %u in netnsfd %d\n", ifidx, netnsfd); goto out_put_netns; } if (!netdev_is_nsim(dev)) { pr_err("Device with ifindex %u in netnsfd %d is not a netdevsim\n", ifidx, netnsfd); goto out_put_netns; } nsim = netdev_priv(dev); peer = rtnl_dereference(nsim->peer); if (!peer) goto out_put_netns; err = 0; RCU_INIT_POINTER(nsim->peer, NULL); RCU_INIT_POINTER(peer->peer, NULL); synchronize_net(); netif_tx_wake_all_queues(dev); netif_tx_wake_all_queues(peer->netdev); out_put_netns: put_net(ns); rtnl_unlock(); return !err ? count : err; } static BUS_ATTR_WO(unlink_device); static struct attribute *nsim_bus_attrs[] = { &bus_attr_new_device.attr, &bus_attr_del_device.attr, &bus_attr_link_device.attr, &bus_attr_unlink_device.attr, NULL }; ATTRIBUTE_GROUPS(nsim_bus); static int nsim_bus_probe(struct device *dev) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); return nsim_drv_probe(nsim_bus_dev); } static void nsim_bus_remove(struct device *dev) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); nsim_drv_remove(nsim_bus_dev); } static int nsim_num_vf(struct device *dev) { struct nsim_bus_dev *nsim_bus_dev = to_nsim_bus_dev(dev); return nsim_bus_dev->num_vfs; } static const struct bus_type nsim_bus = { .name = DRV_NAME, .dev_name = DRV_NAME, .bus_groups = nsim_bus_groups, .probe = nsim_bus_probe, .remove = nsim_bus_remove, .num_vf = nsim_num_vf, }; #define NSIM_BUS_DEV_MAX_VFS 4 static struct nsim_bus_dev * nsim_bus_dev_new(unsigned int id, unsigned int port_count, unsigned int num_queues) { struct nsim_bus_dev *nsim_bus_dev; int err; nsim_bus_dev = kzalloc(sizeof(*nsim_bus_dev), GFP_KERNEL); if (!nsim_bus_dev) return ERR_PTR(-ENOMEM); err = ida_alloc_range(&nsim_bus_dev_ids, id, id, GFP_KERNEL); if (err < 0) goto err_nsim_bus_dev_free; nsim_bus_dev->dev.id = err; nsim_bus_dev->dev.bus = &nsim_bus; nsim_bus_dev->dev.type = &nsim_bus_dev_type; nsim_bus_dev->port_count = port_count; nsim_bus_dev->num_queues = num_queues; nsim_bus_dev->initial_net = current->nsproxy->net_ns; nsim_bus_dev->max_vfs = NSIM_BUS_DEV_MAX_VFS; /* Disallow using nsim_bus_dev */ smp_store_release(&nsim_bus_dev->init, false); err = device_register(&nsim_bus_dev->dev); if (err) goto err_nsim_bus_dev_id_free; return nsim_bus_dev; err_nsim_bus_dev_id_free: ida_free(&nsim_bus_dev_ids, nsim_bus_dev->dev.id); put_device(&nsim_bus_dev->dev); nsim_bus_dev = NULL; err_nsim_bus_dev_free: kfree(nsim_bus_dev); return ERR_PTR(err); } static void nsim_bus_dev_del(struct nsim_bus_dev *nsim_bus_dev) { /* Disallow using nsim_bus_dev */ smp_store_release(&nsim_bus_dev->init, false); ida_free(&nsim_bus_dev_ids, nsim_bus_dev->dev.id); device_unregister(&nsim_bus_dev->dev); } static struct device_driver nsim_driver = { .name = DRV_NAME, .bus = &nsim_bus, .owner = THIS_MODULE, }; int nsim_bus_init(void) { int err; err = bus_register(&nsim_bus); if (err) return err; err = driver_register(&nsim_driver); if (err) goto err_bus_unregister; refcount_set(&nsim_bus_devs, 1); /* Allow using resources */ smp_store_release(&nsim_bus_enable, true); return 0; err_bus_unregister: bus_unregister(&nsim_bus); return err; } void nsim_bus_exit(void) { struct nsim_bus_dev *nsim_bus_dev, *tmp; /* Disallow using resources */ smp_store_release(&nsim_bus_enable, false); if (refcount_dec_and_test(&nsim_bus_devs)) complete(&nsim_bus_devs_released); mutex_lock(&nsim_bus_dev_list_lock); list_for_each_entry_safe(nsim_bus_dev, tmp, &nsim_bus_dev_list, list) { list_del(&nsim_bus_dev->list); nsim_bus_dev_del(nsim_bus_dev); } mutex_unlock(&nsim_bus_dev_list_lock); wait_for_completion(&nsim_bus_devs_released); driver_unregister(&nsim_driver); bus_unregister(&nsim_bus); }
58 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_TABLES_IPV4_H_ #define _NF_TABLES_IPV4_H_ #include <net/netfilter/nf_tables.h> #include <net/ip.h> static inline void nft_set_pktinfo_ipv4(struct nft_pktinfo *pkt) { struct iphdr *ip; ip = ip_hdr(pkt->skb); pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = ip->protocol; pkt->thoff = ip_hdrlen(pkt->skb); pkt->fragoff = ntohs(ip->frag_off) & IP_OFFSET; } static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) { struct iphdr *iph, _iph; u32 len, thoff, skb_len; iph = skb_header_pointer(pkt->skb, skb_network_offset(pkt->skb), sizeof(*iph), &_iph); if (!iph) return -1; if (iph->ihl < 5 || iph->version != 4) return -1; len = iph_totlen(pkt->skb, iph); thoff = iph->ihl * 4; skb_len = pkt->skb->len - skb_network_offset(pkt->skb); if (skb_len < len) return -1; else if (len < thoff) return -1; else if (thoff < sizeof(*iph)) return -1; pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = iph->protocol; pkt->thoff = skb_network_offset(pkt->skb) + thoff; pkt->fragoff = ntohs(iph->frag_off) & IP_OFFSET; return 0; } static inline void nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt) { if (__nft_set_pktinfo_ipv4_validate(pkt) < 0) nft_set_pktinfo_unspec(pkt); } static inline int nft_set_pktinfo_ipv4_ingress(struct nft_pktinfo *pkt) { struct iphdr *iph; u32 len, thoff; if (!pskb_may_pull(pkt->skb, sizeof(*iph))) return -1; iph = ip_hdr(pkt->skb); if (iph->ihl < 5 || iph->version != 4) goto inhdr_error; len = iph_totlen(pkt->skb, iph); thoff = iph->ihl * 4; if (pkt->skb->len < len) { __IP_INC_STATS(nft_net(pkt), IPSTATS_MIB_INTRUNCATEDPKTS); return -1; } else if (len < thoff) { goto inhdr_error; } else if (thoff < sizeof(*iph)) { return -1; } pkt->flags = NFT_PKTINFO_L4PROTO; pkt->tprot = iph->protocol; pkt->thoff = thoff; pkt->fragoff = ntohs(iph->frag_off) & IP_OFFSET; return 0; inhdr_error: __IP_INC_STATS(nft_net(pkt), IPSTATS_MIB_INHDRERRORS); return -1; } #endif
3 5 5 2 2 12 1 1 2 1 2 5 2 2 2 2 496 496 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/act_simple.c Simple example of an action * * Authors: Jamal Hadi Salim (2005-8) */ #include <linux/module.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/tc_wrapper.h> #include <linux/tc_act/tc_defact.h> #include <net/tc_act/tc_defact.h> static struct tc_action_ops act_simp_ops; #define SIMP_MAX_DATA 32 TC_INDIRECT_SCOPE int tcf_simp_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_defact *d = to_defact(a); spin_lock(&d->tcf_lock); tcf_lastuse_update(&d->tcf_tm); bstats_update(&d->tcf_bstats, skb); /* print policy string followed by _ then packet count * Example if this was the 3rd packet and the string was "hello" * then it would look like "hello_3" (without quotes) */ pr_info("simple: %s_%llu\n", (char *)d->tcfd_defdata, u64_stats_read(&d->tcf_bstats.packets)); spin_unlock(&d->tcf_lock); return d->tcf_action; } static void tcf_simp_release(struct tc_action *a) { struct tcf_defact *d = to_defact(a); kfree(d->tcfd_defdata); } static int alloc_defdata(struct tcf_defact *d, const struct nlattr *defdata) { d->tcfd_defdata = kzalloc(SIMP_MAX_DATA, GFP_KERNEL); if (unlikely(!d->tcfd_defdata)) return -ENOMEM; nla_strscpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); return 0; } static int reset_policy(struct tc_action *a, const struct nlattr *defdata, struct tc_defact *p, struct tcf_proto *tp, struct netlink_ext_ack *extack) { struct tcf_chain *goto_ch = NULL; struct tcf_defact *d; int err; err = tcf_action_check_ctrlact(p->action, tp, &goto_ch, extack); if (err < 0) return err; d = to_defact(a); spin_lock_bh(&d->tcf_lock); goto_ch = tcf_action_set_ctrlact(a, p->action, goto_ch); memset(d->tcfd_defdata, 0, SIMP_MAX_DATA); nla_strscpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); spin_unlock_bh(&d->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); return 0; } static const struct nla_policy simple_policy[TCA_DEF_MAX + 1] = { [TCA_DEF_PARMS] = { .len = sizeof(struct tc_defact) }, [TCA_DEF_DATA] = { .type = NLA_STRING, .len = SIMP_MAX_DATA }, }; static int tcf_simp_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_simp_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_DEF_MAX + 1]; struct tcf_chain *goto_ch = NULL; struct tc_defact *parm; struct tcf_defact *d; bool exists = false; int ret = 0, err; u32 index; if (nla == NULL) return -EINVAL; err = nla_parse_nested_deprecated(tb, TCA_DEF_MAX, nla, simple_policy, NULL); if (err < 0) return err; if (tb[TCA_DEF_PARMS] == NULL) return -EINVAL; parm = nla_data(tb[TCA_DEF_PARMS]); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) return err; exists = err; if (exists && bind) return ACT_P_BOUND; if (tb[TCA_DEF_DATA] == NULL) { if (exists) tcf_idr_release(*a, bind); else tcf_idr_cleanup(tn, index); return -EINVAL; } if (!exists) { ret = tcf_idr_create(tn, index, est, a, &act_simp_ops, bind, false, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } d = to_defact(*a); err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; err = alloc_defdata(d, tb[TCA_DEF_DATA]); if (err < 0) goto put_chain; tcf_action_set_ctrlact(*a, parm->action, goto_ch); ret = ACT_P_CREATED; } else { if (!(flags & TCA_ACT_FLAGS_REPLACE)) { err = -EEXIST; goto release_idr; } err = reset_policy(*a, tb[TCA_DEF_DATA], parm, tp, extack); if (err) goto release_idr; } return ret; put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: tcf_idr_release(*a, bind); return err; } static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_defact *d = to_defact(a); struct tc_defact opt = { .index = d->tcf_index, .refcnt = refcount_read(&d->tcf_refcnt) - ref, .bindcnt = atomic_read(&d->tcf_bindcnt) - bind, }; struct tcf_t t; spin_lock_bh(&d->tcf_lock); opt.action = d->tcf_action; if (nla_put(skb, TCA_DEF_PARMS, sizeof(opt), &opt) || nla_put_string(skb, TCA_DEF_DATA, d->tcfd_defdata)) goto nla_put_failure; tcf_tm_dump(&t, &d->tcf_tm); if (nla_put_64bit(skb, TCA_DEF_TM, sizeof(t), &t, TCA_DEF_PAD)) goto nla_put_failure; spin_unlock_bh(&d->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&d->tcf_lock); nlmsg_trim(skb, b); return -1; } static struct tc_action_ops act_simp_ops = { .kind = "simple", .id = TCA_ID_SIMP, .owner = THIS_MODULE, .act = tcf_simp_act, .dump = tcf_simp_dump, .cleanup = tcf_simp_release, .init = tcf_simp_init, .size = sizeof(struct tcf_defact), }; MODULE_ALIAS_NET_ACT("simple"); static __net_init int simp_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_simp_ops.net_id); return tc_action_net_init(net, tn, &act_simp_ops); } static void __net_exit simp_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_simp_ops.net_id); } static struct pernet_operations simp_net_ops = { .init = simp_init_net, .exit_batch = simp_exit_net, .id = &act_simp_ops.net_id, .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2005)"); MODULE_DESCRIPTION("Simple example action"); MODULE_LICENSE("GPL"); static int __init simp_init_module(void) { int ret = tcf_register_action(&act_simp_ops, &simp_net_ops); if (!ret) pr_info("Simple TC action Loaded\n"); return ret; } static void __exit simp_cleanup_module(void) { tcf_unregister_action(&act_simp_ops, &simp_net_ops); } module_init(simp_init_module); module_exit(simp_cleanup_module);
553 421 276 151 151 681 587 653 49 633 682 682 628 682 311 313 26 48 312 88 313 407 82 407 34 25 48 340 7 283 238 389 285 390 60 52 33 386 2 30 6 2 2 23 380 125 407 407 2 405 6 407 120 400 7 407 398 7 403 144 396 7 4 7 400 404 403 7 318 287 94 361 3 369 261 303 293 71 363 65 15 50 21 42 2 36 6 38 12 29 24 6 10 1 43 16 14 8 35 2 14 48 18 29 52 14 15 36 4 12 20 1 15 539 491 33 4 33 25 23 25 25 414 414 413 276 272 217 81 25 25 25 25 25 102 102 102 102 78 24 38 64 103 102 1 79 79 79 79 30 79 103 40 103 24 1 1 99 3 2 94 81 13 94 1 93 9 3 13 13 13 1 13 11 228 219 199 159 197 198 185 16 33 2 29 199 199 16 196 199 199 196 32 20 1 13 3 3 1 21 20 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/fat/dir.c * * directory handling functions for fat-based filesystems * * Written 1992,1993 by Werner Almesberger * * Hidden files 1995 by Albert Cahalan <albert@ccs.neu.edu> <adc@coe.neu.edu> * * VFAT extensions by Gordon Chaffee <chaffee@plateau.cs.berkeley.edu> * Merged with msdos fs by Henrik Storner <storner@osiris.ping.dk> * Rewritten for constant inumbers. Plugged buffer overrun in readdir(). AV * Short name translation 1999, 2001 by Wolfram Pienkoss <wp@bszh.de> */ #include <linux/slab.h> #include <linux/compat.h> #include <linux/uaccess.h> #include <linux/iversion.h> #include "fat.h" /* * Maximum buffer size of short name. * [(MSDOS_NAME + '.') * max one char + nul] * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul] */ #define FAT_MAX_SHORT_SIZE ((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1) /* * Maximum buffer size of unicode chars from slots. * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)] */ #define FAT_MAX_UNI_CHARS ((MSDOS_SLOTS - 1) * 13 + 1) #define FAT_MAX_UNI_SIZE (FAT_MAX_UNI_CHARS * sizeof(wchar_t)) static inline unsigned char fat_tolower(unsigned char c) { return ((c >= 'A') && (c <= 'Z')) ? c+32 : c; } static inline loff_t fat_make_i_pos(struct super_block *sb, struct buffer_head *bh, struct msdos_dir_entry *de) { return ((loff_t)bh->b_blocknr << MSDOS_SB(sb)->dir_per_block_bits) | (de - (struct msdos_dir_entry *)bh->b_data); } static inline void fat_dir_readahead(struct inode *dir, sector_t iblock, sector_t phys) { struct super_block *sb = dir->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); struct buffer_head *bh; int sec; /* This is not a first sector of cluster, or sec_per_clus == 1 */ if ((iblock & (sbi->sec_per_clus - 1)) || sbi->sec_per_clus == 1) return; /* root dir of FAT12/FAT16 */ if (!is_fat32(sbi) && (dir->i_ino == MSDOS_ROOT_INO)) return; bh = sb_find_get_block(sb, phys); if (bh == NULL || !buffer_uptodate(bh)) { for (sec = 0; sec < sbi->sec_per_clus; sec++) sb_breadahead(sb, phys + sec); } brelse(bh); } /* Returns the inode number of the directory entry at offset pos. If bh is non-NULL, it is brelse'd before. Pos is incremented. The buffer header is returned in bh. AV. Most often we do it item-by-item. Makes sense to optimize. AV. OK, there we go: if both bh and de are non-NULL we assume that we just AV. want the next entry (took one explicit de=NULL in vfat/namei.c). AV. It's done in fat_get_entry() (inlined), here the slow case lives. AV. Additionally, when we return -1 (i.e. reached the end of directory) AV. we make bh NULL. */ static int fat__get_entry(struct inode *dir, loff_t *pos, struct buffer_head **bh, struct msdos_dir_entry **de) { struct super_block *sb = dir->i_sb; sector_t phys, iblock; unsigned long mapped_blocks; int err, offset; next: brelse(*bh); *bh = NULL; iblock = *pos >> sb->s_blocksize_bits; err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0, false); if (err || !phys) return -1; /* beyond EOF or error */ fat_dir_readahead(dir, iblock, phys); *bh = sb_bread(sb, phys); if (*bh == NULL) { fat_msg_ratelimit(sb, KERN_ERR, "Directory bread(block %llu) failed", (llu)phys); /* skip this block */ *pos = (iblock + 1) << sb->s_blocksize_bits; goto next; } offset = *pos & (sb->s_blocksize - 1); *pos += sizeof(struct msdos_dir_entry); *de = (struct msdos_dir_entry *)((*bh)->b_data + offset); return 0; } static inline int fat_get_entry(struct inode *dir, loff_t *pos, struct buffer_head **bh, struct msdos_dir_entry **de) { /* Fast stuff first */ if (*bh && *de && (*de - (struct msdos_dir_entry *)(*bh)->b_data) < MSDOS_SB(dir->i_sb)->dir_per_block - 1) { *pos += sizeof(struct msdos_dir_entry); (*de)++; return 0; } return fat__get_entry(dir, pos, bh, de); } /* * Convert Unicode 16 to UTF-8, translated Unicode, or ASCII. * If uni_xlate is enabled and we can't get a 1:1 conversion, use a * colon as an escape character since it is normally invalid on the vfat * filesystem. The following four characters are the hexadecimal digits * of Unicode value. This lets us do a full dump and restore of Unicode * filenames. We could get into some trouble with long Unicode names, * but ignore that right now. * Ahem... Stack smashing in ring 0 isn't fun. Fixed. */ static int uni16_to_x8(struct super_block *sb, unsigned char *ascii, const wchar_t *uni, int len, struct nls_table *nls) { int uni_xlate = MSDOS_SB(sb)->options.unicode_xlate; const wchar_t *ip; wchar_t ec; unsigned char *op; int charlen; ip = uni; op = ascii; while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) { ec = *ip++; charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE); if (charlen > 0) { op += charlen; len -= charlen; } else { if (uni_xlate == 1) { *op++ = ':'; op = hex_byte_pack(op, ec >> 8); op = hex_byte_pack(op, ec); len -= 5; } else { *op++ = '?'; len--; } } } if (unlikely(*ip)) { fat_msg(sb, KERN_WARNING, "filename was truncated while converting."); } *op = 0; return op - ascii; } static inline int fat_uni_to_x8(struct super_block *sb, const wchar_t *uni, unsigned char *buf, int size) { struct msdos_sb_info *sbi = MSDOS_SB(sb); if (sbi->options.utf8) return utf16s_to_utf8s(uni, FAT_MAX_UNI_CHARS, UTF16_HOST_ENDIAN, buf, size); else return uni16_to_x8(sb, buf, uni, size, sbi->nls_io); } static inline int fat_short2uni(struct nls_table *t, unsigned char *c, int clen, wchar_t *uni) { int charlen; charlen = t->char2uni(c, clen, uni); if (charlen < 0) { *uni = 0x003f; /* a question mark */ charlen = 1; } return charlen; } static inline int fat_short2lower_uni(struct nls_table *t, unsigned char *c, int clen, wchar_t *uni) { int charlen; wchar_t wc; charlen = t->char2uni(c, clen, &wc); if (charlen < 0) { *uni = 0x003f; /* a question mark */ charlen = 1; } else if (charlen <= 1) { unsigned char nc = t->charset2lower[*c]; if (!nc) nc = *c; charlen = t->char2uni(&nc, 1, uni); if (charlen < 0) { *uni = 0x003f; /* a question mark */ charlen = 1; } } else *uni = wc; return charlen; } static inline int fat_shortname2uni(struct nls_table *nls, unsigned char *buf, int buf_size, wchar_t *uni_buf, unsigned short opt, int lower) { int len = 0; if (opt & VFAT_SFN_DISPLAY_LOWER) len = fat_short2lower_uni(nls, buf, buf_size, uni_buf); else if (opt & VFAT_SFN_DISPLAY_WIN95) len = fat_short2uni(nls, buf, buf_size, uni_buf); else if (opt & VFAT_SFN_DISPLAY_WINNT) { if (lower) len = fat_short2lower_uni(nls, buf, buf_size, uni_buf); else len = fat_short2uni(nls, buf, buf_size, uni_buf); } else len = fat_short2uni(nls, buf, buf_size, uni_buf); return len; } static inline int fat_name_match(struct msdos_sb_info *sbi, const unsigned char *a, int a_len, const unsigned char *b, int b_len) { if (a_len != b_len) return 0; if (sbi->options.name_check != 's') return !nls_strnicmp(sbi->nls_io, a, b, a_len); else return !memcmp(a, b, a_len); } enum { PARSE_INVALID = 1, PARSE_NOT_LONGNAME, PARSE_EOF, }; /** * fat_parse_long - Parse extended directory entry. * * @dir: Pointer to the inode that represents the directory. * @pos: On input, contains the starting position to read from. * On output, updated with the new position. * @bh: Pointer to the buffer head that may be used for reading directory * entries. May be updated. * @de: On input, points to the current directory entry. * On output, points to the next directory entry. * @unicode: Pointer to a buffer where the parsed Unicode long filename will be * stored. * @nr_slots: Pointer to a variable that will store the number of longname * slots found. * * This function returns zero on success, negative value on error, or one of * the following: * * %PARSE_INVALID - Directory entry is invalid. * %PARSE_NOT_LONGNAME - Directory entry does not contain longname. * %PARSE_EOF - Directory has no more entries. */ static int fat_parse_long(struct inode *dir, loff_t *pos, struct buffer_head **bh, struct msdos_dir_entry **de, wchar_t **unicode, unsigned char *nr_slots) { struct msdos_dir_slot *ds; unsigned char id, slot, slots, alias_checksum; if (!*unicode) { *unicode = __getname(); if (!*unicode) { brelse(*bh); return -ENOMEM; } } parse_long: ds = (struct msdos_dir_slot *)*de; id = ds->id; if (!(id & 0x40)) return PARSE_INVALID; slots = id & ~0x40; if (slots > 20 || !slots) /* ceil(256 * 2 / 26) */ return PARSE_INVALID; *nr_slots = slots; alias_checksum = ds->alias_checksum; slot = slots; while (1) { int offset; slot--; offset = slot * 13; fat16_towchar(*unicode + offset, ds->name0_4, 5); fat16_towchar(*unicode + offset + 5, ds->name5_10, 6); fat16_towchar(*unicode + offset + 11, ds->name11_12, 2); if (ds->id & 0x40) (*unicode)[offset + 13] = 0; if (fat_get_entry(dir, pos, bh, de) < 0) return PARSE_EOF; if (slot == 0) break; ds = (struct msdos_dir_slot *)*de; if (ds->attr != ATTR_EXT) return PARSE_NOT_LONGNAME; if ((ds->id & ~0x40) != slot) goto parse_long; if (ds->alias_checksum != alias_checksum) goto parse_long; } if ((*de)->name[0] == DELETED_FLAG) return PARSE_INVALID; if ((*de)->attr == ATTR_EXT) goto parse_long; if (IS_FREE((*de)->name) || ((*de)->attr & ATTR_VOLUME)) return PARSE_INVALID; if (fat_checksum((*de)->name) != alias_checksum) *nr_slots = 0; return 0; } /** * fat_parse_short - Parse MS-DOS (short) directory entry. * @sb: superblock * @de: directory entry to parse * @name: FAT_MAX_SHORT_SIZE array in which to place extracted name * @dot_hidden: Nonzero == prepend '.' to names with ATTR_HIDDEN * * Returns the number of characters extracted into 'name'. */ static int fat_parse_short(struct super_block *sb, const struct msdos_dir_entry *de, unsigned char *name, int dot_hidden) { const struct msdos_sb_info *sbi = MSDOS_SB(sb); int isvfat = sbi->options.isvfat; int nocase = sbi->options.nocase; unsigned short opt_shortname = sbi->options.shortname; struct nls_table *nls_disk = sbi->nls_disk; wchar_t uni_name[14]; unsigned char c, work[MSDOS_NAME]; unsigned char *ptname = name; int chi, chl, i, j, k; int dotoffset = 0; int name_len = 0, uni_len = 0; if (!isvfat && dot_hidden && (de->attr & ATTR_HIDDEN)) { *ptname++ = '.'; dotoffset = 1; } memcpy(work, de->name, sizeof(work)); /* For an explanation of the special treatment of 0x05 in * filenames, see msdos_format_name in namei_msdos.c */ if (work[0] == 0x05) work[0] = 0xE5; /* Filename */ for (i = 0, j = 0; i < 8;) { c = work[i]; if (!c) break; chl = fat_shortname2uni(nls_disk, &work[i], 8 - i, &uni_name[j++], opt_shortname, de->lcase & CASE_LOWER_BASE); if (chl <= 1) { if (!isvfat) ptname[i] = nocase ? c : fat_tolower(c); i++; if (c != ' ') { name_len = i; uni_len = j; } } else { uni_len = j; if (isvfat) i += min(chl, 8-i); else { for (chi = 0; chi < chl && i < 8; chi++, i++) ptname[i] = work[i]; } if (chl) name_len = i; } } i = name_len; j = uni_len; fat_short2uni(nls_disk, ".", 1, &uni_name[j++]); if (!isvfat) ptname[i] = '.'; i++; /* Extension */ for (k = 8; k < MSDOS_NAME;) { c = work[k]; if (!c) break; chl = fat_shortname2uni(nls_disk, &work[k], MSDOS_NAME - k, &uni_name[j++], opt_shortname, de->lcase & CASE_LOWER_EXT); if (chl <= 1) { k++; if (!isvfat) ptname[i] = nocase ? c : fat_tolower(c); i++; if (c != ' ') { name_len = i; uni_len = j; } } else { uni_len = j; if (isvfat) { int offset = min(chl, MSDOS_NAME-k); k += offset; i += offset; } else { for (chi = 0; chi < chl && k < MSDOS_NAME; chi++, i++, k++) { ptname[i] = work[k]; } } if (chl) name_len = i; } } if (name_len > 0) { name_len += dotoffset; if (sbi->options.isvfat) { uni_name[uni_len] = 0x0000; name_len = fat_uni_to_x8(sb, uni_name, name, FAT_MAX_SHORT_SIZE); } } return name_len; } /* * Return values: negative -> error/not found, 0 -> found. */ int fat_search_long(struct inode *inode, const unsigned char *name, int name_len, struct fat_slot_info *sinfo) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); struct buffer_head *bh = NULL; struct msdos_dir_entry *de; unsigned char nr_slots; wchar_t *unicode = NULL; unsigned char bufname[FAT_MAX_SHORT_SIZE]; loff_t cpos = 0; int err, len; err = -ENOENT; while (1) { if (fat_get_entry(inode, &cpos, &bh, &de) == -1) goto end_of_dir; parse_record: nr_slots = 0; if (de->name[0] == DELETED_FLAG) continue; if (de->attr != ATTR_EXT && (de->attr & ATTR_VOLUME)) continue; if (de->attr != ATTR_EXT && IS_FREE(de->name)) continue; if (de->attr == ATTR_EXT) { int status = fat_parse_long(inode, &cpos, &bh, &de, &unicode, &nr_slots); if (status < 0) { err = status; goto end_of_dir; } else if (status == PARSE_INVALID) continue; else if (status == PARSE_NOT_LONGNAME) goto parse_record; else if (status == PARSE_EOF) goto end_of_dir; } /* Never prepend '.' to hidden files here. * That is done only for msdos mounts (and only when * 'dotsOK=yes'); if we are executing here, it is in the * context of a vfat mount. */ len = fat_parse_short(sb, de, bufname, 0); if (len == 0) continue; /* Compare shortname */ if (fat_name_match(sbi, name, name_len, bufname, len)) goto found; if (nr_slots) { void *longname = unicode + FAT_MAX_UNI_CHARS; int size = PATH_MAX - FAT_MAX_UNI_SIZE; /* Compare longname */ len = fat_uni_to_x8(sb, unicode, longname, size); if (fat_name_match(sbi, name, name_len, longname, len)) goto found; } } found: nr_slots++; /* include the de */ sinfo->slot_off = cpos - nr_slots * sizeof(*de); sinfo->nr_slots = nr_slots; sinfo->de = de; sinfo->bh = bh; sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de); err = 0; end_of_dir: if (unicode) __putname(unicode); return err; } EXPORT_SYMBOL_GPL(fat_search_long); struct fat_ioctl_filldir_callback { struct dir_context ctx; void __user *dirent; int result; /* for dir ioctl */ const char *longname; int long_len; const char *shortname; int short_len; }; static int __fat_readdir(struct inode *inode, struct file *file, struct dir_context *ctx, int short_only, struct fat_ioctl_filldir_callback *both) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); struct buffer_head *bh; struct msdos_dir_entry *de; unsigned char nr_slots; wchar_t *unicode = NULL; unsigned char bufname[FAT_MAX_SHORT_SIZE]; int isvfat = sbi->options.isvfat; const char *fill_name = NULL; int fake_offset = 0; loff_t cpos; int short_len = 0, fill_len = 0; int ret = 0; mutex_lock(&sbi->s_lock); cpos = ctx->pos; /* Fake . and .. for the root directory. */ if (inode->i_ino == MSDOS_ROOT_INO) { if (!dir_emit_dots(file, ctx)) goto out; if (ctx->pos == 2) { fake_offset = 1; cpos = 0; } } if (cpos & (sizeof(struct msdos_dir_entry) - 1)) { ret = -ENOENT; goto out; } bh = NULL; get_new: if (fat_get_entry(inode, &cpos, &bh, &de) == -1) goto end_of_dir; parse_record: nr_slots = 0; /* * Check for long filename entry, but if short_only, we don't * need to parse long filename. */ if (isvfat && !short_only) { if (de->name[0] == DELETED_FLAG) goto record_end; if (de->attr != ATTR_EXT && (de->attr & ATTR_VOLUME)) goto record_end; if (de->attr != ATTR_EXT && IS_FREE(de->name)) goto record_end; } else { if ((de->attr & ATTR_VOLUME) || IS_FREE(de->name)) goto record_end; } if (isvfat && de->attr == ATTR_EXT) { int status = fat_parse_long(inode, &cpos, &bh, &de, &unicode, &nr_slots); if (status < 0) { bh = NULL; ret = status; goto end_of_dir; } else if (status == PARSE_INVALID) goto record_end; else if (status == PARSE_NOT_LONGNAME) goto parse_record; else if (status == PARSE_EOF) goto end_of_dir; if (nr_slots) { void *longname = unicode + FAT_MAX_UNI_CHARS; int size = PATH_MAX - FAT_MAX_UNI_SIZE; int len = fat_uni_to_x8(sb, unicode, longname, size); fill_name = longname; fill_len = len; /* !both && !short_only, so we don't need shortname. */ if (!both) goto start_filldir; short_len = fat_parse_short(sb, de, bufname, sbi->options.dotsOK); if (short_len == 0) goto record_end; /* hack for fat_ioctl_filldir() */ both->longname = fill_name; both->long_len = fill_len; both->shortname = bufname; both->short_len = short_len; fill_name = NULL; fill_len = 0; goto start_filldir; } } short_len = fat_parse_short(sb, de, bufname, sbi->options.dotsOK); if (short_len == 0) goto record_end; fill_name = bufname; fill_len = short_len; start_filldir: ctx->pos = cpos - (nr_slots + 1) * sizeof(struct msdos_dir_entry); if (fake_offset && ctx->pos < 2) ctx->pos = 2; if (!memcmp(de->name, MSDOS_DOT, MSDOS_NAME)) { if (!dir_emit_dot(file, ctx)) goto fill_failed; } else if (!memcmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) { if (!dir_emit_dotdot(file, ctx)) goto fill_failed; } else { unsigned long inum; loff_t i_pos = fat_make_i_pos(sb, bh, de); struct inode *tmp = fat_iget(sb, i_pos); if (tmp) { inum = tmp->i_ino; iput(tmp); } else inum = iunique(sb, MSDOS_ROOT_INO); if (!dir_emit(ctx, fill_name, fill_len, inum, (de->attr & ATTR_DIR) ? DT_DIR : DT_REG)) goto fill_failed; } record_end: fake_offset = 0; ctx->pos = cpos; goto get_new; end_of_dir: if (fake_offset && cpos < 2) ctx->pos = 2; else ctx->pos = cpos; fill_failed: brelse(bh); if (unicode) __putname(unicode); out: mutex_unlock(&sbi->s_lock); return ret; } static int fat_readdir(struct file *file, struct dir_context *ctx) { return __fat_readdir(file_inode(file), file, ctx, 0, NULL); } #define FAT_IOCTL_FILLDIR_FUNC(func, dirent_type) \ static bool func(struct dir_context *ctx, const char *name, int name_len, \ loff_t offset, u64 ino, unsigned int d_type) \ { \ struct fat_ioctl_filldir_callback *buf = \ container_of(ctx, struct fat_ioctl_filldir_callback, ctx); \ struct dirent_type __user *d1 = buf->dirent; \ struct dirent_type __user *d2 = d1 + 1; \ \ if (buf->result) \ return false; \ buf->result++; \ \ if (name != NULL) { \ /* dirent has only short name */ \ if (name_len >= sizeof(d1->d_name)) \ name_len = sizeof(d1->d_name) - 1; \ \ if (put_user(0, &d2->d_name[0]) || \ put_user(0, &d2->d_reclen) || \ copy_to_user(d1->d_name, name, name_len) || \ put_user(0, d1->d_name + name_len) || \ put_user(name_len, &d1->d_reclen)) \ goto efault; \ } else { \ /* dirent has short and long name */ \ const char *longname = buf->longname; \ int long_len = buf->long_len; \ const char *shortname = buf->shortname; \ int short_len = buf->short_len; \ \ if (long_len >= sizeof(d1->d_name)) \ long_len = sizeof(d1->d_name) - 1; \ if (short_len >= sizeof(d1->d_name)) \ short_len = sizeof(d1->d_name) - 1; \ \ if (copy_to_user(d2->d_name, longname, long_len) || \ put_user(0, d2->d_name + long_len) || \ put_user(long_len, &d2->d_reclen) || \ put_user(ino, &d2->d_ino) || \ put_user(offset, &d2->d_off) || \ copy_to_user(d1->d_name, shortname, short_len) || \ put_user(0, d1->d_name + short_len) || \ put_user(short_len, &d1->d_reclen)) \ goto efault; \ } \ return true; \ efault: \ buf->result = -EFAULT; \ return false; \ } FAT_IOCTL_FILLDIR_FUNC(fat_ioctl_filldir, __fat_dirent) static int fat_ioctl_readdir(struct inode *inode, struct file *file, void __user *dirent, filldir_t filldir, int short_only, int both) { struct fat_ioctl_filldir_callback buf = { .ctx.actor = filldir, .dirent = dirent }; int ret; buf.dirent = dirent; buf.result = 0; inode_lock_shared(inode); buf.ctx.pos = file->f_pos; ret = -ENOENT; if (!IS_DEADDIR(inode)) { ret = __fat_readdir(inode, file, &buf.ctx, short_only, both ? &buf : NULL); file->f_pos = buf.ctx.pos; } inode_unlock_shared(inode); if (ret >= 0) ret = buf.result; return ret; } static long fat_dir_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg; int short_only, both; switch (cmd) { case VFAT_IOCTL_READDIR_SHORT: short_only = 1; both = 0; break; case VFAT_IOCTL_READDIR_BOTH: short_only = 0; both = 1; break; default: return fat_generic_ioctl(filp, cmd, arg); } /* * Yes, we don't need this put_user() absolutely. However old * code didn't return the right value. So, app use this value, * in order to check whether it is EOF. */ if (put_user(0, &d1->d_reclen)) return -EFAULT; return fat_ioctl_readdir(inode, filp, d1, fat_ioctl_filldir, short_only, both); } #ifdef CONFIG_COMPAT #define VFAT_IOCTL_READDIR_BOTH32 _IOR('r', 1, struct compat_dirent[2]) #define VFAT_IOCTL_READDIR_SHORT32 _IOR('r', 2, struct compat_dirent[2]) FAT_IOCTL_FILLDIR_FUNC(fat_compat_ioctl_filldir, compat_dirent) static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd, unsigned long arg) { struct inode *inode = file_inode(filp); struct compat_dirent __user *d1 = compat_ptr(arg); int short_only, both; switch (cmd) { case VFAT_IOCTL_READDIR_SHORT32: short_only = 1; both = 0; break; case VFAT_IOCTL_READDIR_BOTH32: short_only = 0; both = 1; break; default: return fat_generic_ioctl(filp, cmd, (unsigned long)arg); } /* * Yes, we don't need this put_user() absolutely. However old * code didn't return the right value. So, app use this value, * in order to check whether it is EOF. */ if (put_user(0, &d1->d_reclen)) return -EFAULT; return fat_ioctl_readdir(inode, filp, d1, fat_compat_ioctl_filldir, short_only, both); } #endif /* CONFIG_COMPAT */ const struct file_operations fat_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = fat_readdir, .unlocked_ioctl = fat_dir_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = fat_compat_dir_ioctl, #endif .fsync = fat_file_fsync, }; static int fat_get_short_entry(struct inode *dir, loff_t *pos, struct buffer_head **bh, struct msdos_dir_entry **de) { while (fat_get_entry(dir, pos, bh, de) >= 0) { /* free entry or long name entry or volume label */ if (!IS_FREE((*de)->name) && !((*de)->attr & ATTR_VOLUME)) return 0; } return -ENOENT; } /* * The ".." entry can not provide the "struct fat_slot_info" information * for inode, nor a usable i_pos. So, this function provides some information * only. * * Since this function walks through the on-disk inodes within a directory, * callers are responsible for taking any locks necessary to prevent the * directory from changing. */ int fat_get_dotdot_entry(struct inode *dir, struct buffer_head **bh, struct msdos_dir_entry **de) { loff_t offset = 0; *de = NULL; while (fat_get_short_entry(dir, &offset, bh, de) >= 0) { if (!strncmp((*de)->name, MSDOS_DOTDOT, MSDOS_NAME)) return 0; } return -ENOENT; } EXPORT_SYMBOL_GPL(fat_get_dotdot_entry); /* See if directory is empty */ int fat_dir_empty(struct inode *dir) { struct buffer_head *bh; struct msdos_dir_entry *de; loff_t cpos; int result = 0; bh = NULL; cpos = 0; while (fat_get_short_entry(dir, &cpos, &bh, &de) >= 0) { if (strncmp(de->name, MSDOS_DOT , MSDOS_NAME) && strncmp(de->name, MSDOS_DOTDOT, MSDOS_NAME)) { result = -ENOTEMPTY; break; } } brelse(bh); return result; } EXPORT_SYMBOL_GPL(fat_dir_empty); /* * fat_subdirs counts the number of sub-directories of dir. It can be run * on directories being created. */ int fat_subdirs(struct inode *dir) { struct buffer_head *bh; struct msdos_dir_entry *de; loff_t cpos; int count = 0; bh = NULL; cpos = 0; while (fat_get_short_entry(dir, &cpos, &bh, &de) >= 0) { if (de->attr & ATTR_DIR) count++; } brelse(bh); return count; } /* * Scans a directory for a given file (name points to its formatted name). * Returns an error code or zero. */ int fat_scan(struct inode *dir, const unsigned char *name, struct fat_slot_info *sinfo) { struct super_block *sb = dir->i_sb; sinfo->slot_off = 0; sinfo->bh = NULL; while (fat_get_short_entry(dir, &sinfo->slot_off, &sinfo->bh, &sinfo->de) >= 0) { if (!strncmp(sinfo->de->name, name, MSDOS_NAME)) { sinfo->slot_off -= sizeof(*sinfo->de); sinfo->nr_slots = 1; sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de); return 0; } } return -ENOENT; } EXPORT_SYMBOL_GPL(fat_scan); /* * Scans a directory for a given logstart. * Returns an error code or zero. */ int fat_scan_logstart(struct inode *dir, int i_logstart, struct fat_slot_info *sinfo) { struct super_block *sb = dir->i_sb; sinfo->slot_off = 0; sinfo->bh = NULL; while (fat_get_short_entry(dir, &sinfo->slot_off, &sinfo->bh, &sinfo->de) >= 0) { if (fat_get_start(MSDOS_SB(sb), sinfo->de) == i_logstart) { sinfo->slot_off -= sizeof(*sinfo->de); sinfo->nr_slots = 1; sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de); return 0; } } return -ENOENT; } static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots) { struct super_block *sb = dir->i_sb; struct buffer_head *bh; struct msdos_dir_entry *de, *endp; int err = 0, orig_slots; while (nr_slots) { bh = NULL; if (fat_get_entry(dir, &pos, &bh, &de) < 0) { err = -EIO; break; } orig_slots = nr_slots; endp = (struct msdos_dir_entry *)(bh->b_data + sb->s_blocksize); while (nr_slots && de < endp) { de->name[0] = DELETED_FLAG; de++; nr_slots--; } mark_buffer_dirty_inode(bh, dir); if (IS_DIRSYNC(dir)) err = sync_dirty_buffer(bh); brelse(bh); if (err) break; /* pos is *next* de's position, so this does `- sizeof(de)' */ pos += ((orig_slots - nr_slots) * sizeof(*de)) - sizeof(*de); } return err; } int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo) { struct super_block *sb = dir->i_sb; struct msdos_dir_entry *de; struct buffer_head *bh; int err = 0, nr_slots; /* * First stage: Remove the shortname. By this, the directory * entry is removed. */ nr_slots = sinfo->nr_slots; de = sinfo->de; sinfo->de = NULL; bh = sinfo->bh; sinfo->bh = NULL; while (nr_slots && de >= (struct msdos_dir_entry *)bh->b_data) { de->name[0] = DELETED_FLAG; de--; nr_slots--; } mark_buffer_dirty_inode(bh, dir); if (IS_DIRSYNC(dir)) err = sync_dirty_buffer(bh); brelse(bh); if (err) return err; inode_inc_iversion(dir); if (nr_slots) { /* * Second stage: remove the remaining longname slots. * (This directory entry is already removed, and so return * the success) */ err = __fat_remove_entries(dir, sinfo->slot_off, nr_slots); if (err) { fat_msg(sb, KERN_WARNING, "Couldn't remove the long name slots"); } } fat_truncate_time(dir, NULL, S_ATIME|S_MTIME); if (IS_DIRSYNC(dir)) (void)fat_sync_inode(dir); else mark_inode_dirty(dir); return 0; } EXPORT_SYMBOL_GPL(fat_remove_entries); static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used, struct buffer_head **bhs, int nr_bhs) { struct super_block *sb = dir->i_sb; sector_t last_blknr = blknr + MSDOS_SB(sb)->sec_per_clus; int err, i, n; /* Zeroing the unused blocks on this cluster */ blknr += nr_used; n = nr_used; while (blknr < last_blknr) { bhs[n] = sb_getblk(sb, blknr); if (!bhs[n]) { err = -ENOMEM; goto error; } /* Avoid race with userspace read via bdev */ lock_buffer(bhs[n]); memset(bhs[n]->b_data, 0, sb->s_blocksize); set_buffer_uptodate(bhs[n]); unlock_buffer(bhs[n]); mark_buffer_dirty_inode(bhs[n], dir); n++; blknr++; if (n == nr_bhs) { if (IS_DIRSYNC(dir)) { err = fat_sync_bhs(bhs, n); if (err) goto error; } for (i = 0; i < n; i++) brelse(bhs[i]); n = 0; } } if (IS_DIRSYNC(dir)) { err = fat_sync_bhs(bhs, n); if (err) goto error; } for (i = 0; i < n; i++) brelse(bhs[i]); return 0; error: for (i = 0; i < n; i++) bforget(bhs[i]); return err; } int fat_alloc_new_dir(struct inode *dir, struct timespec64 *ts) { struct super_block *sb = dir->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); struct buffer_head *bhs[MAX_BUF_PER_PAGE]; struct msdos_dir_entry *de; sector_t blknr; __le16 date, time; u8 time_cs; int err, cluster; err = fat_alloc_clusters(dir, &cluster, 1); if (err) goto error; blknr = fat_clus_to_blknr(sbi, cluster); bhs[0] = sb_getblk(sb, blknr); if (!bhs[0]) { err = -ENOMEM; goto error_free; } fat_time_unix2fat(sbi, ts, &time, &date, &time_cs); de = (struct msdos_dir_entry *)bhs[0]->b_data; /* Avoid race with userspace read via bdev */ lock_buffer(bhs[0]); /* filling the new directory slots ("." and ".." entries) */ memcpy(de[0].name, MSDOS_DOT, MSDOS_NAME); memcpy(de[1].name, MSDOS_DOTDOT, MSDOS_NAME); de->attr = de[1].attr = ATTR_DIR; de[0].lcase = de[1].lcase = 0; de[0].time = de[1].time = time; de[0].date = de[1].date = date; if (sbi->options.isvfat) { /* extra timestamps */ de[0].ctime = de[1].ctime = time; de[0].ctime_cs = de[1].ctime_cs = time_cs; de[0].adate = de[0].cdate = de[1].adate = de[1].cdate = date; } else { de[0].ctime = de[1].ctime = 0; de[0].ctime_cs = de[1].ctime_cs = 0; de[0].adate = de[0].cdate = de[1].adate = de[1].cdate = 0; } fat_set_start(&de[0], cluster); fat_set_start(&de[1], MSDOS_I(dir)->i_logstart); de[0].size = de[1].size = 0; memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de)); set_buffer_uptodate(bhs[0]); unlock_buffer(bhs[0]); mark_buffer_dirty_inode(bhs[0], dir); err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE); if (err) goto error_free; return cluster; error_free: fat_free_clusters(dir, cluster); error: return err; } EXPORT_SYMBOL_GPL(fat_alloc_new_dir); static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots, int *nr_cluster, struct msdos_dir_entry **de, struct buffer_head **bh, loff_t *i_pos) { struct super_block *sb = dir->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); struct buffer_head *bhs[MAX_BUF_PER_PAGE]; sector_t blknr, start_blknr, last_blknr; unsigned long size, copy; int err, i, n, offset, cluster[2]; /* * The minimum cluster size is 512bytes, and maximum entry * size is 32*slots (672bytes). So, iff the cluster size is * 512bytes, we may need two clusters. */ size = nr_slots * sizeof(struct msdos_dir_entry); *nr_cluster = (size + (sbi->cluster_size - 1)) >> sbi->cluster_bits; BUG_ON(*nr_cluster > 2); err = fat_alloc_clusters(dir, cluster, *nr_cluster); if (err) goto error; /* * First stage: Fill the directory entry. NOTE: This cluster * is not referenced from any inode yet, so updates order is * not important. */ i = n = copy = 0; do { start_blknr = blknr = fat_clus_to_blknr(sbi, cluster[i]); last_blknr = start_blknr + sbi->sec_per_clus; while (blknr < last_blknr) { bhs[n] = sb_getblk(sb, blknr); if (!bhs[n]) { err = -ENOMEM; goto error_nomem; } /* fill the directory entry */ copy = min(size, sb->s_blocksize); /* Avoid race with userspace read via bdev */ lock_buffer(bhs[n]); memcpy(bhs[n]->b_data, slots, copy); set_buffer_uptodate(bhs[n]); unlock_buffer(bhs[n]); mark_buffer_dirty_inode(bhs[n], dir); slots += copy; size -= copy; if (!size) break; n++; blknr++; } } while (++i < *nr_cluster); memset(bhs[n]->b_data + copy, 0, sb->s_blocksize - copy); offset = copy - sizeof(struct msdos_dir_entry); get_bh(bhs[n]); *bh = bhs[n]; *de = (struct msdos_dir_entry *)((*bh)->b_data + offset); *i_pos = fat_make_i_pos(sb, *bh, *de); /* Second stage: clear the rest of cluster, and write outs */ err = fat_zeroed_cluster(dir, start_blknr, ++n, bhs, MAX_BUF_PER_PAGE); if (err) goto error_free; return cluster[0]; error_free: brelse(*bh); *bh = NULL; n = 0; error_nomem: for (i = 0; i < n; i++) bforget(bhs[i]); fat_free_clusters(dir, cluster[0]); error: return err; } int fat_add_entries(struct inode *dir, void *slots, int nr_slots, struct fat_slot_info *sinfo) { struct super_block *sb = dir->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); struct buffer_head *bh, *prev, *bhs[3]; /* 32*slots (672bytes) */ struct msdos_dir_entry *de; int err, free_slots, i, nr_bhs; loff_t pos, i_pos; sinfo->nr_slots = nr_slots; /* First stage: search free directory entries */ free_slots = nr_bhs = 0; bh = prev = NULL; pos = 0; err = -ENOSPC; while (fat_get_entry(dir, &pos, &bh, &de) > -1) { /* check the maximum size of directory */ if (pos >= FAT_MAX_DIR_SIZE) goto error; if (IS_FREE(de->name)) { if (prev != bh) { get_bh(bh); bhs[nr_bhs] = prev = bh; nr_bhs++; } free_slots++; if (free_slots == nr_slots) goto found; } else { for (i = 0; i < nr_bhs; i++) brelse(bhs[i]); prev = NULL; free_slots = nr_bhs = 0; } } if (dir->i_ino == MSDOS_ROOT_INO) { if (!is_fat32(sbi)) goto error; } else if (MSDOS_I(dir)->i_start == 0) { fat_msg(sb, KERN_ERR, "Corrupted directory (i_pos %lld)", MSDOS_I(dir)->i_pos); err = -EIO; goto error; } found: err = 0; pos -= free_slots * sizeof(*de); nr_slots -= free_slots; if (free_slots) { /* * Second stage: filling the free entries with new entries. * NOTE: If this slots has shortname, first, we write * the long name slots, then write the short name. */ int size = free_slots * sizeof(*de); int offset = pos & (sb->s_blocksize - 1); int long_bhs = nr_bhs - (nr_slots == 0); /* Fill the long name slots. */ for (i = 0; i < long_bhs; i++) { int copy = min_t(int, sb->s_blocksize - offset, size); memcpy(bhs[i]->b_data + offset, slots, copy); mark_buffer_dirty_inode(bhs[i], dir); offset = 0; slots += copy; size -= copy; } if (long_bhs && IS_DIRSYNC(dir)) err = fat_sync_bhs(bhs, long_bhs); if (!err && i < nr_bhs) { /* Fill the short name slot. */ int copy = min_t(int, sb->s_blocksize - offset, size); memcpy(bhs[i]->b_data + offset, slots, copy); mark_buffer_dirty_inode(bhs[i], dir); if (IS_DIRSYNC(dir)) err = sync_dirty_buffer(bhs[i]); } for (i = 0; i < nr_bhs; i++) brelse(bhs[i]); if (err) goto error_remove; } if (nr_slots) { int cluster, nr_cluster; /* * Third stage: allocate the cluster for new entries. * And initialize the cluster with new entries, then * add the cluster to dir. */ cluster = fat_add_new_entries(dir, slots, nr_slots, &nr_cluster, &de, &bh, &i_pos); if (cluster < 0) { err = cluster; goto error_remove; } err = fat_chain_add(dir, cluster, nr_cluster); if (err) { fat_free_clusters(dir, cluster); goto error_remove; } if (dir->i_size & (sbi->cluster_size - 1)) { fat_fs_error(sb, "Odd directory size"); dir->i_size = (dir->i_size + sbi->cluster_size - 1) & ~((loff_t)sbi->cluster_size - 1); } dir->i_size += nr_cluster << sbi->cluster_bits; MSDOS_I(dir)->mmu_private += nr_cluster << sbi->cluster_bits; } sinfo->slot_off = pos; sinfo->de = de; sinfo->bh = bh; sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de); return 0; error: brelse(bh); for (i = 0; i < nr_bhs; i++) brelse(bhs[i]); return err; error_remove: brelse(bh); if (free_slots) __fat_remove_entries(dir, pos, free_slots); return err; } EXPORT_SYMBOL_GPL(fat_add_entries);
68 68 68 68 68 4047 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 // SPDX-License-Identifier: GPL-2.0 /* * USB-ACPI glue code * * Copyright 2012 Red Hat <mjg@redhat.com> */ #include <linux/module.h> #include <linux/usb.h> #include <linux/device.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/acpi.h> #include <linux/pci.h> #include <linux/usb/hcd.h> #include "hub.h" /** * usb_acpi_power_manageable - check whether usb port has * acpi power resource. * @hdev: USB device belonging to the usb hub * @index: port index based zero * * Return true if the port has acpi power resource and false if no. */ bool usb_acpi_power_manageable(struct usb_device *hdev, int index) { acpi_handle port_handle; int port1 = index + 1; port_handle = usb_get_hub_port_acpi_handle(hdev, port1); if (port_handle) return acpi_bus_power_manageable(port_handle); else return false; } EXPORT_SYMBOL_GPL(usb_acpi_power_manageable); #define UUID_USB_CONTROLLER_DSM "ce2ee385-00e6-48cb-9f05-2edb927c4899" #define USB_DSM_DISABLE_U1_U2_FOR_PORT 5 /** * usb_acpi_port_lpm_incapable - check if lpm should be disabled for a port. * @hdev: USB device belonging to the usb hub * @index: zero based port index * * Some USB3 ports may not support USB3 link power management U1/U2 states * due to different retimer setup. ACPI provides _DSM method which returns 0x01 * if U1 and U2 states should be disabled. Evaluate _DSM with: * Arg0: UUID = ce2ee385-00e6-48cb-9f05-2edb927c4899 * Arg1: Revision ID = 0 * Arg2: Function Index = 5 * Arg3: (empty) * * Return 1 if USB3 port is LPM incapable, negative on error, otherwise 0 */ int usb_acpi_port_lpm_incapable(struct usb_device *hdev, int index) { union acpi_object *obj; acpi_handle port_handle; int port1 = index + 1; guid_t guid; int ret; ret = guid_parse(UUID_USB_CONTROLLER_DSM, &guid); if (ret) return ret; port_handle = usb_get_hub_port_acpi_handle(hdev, port1); if (!port_handle) { dev_dbg(&hdev->dev, "port-%d no acpi handle\n", port1); return -ENODEV; } if (!acpi_check_dsm(port_handle, &guid, 0, BIT(USB_DSM_DISABLE_U1_U2_FOR_PORT))) { dev_dbg(&hdev->dev, "port-%d no _DSM function %d\n", port1, USB_DSM_DISABLE_U1_U2_FOR_PORT); return -ENODEV; } obj = acpi_evaluate_dsm_typed(port_handle, &guid, 0, USB_DSM_DISABLE_U1_U2_FOR_PORT, NULL, ACPI_TYPE_INTEGER); if (!obj) { dev_dbg(&hdev->dev, "evaluate port-%d _DSM failed\n", port1); return -EINVAL; } if (obj->integer.value == 0x01) ret = 1; ACPI_FREE(obj); return ret; } EXPORT_SYMBOL_GPL(usb_acpi_port_lpm_incapable); /** * usb_acpi_set_power_state - control usb port's power via acpi power * resource * @hdev: USB device belonging to the usb hub * @index: port index based zero * @enable: power state expected to be set * * Notice to use usb_acpi_power_manageable() to check whether the usb port * has acpi power resource before invoking this function. * * Returns 0 on success, else negative errno. */ int usb_acpi_set_power_state(struct usb_device *hdev, int index, bool enable) { struct usb_hub *hub = usb_hub_to_struct_hub(hdev); struct usb_port *port_dev; acpi_handle port_handle; unsigned char state; int port1 = index + 1; int error = -EINVAL; if (!hub) return -ENODEV; port_dev = hub->ports[port1 - 1]; port_handle = (acpi_handle) usb_get_hub_port_acpi_handle(hdev, port1); if (!port_handle) return error; if (enable) state = ACPI_STATE_D0; else state = ACPI_STATE_D3_COLD; error = acpi_bus_set_power(port_handle, state); if (!error) dev_dbg(&port_dev->dev, "acpi: power was set to %d\n", enable); else dev_dbg(&port_dev->dev, "acpi: power failed to be set\n"); return error; } EXPORT_SYMBOL_GPL(usb_acpi_set_power_state); /** * usb_acpi_add_usb4_devlink - add device link to USB4 Host Interface for tunneled USB3 devices * * @udev: Tunneled USB3 device connected to a roothub. * * Adds a device link between a tunneled USB3 device and the USB4 Host Interface * device to ensure correct runtime PM suspend and resume order. This function * should only be called for tunneled USB3 devices. * The USB4 Host Interface this tunneled device depends on is found from the roothub * port ACPI device specific data _DSD entry. * * Return: negative error code on failure, 0 otherwise */ static int usb_acpi_add_usb4_devlink(struct usb_device *udev) { struct device_link *link; struct usb_port *port_dev; struct usb_hub *hub; if (!udev->parent || udev->parent->parent) return 0; hub = usb_hub_to_struct_hub(udev->parent); if (!hub) return 0; port_dev = hub->ports[udev->portnum - 1]; struct fwnode_handle *nhi_fwnode __free(fwnode_handle) = fwnode_find_reference(dev_fwnode(&port_dev->dev), "usb4-host-interface", 0); if (IS_ERR(nhi_fwnode) || !nhi_fwnode->dev) return 0; link = device_link_add(&port_dev->child->dev, nhi_fwnode->dev, DL_FLAG_STATELESS | DL_FLAG_RPM_ACTIVE | DL_FLAG_PM_RUNTIME); if (!link) { dev_err(&port_dev->dev, "Failed to created device link from %s to %s\n", dev_name(&port_dev->child->dev), dev_name(nhi_fwnode->dev)); return -EINVAL; } dev_dbg(&port_dev->dev, "Created device link from %s to %s\n", dev_name(&port_dev->child->dev), dev_name(nhi_fwnode->dev)); udev->usb4_link = link; return 0; } /* * Private to usb-acpi, all the core needs to know is that * port_dev->location is non-zero when it has been set by the firmware. */ #define USB_ACPI_LOCATION_VALID (1 << 31) static void usb_acpi_get_connect_type(struct usb_port *port_dev, acpi_handle *handle) { enum usb_port_connect_type connect_type = USB_PORT_CONNECT_TYPE_UNKNOWN; struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; union acpi_object *upc = NULL; struct acpi_pld_info *pld = NULL; acpi_status status; /* * According to 9.14 in ACPI Spec 6.2. _PLD indicates whether usb port * is user visible and _UPC indicates whether it is connectable. If * the port was visible and connectable, it could be freely connected * and disconnected with USB devices. If no visible and connectable, * a usb device is directly hard-wired to the port. If no visible and * no connectable, the port would be not used. */ if (acpi_get_physical_device_location(handle, &pld) && pld) port_dev->location = USB_ACPI_LOCATION_VALID | pld->group_token << 8 | pld->group_position; status = acpi_evaluate_object(handle, "_UPC", NULL, &buffer); if (ACPI_FAILURE(status)) goto out; upc = buffer.pointer; if (!upc || (upc->type != ACPI_TYPE_PACKAGE) || upc->package.count != 4) goto out; /* UPC states port is connectable */ if (upc->package.elements[0].integer.value) if (!pld) ; /* keep connect_type as unknown */ else if (pld->user_visible) connect_type = USB_PORT_CONNECT_TYPE_HOT_PLUG; else connect_type = USB_PORT_CONNECT_TYPE_HARD_WIRED; else connect_type = USB_PORT_NOT_USED; out: port_dev->connect_type = connect_type; kfree(upc); ACPI_FREE(pld); } static struct acpi_device * usb_acpi_get_companion_for_port(struct usb_port *port_dev) { struct usb_device *udev; struct acpi_device *adev; acpi_handle *parent_handle; int port1; /* Get the struct usb_device point of port's hub */ udev = to_usb_device(port_dev->dev.parent->parent); /* * The root hub ports' parent is the root hub. The non-root-hub * ports' parent is the parent hub port which the hub is * connected to. */ if (!udev->parent) { adev = ACPI_COMPANION(&udev->dev); port1 = usb_hcd_find_raw_port_number(bus_to_hcd(udev->bus), port_dev->portnum); } else { parent_handle = usb_get_hub_port_acpi_handle(udev->parent, udev->portnum); if (!parent_handle) return NULL; adev = acpi_fetch_acpi_dev(parent_handle); port1 = port_dev->portnum; } return acpi_find_child_by_adr(adev, port1); } static struct acpi_device * usb_acpi_find_companion_for_port(struct usb_port *port_dev) { struct acpi_device *adev; adev = usb_acpi_get_companion_for_port(port_dev); if (!adev) return NULL; usb_acpi_get_connect_type(port_dev, adev->handle); return adev; } static struct acpi_device * usb_acpi_find_companion_for_device(struct usb_device *udev) { struct acpi_device *adev; struct usb_port *port_dev; struct usb_hub *hub; if (!udev->parent) { /* * root hub is only child (_ADR=0) under its parent, the HC. * sysdev pointer is the HC as seen from firmware. */ adev = ACPI_COMPANION(udev->bus->sysdev); return acpi_find_child_device(adev, 0, false); } hub = usb_hub_to_struct_hub(udev->parent); if (!hub) return NULL; /* Tunneled USB3 devices depend on USB4 Host Interface, set device link to it */ if (udev->speed >= USB_SPEED_SUPER && udev->tunnel_mode != USB_LINK_NATIVE) usb_acpi_add_usb4_devlink(udev); /* * This is an embedded USB device connected to a port and such * devices share port's ACPI companion. */ port_dev = hub->ports[udev->portnum - 1]; return usb_acpi_get_companion_for_port(port_dev); } static struct acpi_device *usb_acpi_find_companion(struct device *dev) { /* * The USB hierarchy like following: * * Device (EHC1) * Device (HUBN) * Device (PR01) * Device (PR11) * Device (PR12) * Device (FN12) * Device (FN13) * Device (PR13) * ... * where HUBN is root hub, and PRNN are USB ports and devices * connected to them, and FNNN are individualk functions for * connected composite USB devices. PRNN and FNNN may contain * _CRS and other methods describing sideband resources for * the connected device. * * On the kernel side both root hub and embedded USB devices are * represented as instances of usb_device structure, and ports * are represented as usb_port structures, so the whole process * is split into 2 parts: finding companions for devices and * finding companions for ports. * * Note that we do not handle individual functions of composite * devices yet, for that we would need to assign companions to * devices corresponding to USB interfaces. */ if (is_usb_device(dev)) return usb_acpi_find_companion_for_device(to_usb_device(dev)); else if (is_usb_port(dev)) return usb_acpi_find_companion_for_port(to_usb_port(dev)); return NULL; } static bool usb_acpi_bus_match(struct device *dev) { return is_usb_device(dev) || is_usb_port(dev); } static struct acpi_bus_type usb_acpi_bus = { .name = "USB", .match = usb_acpi_bus_match, .find_companion = usb_acpi_find_companion, }; int usb_acpi_register(void) { return register_acpi_bus_type(&usb_acpi_bus); } void usb_acpi_unregister(void) { unregister_acpi_bus_type(&usb_acpi_bus); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,port,net type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/random.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <net/tcp.h> #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_getport.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 /* 0 Comments support added */ /* 1 Forceadd support added */ /* 2 skbinfo support added */ #define IPSET_TYPE_REV_MAX 3 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Oliver Smith <oliver@8.c.9.b.0.7.4.0.1.0.0.2.ip6.arpa>"); IP_SET_MODULE_DESC("hash:net,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:net,port,net"); /* Type specific function prefix */ #define HTYPE hash_netportnet #define IP_SET_HASH_WITH_PROTO #define IP_SET_HASH_WITH_NETS #define IPSET_NET_COUNT 2 #define IP_SET_HASH_WITH_NET0 /* IPv4 variant */ /* Member elements */ struct hash_netportnet4_elem { union { __be32 ip[2]; __be64 ipcmp; }; __be16 port; union { u8 cidr[2]; u16 ccmp; }; u16 padding; u8 nomatch; u8 proto; }; /* Common functions */ static bool hash_netportnet4_data_equal(const struct hash_netportnet4_elem *ip1, const struct hash_netportnet4_elem *ip2, u32 *multi) { return ip1->ipcmp == ip2->ipcmp && ip1->ccmp == ip2->ccmp && ip1->port == ip2->port && ip1->proto == ip2->proto; } static int hash_netportnet4_do_data_match(const struct hash_netportnet4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static void hash_netportnet4_data_set_flags(struct hash_netportnet4_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } static void hash_netportnet4_data_reset_flags(struct hash_netportnet4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } static void hash_netportnet4_data_reset_elem(struct hash_netportnet4_elem *elem, struct hash_netportnet4_elem *orig) { elem->ip[1] = orig->ip[1]; } static void hash_netportnet4_data_netmask(struct hash_netportnet4_elem *elem, u8 cidr, bool inner) { if (inner) { elem->ip[1] &= ip_set_netmask(cidr); elem->cidr[1] = cidr; } else { elem->ip[0] &= ip_set_netmask(cidr); elem->cidr[0] = cidr; } } static bool hash_netportnet4_data_list(struct sk_buff *skb, const struct hash_netportnet4_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip[0]) || nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip[1]) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) || nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_netportnet4_data_next(struct hash_netportnet4_elem *next, const struct hash_netportnet4_elem *d) { next->ipcmp = d->ipcmp; next->port = d->port; } #define MTYPE hash_netportnet4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" static void hash_netportnet4_init(struct hash_netportnet4_elem *e) { e->cidr[0] = HOST_MASK; e->cidr[1] = HOST_MASK; } static int hash_netportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_netportnet4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netportnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); if (adt == IPSET_TEST) e.ccmp = (HOST_MASK << (sizeof(e.cidr[0]) * 8)) | HOST_MASK; if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0]); ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip[1]); e.ip[0] &= ip_set_netmask(e.cidr[0]); e.ip[1] &= ip_set_netmask(e.cidr[1]); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static u32 hash_netportnet4_range_to_cidr(u32 from, u32 to, u8 *cidr) { if (from == 0 && to == UINT_MAX) { *cidr = 0; return to; } return ip_set_range_to_cidr(from, to, cidr); } static int hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct hash_netportnet4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netportnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, p = 0, port, port_to; u32 ip2_from = 0, ip2_to = 0, ip2, i = 0; bool with_ports = false; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); hash_netportnet4_init(&e); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); if (ret) return ret; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) { e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (e.cidr[0] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } if (tb[IPSET_ATTR_CIDR2]) { e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); if (e.cidr[1] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } with_ports = with_ports && tb[IPSET_ATTR_PORT_TO]; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || with_ports || tb[IPSET_ATTR_IP2_TO])) { e.ip[0] = htonl(ip & ip_set_hostmask(e.cidr[0])); e.ip[1] = htonl(ip2_from & ip_set_hostmask(e.cidr[1])); ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } ip_to = ip; if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; if (ip > ip_to) swap(ip, ip_to); if (unlikely(ip + UINT_MAX == ip_to)) return -IPSET_ERR_HASH_RANGE; } else { ip_set_mask_from_to(ip, ip_to, e.cidr[0]); } port_to = port = ntohs(e.port); if (tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); } ip2_to = ip2_from; if (tb[IPSET_ATTR_IP2_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to); if (ret) return ret; if (ip2_from > ip2_to) swap(ip2_from, ip2_to); if (unlikely(ip2_from + UINT_MAX == ip2_to)) return -IPSET_ERR_HASH_RANGE; } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]); } if (retried) { ip = ntohl(h->next.ip[0]); p = ntohs(h->next.port); ip2 = ntohl(h->next.ip[1]); } else { p = port; ip2 = ip2_from; } do { e.ip[0] = htonl(ip); ip = hash_netportnet4_range_to_cidr(ip, ip_to, &e.cidr[0]); for (; p <= port_to; p++) { e.port = htons(p); do { i++; e.ip[1] = htonl(ip2); if (i > IPSET_MAX_RANGE) { hash_netportnet4_data_next(&h->next, &e); return -ERANGE; } ip2 = hash_netportnet4_range_to_cidr(ip2, ip2_to, &e.cidr[1]); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } while (ip2++ < ip2_to); ip2 = ip2_from; } p = port; } while (ip++ < ip_to); return ret; } /* IPv6 variant */ struct hash_netportnet6_elem { union nf_inet_addr ip[2]; __be16 port; union { u8 cidr[2]; u16 ccmp; }; u16 padding; u8 nomatch; u8 proto; }; /* Common functions */ static bool hash_netportnet6_data_equal(const struct hash_netportnet6_elem *ip1, const struct hash_netportnet6_elem *ip2, u32 *multi) { return ipv6_addr_equal(&ip1->ip[0].in6, &ip2->ip[0].in6) && ipv6_addr_equal(&ip1->ip[1].in6, &ip2->ip[1].in6) && ip1->ccmp == ip2->ccmp && ip1->port == ip2->port && ip1->proto == ip2->proto; } static int hash_netportnet6_do_data_match(const struct hash_netportnet6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static void hash_netportnet6_data_set_flags(struct hash_netportnet6_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } static void hash_netportnet6_data_reset_flags(struct hash_netportnet6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } static void hash_netportnet6_data_reset_elem(struct hash_netportnet6_elem *elem, struct hash_netportnet6_elem *orig) { elem->ip[1] = orig->ip[1]; } static void hash_netportnet6_data_netmask(struct hash_netportnet6_elem *elem, u8 cidr, bool inner) { if (inner) { ip6_netmask(&elem->ip[1], cidr); elem->cidr[1] = cidr; } else { ip6_netmask(&elem->ip[0], cidr); elem->cidr[0] = cidr; } } static bool hash_netportnet6_data_list(struct sk_buff *skb, const struct hash_netportnet6_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip[0].in6) || nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip[1].in6) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_CIDR, data->cidr[0]) || nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr[1]) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_netportnet6_data_next(struct hash_netportnet6_elem *next, const struct hash_netportnet6_elem *d) { next->port = d->port; } #undef MTYPE #undef HOST_MASK #define MTYPE hash_netportnet6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static void hash_netportnet6_init(struct hash_netportnet6_elem *e) { e->cidr[0] = HOST_MASK; e->cidr[1] = HOST_MASK; } static int hash_netportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_netportnet6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netportnet6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); e.cidr[0] = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK); e.cidr[1] = INIT_CIDR(h->nets[0].cidr[1], HOST_MASK); if (adt == IPSET_TEST) e.ccmp = (HOST_MASK << (sizeof(u8) * 8)) | HOST_MASK; if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip[0].in6); ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip[1].in6); ip6_netmask(&e.ip[0], e.cidr[0]); ip6_netmask(&e.ip[1], e.cidr[1]); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_netportnet6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_netportnet6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netportnet6_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to; bool with_ports = false; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); hash_netportnet6_init(&e); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_IP2_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip[0]); if (ret) return ret; ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip[1]); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; if (tb[IPSET_ATTR_CIDR]) { e.cidr[0] = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (e.cidr[0] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } if (tb[IPSET_ATTR_CIDR2]) { e.cidr[1] = nla_get_u8(tb[IPSET_ATTR_CIDR2]); if (e.cidr[1] > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } ip6_netmask(&e.ip[0], e.cidr[0]); ip6_netmask(&e.ip[1], e.cidr[1]); e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } port = ntohs(e.port); port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); if (retried) port = ntohs(h->next.port); for (; port <= port_to; port++) { e.port = htons(port); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } return ret; } static struct ip_set_type hash_netportnet_type __read_mostly = { .name = "hash:net,port,net", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2 | IPSET_TYPE_NOMATCH, .dimension = IPSET_DIM_THREE, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_netportnet_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_IP2] = { .type = NLA_NESTED }, [IPSET_ATTR_IP2_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_PORT] = { .type = NLA_U16 }, [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_CIDR2] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_netportnet_init(void) { return ip_set_type_register(&hash_netportnet_type); } static void __exit hash_netportnet_fini(void) { rcu_barrier(); ip_set_type_unregister(&hash_netportnet_type); } module_init(hash_netportnet_init); module_exit(hash_netportnet_fini);
115 3 235 3 122 122 116 10 122 122 122 58 58 58 316 3 3 316 3 316 96 96 96 236 235 211 211 237 2 235 215 208 1 1 45 45 45 91 91 18 1 18 54 54 54 180 148 54 6 8 2 2 6 6 1 49 33 16 40 9 16 27 14 14 14 31 31 31 31 31 31 39 5 34 2 31 28 2 10 16 99 28 35 20 26 27 91 28 75 7 7 7 119 22 11 115 16 115 1 26 195 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 /* * linux/drivers/video/vgacon.c -- Low level VGA based console driver * * Created 28 Sep 1997 by Geert Uytterhoeven * * Rewritten by Martin Mares <mj@ucw.cz>, July 1998 * * This file is based on the old console.c, vga.c and vesa_blank.c drivers. * * Copyright (C) 1991, 1992 Linus Torvalds * 1995 Jay Estabrook * * User definable mapping table and font loading by Eugene G. Crosser, * <crosser@average.org> * * Improved loadable font/UTF-8 support by H. Peter Anvin * Feb-Sep 1995 <peter.anvin@linux.org> * * Colour palette handling, by Simon Tatham * 17-Jun-95 <sgt20@cam.ac.uk> * * if 512 char mode is already enabled don't re-enable it, * because it causes screen to flicker, by Mitja Horvat * 5-May-96 <mitja.horvat@guest.arnes.si> * * Use 2 outw instead of 4 outb_p to reduce erroneous text * flashing on RHS of screen during heavy console scrolling . * Oct 1996, Paul Gortmaker. * * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of this archive for * more details. */ #include <linux/module.h> #include <linux/types.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/console.h> #include <linux/string.h> #include <linux/kd.h> #include <linux/slab.h> #include <linux/vt_kern.h> #include <linux/sched.h> #include <linux/selection.h> #include <linux/spinlock.h> #include <linux/ioport.h> #include <linux/init.h> #include <linux/screen_info.h> #include <video/vga.h> #include <asm/io.h> static DEFINE_RAW_SPINLOCK(vga_lock); static int cursor_size_lastfrom; static int cursor_size_lastto; static u32 vgacon_xres; static u32 vgacon_yres; static struct vgastate vgastate; #define BLANK 0x0020 #define VGA_FONTWIDTH 8 /* VGA does not support fontwidths != 8 */ /* * Interface used by the world */ static bool vgacon_set_origin(struct vc_data *c); static struct uni_pagedict *vgacon_uni_pagedir; static int vgacon_refcount; /* Description of the hardware situation */ static unsigned long vga_vram_base __read_mostly; /* Base of video memory */ static unsigned long vga_vram_end __read_mostly; /* End of video memory */ static unsigned int vga_vram_size __read_mostly; /* Size of video memory */ static u16 vga_video_port_reg __read_mostly; /* Video register select port */ static u16 vga_video_port_val __read_mostly; /* Video register value port */ static unsigned int vga_video_num_columns; /* Number of text columns */ static unsigned int vga_video_num_lines; /* Number of text lines */ static bool vga_can_do_color; /* Do we support colors? */ static unsigned int vga_default_font_height __read_mostly; /* Height of default screen font */ static unsigned char vga_video_type __read_mostly; /* Card type */ static enum vesa_blank_mode vga_vesa_blanked; static bool vga_palette_blanked; static bool vga_is_gfx; static bool vga_512_chars; static int vga_video_font_height; static int vga_scan_lines __read_mostly; static unsigned int vga_rolled_over; /* last vc_origin offset before wrap */ static struct screen_info *vga_si; static bool vga_hardscroll_enabled; static bool vga_hardscroll_user_enable = true; static int __init no_scroll(char *str) { /* * Disabling scrollback is required for the Braillex ib80-piezo * Braille reader made by F.H. Papenmeier (Germany). * Use the "no-scroll" bootflag. */ vga_hardscroll_user_enable = vga_hardscroll_enabled = false; return 1; } __setup("no-scroll", no_scroll); /* * By replacing the four outb_p with two back to back outw, we can reduce * the window of opportunity to see text mislocated to the RHS of the * console during heavy scrolling activity. However there is the remote * possibility that some pre-dinosaur hardware won't like the back to back * I/O. Since the Xservers get away with it, we should be able to as well. */ static inline void write_vga(unsigned char reg, unsigned int val) { unsigned int v1, v2; unsigned long flags; /* * ddprintk might set the console position from interrupt * handlers, thus the write has to be IRQ-atomic. */ raw_spin_lock_irqsave(&vga_lock, flags); v1 = reg + (val & 0xff00); v2 = reg + 1 + ((val << 8) & 0xff00); outw(v1, vga_video_port_reg); outw(v2, vga_video_port_reg); raw_spin_unlock_irqrestore(&vga_lock, flags); } static inline void vga_set_mem_top(struct vc_data *c) { write_vga(12, (c->vc_visible_origin - vga_vram_base) / 2); } static void vgacon_scrolldelta(struct vc_data *c, int lines) { unsigned long scr_end = c->vc_scr_end - vga_vram_base; unsigned long vorigin = c->vc_visible_origin - vga_vram_base; unsigned long origin = c->vc_origin - vga_vram_base; int margin = c->vc_size_row * 4; int from, wrap, from_off, avail; /* Turn scrollback off */ if (!lines) { c->vc_visible_origin = c->vc_origin; return; } /* Do we have already enough to allow jumping from 0 to the end? */ if (vga_rolled_over > scr_end + margin) { from = scr_end; wrap = vga_rolled_over + c->vc_size_row; } else { from = 0; wrap = vga_vram_size; } from_off = (vorigin - from + wrap) % wrap + lines * c->vc_size_row; avail = (origin - from + wrap) % wrap; /* Only a little piece would be left? Show all incl. the piece! */ if (avail < 2 * margin) margin = 0; if (from_off < margin) from_off = 0; if (from_off > avail - margin) from_off = avail; c->vc_visible_origin = vga_vram_base + (from + from_off) % wrap; vga_set_mem_top(c); } static void vgacon_restore_screen(struct vc_data *c) { if (c->vc_origin != c->vc_visible_origin) vgacon_scrolldelta(c, 0); } static const char *vgacon_startup(void) { const char *display_desc = NULL; u16 saved1, saved2; volatile u16 *p; if (!vga_si || vga_si->orig_video_isVGA == VIDEO_TYPE_VLFB || vga_si->orig_video_isVGA == VIDEO_TYPE_EFI) { no_vga: #ifdef CONFIG_DUMMY_CONSOLE conswitchp = &dummy_con; return conswitchp->con_startup(); #else return NULL; #endif } /* vga_si reasonably initialized? */ if ((vga_si->orig_video_lines == 0) || (vga_si->orig_video_cols == 0)) goto no_vga; /* VGA16 modes are not handled by VGACON */ if ((vga_si->orig_video_mode == 0x0D) || /* 320x200/4 */ (vga_si->orig_video_mode == 0x0E) || /* 640x200/4 */ (vga_si->orig_video_mode == 0x10) || /* 640x350/4 */ (vga_si->orig_video_mode == 0x12) || /* 640x480/4 */ (vga_si->orig_video_mode == 0x6A)) /* 800x600/4 (VESA) */ goto no_vga; vga_video_num_lines = vga_si->orig_video_lines; vga_video_num_columns = vga_si->orig_video_cols; vgastate.vgabase = NULL; if (vga_si->orig_video_mode == 7) { /* Monochrome display */ vga_vram_base = 0xb0000; vga_video_port_reg = VGA_CRT_IM; vga_video_port_val = VGA_CRT_DM; if ((vga_si->orig_video_ega_bx & 0xff) != 0x10) { static struct resource ega_console_resource = { .name = "ega", .flags = IORESOURCE_IO, .start = 0x3B0, .end = 0x3BF }; vga_video_type = VIDEO_TYPE_EGAM; vga_vram_size = 0x8000; display_desc = "EGA+"; request_resource(&ioport_resource, &ega_console_resource); } else { static struct resource mda1_console_resource = { .name = "mda", .flags = IORESOURCE_IO, .start = 0x3B0, .end = 0x3BB }; static struct resource mda2_console_resource = { .name = "mda", .flags = IORESOURCE_IO, .start = 0x3BF, .end = 0x3BF }; vga_video_type = VIDEO_TYPE_MDA; vga_vram_size = 0x2000; display_desc = "*MDA"; request_resource(&ioport_resource, &mda1_console_resource); request_resource(&ioport_resource, &mda2_console_resource); vga_video_font_height = 14; } } else { /* If not, it is color. */ vga_can_do_color = true; vga_vram_base = 0xb8000; vga_video_port_reg = VGA_CRT_IC; vga_video_port_val = VGA_CRT_DC; if ((vga_si->orig_video_ega_bx & 0xff) != 0x10) { int i; vga_vram_size = 0x8000; if (!vga_si->orig_video_isVGA) { static struct resource ega_console_resource = { .name = "ega", .flags = IORESOURCE_IO, .start = 0x3C0, .end = 0x3DF }; vga_video_type = VIDEO_TYPE_EGAC; display_desc = "EGA"; request_resource(&ioport_resource, &ega_console_resource); } else { static struct resource vga_console_resource = { .name = "vga+", .flags = IORESOURCE_IO, .start = 0x3C0, .end = 0x3DF }; vga_video_type = VIDEO_TYPE_VGAC; display_desc = "VGA+"; request_resource(&ioport_resource, &vga_console_resource); /* * Normalise the palette registers, to point * the 16 screen colours to the first 16 * DAC entries. */ for (i = 0; i < 16; i++) { inb_p(VGA_IS1_RC); outb_p(i, VGA_ATT_W); outb_p(i, VGA_ATT_W); } outb_p(0x20, VGA_ATT_W); /* * Now set the DAC registers back to their * default values */ for (i = 0; i < 16; i++) { outb_p(color_table[i], VGA_PEL_IW); outb_p(default_red[i], VGA_PEL_D); outb_p(default_grn[i], VGA_PEL_D); outb_p(default_blu[i], VGA_PEL_D); } } } else { static struct resource cga_console_resource = { .name = "cga", .flags = IORESOURCE_IO, .start = 0x3D4, .end = 0x3D5 }; vga_video_type = VIDEO_TYPE_CGA; vga_vram_size = 0x2000; display_desc = "*CGA"; request_resource(&ioport_resource, &cga_console_resource); vga_video_font_height = 8; } } vga_vram_base = VGA_MAP_MEM(vga_vram_base, vga_vram_size); vga_vram_end = vga_vram_base + vga_vram_size; /* * Find out if there is a graphics card present. * Are there smarter methods around? */ p = (volatile u16 *) vga_vram_base; saved1 = scr_readw(p); saved2 = scr_readw(p + 1); scr_writew(0xAA55, p); scr_writew(0x55AA, p + 1); if (scr_readw(p) != 0xAA55 || scr_readw(p + 1) != 0x55AA) { scr_writew(saved1, p); scr_writew(saved2, p + 1); goto no_vga; } scr_writew(0x55AA, p); scr_writew(0xAA55, p + 1); if (scr_readw(p) != 0x55AA || scr_readw(p + 1) != 0xAA55) { scr_writew(saved1, p); scr_writew(saved2, p + 1); goto no_vga; } scr_writew(saved1, p); scr_writew(saved2, p + 1); if (vga_video_type == VIDEO_TYPE_EGAC || vga_video_type == VIDEO_TYPE_VGAC || vga_video_type == VIDEO_TYPE_EGAM) { vga_hardscroll_enabled = vga_hardscroll_user_enable; vga_default_font_height = vga_si->orig_video_points; vga_video_font_height = vga_si->orig_video_points; /* This may be suboptimal but is a safe bet - go with it */ vga_scan_lines = vga_video_font_height * vga_video_num_lines; } vgacon_xres = vga_si->orig_video_cols * VGA_FONTWIDTH; vgacon_yres = vga_scan_lines; return display_desc; } static void vgacon_init(struct vc_data *c, bool init) { struct uni_pagedict *p; /* * We cannot be loaded as a module, therefore init will be 1 * if we are the default console, however if we are a fallback * console, for example if fbcon has failed registration, then * init will be 0, so we need to make sure our boot parameters * have been copied to the console structure for vgacon_resize * ultimately called by vc_resize. Any subsequent calls to * vgacon_init init will have init set to 0 too. */ c->vc_can_do_color = vga_can_do_color; c->vc_scan_lines = vga_scan_lines; c->vc_font.height = c->vc_cell_height = vga_video_font_height; /* set dimensions manually if init is true since vc_resize() will fail */ if (init) { c->vc_cols = vga_video_num_columns; c->vc_rows = vga_video_num_lines; } else vc_resize(c, vga_video_num_columns, vga_video_num_lines); c->vc_complement_mask = 0x7700; if (vga_512_chars) c->vc_hi_font_mask = 0x0800; p = *c->uni_pagedict_loc; if (c->uni_pagedict_loc != &vgacon_uni_pagedir) { con_free_unimap(c); c->uni_pagedict_loc = &vgacon_uni_pagedir; vgacon_refcount++; } if (!vgacon_uni_pagedir && p) con_set_default_unimap(c); /* Only set the default if the user didn't deliberately override it */ if (global_cursor_default == -1) global_cursor_default = !(vga_si->flags & VIDEO_FLAGS_NOCURSOR); } static void vgacon_deinit(struct vc_data *c) { /* When closing the active console, reset video origin */ if (con_is_visible(c)) { c->vc_visible_origin = vga_vram_base; vga_set_mem_top(c); } if (!--vgacon_refcount) con_free_unimap(c); c->uni_pagedict_loc = &c->uni_pagedict; con_set_default_unimap(c); } static u8 vgacon_build_attr(struct vc_data *c, u8 color, enum vc_intensity intensity, bool blink, bool underline, bool reverse, bool italic) { u8 attr = color; if (vga_can_do_color) { if (italic) attr = (attr & 0xF0) | c->vc_itcolor; else if (underline) attr = (attr & 0xf0) | c->vc_ulcolor; else if (intensity == VCI_HALF_BRIGHT) attr = (attr & 0xf0) | c->vc_halfcolor; } if (reverse) attr = ((attr) & 0x88) | ((((attr) >> 4) | ((attr) << 4)) & 0x77); if (blink) attr ^= 0x80; if (intensity == VCI_BOLD) attr ^= 0x08; if (!vga_can_do_color) { if (italic) attr = (attr & 0xF8) | 0x02; else if (underline) attr = (attr & 0xf8) | 0x01; else if (intensity == VCI_HALF_BRIGHT) attr = (attr & 0xf0) | 0x08; } return attr; } static void vgacon_invert_region(struct vc_data *c, u16 * p, int count) { const bool col = vga_can_do_color; while (count--) { u16 a = scr_readw(p); if (col) a = ((a) & 0x88ff) | (((a) & 0x7000) >> 4) | (((a) & 0x0700) << 4); else a ^= ((a & 0x0700) == 0x0100) ? 0x7000 : 0x7700; scr_writew(a, p++); } } static void vgacon_set_cursor_size(int from, int to) { unsigned long flags; int curs, cure; if ((from == cursor_size_lastfrom) && (to == cursor_size_lastto)) return; cursor_size_lastfrom = from; cursor_size_lastto = to; raw_spin_lock_irqsave(&vga_lock, flags); if (vga_video_type >= VIDEO_TYPE_VGAC) { outb_p(VGA_CRTC_CURSOR_START, vga_video_port_reg); curs = inb_p(vga_video_port_val); outb_p(VGA_CRTC_CURSOR_END, vga_video_port_reg); cure = inb_p(vga_video_port_val); } else { curs = 0; cure = 0; } curs = (curs & 0xc0) | from; cure = (cure & 0xe0) | to; outb_p(VGA_CRTC_CURSOR_START, vga_video_port_reg); outb_p(curs, vga_video_port_val); outb_p(VGA_CRTC_CURSOR_END, vga_video_port_reg); outb_p(cure, vga_video_port_val); raw_spin_unlock_irqrestore(&vga_lock, flags); } static void vgacon_cursor(struct vc_data *c, bool enable) { unsigned int c_height; if (c->vc_mode != KD_TEXT) return; vgacon_restore_screen(c); c_height = c->vc_cell_height; write_vga(14, (c->vc_pos - vga_vram_base) / 2); if (!enable) { if (vga_video_type >= VIDEO_TYPE_VGAC) vgacon_set_cursor_size(31, 30); else vgacon_set_cursor_size(31, 31); return; } switch (CUR_SIZE(c->vc_cursor_type)) { case CUR_UNDERLINE: vgacon_set_cursor_size(c_height - (c_height < 10 ? 2 : 3), c_height - (c_height < 10 ? 1 : 2)); break; case CUR_TWO_THIRDS: vgacon_set_cursor_size(c_height / 3, c_height - (c_height < 10 ? 1 : 2)); break; case CUR_LOWER_THIRD: vgacon_set_cursor_size(c_height * 2 / 3, c_height - (c_height < 10 ? 1 : 2)); break; case CUR_LOWER_HALF: vgacon_set_cursor_size(c_height / 2, c_height - (c_height < 10 ? 1 : 2)); break; case CUR_NONE: if (vga_video_type >= VIDEO_TYPE_VGAC) vgacon_set_cursor_size(31, 30); else vgacon_set_cursor_size(31, 31); break; default: vgacon_set_cursor_size(1, c_height); break; } } static void vgacon_doresize(struct vc_data *c, unsigned int width, unsigned int height) { unsigned long flags; unsigned int scanlines = height * c->vc_cell_height; u8 scanlines_lo = 0, r7 = 0, vsync_end = 0, mode, max_scan; raw_spin_lock_irqsave(&vga_lock, flags); vgacon_xres = width * VGA_FONTWIDTH; vgacon_yres = height * c->vc_cell_height; if (vga_video_type >= VIDEO_TYPE_VGAC) { outb_p(VGA_CRTC_MAX_SCAN, vga_video_port_reg); max_scan = inb_p(vga_video_port_val); if (max_scan & 0x80) scanlines <<= 1; outb_p(VGA_CRTC_MODE, vga_video_port_reg); mode = inb_p(vga_video_port_val); if (mode & 0x04) scanlines >>= 1; scanlines -= 1; scanlines_lo = scanlines & 0xff; outb_p(VGA_CRTC_OVERFLOW, vga_video_port_reg); r7 = inb_p(vga_video_port_val) & ~0x42; if (scanlines & 0x100) r7 |= 0x02; if (scanlines & 0x200) r7 |= 0x40; /* deprotect registers */ outb_p(VGA_CRTC_V_SYNC_END, vga_video_port_reg); vsync_end = inb_p(vga_video_port_val); outb_p(VGA_CRTC_V_SYNC_END, vga_video_port_reg); outb_p(vsync_end & ~0x80, vga_video_port_val); } outb_p(VGA_CRTC_H_DISP, vga_video_port_reg); outb_p(width - 1, vga_video_port_val); outb_p(VGA_CRTC_OFFSET, vga_video_port_reg); outb_p(width >> 1, vga_video_port_val); if (vga_video_type >= VIDEO_TYPE_VGAC) { outb_p(VGA_CRTC_V_DISP_END, vga_video_port_reg); outb_p(scanlines_lo, vga_video_port_val); outb_p(VGA_CRTC_OVERFLOW, vga_video_port_reg); outb_p(r7,vga_video_port_val); /* reprotect registers */ outb_p(VGA_CRTC_V_SYNC_END, vga_video_port_reg); outb_p(vsync_end, vga_video_port_val); } raw_spin_unlock_irqrestore(&vga_lock, flags); } static bool vgacon_switch(struct vc_data *c) { int x = c->vc_cols * VGA_FONTWIDTH; int y = c->vc_rows * c->vc_cell_height; int rows = vga_si->orig_video_lines * vga_default_font_height/ c->vc_cell_height; /* * We need to save screen size here as it's the only way * we can spot the screen has been resized and we need to * set size of freshly allocated screens ourselves. */ vga_video_num_columns = c->vc_cols; vga_video_num_lines = c->vc_rows; /* We can only copy out the size of the video buffer here, * otherwise we get into VGA BIOS */ if (!vga_is_gfx) { scr_memcpyw((u16 *) c->vc_origin, (u16 *) c->vc_screenbuf, c->vc_screenbuf_size > vga_vram_size ? vga_vram_size : c->vc_screenbuf_size); if ((vgacon_xres != x || vgacon_yres != y) && (!(vga_video_num_columns % 2) && vga_video_num_columns <= vga_si->orig_video_cols && vga_video_num_lines <= rows)) vgacon_doresize(c, c->vc_cols, c->vc_rows); } return false; /* Redrawing not needed */ } static void vga_set_palette(struct vc_data *vc, const unsigned char *table) { int i, j; vga_w(vgastate.vgabase, VGA_PEL_MSK, 0xff); for (i = j = 0; i < 16; i++) { vga_w(vgastate.vgabase, VGA_PEL_IW, table[i]); vga_w(vgastate.vgabase, VGA_PEL_D, vc->vc_palette[j++] >> 2); vga_w(vgastate.vgabase, VGA_PEL_D, vc->vc_palette[j++] >> 2); vga_w(vgastate.vgabase, VGA_PEL_D, vc->vc_palette[j++] >> 2); } } static void vgacon_set_palette(struct vc_data *vc, const unsigned char *table) { if (vga_video_type != VIDEO_TYPE_VGAC || vga_palette_blanked || !con_is_visible(vc)) return; vga_set_palette(vc, table); } /* structure holding original VGA register settings */ static struct { unsigned char SeqCtrlIndex; /* Sequencer Index reg. */ unsigned char CrtCtrlIndex; /* CRT-Contr. Index reg. */ unsigned char CrtMiscIO; /* Miscellaneous register */ unsigned char HorizontalTotal; /* CRT-Controller:00h */ unsigned char HorizDisplayEnd; /* CRT-Controller:01h */ unsigned char StartHorizRetrace; /* CRT-Controller:04h */ unsigned char EndHorizRetrace; /* CRT-Controller:05h */ unsigned char Overflow; /* CRT-Controller:07h */ unsigned char StartVertRetrace; /* CRT-Controller:10h */ unsigned char EndVertRetrace; /* CRT-Controller:11h */ unsigned char ModeControl; /* CRT-Controller:17h */ unsigned char ClockingMode; /* Seq-Controller:01h */ } vga_state; static void vga_vesa_blank(struct vgastate *state, enum vesa_blank_mode mode) { /* save original values of VGA controller registers */ if (!vga_vesa_blanked) { raw_spin_lock_irq(&vga_lock); vga_state.SeqCtrlIndex = vga_r(state->vgabase, VGA_SEQ_I); vga_state.CrtCtrlIndex = inb_p(vga_video_port_reg); vga_state.CrtMiscIO = vga_r(state->vgabase, VGA_MIS_R); raw_spin_unlock_irq(&vga_lock); outb_p(0x00, vga_video_port_reg); /* HorizontalTotal */ vga_state.HorizontalTotal = inb_p(vga_video_port_val); outb_p(0x01, vga_video_port_reg); /* HorizDisplayEnd */ vga_state.HorizDisplayEnd = inb_p(vga_video_port_val); outb_p(0x04, vga_video_port_reg); /* StartHorizRetrace */ vga_state.StartHorizRetrace = inb_p(vga_video_port_val); outb_p(0x05, vga_video_port_reg); /* EndHorizRetrace */ vga_state.EndHorizRetrace = inb_p(vga_video_port_val); outb_p(0x07, vga_video_port_reg); /* Overflow */ vga_state.Overflow = inb_p(vga_video_port_val); outb_p(0x10, vga_video_port_reg); /* StartVertRetrace */ vga_state.StartVertRetrace = inb_p(vga_video_port_val); outb_p(0x11, vga_video_port_reg); /* EndVertRetrace */ vga_state.EndVertRetrace = inb_p(vga_video_port_val); outb_p(0x17, vga_video_port_reg); /* ModeControl */ vga_state.ModeControl = inb_p(vga_video_port_val); vga_state.ClockingMode = vga_rseq(state->vgabase, VGA_SEQ_CLOCK_MODE); } /* assure that video is enabled */ /* "0x20" is VIDEO_ENABLE_bit in register 01 of sequencer */ raw_spin_lock_irq(&vga_lock); vga_wseq(state->vgabase, VGA_SEQ_CLOCK_MODE, vga_state.ClockingMode | 0x20); /* test for vertical retrace in process.... */ if ((vga_state.CrtMiscIO & 0x80) == 0x80) vga_w(state->vgabase, VGA_MIS_W, vga_state.CrtMiscIO & 0xEF); /* * Set <End of vertical retrace> to minimum (0) and * <Start of vertical Retrace> to maximum (incl. overflow) * Result: turn off vertical sync (VSync) pulse. */ if (mode & VESA_VSYNC_SUSPEND) { outb_p(0x10, vga_video_port_reg); /* StartVertRetrace */ outb_p(0xff, vga_video_port_val); /* maximum value */ outb_p(0x11, vga_video_port_reg); /* EndVertRetrace */ outb_p(0x40, vga_video_port_val); /* minimum (bits 0..3) */ outb_p(0x07, vga_video_port_reg); /* Overflow */ outb_p(vga_state.Overflow | 0x84, vga_video_port_val); /* bits 9,10 of vert. retrace */ } if (mode & VESA_HSYNC_SUSPEND) { /* * Set <End of horizontal retrace> to minimum (0) and * <Start of horizontal Retrace> to maximum * Result: turn off horizontal sync (HSync) pulse. */ outb_p(0x04, vga_video_port_reg); /* StartHorizRetrace */ outb_p(0xff, vga_video_port_val); /* maximum */ outb_p(0x05, vga_video_port_reg); /* EndHorizRetrace */ outb_p(0x00, vga_video_port_val); /* minimum (0) */ } /* restore both index registers */ vga_w(state->vgabase, VGA_SEQ_I, vga_state.SeqCtrlIndex); outb_p(vga_state.CrtCtrlIndex, vga_video_port_reg); raw_spin_unlock_irq(&vga_lock); } static void vga_vesa_unblank(struct vgastate *state) { /* restore original values of VGA controller registers */ raw_spin_lock_irq(&vga_lock); vga_w(state->vgabase, VGA_MIS_W, vga_state.CrtMiscIO); outb_p(0x00, vga_video_port_reg); /* HorizontalTotal */ outb_p(vga_state.HorizontalTotal, vga_video_port_val); outb_p(0x01, vga_video_port_reg); /* HorizDisplayEnd */ outb_p(vga_state.HorizDisplayEnd, vga_video_port_val); outb_p(0x04, vga_video_port_reg); /* StartHorizRetrace */ outb_p(vga_state.StartHorizRetrace, vga_video_port_val); outb_p(0x05, vga_video_port_reg); /* EndHorizRetrace */ outb_p(vga_state.EndHorizRetrace, vga_video_port_val); outb_p(0x07, vga_video_port_reg); /* Overflow */ outb_p(vga_state.Overflow, vga_video_port_val); outb_p(0x10, vga_video_port_reg); /* StartVertRetrace */ outb_p(vga_state.StartVertRetrace, vga_video_port_val); outb_p(0x11, vga_video_port_reg); /* EndVertRetrace */ outb_p(vga_state.EndVertRetrace, vga_video_port_val); outb_p(0x17, vga_video_port_reg); /* ModeControl */ outb_p(vga_state.ModeControl, vga_video_port_val); /* ClockingMode */ vga_wseq(state->vgabase, VGA_SEQ_CLOCK_MODE, vga_state.ClockingMode); /* restore index/control registers */ vga_w(state->vgabase, VGA_SEQ_I, vga_state.SeqCtrlIndex); outb_p(vga_state.CrtCtrlIndex, vga_video_port_reg); raw_spin_unlock_irq(&vga_lock); } static void vga_pal_blank(struct vgastate *state) { int i; vga_w(state->vgabase, VGA_PEL_MSK, 0xff); for (i = 0; i < 16; i++) { vga_w(state->vgabase, VGA_PEL_IW, i); vga_w(state->vgabase, VGA_PEL_D, 0); vga_w(state->vgabase, VGA_PEL_D, 0); vga_w(state->vgabase, VGA_PEL_D, 0); } } static bool vgacon_blank(struct vc_data *c, enum vesa_blank_mode blank, bool mode_switch) { switch (blank) { case VESA_NO_BLANKING: /* Unblank */ if (vga_vesa_blanked) { vga_vesa_unblank(&vgastate); vga_vesa_blanked = VESA_NO_BLANKING; } if (vga_palette_blanked) { vga_set_palette(c, color_table); vga_palette_blanked = false; return 0; } vga_is_gfx = false; /* Tell console.c that it has to restore the screen itself */ return 1; case VESA_VSYNC_SUSPEND: /* Normal blanking */ if (!mode_switch && vga_video_type == VIDEO_TYPE_VGAC) { vga_pal_blank(&vgastate); vga_palette_blanked = true; return 0; } vgacon_set_origin(c); scr_memsetw((void *) vga_vram_base, BLANK, c->vc_screenbuf_size); if (mode_switch) vga_is_gfx = true; return 1; default: /* VESA blanking */ if (vga_video_type == VIDEO_TYPE_VGAC) { vga_vesa_blank(&vgastate, blank - 1); vga_vesa_blanked = blank; } return 0; } } /* * PIO_FONT support. * * The font loading code goes back to the codepage package by * Joel Hoffman (joel@wam.umd.edu). (He reports that the original * reference is: "From: p. 307 of _Programmer's Guide to PC & PS/2 * Video Systems_ by Richard Wilton. 1987. Microsoft Press".) * * Change for certain monochrome monitors by Yury Shevchuck * (sizif@botik.yaroslavl.su). */ #define colourmap 0xa0000 /* Pauline Middelink <middelin@polyware.iaf.nl> reports that we should use 0xA0000 for the bwmap as well.. */ #define blackwmap 0xa0000 #define cmapsz 8192 static int vgacon_do_font_op(struct vgastate *state, char *arg, int set, bool ch512) { unsigned short video_port_status = vga_video_port_reg + 6; int font_select = 0x00, beg, i; char *charmap; bool clear_attribs = false; if (vga_video_type != VIDEO_TYPE_EGAM) { charmap = (char *) VGA_MAP_MEM(colourmap, 0); beg = 0x0e; } else { charmap = (char *) VGA_MAP_MEM(blackwmap, 0); beg = 0x0a; } /* * All fonts are loaded in slot 0 (0:1 for 512 ch) */ if (!arg) return -EINVAL; /* Return to default font not supported */ font_select = ch512 ? 0x04 : 0x00; raw_spin_lock_irq(&vga_lock); /* First, the Sequencer */ vga_wseq(state->vgabase, VGA_SEQ_RESET, 0x1); /* CPU writes only to map 2 */ vga_wseq(state->vgabase, VGA_SEQ_PLANE_WRITE, 0x04); /* Sequential addressing */ vga_wseq(state->vgabase, VGA_SEQ_MEMORY_MODE, 0x07); /* Clear synchronous reset */ vga_wseq(state->vgabase, VGA_SEQ_RESET, 0x03); /* Now, the graphics controller, select map 2 */ vga_wgfx(state->vgabase, VGA_GFX_PLANE_READ, 0x02); /* disable odd-even addressing */ vga_wgfx(state->vgabase, VGA_GFX_MODE, 0x00); /* map start at A000:0000 */ vga_wgfx(state->vgabase, VGA_GFX_MISC, 0x00); raw_spin_unlock_irq(&vga_lock); if (arg) { if (set) for (i = 0; i < cmapsz; i++) { vga_writeb(arg[i], charmap + i); cond_resched(); } else for (i = 0; i < cmapsz; i++) { arg[i] = vga_readb(charmap + i); cond_resched(); } /* * In 512-character mode, the character map is not contiguous if * we want to remain EGA compatible -- which we do */ if (ch512) { charmap += 2 * cmapsz; arg += cmapsz; if (set) for (i = 0; i < cmapsz; i++) { vga_writeb(arg[i], charmap + i); cond_resched(); } else for (i = 0; i < cmapsz; i++) { arg[i] = vga_readb(charmap + i); cond_resched(); } } } raw_spin_lock_irq(&vga_lock); /* First, the sequencer, Synchronous reset */ vga_wseq(state->vgabase, VGA_SEQ_RESET, 0x01); /* CPU writes to maps 0 and 1 */ vga_wseq(state->vgabase, VGA_SEQ_PLANE_WRITE, 0x03); /* odd-even addressing */ vga_wseq(state->vgabase, VGA_SEQ_MEMORY_MODE, 0x03); /* Character Map Select */ if (set) vga_wseq(state->vgabase, VGA_SEQ_CHARACTER_MAP, font_select); /* clear synchronous reset */ vga_wseq(state->vgabase, VGA_SEQ_RESET, 0x03); /* Now, the graphics controller, select map 0 for CPU */ vga_wgfx(state->vgabase, VGA_GFX_PLANE_READ, 0x00); /* enable even-odd addressing */ vga_wgfx(state->vgabase, VGA_GFX_MODE, 0x10); /* map starts at b800:0 or b000:0 */ vga_wgfx(state->vgabase, VGA_GFX_MISC, beg); /* if 512 char mode is already enabled don't re-enable it. */ if ((set) && (ch512 != vga_512_chars)) { vga_512_chars = ch512; /* 256-char: enable intensity bit 512-char: disable intensity bit */ inb_p(video_port_status); /* clear address flip-flop */ /* color plane enable register */ vga_wattr(state->vgabase, VGA_ATC_PLANE_ENABLE, ch512 ? 0x07 : 0x0f); /* Wilton (1987) mentions the following; I don't know what it means, but it works, and it appears necessary */ inb_p(video_port_status); vga_wattr(state->vgabase, VGA_AR_ENABLE_DISPLAY, 0); clear_attribs = true; } raw_spin_unlock_irq(&vga_lock); if (clear_attribs) { for (i = 0; i < MAX_NR_CONSOLES; i++) { struct vc_data *c = vc_cons[i].d; if (c && c->vc_sw == &vga_con) { /* force hi font mask to 0, so we always clear the bit on either transition */ c->vc_hi_font_mask = 0x00; clear_buffer_attributes(c); c->vc_hi_font_mask = ch512 ? 0x0800 : 0; } } } return 0; } /* * Adjust the screen to fit a font of a certain height */ static int vgacon_adjust_height(struct vc_data *vc, unsigned fontheight) { unsigned char ovr, vde, fsr; int rows, maxscan, i; rows = vc->vc_scan_lines / fontheight; /* Number of video rows we end up with */ maxscan = rows * fontheight - 1; /* Scan lines to actually display-1 */ /* Reprogram the CRTC for the new font size Note: the attempt to read the overflow register will fail on an EGA, but using 0xff for the previous value appears to be OK for EGA text modes in the range 257-512 scan lines, so I guess we don't need to worry about it. The same applies for the spill bits in the font size and cursor registers; they are write-only on EGA, but it appears that they are all don't care bits on EGA, so I guess it doesn't matter. */ raw_spin_lock_irq(&vga_lock); outb_p(0x07, vga_video_port_reg); /* CRTC overflow register */ ovr = inb_p(vga_video_port_val); outb_p(0x09, vga_video_port_reg); /* Font size register */ fsr = inb_p(vga_video_port_val); raw_spin_unlock_irq(&vga_lock); vde = maxscan & 0xff; /* Vertical display end reg */ ovr = (ovr & 0xbd) + /* Overflow register */ ((maxscan & 0x100) >> 7) + ((maxscan & 0x200) >> 3); fsr = (fsr & 0xe0) + (fontheight - 1); /* Font size register */ raw_spin_lock_irq(&vga_lock); outb_p(0x07, vga_video_port_reg); /* CRTC overflow register */ outb_p(ovr, vga_video_port_val); outb_p(0x09, vga_video_port_reg); /* Font size */ outb_p(fsr, vga_video_port_val); outb_p(0x12, vga_video_port_reg); /* Vertical display limit */ outb_p(vde, vga_video_port_val); raw_spin_unlock_irq(&vga_lock); vga_video_font_height = fontheight; for (i = 0; i < MAX_NR_CONSOLES; i++) { struct vc_data *c = vc_cons[i].d; if (c && c->vc_sw == &vga_con) { if (con_is_visible(c)) { /* void size to cause regs to be rewritten */ cursor_size_lastfrom = 0; cursor_size_lastto = 0; c->vc_sw->con_cursor(c, true); } c->vc_font.height = c->vc_cell_height = fontheight; vc_resize(c, 0, rows); /* Adjust console size */ } } return 0; } static int vgacon_font_set(struct vc_data *c, const struct console_font *font, unsigned int vpitch, unsigned int flags) { unsigned charcount = font->charcount; int rc; if (vga_video_type < VIDEO_TYPE_EGAM) return -EINVAL; if (font->width != VGA_FONTWIDTH || font->height > 32 || vpitch != 32 || (charcount != 256 && charcount != 512)) return -EINVAL; rc = vgacon_do_font_op(&vgastate, font->data, 1, charcount == 512); if (rc) return rc; if (!(flags & KD_FONT_FLAG_DONT_RECALC)) rc = vgacon_adjust_height(c, font->height); return rc; } static int vgacon_font_get(struct vc_data *c, struct console_font *font, unsigned int vpitch) { if (vga_video_type < VIDEO_TYPE_EGAM || vpitch != 32) return -EINVAL; font->width = VGA_FONTWIDTH; font->height = c->vc_font.height; font->charcount = vga_512_chars ? 512 : 256; if (!font->data) return 0; return vgacon_do_font_op(&vgastate, font->data, 0, vga_512_chars); } static int vgacon_resize(struct vc_data *c, unsigned int width, unsigned int height, bool from_user) { if ((width << 1) * height > vga_vram_size) return -EINVAL; if (from_user) { /* * Ho ho! Someone (svgatextmode, eh?) may have reprogrammed * the video mode! Set the new defaults then and go away. */ vga_si->orig_video_cols = width; vga_si->orig_video_lines = height; vga_default_font_height = c->vc_cell_height; return 0; } if (width % 2 || width > vga_si->orig_video_cols || height > (vga_si->orig_video_lines * vga_default_font_height)/ c->vc_cell_height) return -EINVAL; if (con_is_visible(c) && !vga_is_gfx) /* who knows */ vgacon_doresize(c, width, height); return 0; } static bool vgacon_set_origin(struct vc_data *c) { if (vga_is_gfx || /* We don't play origin tricks in graphic modes */ (console_blanked && !vga_palette_blanked)) /* Nor we write to blanked screens */ return false; c->vc_origin = c->vc_visible_origin = vga_vram_base; vga_set_mem_top(c); vga_rolled_over = 0; return true; } static void vgacon_save_screen(struct vc_data *c) { static int vga_bootup_console = 0; if (!vga_bootup_console) { /* This is a gross hack, but here is the only place we can * set bootup console parameters without messing up generic * console initialization routines. */ vga_bootup_console = 1; c->state.x = vga_si->orig_x; c->state.y = vga_si->orig_y; } /* We can't copy in more than the size of the video buffer, * or we'll be copying in VGA BIOS */ if (!vga_is_gfx) scr_memcpyw((u16 *) c->vc_screenbuf, (u16 *) c->vc_origin, c->vc_screenbuf_size > vga_vram_size ? vga_vram_size : c->vc_screenbuf_size); } static bool vgacon_scroll(struct vc_data *c, unsigned int t, unsigned int b, enum con_scroll dir, unsigned int lines) { unsigned long oldo; unsigned int delta; if (t || b != c->vc_rows || vga_is_gfx || c->vc_mode != KD_TEXT) return false; if (!vga_hardscroll_enabled || lines >= c->vc_rows / 2) return false; vgacon_restore_screen(c); oldo = c->vc_origin; delta = lines * c->vc_size_row; if (dir == SM_UP) { if (c->vc_scr_end + delta >= vga_vram_end) { scr_memcpyw((u16 *) vga_vram_base, (u16 *) (oldo + delta), c->vc_screenbuf_size - delta); c->vc_origin = vga_vram_base; vga_rolled_over = oldo - vga_vram_base; } else c->vc_origin += delta; scr_memsetw((u16 *) (c->vc_origin + c->vc_screenbuf_size - delta), c->vc_video_erase_char, delta); } else { if (oldo - delta < vga_vram_base) { scr_memmovew((u16 *) (vga_vram_end - c->vc_screenbuf_size + delta), (u16 *) oldo, c->vc_screenbuf_size - delta); c->vc_origin = vga_vram_end - c->vc_screenbuf_size; vga_rolled_over = 0; } else c->vc_origin -= delta; c->vc_scr_end = c->vc_origin + c->vc_screenbuf_size; scr_memsetw((u16 *) (c->vc_origin), c->vc_video_erase_char, delta); } c->vc_scr_end = c->vc_origin + c->vc_screenbuf_size; c->vc_visible_origin = c->vc_origin; vga_set_mem_top(c); c->vc_pos = (c->vc_pos - oldo) + c->vc_origin; return true; } /* * The console `switch' structure for the VGA based console */ static void vgacon_clear(struct vc_data *vc, unsigned int sy, unsigned int sx, unsigned int width) { } static void vgacon_putcs(struct vc_data *vc, const u16 *s, unsigned int count, unsigned int ypos, unsigned int xpos) { } const struct consw vga_con = { .owner = THIS_MODULE, .con_startup = vgacon_startup, .con_init = vgacon_init, .con_deinit = vgacon_deinit, .con_clear = vgacon_clear, .con_putcs = vgacon_putcs, .con_cursor = vgacon_cursor, .con_scroll = vgacon_scroll, .con_switch = vgacon_switch, .con_blank = vgacon_blank, .con_font_set = vgacon_font_set, .con_font_get = vgacon_font_get, .con_resize = vgacon_resize, .con_set_palette = vgacon_set_palette, .con_scrolldelta = vgacon_scrolldelta, .con_set_origin = vgacon_set_origin, .con_save_screen = vgacon_save_screen, .con_build_attr = vgacon_build_attr, .con_invert_region = vgacon_invert_region, }; EXPORT_SYMBOL(vga_con); void vgacon_register_screen(struct screen_info *si) { if (!si || vga_si) return; conswitchp = &vga_con; vga_si = si; } MODULE_DESCRIPTION("VGA based console driver"); MODULE_LICENSE("GPL");
1 1 4 1 1 2 24 18 2 4 4 83 80 3 80 3 78 2 2 1 69 6 2 64 59 5 8 24 2 2 17 9 3 3 66 55 5 5 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 // SPDX-License-Identifier: GPL-2.0 #include <linux/types.h> #include <net/ip.h> #include <net/tcp.h> #include <net/netlink.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_synproxy.h> #include <net/netfilter/nf_synproxy.h> #include <linux/netfilter/nf_tables.h> #include <linux/netfilter/nf_synproxy.h> struct nft_synproxy { struct nf_synproxy_info info; }; static const struct nla_policy nft_synproxy_policy[NFTA_SYNPROXY_MAX + 1] = { [NFTA_SYNPROXY_MSS] = { .type = NLA_U16 }, [NFTA_SYNPROXY_WSCALE] = { .type = NLA_U8 }, [NFTA_SYNPROXY_FLAGS] = { .type = NLA_U32 }, }; static void nft_synproxy_tcp_options(struct synproxy_options *opts, const struct tcphdr *tcp, struct synproxy_net *snet, struct nf_synproxy_info *info, const struct nft_synproxy *priv) { this_cpu_inc(snet->stats->syn_received); if (tcp->ece && tcp->cwr) opts->options |= NF_SYNPROXY_OPT_ECN; opts->options &= priv->info.options; opts->mss_encode = opts->mss_option; opts->mss_option = info->mss; if (opts->options & NF_SYNPROXY_OPT_TIMESTAMP) synproxy_init_timestamp_cookie(info, opts); else opts->options &= ~(NF_SYNPROXY_OPT_WSCALE | NF_SYNPROXY_OPT_SACK_PERM | NF_SYNPROXY_OPT_ECN); } static void nft_synproxy_eval_v4(const struct nft_synproxy *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt, const struct tcphdr *tcp, struct tcphdr *_tcph, struct synproxy_options *opts) { struct nf_synproxy_info info = priv->info; struct net *net = nft_net(pkt); struct synproxy_net *snet = synproxy_pernet(net); struct sk_buff *skb = pkt->skb; if (tcp->syn) { /* Initial SYN from client */ nft_synproxy_tcp_options(opts, tcp, snet, &info, priv); synproxy_send_client_synack(net, skb, tcp, opts); consume_skb(skb); regs->verdict.code = NF_STOLEN; } else if (tcp->ack) { /* ACK from client */ if (synproxy_recv_client_ack(net, skb, tcp, opts, ntohl(tcp->seq))) { consume_skb(skb); regs->verdict.code = NF_STOLEN; } else { regs->verdict.code = NF_DROP; } } } #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) static void nft_synproxy_eval_v6(const struct nft_synproxy *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt, const struct tcphdr *tcp, struct tcphdr *_tcph, struct synproxy_options *opts) { struct nf_synproxy_info info = priv->info; struct net *net = nft_net(pkt); struct synproxy_net *snet = synproxy_pernet(net); struct sk_buff *skb = pkt->skb; if (tcp->syn) { /* Initial SYN from client */ nft_synproxy_tcp_options(opts, tcp, snet, &info, priv); synproxy_send_client_synack_ipv6(net, skb, tcp, opts); consume_skb(skb); regs->verdict.code = NF_STOLEN; } else if (tcp->ack) { /* ACK from client */ if (synproxy_recv_client_ack_ipv6(net, skb, tcp, opts, ntohl(tcp->seq))) { consume_skb(skb); regs->verdict.code = NF_STOLEN; } else { regs->verdict.code = NF_DROP; } } } #endif /* CONFIG_NF_TABLES_IPV6*/ static void nft_synproxy_do_eval(const struct nft_synproxy *priv, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct synproxy_options opts = {}; struct sk_buff *skb = pkt->skb; int thoff = nft_thoff(pkt); const struct tcphdr *tcp; struct tcphdr _tcph; if (pkt->tprot != IPPROTO_TCP) { regs->verdict.code = NFT_BREAK; return; } if (nf_ip_checksum(skb, nft_hook(pkt), thoff, IPPROTO_TCP)) { regs->verdict.code = NF_DROP; return; } tcp = skb_header_pointer(skb, thoff, sizeof(struct tcphdr), &_tcph); if (!tcp) { regs->verdict.code = NF_DROP; return; } if (!synproxy_parse_options(skb, thoff, tcp, &opts)) { regs->verdict.code = NF_DROP; return; } switch (skb->protocol) { case htons(ETH_P_IP): nft_synproxy_eval_v4(priv, regs, pkt, tcp, &_tcph, &opts); return; #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) case htons(ETH_P_IPV6): nft_synproxy_eval_v6(priv, regs, pkt, tcp, &_tcph, &opts); return; #endif } regs->verdict.code = NFT_BREAK; } static int nft_synproxy_do_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_synproxy *priv) { struct synproxy_net *snet = synproxy_pernet(ctx->net); u32 flags; int err; if (tb[NFTA_SYNPROXY_MSS]) priv->info.mss = ntohs(nla_get_be16(tb[NFTA_SYNPROXY_MSS])); if (tb[NFTA_SYNPROXY_WSCALE]) priv->info.wscale = nla_get_u8(tb[NFTA_SYNPROXY_WSCALE]); if (tb[NFTA_SYNPROXY_FLAGS]) { flags = ntohl(nla_get_be32(tb[NFTA_SYNPROXY_FLAGS])); if (flags & ~NF_SYNPROXY_OPT_MASK) return -EOPNOTSUPP; priv->info.options = flags; } err = nf_ct_netns_get(ctx->net, ctx->family); if (err) return err; switch (ctx->family) { case NFPROTO_IPV4: err = nf_synproxy_ipv4_init(snet, ctx->net); if (err) goto nf_ct_failure; break; #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) case NFPROTO_IPV6: err = nf_synproxy_ipv6_init(snet, ctx->net); if (err) goto nf_ct_failure; break; #endif case NFPROTO_INET: err = nf_synproxy_ipv4_init(snet, ctx->net); if (err) goto nf_ct_failure; err = nf_synproxy_ipv6_init(snet, ctx->net); if (err) { nf_synproxy_ipv4_fini(snet, ctx->net); goto nf_ct_failure; } break; } return 0; nf_ct_failure: nf_ct_netns_put(ctx->net, ctx->family); return err; } static void nft_synproxy_do_destroy(const struct nft_ctx *ctx) { struct synproxy_net *snet = synproxy_pernet(ctx->net); switch (ctx->family) { case NFPROTO_IPV4: nf_synproxy_ipv4_fini(snet, ctx->net); break; #if IS_ENABLED(CONFIG_NF_TABLES_IPV6) case NFPROTO_IPV6: nf_synproxy_ipv6_fini(snet, ctx->net); break; #endif case NFPROTO_INET: nf_synproxy_ipv4_fini(snet, ctx->net); nf_synproxy_ipv6_fini(snet, ctx->net); break; } nf_ct_netns_put(ctx->net, ctx->family); } static int nft_synproxy_do_dump(struct sk_buff *skb, struct nft_synproxy *priv) { if (nla_put_be16(skb, NFTA_SYNPROXY_MSS, htons(priv->info.mss)) || nla_put_u8(skb, NFTA_SYNPROXY_WSCALE, priv->info.wscale) || nla_put_be32(skb, NFTA_SYNPROXY_FLAGS, htonl(priv->info.options))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static void nft_synproxy_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_synproxy *priv = nft_expr_priv(expr); nft_synproxy_do_eval(priv, regs, pkt); } static int nft_synproxy_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { if (ctx->family != NFPROTO_IPV4 && ctx->family != NFPROTO_IPV6 && ctx->family != NFPROTO_INET) return -EOPNOTSUPP; return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD)); } static int nft_synproxy_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_synproxy *priv = nft_expr_priv(expr); return nft_synproxy_do_init(ctx, tb, priv); } static void nft_synproxy_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { nft_synproxy_do_destroy(ctx); } static int nft_synproxy_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { struct nft_synproxy *priv = nft_expr_priv(expr); return nft_synproxy_do_dump(skb, priv); } static struct nft_expr_type nft_synproxy_type; static const struct nft_expr_ops nft_synproxy_ops = { .eval = nft_synproxy_eval, .size = NFT_EXPR_SIZE(sizeof(struct nft_synproxy)), .init = nft_synproxy_init, .destroy = nft_synproxy_destroy, .dump = nft_synproxy_dump, .type = &nft_synproxy_type, .validate = nft_synproxy_validate, .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_synproxy_type __read_mostly = { .ops = &nft_synproxy_ops, .name = "synproxy", .owner = THIS_MODULE, .policy = nft_synproxy_policy, .maxattr = NFTA_SYNPROXY_MAX, }; static int nft_synproxy_obj_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) { struct nft_synproxy *priv = nft_obj_data(obj); return nft_synproxy_do_init(ctx, tb, priv); } static void nft_synproxy_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { nft_synproxy_do_destroy(ctx); } static int nft_synproxy_obj_dump(struct sk_buff *skb, struct nft_object *obj, bool reset) { struct nft_synproxy *priv = nft_obj_data(obj); return nft_synproxy_do_dump(skb, priv); } static void nft_synproxy_obj_eval(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_synproxy *priv = nft_obj_data(obj); nft_synproxy_do_eval(priv, regs, pkt); } static void nft_synproxy_obj_update(struct nft_object *obj, struct nft_object *newobj) { struct nft_synproxy *newpriv = nft_obj_data(newobj); struct nft_synproxy *priv = nft_obj_data(obj); priv->info = newpriv->info; } static struct nft_object_type nft_synproxy_obj_type; static const struct nft_object_ops nft_synproxy_obj_ops = { .type = &nft_synproxy_obj_type, .size = sizeof(struct nft_synproxy), .init = nft_synproxy_obj_init, .destroy = nft_synproxy_obj_destroy, .dump = nft_synproxy_obj_dump, .eval = nft_synproxy_obj_eval, .update = nft_synproxy_obj_update, }; static struct nft_object_type nft_synproxy_obj_type __read_mostly = { .type = NFT_OBJECT_SYNPROXY, .ops = &nft_synproxy_obj_ops, .maxattr = NFTA_SYNPROXY_MAX, .policy = nft_synproxy_policy, .owner = THIS_MODULE, }; static int __init nft_synproxy_module_init(void) { int err; err = nft_register_obj(&nft_synproxy_obj_type); if (err < 0) return err; err = nft_register_expr(&nft_synproxy_type); if (err < 0) goto err; return 0; err: nft_unregister_obj(&nft_synproxy_obj_type); return err; } static void __exit nft_synproxy_module_exit(void) { nft_unregister_expr(&nft_synproxy_type); nft_unregister_obj(&nft_synproxy_obj_type); } module_init(nft_synproxy_module_init); module_exit(nft_synproxy_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Fernando Fernandez <ffmancera@riseup.net>"); MODULE_ALIAS_NFT_EXPR("synproxy"); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_SYNPROXY); MODULE_DESCRIPTION("nftables SYNPROXY expression support");
5689 5697 5702 5691 5689 5691 5689 5702 284 3407 7 7 283 3410 3410 3406 7 1168 1167 1150 16 56 56 4555 4567 4556 3282 1504 4560 89 23 90 104 9 495 481 90 480 9 495 673 673 306 1 672 673 305 103 498 539 1 701 249 540 540 189 188 2172 17 2171 2172 2174 2804 2793 8 285 2777 17589 2871 2869 2869 2779 19 128 130 130 2053 2829 30 2804 2804 2044 130 18755 18739 31 31 31 18737 18753 91 91 3477 3479 3413 70 3413 3411 3413 3413 3415 3410 3413 3409 16 4494 138 137 126 2 21 21 21 3404 1149 1651 1528 3402 270 3139 3135 3133 3132 10 3407 10 3405 12 12 121 116 6 20 16 5 10 11 5 5 1 5 5 5 21 16 6 4 5 2 18 16 3 2 3 2 17 4 3 1 1 1 3686 3686 358 23 368 11 4742 4739 1 2469 2461 6 104 104 6150 6149 14 628 2861 5102 4761 2159 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux Socket Filter - Kernel level socket filtering * * Based on the design of the Berkeley Packet Filter. The new * internal format has been designed by PLUMgrid: * * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com * * Authors: * * Jay Schulist <jschlst@samba.org> * Alexei Starovoitov <ast@plumgrid.com> * Daniel Borkmann <dborkman@redhat.com> * * Andi Kleen - Fix a few bad bugs and races. * Kris Katterjohn - Added many additional checks in bpf_check_classic() */ #include <uapi/linux/btf.h> #include <linux/filter.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/prandom.h> #include <linux/bpf.h> #include <linux/btf.h> #include <linux/objtool.h> #include <linux/overflow.h> #include <linux/rbtree_latch.h> #include <linux/kallsyms.h> #include <linux/rcupdate.h> #include <linux/perf_event.h> #include <linux/extable.h> #include <linux/log2.h> #include <linux/bpf_verifier.h> #include <linux/nodemask.h> #include <linux/nospec.h> #include <linux/bpf_mem_alloc.h> #include <linux/memcontrol.h> #include <linux/execmem.h> #include <asm/barrier.h> #include <linux/unaligned.h> /* Registers */ #define BPF_R0 regs[BPF_REG_0] #define BPF_R1 regs[BPF_REG_1] #define BPF_R2 regs[BPF_REG_2] #define BPF_R3 regs[BPF_REG_3] #define BPF_R4 regs[BPF_REG_4] #define BPF_R5 regs[BPF_REG_5] #define BPF_R6 regs[BPF_REG_6] #define BPF_R7 regs[BPF_REG_7] #define BPF_R8 regs[BPF_REG_8] #define BPF_R9 regs[BPF_REG_9] #define BPF_R10 regs[BPF_REG_10] /* Named registers */ #define DST regs[insn->dst_reg] #define SRC regs[insn->src_reg] #define FP regs[BPF_REG_FP] #define AX regs[BPF_REG_AX] #define ARG1 regs[BPF_REG_ARG1] #define CTX regs[BPF_REG_CTX] #define OFF insn->off #define IMM insn->imm struct bpf_mem_alloc bpf_global_ma; bool bpf_global_ma_set; /* No hurry in this branch * * Exported for the bpf jit load helper. */ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size) { u8 *ptr = NULL; if (k >= SKF_NET_OFF) { ptr = skb_network_header(skb) + k - SKF_NET_OFF; } else if (k >= SKF_LL_OFF) { if (unlikely(!skb_mac_header_was_set(skb))) return NULL; ptr = skb_mac_header(skb) + k - SKF_LL_OFF; } if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb)) return ptr; return NULL; } /* tell bpf programs that include vmlinux.h kernel's PAGE_SIZE */ enum page_size_enum { __PAGE_SIZE = PAGE_SIZE }; struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags) { gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags); struct bpf_prog_aux *aux; struct bpf_prog *fp; size = round_up(size, __PAGE_SIZE); fp = __vmalloc(size, gfp_flags); if (fp == NULL) return NULL; aux = kzalloc(sizeof(*aux), bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags)); if (aux == NULL) { vfree(fp); return NULL; } fp->active = alloc_percpu_gfp(int, bpf_memcg_flags(GFP_KERNEL | gfp_extra_flags)); if (!fp->active) { vfree(fp); kfree(aux); return NULL; } fp->pages = size / PAGE_SIZE; fp->aux = aux; fp->aux->prog = fp; fp->jit_requested = ebpf_jit_enabled(); fp->blinding_requested = bpf_jit_blinding_enabled(fp); #ifdef CONFIG_CGROUP_BPF aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID; #endif INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode); #ifdef CONFIG_FINEIBT INIT_LIST_HEAD_RCU(&fp->aux->ksym_prefix.lnode); #endif mutex_init(&fp->aux->used_maps_mutex); mutex_init(&fp->aux->ext_mutex); mutex_init(&fp->aux->dst_mutex); #ifdef CONFIG_BPF_SYSCALL bpf_prog_stream_init(fp); #endif return fp; } struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) { gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags); struct bpf_prog *prog; int cpu; prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags); if (!prog) return NULL; prog->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags); if (!prog->stats) { free_percpu(prog->active); kfree(prog->aux); vfree(prog); return NULL; } for_each_possible_cpu(cpu) { struct bpf_prog_stats *pstats; pstats = per_cpu_ptr(prog->stats, cpu); u64_stats_init(&pstats->syncp); } return prog; } EXPORT_SYMBOL_GPL(bpf_prog_alloc); int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog) { if (!prog->aux->nr_linfo || !prog->jit_requested) return 0; prog->aux->jited_linfo = kvcalloc(prog->aux->nr_linfo, sizeof(*prog->aux->jited_linfo), bpf_memcg_flags(GFP_KERNEL | __GFP_NOWARN)); if (!prog->aux->jited_linfo) return -ENOMEM; return 0; } void bpf_prog_jit_attempt_done(struct bpf_prog *prog) { if (prog->aux->jited_linfo && (!prog->jited || !prog->aux->jited_linfo[0])) { kvfree(prog->aux->jited_linfo); prog->aux->jited_linfo = NULL; } kfree(prog->aux->kfunc_tab); prog->aux->kfunc_tab = NULL; } /* The jit engine is responsible to provide an array * for insn_off to the jited_off mapping (insn_to_jit_off). * * The idx to this array is the insn_off. Hence, the insn_off * here is relative to the prog itself instead of the main prog. * This array has one entry for each xlated bpf insn. * * jited_off is the byte off to the end of the jited insn. * * Hence, with * insn_start: * The first bpf insn off of the prog. The insn off * here is relative to the main prog. * e.g. if prog is a subprog, insn_start > 0 * linfo_idx: * The prog's idx to prog->aux->linfo and jited_linfo * * jited_linfo[linfo_idx] = prog->bpf_func * * For i > linfo_idx, * * jited_linfo[i] = prog->bpf_func + * insn_to_jit_off[linfo[i].insn_off - insn_start - 1] */ void bpf_prog_fill_jited_linfo(struct bpf_prog *prog, const u32 *insn_to_jit_off) { u32 linfo_idx, insn_start, insn_end, nr_linfo, i; const struct bpf_line_info *linfo; void **jited_linfo; if (!prog->aux->jited_linfo || prog->aux->func_idx > prog->aux->func_cnt) /* Userspace did not provide linfo */ return; linfo_idx = prog->aux->linfo_idx; linfo = &prog->aux->linfo[linfo_idx]; insn_start = linfo[0].insn_off; insn_end = insn_start + prog->len; jited_linfo = &prog->aux->jited_linfo[linfo_idx]; jited_linfo[0] = prog->bpf_func; nr_linfo = prog->aux->nr_linfo - linfo_idx; for (i = 1; i < nr_linfo && linfo[i].insn_off < insn_end; i++) /* The verifier ensures that linfo[i].insn_off is * strictly increasing */ jited_linfo[i] = prog->bpf_func + insn_to_jit_off[linfo[i].insn_off - insn_start - 1]; } struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags) { gfp_t gfp_flags = bpf_memcg_flags(GFP_KERNEL | __GFP_ZERO | gfp_extra_flags); struct bpf_prog *fp; u32 pages; size = round_up(size, PAGE_SIZE); pages = size / PAGE_SIZE; if (pages <= fp_old->pages) return fp_old; fp = __vmalloc(size, gfp_flags); if (fp) { memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); fp->pages = pages; fp->aux->prog = fp; /* We keep fp->aux from fp_old around in the new * reallocated structure. */ fp_old->aux = NULL; fp_old->stats = NULL; fp_old->active = NULL; __bpf_prog_free(fp_old); } return fp; } void __bpf_prog_free(struct bpf_prog *fp) { if (fp->aux) { mutex_destroy(&fp->aux->used_maps_mutex); mutex_destroy(&fp->aux->dst_mutex); kfree(fp->aux->poke_tab); kfree(fp->aux); } free_percpu(fp->stats); free_percpu(fp->active); vfree(fp); } int bpf_prog_calc_tag(struct bpf_prog *fp) { const u32 bits_offset = SHA1_BLOCK_SIZE - sizeof(__be64); u32 raw_size = bpf_prog_tag_scratch_size(fp); u32 digest[SHA1_DIGEST_WORDS]; u32 ws[SHA1_WORKSPACE_WORDS]; u32 i, bsize, psize, blocks; struct bpf_insn *dst; bool was_ld_map; u8 *raw, *todo; __be32 *result; __be64 *bits; raw = vmalloc(raw_size); if (!raw) return -ENOMEM; sha1_init_raw(digest); memset(ws, 0, sizeof(ws)); /* We need to take out the map fd for the digest calculation * since they are unstable from user space side. */ dst = (void *)raw; for (i = 0, was_ld_map = false; i < fp->len; i++) { dst[i] = fp->insnsi[i]; if (!was_ld_map && dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) && (dst[i].src_reg == BPF_PSEUDO_MAP_FD || dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) { was_ld_map = true; dst[i].imm = 0; } else if (was_ld_map && dst[i].code == 0 && dst[i].dst_reg == 0 && dst[i].src_reg == 0 && dst[i].off == 0) { was_ld_map = false; dst[i].imm = 0; } else { was_ld_map = false; } } psize = bpf_prog_insn_size(fp); memset(&raw[psize], 0, raw_size - psize); raw[psize++] = 0x80; bsize = round_up(psize, SHA1_BLOCK_SIZE); blocks = bsize / SHA1_BLOCK_SIZE; todo = raw; if (bsize - psize >= sizeof(__be64)) { bits = (__be64 *)(todo + bsize - sizeof(__be64)); } else { bits = (__be64 *)(todo + bsize + bits_offset); blocks++; } *bits = cpu_to_be64((psize - 1) << 3); while (blocks--) { sha1_transform(digest, todo, ws); todo += SHA1_BLOCK_SIZE; } result = (__force __be32 *)digest; for (i = 0; i < SHA1_DIGEST_WORDS; i++) result[i] = cpu_to_be32(digest[i]); memcpy(fp->tag, result, sizeof(fp->tag)); vfree(raw); return 0; } static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old, s32 end_new, s32 curr, const bool probe_pass) { const s64 imm_min = S32_MIN, imm_max = S32_MAX; s32 delta = end_new - end_old; s64 imm = insn->imm; if (curr < pos && curr + imm + 1 >= end_old) imm += delta; else if (curr >= end_new && curr + imm + 1 < end_new) imm -= delta; if (imm < imm_min || imm > imm_max) return -ERANGE; if (!probe_pass) insn->imm = imm; return 0; } static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, s32 end_new, s32 curr, const bool probe_pass) { s64 off_min, off_max, off; s32 delta = end_new - end_old; if (insn->code == (BPF_JMP32 | BPF_JA)) { off = insn->imm; off_min = S32_MIN; off_max = S32_MAX; } else { off = insn->off; off_min = S16_MIN; off_max = S16_MAX; } if (curr < pos && curr + off + 1 >= end_old) off += delta; else if (curr >= end_new && curr + off + 1 < end_new) off -= delta; if (off < off_min || off > off_max) return -ERANGE; if (!probe_pass) { if (insn->code == (BPF_JMP32 | BPF_JA)) insn->imm = off; else insn->off = off; } return 0; } static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old, s32 end_new, const bool probe_pass) { u32 i, insn_cnt = prog->len + (probe_pass ? end_new - end_old : 0); struct bpf_insn *insn = prog->insnsi; int ret = 0; for (i = 0; i < insn_cnt; i++, insn++) { u8 code; /* In the probing pass we still operate on the original, * unpatched image in order to check overflows before we * do any other adjustments. Therefore skip the patchlet. */ if (probe_pass && i == pos) { i = end_new; insn = prog->insnsi + end_old; } if (bpf_pseudo_func(insn)) { ret = bpf_adj_delta_to_imm(insn, pos, end_old, end_new, i, probe_pass); if (ret) return ret; continue; } code = insn->code; if ((BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) || BPF_OP(code) == BPF_EXIT) continue; /* Adjust offset of jmps if we cross patch boundaries. */ if (BPF_OP(code) == BPF_CALL) { if (insn->src_reg != BPF_PSEUDO_CALL) continue; ret = bpf_adj_delta_to_imm(insn, pos, end_old, end_new, i, probe_pass); } else { ret = bpf_adj_delta_to_off(insn, pos, end_old, end_new, i, probe_pass); } if (ret) break; } return ret; } static void bpf_adj_linfo(struct bpf_prog *prog, u32 off, u32 delta) { struct bpf_line_info *linfo; u32 i, nr_linfo; nr_linfo = prog->aux->nr_linfo; if (!nr_linfo || !delta) return; linfo = prog->aux->linfo; for (i = 0; i < nr_linfo; i++) if (off < linfo[i].insn_off) break; /* Push all off < linfo[i].insn_off by delta */ for (; i < nr_linfo; i++) linfo[i].insn_off += delta; } struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len) { u32 insn_adj_cnt, insn_rest, insn_delta = len - 1; const u32 cnt_max = S16_MAX; struct bpf_prog *prog_adj; int err; /* Since our patchlet doesn't expand the image, we're done. */ if (insn_delta == 0) { memcpy(prog->insnsi + off, patch, sizeof(*patch)); return prog; } insn_adj_cnt = prog->len + insn_delta; /* Reject anything that would potentially let the insn->off * target overflow when we have excessive program expansions. * We need to probe here before we do any reallocation where * we afterwards may not fail anymore. */ if (insn_adj_cnt > cnt_max && (err = bpf_adj_branches(prog, off, off + 1, off + len, true))) return ERR_PTR(err); /* Several new instructions need to be inserted. Make room * for them. Likely, there's no need for a new allocation as * last page could have large enough tailroom. */ prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt), GFP_USER); if (!prog_adj) return ERR_PTR(-ENOMEM); prog_adj->len = insn_adj_cnt; /* Patching happens in 3 steps: * * 1) Move over tail of insnsi from next instruction onwards, * so we can patch the single target insn with one or more * new ones (patching is always from 1 to n insns, n > 0). * 2) Inject new instructions at the target location. * 3) Adjust branch offsets if necessary. */ insn_rest = insn_adj_cnt - off - len; memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1, sizeof(*patch) * insn_rest); memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len); /* We are guaranteed to not fail at this point, otherwise * the ship has sailed to reverse to the original state. An * overflow cannot happen at this point. */ BUG_ON(bpf_adj_branches(prog_adj, off, off + 1, off + len, false)); bpf_adj_linfo(prog_adj, off, insn_delta); return prog_adj; } int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt) { int err; /* Branch offsets can't overflow when program is shrinking, no need * to call bpf_adj_branches(..., true) here */ memmove(prog->insnsi + off, prog->insnsi + off + cnt, sizeof(struct bpf_insn) * (prog->len - off - cnt)); prog->len -= cnt; err = bpf_adj_branches(prog, off, off + cnt, off, false); WARN_ON_ONCE(err); return err; } static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp) { int i; for (i = 0; i < fp->aux->real_func_cnt; i++) bpf_prog_kallsyms_del(fp->aux->func[i]); } void bpf_prog_kallsyms_del_all(struct bpf_prog *fp) { bpf_prog_kallsyms_del_subprogs(fp); bpf_prog_kallsyms_del(fp); } #ifdef CONFIG_BPF_JIT /* All BPF JIT sysctl knobs here. */ int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); int bpf_jit_harden __read_mostly; long bpf_jit_limit __read_mostly; long bpf_jit_limit_max __read_mostly; static void bpf_prog_ksym_set_addr(struct bpf_prog *prog) { WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog)); prog->aux->ksym.start = (unsigned long) prog->bpf_func; prog->aux->ksym.end = prog->aux->ksym.start + prog->jited_len; } static void bpf_prog_ksym_set_name(struct bpf_prog *prog) { char *sym = prog->aux->ksym.name; const char *end = sym + KSYM_NAME_LEN; const struct btf_type *type; const char *func_name; BUILD_BUG_ON(sizeof("bpf_prog_") + sizeof(prog->tag) * 2 + /* name has been null terminated. * We should need +1 for the '_' preceding * the name. However, the null character * is double counted between the name and the * sizeof("bpf_prog_") above, so we omit * the +1 here. */ sizeof(prog->aux->name) > KSYM_NAME_LEN); sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_"); sym = bin2hex(sym, prog->tag, sizeof(prog->tag)); /* prog->aux->name will be ignored if full btf name is available */ if (prog->aux->func_info_cnt && prog->aux->func_idx < prog->aux->func_info_cnt) { type = btf_type_by_id(prog->aux->btf, prog->aux->func_info[prog->aux->func_idx].type_id); func_name = btf_name_by_offset(prog->aux->btf, type->name_off); snprintf(sym, (size_t)(end - sym), "_%s", func_name); return; } if (prog->aux->name[0]) snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name); else *sym = 0; } static unsigned long bpf_get_ksym_start(struct latch_tree_node *n) { return container_of(n, struct bpf_ksym, tnode)->start; } static __always_inline bool bpf_tree_less(struct latch_tree_node *a, struct latch_tree_node *b) { return bpf_get_ksym_start(a) < bpf_get_ksym_start(b); } static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n) { unsigned long val = (unsigned long)key; const struct bpf_ksym *ksym; ksym = container_of(n, struct bpf_ksym, tnode); if (val < ksym->start) return -1; /* Ensure that we detect return addresses as part of the program, when * the final instruction is a call for a program part of the stack * trace. Therefore, do val > ksym->end instead of val >= ksym->end. */ if (val > ksym->end) return 1; return 0; } static const struct latch_tree_ops bpf_tree_ops = { .less = bpf_tree_less, .comp = bpf_tree_comp, }; static DEFINE_SPINLOCK(bpf_lock); static LIST_HEAD(bpf_kallsyms); static struct latch_tree_root bpf_tree __cacheline_aligned; void bpf_ksym_add(struct bpf_ksym *ksym) { spin_lock_bh(&bpf_lock); WARN_ON_ONCE(!list_empty(&ksym->lnode)); list_add_tail_rcu(&ksym->lnode, &bpf_kallsyms); latch_tree_insert(&ksym->tnode, &bpf_tree, &bpf_tree_ops); spin_unlock_bh(&bpf_lock); } static void __bpf_ksym_del(struct bpf_ksym *ksym) { if (list_empty(&ksym->lnode)) return; latch_tree_erase(&ksym->tnode, &bpf_tree, &bpf_tree_ops); list_del_rcu(&ksym->lnode); } void bpf_ksym_del(struct bpf_ksym *ksym) { spin_lock_bh(&bpf_lock); __bpf_ksym_del(ksym); spin_unlock_bh(&bpf_lock); } static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp) { return fp->jited && !bpf_prog_was_classic(fp); } void bpf_prog_kallsyms_add(struct bpf_prog *fp) { if (!bpf_prog_kallsyms_candidate(fp) || !bpf_token_capable(fp->aux->token, CAP_BPF)) return; bpf_prog_ksym_set_addr(fp); bpf_prog_ksym_set_name(fp); fp->aux->ksym.prog = true; bpf_ksym_add(&fp->aux->ksym); #ifdef CONFIG_FINEIBT /* * When FineIBT, code in the __cfi_foo() symbols can get executed * and hence unwinder needs help. */ if (cfi_mode != CFI_FINEIBT) return; snprintf(fp->aux->ksym_prefix.name, KSYM_NAME_LEN, "__cfi_%s", fp->aux->ksym.name); fp->aux->ksym_prefix.start = (unsigned long) fp->bpf_func - 16; fp->aux->ksym_prefix.end = (unsigned long) fp->bpf_func; bpf_ksym_add(&fp->aux->ksym_prefix); #endif } void bpf_prog_kallsyms_del(struct bpf_prog *fp) { if (!bpf_prog_kallsyms_candidate(fp)) return; bpf_ksym_del(&fp->aux->ksym); #ifdef CONFIG_FINEIBT if (cfi_mode != CFI_FINEIBT) return; bpf_ksym_del(&fp->aux->ksym_prefix); #endif } static struct bpf_ksym *bpf_ksym_find(unsigned long addr) { struct latch_tree_node *n; n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops); return n ? container_of(n, struct bpf_ksym, tnode) : NULL; } int __bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char *sym) { struct bpf_ksym *ksym; int ret = 0; rcu_read_lock(); ksym = bpf_ksym_find(addr); if (ksym) { unsigned long symbol_start = ksym->start; unsigned long symbol_end = ksym->end; ret = strscpy(sym, ksym->name, KSYM_NAME_LEN); if (size) *size = symbol_end - symbol_start; if (off) *off = addr - symbol_start; } rcu_read_unlock(); return ret; } bool is_bpf_text_address(unsigned long addr) { bool ret; rcu_read_lock(); ret = bpf_ksym_find(addr) != NULL; rcu_read_unlock(); return ret; } struct bpf_prog *bpf_prog_ksym_find(unsigned long addr) { struct bpf_ksym *ksym; WARN_ON_ONCE(!rcu_read_lock_held()); ksym = bpf_ksym_find(addr); return ksym && ksym->prog ? container_of(ksym, struct bpf_prog_aux, ksym)->prog : NULL; } const struct exception_table_entry *search_bpf_extables(unsigned long addr) { const struct exception_table_entry *e = NULL; struct bpf_prog *prog; rcu_read_lock(); prog = bpf_prog_ksym_find(addr); if (!prog) goto out; if (!prog->aux->num_exentries) goto out; e = search_extable(prog->aux->extable, prog->aux->num_exentries, addr); out: rcu_read_unlock(); return e; } int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym) { struct bpf_ksym *ksym; unsigned int it = 0; int ret = -ERANGE; if (!bpf_jit_kallsyms_enabled()) return ret; rcu_read_lock(); list_for_each_entry_rcu(ksym, &bpf_kallsyms, lnode) { if (it++ != symnum) continue; strscpy(sym, ksym->name, KSYM_NAME_LEN); *value = ksym->start; *type = BPF_SYM_ELF_TYPE; ret = 0; break; } rcu_read_unlock(); return ret; } int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, struct bpf_jit_poke_descriptor *poke) { struct bpf_jit_poke_descriptor *tab = prog->aux->poke_tab; static const u32 poke_tab_max = 1024; u32 slot = prog->aux->size_poke_tab; u32 size = slot + 1; if (size > poke_tab_max) return -ENOSPC; if (poke->tailcall_target || poke->tailcall_target_stable || poke->tailcall_bypass || poke->adj_off || poke->bypass_addr) return -EINVAL; switch (poke->reason) { case BPF_POKE_REASON_TAIL_CALL: if (!poke->tail_call.map) return -EINVAL; break; default: return -EINVAL; } tab = krealloc_array(tab, size, sizeof(*poke), GFP_KERNEL); if (!tab) return -ENOMEM; memcpy(&tab[slot], poke, sizeof(*poke)); prog->aux->size_poke_tab = size; prog->aux->poke_tab = tab; return slot; } /* * BPF program pack allocator. * * Most BPF programs are pretty small. Allocating a hole page for each * program is sometime a waste. Many small bpf program also adds pressure * to instruction TLB. To solve this issue, we introduce a BPF program pack * allocator. The prog_pack allocator uses HPAGE_PMD_SIZE page (2MB on x86) * to host BPF programs. */ #define BPF_PROG_CHUNK_SHIFT 6 #define BPF_PROG_CHUNK_SIZE (1 << BPF_PROG_CHUNK_SHIFT) #define BPF_PROG_CHUNK_MASK (~(BPF_PROG_CHUNK_SIZE - 1)) struct bpf_prog_pack { struct list_head list; void *ptr; unsigned long bitmap[]; }; void bpf_jit_fill_hole_with_zero(void *area, unsigned int size) { memset(area, 0, size); } #define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE) static DEFINE_MUTEX(pack_mutex); static LIST_HEAD(pack_list); /* PMD_SIZE is not available in some special config, e.g. ARCH=arm with * CONFIG_MMU=n. Use PAGE_SIZE in these cases. */ #ifdef PMD_SIZE /* PMD_SIZE is really big for some archs. It doesn't make sense to * reserve too much memory in one allocation. Hardcode BPF_PROG_PACK_SIZE to * 2MiB * num_possible_nodes(). On most architectures PMD_SIZE will be * greater than or equal to 2MB. */ #define BPF_PROG_PACK_SIZE (SZ_2M * num_possible_nodes()) #else #define BPF_PROG_PACK_SIZE PAGE_SIZE #endif #define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE) static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_prog_pack *pack; int err; pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)), GFP_KERNEL); if (!pack) return NULL; pack->ptr = bpf_jit_alloc_exec(BPF_PROG_PACK_SIZE); if (!pack->ptr) goto out; bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE); bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE); set_vm_flush_reset_perms(pack->ptr); err = set_memory_rox((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE); if (err) goto out; list_add_tail(&pack->list, &pack_list); return pack; out: bpf_jit_free_exec(pack->ptr); kfree(pack); return NULL; } void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns) { unsigned int nbits = BPF_PROG_SIZE_TO_NBITS(size); struct bpf_prog_pack *pack; unsigned long pos; void *ptr = NULL; mutex_lock(&pack_mutex); if (size > BPF_PROG_PACK_SIZE) { size = round_up(size, PAGE_SIZE); ptr = bpf_jit_alloc_exec(size); if (ptr) { int err; bpf_fill_ill_insns(ptr, size); set_vm_flush_reset_perms(ptr); err = set_memory_rox((unsigned long)ptr, size / PAGE_SIZE); if (err) { bpf_jit_free_exec(ptr); ptr = NULL; } } goto out; } list_for_each_entry(pack, &pack_list, list) { pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0, nbits, 0); if (pos < BPF_PROG_CHUNK_COUNT) goto found_free_area; } pack = alloc_new_pack(bpf_fill_ill_insns); if (!pack) goto out; pos = 0; found_free_area: bitmap_set(pack->bitmap, pos, nbits); ptr = (void *)(pack->ptr) + (pos << BPF_PROG_CHUNK_SHIFT); out: mutex_unlock(&pack_mutex); return ptr; } void bpf_prog_pack_free(void *ptr, u32 size) { struct bpf_prog_pack *pack = NULL, *tmp; unsigned int nbits; unsigned long pos; mutex_lock(&pack_mutex); if (size > BPF_PROG_PACK_SIZE) { bpf_jit_free_exec(ptr); goto out; } list_for_each_entry(tmp, &pack_list, list) { if (ptr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > ptr) { pack = tmp; break; } } if (WARN_ONCE(!pack, "bpf_prog_pack bug\n")) goto out; nbits = BPF_PROG_SIZE_TO_NBITS(size); pos = ((unsigned long)ptr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT; WARN_ONCE(bpf_arch_text_invalidate(ptr, size), "bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n"); bitmap_clear(pack->bitmap, pos, nbits); if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0, BPF_PROG_CHUNK_COUNT, 0) == 0) { list_del(&pack->list); bpf_jit_free_exec(pack->ptr); kfree(pack); } out: mutex_unlock(&pack_mutex); } static atomic_long_t bpf_jit_current; /* Can be overridden by an arch's JIT compiler if it has a custom, * dedicated BPF backend memory area, or if neither of the two * below apply. */ u64 __weak bpf_jit_alloc_exec_limit(void) { #if defined(MODULES_VADDR) return MODULES_END - MODULES_VADDR; #else return VMALLOC_END - VMALLOC_START; #endif } static int __init bpf_jit_charge_init(void) { /* Only used as heuristic here to derive limit. */ bpf_jit_limit_max = bpf_jit_alloc_exec_limit(); bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 1, PAGE_SIZE), LONG_MAX); return 0; } pure_initcall(bpf_jit_charge_init); int bpf_jit_charge_modmem(u32 size) { if (atomic_long_add_return(size, &bpf_jit_current) > READ_ONCE(bpf_jit_limit)) { if (!bpf_capable()) { atomic_long_sub(size, &bpf_jit_current); return -EPERM; } } return 0; } void bpf_jit_uncharge_modmem(u32 size) { atomic_long_sub(size, &bpf_jit_current); } void *__weak bpf_jit_alloc_exec(unsigned long size) { return execmem_alloc(EXECMEM_BPF, size); } void __weak bpf_jit_free_exec(void *addr) { execmem_free(addr); } struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_binary_header *hdr; u32 size, hole, start; WARN_ON_ONCE(!is_power_of_2(alignment) || alignment > BPF_IMAGE_ALIGNMENT); /* Most of BPF filters are really small, but if some of them * fill a page, allow at least 128 extra bytes to insert a * random section of illegal instructions. */ size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); if (bpf_jit_charge_modmem(size)) return NULL; hdr = bpf_jit_alloc_exec(size); if (!hdr) { bpf_jit_uncharge_modmem(size); return NULL; } /* Fill space with illegal/arch-dep instructions. */ bpf_fill_ill_insns(hdr, size); hdr->size = size; hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), PAGE_SIZE - sizeof(*hdr)); start = get_random_u32_below(hole) & ~(alignment - 1); /* Leave a random number of instructions before BPF code. */ *image_ptr = &hdr->image[start]; return hdr; } void bpf_jit_binary_free(struct bpf_binary_header *hdr) { u32 size = hdr->size; bpf_jit_free_exec(hdr); bpf_jit_uncharge_modmem(size); } /* Allocate jit binary from bpf_prog_pack allocator. * Since the allocated memory is RO+X, the JIT engine cannot write directly * to the memory. To solve this problem, a RW buffer is also allocated at * as the same time. The JIT engine should calculate offsets based on the * RO memory address, but write JITed program to the RW buffer. Once the * JIT engine finishes, it calls bpf_jit_binary_pack_finalize, which copies * the JITed program to the RO memory. */ struct bpf_binary_header * bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, struct bpf_binary_header **rw_header, u8 **rw_image, bpf_jit_fill_hole_t bpf_fill_ill_insns) { struct bpf_binary_header *ro_header; u32 size, hole, start; WARN_ON_ONCE(!is_power_of_2(alignment) || alignment > BPF_IMAGE_ALIGNMENT); /* add 16 bytes for a random section of illegal instructions */ size = round_up(proglen + sizeof(*ro_header) + 16, BPF_PROG_CHUNK_SIZE); if (bpf_jit_charge_modmem(size)) return NULL; ro_header = bpf_prog_pack_alloc(size, bpf_fill_ill_insns); if (!ro_header) { bpf_jit_uncharge_modmem(size); return NULL; } *rw_header = kvmalloc(size, GFP_KERNEL); if (!*rw_header) { bpf_prog_pack_free(ro_header, size); bpf_jit_uncharge_modmem(size); return NULL; } /* Fill space with illegal/arch-dep instructions. */ bpf_fill_ill_insns(*rw_header, size); (*rw_header)->size = size; hole = min_t(unsigned int, size - (proglen + sizeof(*ro_header)), BPF_PROG_CHUNK_SIZE - sizeof(*ro_header)); start = get_random_u32_below(hole) & ~(alignment - 1); *image_ptr = &ro_header->image[start]; *rw_image = &(*rw_header)->image[start]; return ro_header; } /* Copy JITed text from rw_header to its final location, the ro_header. */ int bpf_jit_binary_pack_finalize(struct bpf_binary_header *ro_header, struct bpf_binary_header *rw_header) { void *ptr; ptr = bpf_arch_text_copy(ro_header, rw_header, rw_header->size); kvfree(rw_header); if (IS_ERR(ptr)) { bpf_prog_pack_free(ro_header, ro_header->size); return PTR_ERR(ptr); } return 0; } /* bpf_jit_binary_pack_free is called in two different scenarios: * 1) when the program is freed after; * 2) when the JIT engine fails (before bpf_jit_binary_pack_finalize). * For case 2), we need to free both the RO memory and the RW buffer. * * bpf_jit_binary_pack_free requires proper ro_header->size. However, * bpf_jit_binary_pack_alloc does not set it. Therefore, ro_header->size * must be set with either bpf_jit_binary_pack_finalize (normal path) or * bpf_arch_text_copy (when jit fails). */ void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header, struct bpf_binary_header *rw_header) { u32 size = ro_header->size; bpf_prog_pack_free(ro_header, size); kvfree(rw_header); bpf_jit_uncharge_modmem(size); } struct bpf_binary_header * bpf_jit_binary_pack_hdr(const struct bpf_prog *fp) { unsigned long real_start = (unsigned long)fp->bpf_func; unsigned long addr; addr = real_start & BPF_PROG_CHUNK_MASK; return (void *)addr; } static inline struct bpf_binary_header * bpf_jit_binary_hdr(const struct bpf_prog *fp) { unsigned long real_start = (unsigned long)fp->bpf_func; unsigned long addr; addr = real_start & PAGE_MASK; return (void *)addr; } /* This symbol is only overridden by archs that have different * requirements than the usual eBPF JITs, f.e. when they only * implement cBPF JIT, do not set images read-only, etc. */ void __weak bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) { struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); bpf_jit_binary_free(hdr); WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); } bpf_prog_unlock_free(fp); } int bpf_jit_get_func_addr(const struct bpf_prog *prog, const struct bpf_insn *insn, bool extra_pass, u64 *func_addr, bool *func_addr_fixed) { s16 off = insn->off; s32 imm = insn->imm; u8 *addr; int err; *func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL; if (!*func_addr_fixed) { /* Place-holder address till the last pass has collected * all addresses for JITed subprograms in which case we * can pick them up from prog->aux. */ if (!extra_pass) addr = NULL; else if (prog->aux->func && off >= 0 && off < prog->aux->real_func_cnt) addr = (u8 *)prog->aux->func[off]->bpf_func; else return -EINVAL; } else if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL && bpf_jit_supports_far_kfunc_call()) { err = bpf_get_kfunc_addr(prog, insn->imm, insn->off, &addr); if (err) return err; } else { /* Address of a BPF helper call. Since part of the core * kernel, it's always at a fixed location. __bpf_call_base * and the helper with imm relative to it are both in core * kernel. */ addr = (u8 *)__bpf_call_base + imm; } *func_addr = (unsigned long)addr; return 0; } const char *bpf_jit_get_prog_name(struct bpf_prog *prog) { if (prog->aux->ksym.prog) return prog->aux->ksym.name; return prog->aux->name; } static int bpf_jit_blind_insn(const struct bpf_insn *from, const struct bpf_insn *aux, struct bpf_insn *to_buff, bool emit_zext) { struct bpf_insn *to = to_buff; u32 imm_rnd = get_random_u32(); s16 off; BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG); /* Constraints on AX register: * * AX register is inaccessible from user space. It is mapped in * all JITs, and used here for constant blinding rewrites. It is * typically "stateless" meaning its contents are only valid within * the executed instruction, but not across several instructions. * There are a few exceptions however which are further detailed * below. * * Constant blinding is only used by JITs, not in the interpreter. * The interpreter uses AX in some occasions as a local temporary * register e.g. in DIV or MOD instructions. * * In restricted circumstances, the verifier can also use the AX * register for rewrites as long as they do not interfere with * the above cases! */ if (from->dst_reg == BPF_REG_AX || from->src_reg == BPF_REG_AX) goto out; if (from->imm == 0 && (from->code == (BPF_ALU | BPF_MOV | BPF_K) || from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) { *to++ = BPF_ALU64_REG(BPF_XOR, from->dst_reg, from->dst_reg); goto out; } switch (from->code) { case BPF_ALU | BPF_ADD | BPF_K: case BPF_ALU | BPF_SUB | BPF_K: case BPF_ALU | BPF_AND | BPF_K: case BPF_ALU | BPF_OR | BPF_K: case BPF_ALU | BPF_XOR | BPF_K: case BPF_ALU | BPF_MUL | BPF_K: case BPF_ALU | BPF_MOV | BPF_K: case BPF_ALU | BPF_DIV | BPF_K: case BPF_ALU | BPF_MOD | BPF_K: *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); *to++ = BPF_ALU32_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off); break; case BPF_ALU64 | BPF_ADD | BPF_K: case BPF_ALU64 | BPF_SUB | BPF_K: case BPF_ALU64 | BPF_AND | BPF_K: case BPF_ALU64 | BPF_OR | BPF_K: case BPF_ALU64 | BPF_XOR | BPF_K: case BPF_ALU64 | BPF_MUL | BPF_K: case BPF_ALU64 | BPF_MOV | BPF_K: case BPF_ALU64 | BPF_DIV | BPF_K: case BPF_ALU64 | BPF_MOD | BPF_K: *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); *to++ = BPF_ALU64_REG_OFF(from->code, from->dst_reg, BPF_REG_AX, from->off); break; case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JNE | BPF_K: case BPF_JMP | BPF_JGT | BPF_K: case BPF_JMP | BPF_JLT | BPF_K: case BPF_JMP | BPF_JGE | BPF_K: case BPF_JMP | BPF_JLE | BPF_K: case BPF_JMP | BPF_JSGT | BPF_K: case BPF_JMP | BPF_JSLT | BPF_K: case BPF_JMP | BPF_JSGE | BPF_K: case BPF_JMP | BPF_JSLE | BPF_K: case BPF_JMP | BPF_JSET | BPF_K: /* Accommodate for extra offset in case of a backjump. */ off = from->off; if (off < 0) off -= 2; *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off); break; case BPF_JMP32 | BPF_JEQ | BPF_K: case BPF_JMP32 | BPF_JNE | BPF_K: case BPF_JMP32 | BPF_JGT | BPF_K: case BPF_JMP32 | BPF_JLT | BPF_K: case BPF_JMP32 | BPF_JGE | BPF_K: case BPF_JMP32 | BPF_JLE | BPF_K: case BPF_JMP32 | BPF_JSGT | BPF_K: case BPF_JMP32 | BPF_JSLT | BPF_K: case BPF_JMP32 | BPF_JSGE | BPF_K: case BPF_JMP32 | BPF_JSLE | BPF_K: case BPF_JMP32 | BPF_JSET | BPF_K: /* Accommodate for extra offset in case of a backjump. */ off = from->off; if (off < 0) off -= 2; *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); *to++ = BPF_JMP32_REG(from->code, from->dst_reg, BPF_REG_AX, off); break; case BPF_LD | BPF_IMM | BPF_DW: *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm); *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); *to++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32); *to++ = BPF_ALU64_REG(BPF_MOV, aux[0].dst_reg, BPF_REG_AX); break; case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */ *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm); *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); if (emit_zext) *to++ = BPF_ZEXT_REG(BPF_REG_AX); *to++ = BPF_ALU64_REG(BPF_OR, aux[0].dst_reg, BPF_REG_AX); break; case BPF_ST | BPF_MEM | BPF_DW: case BPF_ST | BPF_MEM | BPF_W: case BPF_ST | BPF_MEM | BPF_H: case BPF_ST | BPF_MEM | BPF_B: *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); *to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off); break; } out: return to - to_buff; } static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, gfp_t gfp_extra_flags) { gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags); if (fp != NULL) { /* aux->prog still points to the fp_other one, so * when promoting the clone to the real program, * this still needs to be adapted. */ memcpy(fp, fp_other, fp_other->pages * PAGE_SIZE); } return fp; } static void bpf_prog_clone_free(struct bpf_prog *fp) { /* aux was stolen by the other clone, so we cannot free * it from this path! It will be freed eventually by the * other program on release. * * At this point, we don't need a deferred release since * clone is guaranteed to not be locked. */ fp->aux = NULL; fp->stats = NULL; fp->active = NULL; __bpf_prog_free(fp); } void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other) { /* We have to repoint aux->prog to self, as we don't * know whether fp here is the clone or the original. */ fp->aux->prog = fp; bpf_prog_clone_free(fp_other); } struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) { struct bpf_insn insn_buff[16], aux[2]; struct bpf_prog *clone, *tmp; int insn_delta, insn_cnt; struct bpf_insn *insn; int i, rewritten; if (!prog->blinding_requested || prog->blinded) return prog; clone = bpf_prog_clone_create(prog, GFP_USER); if (!clone) return ERR_PTR(-ENOMEM); insn_cnt = clone->len; insn = clone->insnsi; for (i = 0; i < insn_cnt; i++, insn++) { if (bpf_pseudo_func(insn)) { /* ld_imm64 with an address of bpf subprog is not * a user controlled constant. Don't randomize it, * since it will conflict with jit_subprogs() logic. */ insn++; i++; continue; } /* We temporarily need to hold the original ld64 insn * so that we can still access the first part in the * second blinding run. */ if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW) && insn[1].code == 0) memcpy(aux, insn, sizeof(aux)); rewritten = bpf_jit_blind_insn(insn, aux, insn_buff, clone->aux->verifier_zext); if (!rewritten) continue; tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten); if (IS_ERR(tmp)) { /* Patching may have repointed aux->prog during * realloc from the original one, so we need to * fix it up here on error. */ bpf_jit_prog_release_other(prog, clone); return tmp; } clone = tmp; insn_delta = rewritten - 1; /* Walk new program and skip insns we just inserted. */ insn = clone->insnsi + i + insn_delta; insn_cnt += insn_delta; i += insn_delta; } clone->blinded = 1; return clone; } #endif /* CONFIG_BPF_JIT */ /* Base function for offset calculation. Needs to go into .text section, * therefore keeping it non-static as well; will also be used by JITs * anyway later on, so do not let the compiler omit it. This also needs * to go into kallsyms for correlation from e.g. bpftool, so naming * must not change. */ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { return 0; } EXPORT_SYMBOL_GPL(__bpf_call_base); /* All UAPI available opcodes. */ #define BPF_INSN_MAP(INSN_2, INSN_3) \ /* 32 bit ALU operations. */ \ /* Register based. */ \ INSN_3(ALU, ADD, X), \ INSN_3(ALU, SUB, X), \ INSN_3(ALU, AND, X), \ INSN_3(ALU, OR, X), \ INSN_3(ALU, LSH, X), \ INSN_3(ALU, RSH, X), \ INSN_3(ALU, XOR, X), \ INSN_3(ALU, MUL, X), \ INSN_3(ALU, MOV, X), \ INSN_3(ALU, ARSH, X), \ INSN_3(ALU, DIV, X), \ INSN_3(ALU, MOD, X), \ INSN_2(ALU, NEG), \ INSN_3(ALU, END, TO_BE), \ INSN_3(ALU, END, TO_LE), \ /* Immediate based. */ \ INSN_3(ALU, ADD, K), \ INSN_3(ALU, SUB, K), \ INSN_3(ALU, AND, K), \ INSN_3(ALU, OR, K), \ INSN_3(ALU, LSH, K), \ INSN_3(ALU, RSH, K), \ INSN_3(ALU, XOR, K), \ INSN_3(ALU, MUL, K), \ INSN_3(ALU, MOV, K), \ INSN_3(ALU, ARSH, K), \ INSN_3(ALU, DIV, K), \ INSN_3(ALU, MOD, K), \ /* 64 bit ALU operations. */ \ /* Register based. */ \ INSN_3(ALU64, ADD, X), \ INSN_3(ALU64, SUB, X), \ INSN_3(ALU64, AND, X), \ INSN_3(ALU64, OR, X), \ INSN_3(ALU64, LSH, X), \ INSN_3(ALU64, RSH, X), \ INSN_3(ALU64, XOR, X), \ INSN_3(ALU64, MUL, X), \ INSN_3(ALU64, MOV, X), \ INSN_3(ALU64, ARSH, X), \ INSN_3(ALU64, DIV, X), \ INSN_3(ALU64, MOD, X), \ INSN_2(ALU64, NEG), \ INSN_3(ALU64, END, TO_LE), \ /* Immediate based. */ \ INSN_3(ALU64, ADD, K), \ INSN_3(ALU64, SUB, K), \ INSN_3(ALU64, AND, K), \ INSN_3(ALU64, OR, K), \ INSN_3(ALU64, LSH, K), \ INSN_3(ALU64, RSH, K), \ INSN_3(ALU64, XOR, K), \ INSN_3(ALU64, MUL, K), \ INSN_3(ALU64, MOV, K), \ INSN_3(ALU64, ARSH, K), \ INSN_3(ALU64, DIV, K), \ INSN_3(ALU64, MOD, K), \ /* Call instruction. */ \ INSN_2(JMP, CALL), \ /* Exit instruction. */ \ INSN_2(JMP, EXIT), \ /* 32-bit Jump instructions. */ \ /* Register based. */ \ INSN_3(JMP32, JEQ, X), \ INSN_3(JMP32, JNE, X), \ INSN_3(JMP32, JGT, X), \ INSN_3(JMP32, JLT, X), \ INSN_3(JMP32, JGE, X), \ INSN_3(JMP32, JLE, X), \ INSN_3(JMP32, JSGT, X), \ INSN_3(JMP32, JSLT, X), \ INSN_3(JMP32, JSGE, X), \ INSN_3(JMP32, JSLE, X), \ INSN_3(JMP32, JSET, X), \ /* Immediate based. */ \ INSN_3(JMP32, JEQ, K), \ INSN_3(JMP32, JNE, K), \ INSN_3(JMP32, JGT, K), \ INSN_3(JMP32, JLT, K), \ INSN_3(JMP32, JGE, K), \ INSN_3(JMP32, JLE, K), \ INSN_3(JMP32, JSGT, K), \ INSN_3(JMP32, JSLT, K), \ INSN_3(JMP32, JSGE, K), \ INSN_3(JMP32, JSLE, K), \ INSN_3(JMP32, JSET, K), \ /* Jump instructions. */ \ /* Register based. */ \ INSN_3(JMP, JEQ, X), \ INSN_3(JMP, JNE, X), \ INSN_3(JMP, JGT, X), \ INSN_3(JMP, JLT, X), \ INSN_3(JMP, JGE, X), \ INSN_3(JMP, JLE, X), \ INSN_3(JMP, JSGT, X), \ INSN_3(JMP, JSLT, X), \ INSN_3(JMP, JSGE, X), \ INSN_3(JMP, JSLE, X), \ INSN_3(JMP, JSET, X), \ /* Immediate based. */ \ INSN_3(JMP, JEQ, K), \ INSN_3(JMP, JNE, K), \ INSN_3(JMP, JGT, K), \ INSN_3(JMP, JLT, K), \ INSN_3(JMP, JGE, K), \ INSN_3(JMP, JLE, K), \ INSN_3(JMP, JSGT, K), \ INSN_3(JMP, JSLT, K), \ INSN_3(JMP, JSGE, K), \ INSN_3(JMP, JSLE, K), \ INSN_3(JMP, JSET, K), \ INSN_2(JMP, JA), \ INSN_2(JMP32, JA), \ /* Atomic operations. */ \ INSN_3(STX, ATOMIC, B), \ INSN_3(STX, ATOMIC, H), \ INSN_3(STX, ATOMIC, W), \ INSN_3(STX, ATOMIC, DW), \ /* Store instructions. */ \ /* Register based. */ \ INSN_3(STX, MEM, B), \ INSN_3(STX, MEM, H), \ INSN_3(STX, MEM, W), \ INSN_3(STX, MEM, DW), \ /* Immediate based. */ \ INSN_3(ST, MEM, B), \ INSN_3(ST, MEM, H), \ INSN_3(ST, MEM, W), \ INSN_3(ST, MEM, DW), \ /* Load instructions. */ \ /* Register based. */ \ INSN_3(LDX, MEM, B), \ INSN_3(LDX, MEM, H), \ INSN_3(LDX, MEM, W), \ INSN_3(LDX, MEM, DW), \ INSN_3(LDX, MEMSX, B), \ INSN_3(LDX, MEMSX, H), \ INSN_3(LDX, MEMSX, W), \ /* Immediate based. */ \ INSN_3(LD, IMM, DW) bool bpf_opcode_in_insntable(u8 code) { #define BPF_INSN_2_TBL(x, y) [BPF_##x | BPF_##y] = true #define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true static const bool public_insntable[256] = { [0 ... 255] = false, /* Now overwrite non-defaults ... */ BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL), /* UAPI exposed, but rewritten opcodes. cBPF carry-over. */ [BPF_LD | BPF_ABS | BPF_B] = true, [BPF_LD | BPF_ABS | BPF_H] = true, [BPF_LD | BPF_ABS | BPF_W] = true, [BPF_LD | BPF_IND | BPF_B] = true, [BPF_LD | BPF_IND | BPF_H] = true, [BPF_LD | BPF_IND | BPF_W] = true, [BPF_JMP | BPF_JCOND] = true, }; #undef BPF_INSN_3_TBL #undef BPF_INSN_2_TBL return public_insntable[code]; } #ifndef CONFIG_BPF_JIT_ALWAYS_ON /** * ___bpf_prog_run - run eBPF program on a given context * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers * @insn: is the array of eBPF instructions * * Decode and execute eBPF instructions. * * Return: whatever value is in %BPF_R0 at program exit */ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn) { #define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z static const void * const jumptable[256] __annotate_jump_table = { [0 ... 255] = &&default_label, /* Now overwrite non-defaults ... */ BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL), /* Non-UAPI available opcodes. */ [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS, [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, [BPF_ST | BPF_NOSPEC] = &&ST_NOSPEC, [BPF_LDX | BPF_PROBE_MEM | BPF_B] = &&LDX_PROBE_MEM_B, [BPF_LDX | BPF_PROBE_MEM | BPF_H] = &&LDX_PROBE_MEM_H, [BPF_LDX | BPF_PROBE_MEM | BPF_W] = &&LDX_PROBE_MEM_W, [BPF_LDX | BPF_PROBE_MEM | BPF_DW] = &&LDX_PROBE_MEM_DW, [BPF_LDX | BPF_PROBE_MEMSX | BPF_B] = &&LDX_PROBE_MEMSX_B, [BPF_LDX | BPF_PROBE_MEMSX | BPF_H] = &&LDX_PROBE_MEMSX_H, [BPF_LDX | BPF_PROBE_MEMSX | BPF_W] = &&LDX_PROBE_MEMSX_W, }; #undef BPF_INSN_3_LBL #undef BPF_INSN_2_LBL u32 tail_call_cnt = 0; #define CONT ({ insn++; goto select_insn; }) #define CONT_JMP ({ insn++; goto select_insn; }) select_insn: goto *jumptable[insn->code]; /* Explicitly mask the register-based shift amounts with 63 or 31 * to avoid undefined behavior. Normally this won't affect the * generated code, for example, in case of native 64 bit archs such * as x86-64 or arm64, the compiler is optimizing the AND away for * the interpreter. In case of JITs, each of the JIT backends compiles * the BPF shift operations to machine instructions which produce * implementation-defined results in such a case; the resulting * contents of the register may be arbitrary, but program behaviour * as a whole remains defined. In other words, in case of JIT backends, * the AND must /not/ be added to the emitted LSH/RSH/ARSH translation. */ /* ALU (shifts) */ #define SHT(OPCODE, OP) \ ALU64_##OPCODE##_X: \ DST = DST OP (SRC & 63); \ CONT; \ ALU_##OPCODE##_X: \ DST = (u32) DST OP ((u32) SRC & 31); \ CONT; \ ALU64_##OPCODE##_K: \ DST = DST OP IMM; \ CONT; \ ALU_##OPCODE##_K: \ DST = (u32) DST OP (u32) IMM; \ CONT; /* ALU (rest) */ #define ALU(OPCODE, OP) \ ALU64_##OPCODE##_X: \ DST = DST OP SRC; \ CONT; \ ALU_##OPCODE##_X: \ DST = (u32) DST OP (u32) SRC; \ CONT; \ ALU64_##OPCODE##_K: \ DST = DST OP IMM; \ CONT; \ ALU_##OPCODE##_K: \ DST = (u32) DST OP (u32) IMM; \ CONT; ALU(ADD, +) ALU(SUB, -) ALU(AND, &) ALU(OR, |) ALU(XOR, ^) ALU(MUL, *) SHT(LSH, <<) SHT(RSH, >>) #undef SHT #undef ALU ALU_NEG: DST = (u32) -DST; CONT; ALU64_NEG: DST = -DST; CONT; ALU_MOV_X: switch (OFF) { case 0: DST = (u32) SRC; break; case 8: DST = (u32)(s8) SRC; break; case 16: DST = (u32)(s16) SRC; break; } CONT; ALU_MOV_K: DST = (u32) IMM; CONT; ALU64_MOV_X: switch (OFF) { case 0: DST = SRC; break; case 8: DST = (s8) SRC; break; case 16: DST = (s16) SRC; break; case 32: DST = (s32) SRC; break; } CONT; ALU64_MOV_K: DST = IMM; CONT; LD_IMM_DW: DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32; insn++; CONT; ALU_ARSH_X: DST = (u64) (u32) (((s32) DST) >> (SRC & 31)); CONT; ALU_ARSH_K: DST = (u64) (u32) (((s32) DST) >> IMM); CONT; ALU64_ARSH_X: (*(s64 *) &DST) >>= (SRC & 63); CONT; ALU64_ARSH_K: (*(s64 *) &DST) >>= IMM; CONT; ALU64_MOD_X: switch (OFF) { case 0: div64_u64_rem(DST, SRC, &AX); DST = AX; break; case 1: AX = div64_s64(DST, SRC); DST = DST - AX * SRC; break; } CONT; ALU_MOD_X: switch (OFF) { case 0: AX = (u32) DST; DST = do_div(AX, (u32) SRC); break; case 1: AX = abs((s32)DST); AX = do_div(AX, abs((s32)SRC)); if ((s32)DST < 0) DST = (u32)-AX; else DST = (u32)AX; break; } CONT; ALU64_MOD_K: switch (OFF) { case 0: div64_u64_rem(DST, IMM, &AX); DST = AX; break; case 1: AX = div64_s64(DST, IMM); DST = DST - AX * IMM; break; } CONT; ALU_MOD_K: switch (OFF) { case 0: AX = (u32) DST; DST = do_div(AX, (u32) IMM); break; case 1: AX = abs((s32)DST); AX = do_div(AX, abs((s32)IMM)); if ((s32)DST < 0) DST = (u32)-AX; else DST = (u32)AX; break; } CONT; ALU64_DIV_X: switch (OFF) { case 0: DST = div64_u64(DST, SRC); break; case 1: DST = div64_s64(DST, SRC); break; } CONT; ALU_DIV_X: switch (OFF) { case 0: AX = (u32) DST; do_div(AX, (u32) SRC); DST = (u32) AX; break; case 1: AX = abs((s32)DST); do_div(AX, abs((s32)SRC)); if (((s32)DST < 0) == ((s32)SRC < 0)) DST = (u32)AX; else DST = (u32)-AX; break; } CONT; ALU64_DIV_K: switch (OFF) { case 0: DST = div64_u64(DST, IMM); break; case 1: DST = div64_s64(DST, IMM); break; } CONT; ALU_DIV_K: switch (OFF) { case 0: AX = (u32) DST; do_div(AX, (u32) IMM); DST = (u32) AX; break; case 1: AX = abs((s32)DST); do_div(AX, abs((s32)IMM)); if (((s32)DST < 0) == ((s32)IMM < 0)) DST = (u32)AX; else DST = (u32)-AX; break; } CONT; ALU_END_TO_BE: switch (IMM) { case 16: DST = (__force u16) cpu_to_be16(DST); break; case 32: DST = (__force u32) cpu_to_be32(DST); break; case 64: DST = (__force u64) cpu_to_be64(DST); break; } CONT; ALU_END_TO_LE: switch (IMM) { case 16: DST = (__force u16) cpu_to_le16(DST); break; case 32: DST = (__force u32) cpu_to_le32(DST); break; case 64: DST = (__force u64) cpu_to_le64(DST); break; } CONT; ALU64_END_TO_LE: switch (IMM) { case 16: DST = (__force u16) __swab16(DST); break; case 32: DST = (__force u32) __swab32(DST); break; case 64: DST = (__force u64) __swab64(DST); break; } CONT; /* CALL */ JMP_CALL: /* Function call scratches BPF_R1-BPF_R5 registers, * preserves BPF_R6-BPF_R9, and stores return value * into BPF_R0. */ BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3, BPF_R4, BPF_R5); CONT; JMP_CALL_ARGS: BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2, BPF_R3, BPF_R4, BPF_R5, insn + insn->off + 1); CONT; JMP_TAIL_CALL: { struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2; struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_prog *prog; u32 index = BPF_R3; if (unlikely(index >= array->map.max_entries)) goto out; if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT)) goto out; tail_call_cnt++; prog = READ_ONCE(array->ptrs[index]); if (!prog) goto out; /* ARG1 at this point is guaranteed to point to CTX from * the verifier side due to the fact that the tail call is * handled like a helper, that is, bpf_tail_call_proto, * where arg1_type is ARG_PTR_TO_CTX. */ insn = prog->insnsi; goto select_insn; out: CONT; } JMP_JA: insn += insn->off; CONT; JMP32_JA: insn += insn->imm; CONT; JMP_EXIT: return BPF_R0; /* JMP */ #define COND_JMP(SIGN, OPCODE, CMP_OP) \ JMP_##OPCODE##_X: \ if ((SIGN##64) DST CMP_OP (SIGN##64) SRC) { \ insn += insn->off; \ CONT_JMP; \ } \ CONT; \ JMP32_##OPCODE##_X: \ if ((SIGN##32) DST CMP_OP (SIGN##32) SRC) { \ insn += insn->off; \ CONT_JMP; \ } \ CONT; \ JMP_##OPCODE##_K: \ if ((SIGN##64) DST CMP_OP (SIGN##64) IMM) { \ insn += insn->off; \ CONT_JMP; \ } \ CONT; \ JMP32_##OPCODE##_K: \ if ((SIGN##32) DST CMP_OP (SIGN##32) IMM) { \ insn += insn->off; \ CONT_JMP; \ } \ CONT; COND_JMP(u, JEQ, ==) COND_JMP(u, JNE, !=) COND_JMP(u, JGT, >) COND_JMP(u, JLT, <) COND_JMP(u, JGE, >=) COND_JMP(u, JLE, <=) COND_JMP(u, JSET, &) COND_JMP(s, JSGT, >) COND_JMP(s, JSLT, <) COND_JMP(s, JSGE, >=) COND_JMP(s, JSLE, <=) #undef COND_JMP /* ST, STX and LDX*/ ST_NOSPEC: /* Speculation barrier for mitigating Speculative Store Bypass, * Bounds-Check Bypass and Type Confusion. In case of arm64, we * rely on the firmware mitigation as controlled via the ssbd * kernel parameter. Whenever the mitigation is enabled, it * works for all of the kernel code with no need to provide any * additional instructions here. In case of x86, we use 'lfence' * insn for mitigation. We reuse preexisting logic from Spectre * v1 mitigation that happens to produce the required code on * x86 for v4 as well. */ barrier_nospec(); CONT; #define LDST(SIZEOP, SIZE) \ STX_MEM_##SIZEOP: \ *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \ CONT; \ ST_MEM_##SIZEOP: \ *(SIZE *)(unsigned long) (DST + insn->off) = IMM; \ CONT; \ LDX_MEM_##SIZEOP: \ DST = *(SIZE *)(unsigned long) (SRC + insn->off); \ CONT; \ LDX_PROBE_MEM_##SIZEOP: \ bpf_probe_read_kernel_common(&DST, sizeof(SIZE), \ (const void *)(long) (SRC + insn->off)); \ DST = *((SIZE *)&DST); \ CONT; LDST(B, u8) LDST(H, u16) LDST(W, u32) LDST(DW, u64) #undef LDST #define LDSX(SIZEOP, SIZE) \ LDX_MEMSX_##SIZEOP: \ DST = *(SIZE *)(unsigned long) (SRC + insn->off); \ CONT; \ LDX_PROBE_MEMSX_##SIZEOP: \ bpf_probe_read_kernel_common(&DST, sizeof(SIZE), \ (const void *)(long) (SRC + insn->off)); \ DST = *((SIZE *)&DST); \ CONT; LDSX(B, s8) LDSX(H, s16) LDSX(W, s32) #undef LDSX #define ATOMIC_ALU_OP(BOP, KOP) \ case BOP: \ if (BPF_SIZE(insn->code) == BPF_W) \ atomic_##KOP((u32) SRC, (atomic_t *)(unsigned long) \ (DST + insn->off)); \ else if (BPF_SIZE(insn->code) == BPF_DW) \ atomic64_##KOP((u64) SRC, (atomic64_t *)(unsigned long) \ (DST + insn->off)); \ else \ goto default_label; \ break; \ case BOP | BPF_FETCH: \ if (BPF_SIZE(insn->code) == BPF_W) \ SRC = (u32) atomic_fetch_##KOP( \ (u32) SRC, \ (atomic_t *)(unsigned long) (DST + insn->off)); \ else if (BPF_SIZE(insn->code) == BPF_DW) \ SRC = (u64) atomic64_fetch_##KOP( \ (u64) SRC, \ (atomic64_t *)(unsigned long) (DST + insn->off)); \ else \ goto default_label; \ break; STX_ATOMIC_DW: STX_ATOMIC_W: STX_ATOMIC_H: STX_ATOMIC_B: switch (IMM) { /* Atomic read-modify-write instructions support only W and DW * size modifiers. */ ATOMIC_ALU_OP(BPF_ADD, add) ATOMIC_ALU_OP(BPF_AND, and) ATOMIC_ALU_OP(BPF_OR, or) ATOMIC_ALU_OP(BPF_XOR, xor) #undef ATOMIC_ALU_OP case BPF_XCHG: if (BPF_SIZE(insn->code) == BPF_W) SRC = (u32) atomic_xchg( (atomic_t *)(unsigned long) (DST + insn->off), (u32) SRC); else if (BPF_SIZE(insn->code) == BPF_DW) SRC = (u64) atomic64_xchg( (atomic64_t *)(unsigned long) (DST + insn->off), (u64) SRC); else goto default_label; break; case BPF_CMPXCHG: if (BPF_SIZE(insn->code) == BPF_W) BPF_R0 = (u32) atomic_cmpxchg( (atomic_t *)(unsigned long) (DST + insn->off), (u32) BPF_R0, (u32) SRC); else if (BPF_SIZE(insn->code) == BPF_DW) BPF_R0 = (u64) atomic64_cmpxchg( (atomic64_t *)(unsigned long) (DST + insn->off), (u64) BPF_R0, (u64) SRC); else goto default_label; break; /* Atomic load and store instructions support all size * modifiers. */ case BPF_LOAD_ACQ: switch (BPF_SIZE(insn->code)) { #define LOAD_ACQUIRE(SIZEOP, SIZE) \ case BPF_##SIZEOP: \ DST = (SIZE)smp_load_acquire( \ (SIZE *)(unsigned long)(SRC + insn->off)); \ break; LOAD_ACQUIRE(B, u8) LOAD_ACQUIRE(H, u16) LOAD_ACQUIRE(W, u32) #ifdef CONFIG_64BIT LOAD_ACQUIRE(DW, u64) #endif #undef LOAD_ACQUIRE default: goto default_label; } break; case BPF_STORE_REL: switch (BPF_SIZE(insn->code)) { #define STORE_RELEASE(SIZEOP, SIZE) \ case BPF_##SIZEOP: \ smp_store_release( \ (SIZE *)(unsigned long)(DST + insn->off), (SIZE)SRC); \ break; STORE_RELEASE(B, u8) STORE_RELEASE(H, u16) STORE_RELEASE(W, u32) #ifdef CONFIG_64BIT STORE_RELEASE(DW, u64) #endif #undef STORE_RELEASE default: goto default_label; } break; default: goto default_label; } CONT; default_label: /* If we ever reach this, we have a bug somewhere. Die hard here * instead of just returning 0; we could be somewhere in a subprog, * so execution could continue otherwise which we do /not/ want. * * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable(). */ pr_warn("BPF interpreter: unknown opcode %02x (imm: 0x%x)\n", insn->code, insn->imm); BUG_ON(1); return 0; } #define PROG_NAME(stack_size) __bpf_prog_run##stack_size #define DEFINE_BPF_PROG_RUN(stack_size) \ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \ { \ u64 stack[stack_size / sizeof(u64)]; \ u64 regs[MAX_BPF_EXT_REG] = {}; \ \ kmsan_unpoison_memory(stack, sizeof(stack)); \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ ARG1 = (u64) (unsigned long) ctx; \ return ___bpf_prog_run(regs, insn); \ } #define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size #define DEFINE_BPF_PROG_RUN_ARGS(stack_size) \ static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \ const struct bpf_insn *insn) \ { \ u64 stack[stack_size / sizeof(u64)]; \ u64 regs[MAX_BPF_EXT_REG]; \ \ kmsan_unpoison_memory(stack, sizeof(stack)); \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ BPF_R1 = r1; \ BPF_R2 = r2; \ BPF_R3 = r3; \ BPF_R4 = r4; \ BPF_R5 = r5; \ return ___bpf_prog_run(regs, insn); \ } #define EVAL1(FN, X) FN(X) #define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y) #define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y) #define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y) #define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y) #define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y) EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192); EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384); EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512); EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 32, 64, 96, 128, 160, 192); EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 224, 256, 288, 320, 352, 384); EVAL4(DEFINE_BPF_PROG_RUN_ARGS, 416, 448, 480, 512); #define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size), static unsigned int (*interpreters[])(const void *ctx, const struct bpf_insn *insn) = { EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) }; #undef PROG_NAME_LIST #define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size), static __maybe_unused u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, const struct bpf_insn *insn) = { EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) }; #undef PROG_NAME_LIST #ifdef CONFIG_BPF_SYSCALL void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) { stack_depth = max_t(u32, stack_depth, 1); insn->off = (s16) insn->imm; insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] - __bpf_call_base_args; insn->code = BPF_JMP | BPF_CALL_ARGS; } #endif #endif static unsigned int __bpf_prog_ret0_warn(const void *ctx, const struct bpf_insn *insn) { /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON * is not working properly, so warn about it! */ WARN_ON_ONCE(1); return 0; } static bool __bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp) { enum bpf_prog_type prog_type = resolve_prog_type(fp); struct bpf_prog_aux *aux = fp->aux; enum bpf_cgroup_storage_type i; bool ret = false; u64 cookie; if (fp->kprobe_override) return ret; spin_lock(&map->owner_lock); /* There's no owner yet where we could check for compatibility. */ if (!map->owner) { map->owner = bpf_map_owner_alloc(map); if (!map->owner) goto err; map->owner->type = prog_type; map->owner->jited = fp->jited; map->owner->xdp_has_frags = aux->xdp_has_frags; map->owner->attach_func_proto = aux->attach_func_proto; for_each_cgroup_storage_type(i) { map->owner->storage_cookie[i] = aux->cgroup_storage[i] ? aux->cgroup_storage[i]->cookie : 0; } ret = true; } else { ret = map->owner->type == prog_type && map->owner->jited == fp->jited && map->owner->xdp_has_frags == aux->xdp_has_frags; for_each_cgroup_storage_type(i) { if (!ret) break; cookie = aux->cgroup_storage[i] ? aux->cgroup_storage[i]->cookie : 0; ret = map->owner->storage_cookie[i] == cookie || !cookie; } if (ret && map->owner->attach_func_proto != aux->attach_func_proto) { switch (prog_type) { case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_LSM: case BPF_PROG_TYPE_EXT: case BPF_PROG_TYPE_STRUCT_OPS: ret = false; break; default: break; } } } err: spin_unlock(&map->owner_lock); return ret; } bool bpf_prog_map_compatible(struct bpf_map *map, const struct bpf_prog *fp) { /* XDP programs inserted into maps are not guaranteed to run on * a particular netdev (and can run outside driver context entirely * in the case of devmap and cpumap). Until device checks * are implemented, prohibit adding dev-bound programs to program maps. */ if (bpf_prog_is_dev_bound(fp->aux)) return false; return __bpf_prog_map_compatible(map, fp); } static int bpf_check_tail_call(const struct bpf_prog *fp) { struct bpf_prog_aux *aux = fp->aux; int i, ret = 0; mutex_lock(&aux->used_maps_mutex); for (i = 0; i < aux->used_map_cnt; i++) { struct bpf_map *map = aux->used_maps[i]; if (!map_type_contains_progs(map)) continue; if (!__bpf_prog_map_compatible(map, fp)) { ret = -EINVAL; goto out; } } out: mutex_unlock(&aux->used_maps_mutex); return ret; } static bool bpf_prog_select_interpreter(struct bpf_prog *fp) { bool select_interpreter = false; #ifndef CONFIG_BPF_JIT_ALWAYS_ON u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); u32 idx = (round_up(stack_depth, 32) / 32) - 1; /* may_goto may cause stack size > 512, leading to idx out-of-bounds. * But for non-JITed programs, we don't need bpf_func, so no bounds * check needed. */ if (idx < ARRAY_SIZE(interpreters)) { fp->bpf_func = interpreters[idx]; select_interpreter = true; } else { fp->bpf_func = __bpf_prog_ret0_warn; } #else fp->bpf_func = __bpf_prog_ret0_warn; #endif return select_interpreter; } /** * bpf_prog_select_runtime - select exec runtime for BPF program * @fp: bpf_prog populated with BPF program * @err: pointer to error variable * * Try to JIT eBPF program, if JIT is not available, use interpreter. * The BPF program will be executed via bpf_prog_run() function. * * Return: the &fp argument along with &err set to 0 for success or * a negative errno code on failure */ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { /* In case of BPF to BPF calls, verifier did all the prep * work with regards to JITing, etc. */ bool jit_needed = false; if (fp->bpf_func) goto finalize; if (IS_ENABLED(CONFIG_BPF_JIT_ALWAYS_ON) || bpf_prog_has_kfunc_call(fp)) jit_needed = true; if (!bpf_prog_select_interpreter(fp)) jit_needed = true; /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during * blinding, bpf_int_jit_compile() must always return a * valid program, which in this case would simply not * be JITed, but falls back to the interpreter. */ if (!bpf_prog_is_offloaded(fp->aux)) { *err = bpf_prog_alloc_jited_linfo(fp); if (*err) return fp; fp = bpf_int_jit_compile(fp); bpf_prog_jit_attempt_done(fp); if (!fp->jited && jit_needed) { *err = -ENOTSUPP; return fp; } } else { *err = bpf_prog_offload_compile(fp); if (*err) return fp; } finalize: *err = bpf_prog_lock_ro(fp); if (*err) return fp; /* The tail call compatibility check can only be done at * this late stage as we need to determine, if we deal * with JITed or non JITed program concatenations and not * all eBPF JITs might immediately support all features. */ *err = bpf_check_tail_call(fp); return fp; } EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); static unsigned int __bpf_prog_ret1(const void *ctx, const struct bpf_insn *insn) { return 1; } static struct bpf_prog_dummy { struct bpf_prog prog; } dummy_bpf_prog = { .prog = { .bpf_func = __bpf_prog_ret1, }, }; struct bpf_empty_prog_array bpf_empty_prog_array = { .null_prog = NULL, }; EXPORT_SYMBOL(bpf_empty_prog_array); struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) { struct bpf_prog_array *p; if (prog_cnt) p = kzalloc(struct_size(p, items, prog_cnt + 1), flags); else p = &bpf_empty_prog_array.hdr; return p; } void bpf_prog_array_free(struct bpf_prog_array *progs) { if (!progs || progs == &bpf_empty_prog_array.hdr) return; kfree_rcu(progs, rcu); } static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu) { struct bpf_prog_array *progs; /* If RCU Tasks Trace grace period implies RCU grace period, there is * no need to call kfree_rcu(), just call kfree() directly. */ progs = container_of(rcu, struct bpf_prog_array, rcu); if (rcu_trace_implies_rcu_gp()) kfree(progs); else kfree_rcu(progs, rcu); } void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs) { if (!progs || progs == &bpf_empty_prog_array.hdr) return; call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb); } int bpf_prog_array_length(struct bpf_prog_array *array) { struct bpf_prog_array_item *item; u32 cnt = 0; for (item = array->items; item->prog; item++) if (item->prog != &dummy_bpf_prog.prog) cnt++; return cnt; } bool bpf_prog_array_is_empty(struct bpf_prog_array *array) { struct bpf_prog_array_item *item; for (item = array->items; item->prog; item++) if (item->prog != &dummy_bpf_prog.prog) return false; return true; } static bool bpf_prog_array_copy_core(struct bpf_prog_array *array, u32 *prog_ids, u32 request_cnt) { struct bpf_prog_array_item *item; int i = 0; for (item = array->items; item->prog; item++) { if (item->prog == &dummy_bpf_prog.prog) continue; prog_ids[i] = item->prog->aux->id; if (++i == request_cnt) { item++; break; } } return !!(item->prog); } int bpf_prog_array_copy_to_user(struct bpf_prog_array *array, __u32 __user *prog_ids, u32 cnt) { unsigned long err = 0; bool nospc; u32 *ids; /* users of this function are doing: * cnt = bpf_prog_array_length(); * if (cnt > 0) * bpf_prog_array_copy_to_user(..., cnt); * so below kcalloc doesn't need extra cnt > 0 check. */ ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN); if (!ids) return -ENOMEM; nospc = bpf_prog_array_copy_core(array, ids, cnt); err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); kfree(ids); if (err) return -EFAULT; if (nospc) return -ENOSPC; return 0; } void bpf_prog_array_delete_safe(struct bpf_prog_array *array, struct bpf_prog *old_prog) { struct bpf_prog_array_item *item; for (item = array->items; item->prog; item++) if (item->prog == old_prog) { WRITE_ONCE(item->prog, &dummy_bpf_prog.prog); break; } } /** * bpf_prog_array_delete_safe_at() - Replaces the program at the given * index into the program array with * a dummy no-op program. * @array: a bpf_prog_array * @index: the index of the program to replace * * Skips over dummy programs, by not counting them, when calculating * the position of the program to replace. * * Return: * * 0 - Success * * -EINVAL - Invalid index value. Must be a non-negative integer. * * -ENOENT - Index out of range */ int bpf_prog_array_delete_safe_at(struct bpf_prog_array *array, int index) { return bpf_prog_array_update_at(array, index, &dummy_bpf_prog.prog); } /** * bpf_prog_array_update_at() - Updates the program at the given index * into the program array. * @array: a bpf_prog_array * @index: the index of the program to update * @prog: the program to insert into the array * * Skips over dummy programs, by not counting them, when calculating * the position of the program to update. * * Return: * * 0 - Success * * -EINVAL - Invalid index value. Must be a non-negative integer. * * -ENOENT - Index out of range */ int bpf_prog_array_update_at(struct bpf_prog_array *array, int index, struct bpf_prog *prog) { struct bpf_prog_array_item *item; if (unlikely(index < 0)) return -EINVAL; for (item = array->items; item->prog; item++) { if (item->prog == &dummy_bpf_prog.prog) continue; if (!index) { WRITE_ONCE(item->prog, prog); return 0; } index--; } return -ENOENT; } int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, u64 bpf_cookie, struct bpf_prog_array **new_array) { int new_prog_cnt, carry_prog_cnt = 0; struct bpf_prog_array_item *existing, *new; struct bpf_prog_array *array; bool found_exclude = false; /* Figure out how many existing progs we need to carry over to * the new array. */ if (old_array) { existing = old_array->items; for (; existing->prog; existing++) { if (existing->prog == exclude_prog) { found_exclude = true; continue; } if (existing->prog != &dummy_bpf_prog.prog) carry_prog_cnt++; if (existing->prog == include_prog) return -EEXIST; } } if (exclude_prog && !found_exclude) return -ENOENT; /* How many progs (not NULL) will be in the new array? */ new_prog_cnt = carry_prog_cnt; if (include_prog) new_prog_cnt += 1; /* Do we have any prog (not NULL) in the new array? */ if (!new_prog_cnt) { *new_array = NULL; return 0; } /* +1 as the end of prog_array is marked with NULL */ array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL); if (!array) return -ENOMEM; new = array->items; /* Fill in the new prog array */ if (carry_prog_cnt) { existing = old_array->items; for (; existing->prog; existing++) { if (existing->prog == exclude_prog || existing->prog == &dummy_bpf_prog.prog) continue; new->prog = existing->prog; new->bpf_cookie = existing->bpf_cookie; new++; } } if (include_prog) { new->prog = include_prog; new->bpf_cookie = bpf_cookie; new++; } new->prog = NULL; *new_array = array; return 0; } int bpf_prog_array_copy_info(struct bpf_prog_array *array, u32 *prog_ids, u32 request_cnt, u32 *prog_cnt) { u32 cnt = 0; if (array) cnt = bpf_prog_array_length(array); *prog_cnt = cnt; /* return early if user requested only program count or nothing to copy */ if (!request_cnt || !cnt) return 0; /* this function is called under trace/bpf_trace.c: bpf_event_mutex */ return bpf_prog_array_copy_core(array, prog_ids, request_cnt) ? -ENOSPC : 0; } void __bpf_free_used_maps(struct bpf_prog_aux *aux, struct bpf_map **used_maps, u32 len) { struct bpf_map *map; bool sleepable; u32 i; sleepable = aux->prog->sleepable; for (i = 0; i < len; i++) { map = used_maps[i]; if (map->ops->map_poke_untrack) map->ops->map_poke_untrack(map, aux); if (sleepable) atomic64_dec(&map->sleepable_refcnt); bpf_map_put(map); } } static void bpf_free_used_maps(struct bpf_prog_aux *aux) { __bpf_free_used_maps(aux, aux->used_maps, aux->used_map_cnt); kfree(aux->used_maps); } void __bpf_free_used_btfs(struct btf_mod_pair *used_btfs, u32 len) { #ifdef CONFIG_BPF_SYSCALL struct btf_mod_pair *btf_mod; u32 i; for (i = 0; i < len; i++) { btf_mod = &used_btfs[i]; if (btf_mod->module) module_put(btf_mod->module); btf_put(btf_mod->btf); } #endif } static void bpf_free_used_btfs(struct bpf_prog_aux *aux) { __bpf_free_used_btfs(aux->used_btfs, aux->used_btf_cnt); kfree(aux->used_btfs); } static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; int i; aux = container_of(work, struct bpf_prog_aux, work); #ifdef CONFIG_BPF_SYSCALL bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab); bpf_prog_stream_free(aux->prog); #endif #ifdef CONFIG_CGROUP_BPF if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID) bpf_cgroup_atype_put(aux->cgroup_atype); #endif bpf_free_used_maps(aux); bpf_free_used_btfs(aux); if (bpf_prog_is_dev_bound(aux)) bpf_prog_dev_bound_destroy(aux->prog); #ifdef CONFIG_PERF_EVENTS if (aux->prog->has_callchain_buf) put_callchain_buffers(); #endif if (aux->dst_trampoline) bpf_trampoline_put(aux->dst_trampoline); for (i = 0; i < aux->real_func_cnt; i++) { /* We can just unlink the subprog poke descriptor table as * it was originally linked to the main program and is also * released along with it. */ aux->func[i]->aux->poke_tab = NULL; bpf_jit_free(aux->func[i]); } if (aux->real_func_cnt) { kfree(aux->func); bpf_prog_unlock_free(aux->prog); } else { bpf_jit_free(aux->prog); } } void bpf_prog_free(struct bpf_prog *fp) { struct bpf_prog_aux *aux = fp->aux; if (aux->dst_prog) bpf_prog_put(aux->dst_prog); bpf_token_put(aux->token); INIT_WORK(&aux->work, bpf_prog_free_deferred); schedule_work(&aux->work); } EXPORT_SYMBOL_GPL(bpf_prog_free); /* RNG for unprivileged user space with separated state from prandom_u32(). */ static DEFINE_PER_CPU(struct rnd_state, bpf_user_rnd_state); void bpf_user_rnd_init_once(void) { prandom_init_once(&bpf_user_rnd_state); } BPF_CALL_0(bpf_user_rnd_u32) { /* Should someone ever have the rather unwise idea to use some * of the registers passed into this function, then note that * this function is called from native eBPF and classic-to-eBPF * transformations. Register assignments from both sides are * different, f.e. classic always sets fn(ctx, A, X) here. */ struct rnd_state *state; u32 res; state = &get_cpu_var(bpf_user_rnd_state); res = prandom_u32_state(state); put_cpu_var(bpf_user_rnd_state); return res; } BPF_CALL_0(bpf_get_raw_cpu_id) { return raw_smp_processor_id(); } /* Weak definitions of helper functions in case we don't have bpf syscall. */ const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; const struct bpf_func_proto bpf_map_update_elem_proto __weak; const struct bpf_func_proto bpf_map_delete_elem_proto __weak; const struct bpf_func_proto bpf_map_push_elem_proto __weak; const struct bpf_func_proto bpf_map_pop_elem_proto __weak; const struct bpf_func_proto bpf_map_peek_elem_proto __weak; const struct bpf_func_proto bpf_map_lookup_percpu_elem_proto __weak; const struct bpf_func_proto bpf_spin_lock_proto __weak; const struct bpf_func_proto bpf_spin_unlock_proto __weak; const struct bpf_func_proto bpf_jiffies64_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak; const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak; const struct bpf_func_proto bpf_ktime_get_tai_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; const struct bpf_func_proto bpf_get_current_comm_proto __weak; const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto __weak; const struct bpf_func_proto bpf_get_local_storage_proto __weak; const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_snprintf_btf_proto __weak; const struct bpf_func_proto bpf_seq_printf_btf_proto __weak; const struct bpf_func_proto bpf_set_retval_proto __weak; const struct bpf_func_proto bpf_get_retval_proto __weak; const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) { return NULL; } const struct bpf_func_proto * __weak bpf_get_trace_vprintk_proto(void) { return NULL; } const struct bpf_func_proto * __weak bpf_get_perf_event_read_value_proto(void) { return NULL; } u64 __weak bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) { return -ENOTSUPP; } EXPORT_SYMBOL_GPL(bpf_event_output); /* Always built-in helper functions. */ const struct bpf_func_proto bpf_tail_call_proto = { /* func is unused for tail_call, we set it to pass the * get_helper_proto check */ .func = BPF_PTR_POISON, .gpl_only = false, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, }; /* Stub for JITs that only support cBPF. eBPF programs are interpreted. * It is encouraged to implement bpf_int_jit_compile() instead, so that * eBPF and implicitly also cBPF can get JITed! */ struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog) { return prog; } /* Stub for JITs that support eBPF. All cBPF code gets transformed into * eBPF by the kernel and is later compiled by bpf_int_jit_compile(). */ void __weak bpf_jit_compile(struct bpf_prog *prog) { } bool __weak bpf_helper_changes_pkt_data(enum bpf_func_id func_id) { return false; } /* Return TRUE if the JIT backend wants verifier to enable sub-register usage * analysis code and wants explicit zero extension inserted by verifier. * Otherwise, return FALSE. * * The verifier inserts an explicit zero extension after BPF_CMPXCHGs even if * you don't override this. JITs that don't want these extra insns can detect * them using insn_is_zext. */ bool __weak bpf_jit_needs_zext(void) { return false; } /* By default, enable the verifier's mitigations against Spectre v1 and v4 for * all archs. The value returned must not change at runtime as there is * currently no support for reloading programs that were loaded without * mitigations. */ bool __weak bpf_jit_bypass_spec_v1(void) { return false; } bool __weak bpf_jit_bypass_spec_v4(void) { return false; } /* Return true if the JIT inlines the call to the helper corresponding to * the imm. * * The verifier will not patch the insn->imm for the call to the helper if * this returns true. */ bool __weak bpf_jit_inlines_helper_call(s32 imm) { return false; } /* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */ bool __weak bpf_jit_supports_subprog_tailcalls(void) { return false; } bool __weak bpf_jit_supports_percpu_insn(void) { return false; } bool __weak bpf_jit_supports_kfunc_call(void) { return false; } bool __weak bpf_jit_supports_far_kfunc_call(void) { return false; } bool __weak bpf_jit_supports_arena(void) { return false; } bool __weak bpf_jit_supports_insn(struct bpf_insn *insn, bool in_arena) { return false; } u64 __weak bpf_arch_uaddress_limit(void) { #if defined(CONFIG_64BIT) && defined(CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE) return TASK_SIZE; #else return 0; #endif } /* Return TRUE if the JIT backend satisfies the following two conditions: * 1) JIT backend supports atomic_xchg() on pointer-sized words. * 2) Under the specific arch, the implementation of xchg() is the same * as atomic_xchg() on pointer-sized words. */ bool __weak bpf_jit_supports_ptr_xchg(void) { return false; } /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call * skb_copy_bits(), so provide a weak definition of it for NET-less config. */ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) { return -EFAULT; } int __weak bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *addr1, void *addr2) { return -ENOTSUPP; } void * __weak bpf_arch_text_copy(void *dst, void *src, size_t len) { return ERR_PTR(-ENOTSUPP); } int __weak bpf_arch_text_invalidate(void *dst, size_t len) { return -ENOTSUPP; } bool __weak bpf_jit_supports_exceptions(void) { return false; } bool __weak bpf_jit_supports_private_stack(void) { return false; } void __weak arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie) { } bool __weak bpf_jit_supports_timed_may_goto(void) { return false; } u64 __weak arch_bpf_timed_may_goto(void) { return 0; } static noinline void bpf_prog_report_may_goto_violation(void) { #ifdef CONFIG_BPF_SYSCALL struct bpf_stream_stage ss; struct bpf_prog *prog; prog = bpf_prog_find_from_stack(); if (!prog) return; bpf_stream_stage(ss, prog, BPF_STDERR, ({ bpf_stream_printk(ss, "ERROR: Timeout detected for may_goto instruction\n"); bpf_stream_dump_stack(ss); })); #endif } u64 bpf_check_timed_may_goto(struct bpf_timed_may_goto *p) { u64 time = ktime_get_mono_fast_ns(); /* Populate the timestamp for this stack frame, and refresh count. */ if (!p->timestamp) { p->timestamp = time; return BPF_MAX_TIMED_LOOPS; } /* Check if we've exhausted our time slice, and zero count. */ if (unlikely(time - p->timestamp >= (NSEC_PER_SEC / 4))) { bpf_prog_report_may_goto_violation(); return 0; } /* Refresh the count for the stack frame. */ return BPF_MAX_TIMED_LOOPS; } /* for configs without MMU or 32-bit */ __weak const struct bpf_map_ops arena_map_ops; __weak u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena) { return 0; } __weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena) { return 0; } #ifdef CONFIG_BPF_SYSCALL static int __init bpf_global_ma_init(void) { int ret; ret = bpf_mem_alloc_init(&bpf_global_ma, 0, false); bpf_global_ma_set = !ret; return ret; } late_initcall(bpf_global_ma_init); #endif DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); EXPORT_SYMBOL(bpf_stats_enabled_key); /* All definitions of tracepoints related to BPF. */ #define CREATE_TRACE_POINTS #include <linux/bpf_trace.h> EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx); #ifdef CONFIG_BPF_SYSCALL int bpf_prog_get_file_line(struct bpf_prog *prog, unsigned long ip, const char **filep, const char **linep, int *nump) { int idx = -1, insn_start, insn_end, len; struct bpf_line_info *linfo; void **jited_linfo; struct btf *btf; int nr_linfo; btf = prog->aux->btf; linfo = prog->aux->linfo; jited_linfo = prog->aux->jited_linfo; if (!btf || !linfo || !jited_linfo) return -EINVAL; len = prog->aux->func ? prog->aux->func[prog->aux->func_idx]->len : prog->len; linfo = &prog->aux->linfo[prog->aux->linfo_idx]; jited_linfo = &prog->aux->jited_linfo[prog->aux->linfo_idx]; insn_start = linfo[0].insn_off; insn_end = insn_start + len; nr_linfo = prog->aux->nr_linfo - prog->aux->linfo_idx; for (int i = 0; i < nr_linfo && linfo[i].insn_off >= insn_start && linfo[i].insn_off < insn_end; i++) { if (jited_linfo[i] >= (void *)ip) break; idx = i; } if (idx == -1) return -ENOENT; /* Get base component of the file path. */ *filep = btf_name_by_offset(btf, linfo[idx].file_name_off); *filep = kbasename(*filep); /* Obtain the source line, and strip whitespace in prefix. */ *linep = btf_name_by_offset(btf, linfo[idx].line_off); while (isspace(**linep)) *linep += 1; *nump = BPF_LINE_INFO_LINE_NUM(linfo[idx].line_col); return 0; } struct walk_stack_ctx { struct bpf_prog *prog; }; static bool find_from_stack_cb(void *cookie, u64 ip, u64 sp, u64 bp) { struct walk_stack_ctx *ctxp = cookie; struct bpf_prog *prog; /* * The RCU read lock is held to safely traverse the latch tree, but we * don't need its protection when accessing the prog, since it has an * active stack frame on the current stack trace, and won't disappear. */ rcu_read_lock(); prog = bpf_prog_ksym_find(ip); rcu_read_unlock(); if (!prog) return true; if (bpf_is_subprog(prog)) return true; ctxp->prog = prog; return false; } struct bpf_prog *bpf_prog_find_from_stack(void) { struct walk_stack_ctx ctx = {}; arch_bpf_stack_walk(find_from_stack_cb, &ctx); return ctx.prog; } #endif
3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs */ #ifndef _ASM_X86_STACKTRACE_H #define _ASM_X86_STACKTRACE_H #include <linux/uaccess.h> #include <linux/ptrace.h> #include <asm/cpu_entry_area.h> #include <asm/switch_to.h> enum stack_type { STACK_TYPE_UNKNOWN, STACK_TYPE_TASK, STACK_TYPE_IRQ, STACK_TYPE_SOFTIRQ, STACK_TYPE_ENTRY, STACK_TYPE_EXCEPTION, STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, }; struct stack_info { enum stack_type type; unsigned long *begin, *end, *next_sp; }; bool in_task_stack(unsigned long *stack, struct task_struct *task, struct stack_info *info); bool in_entry_stack(unsigned long *stack, struct stack_info *info); int get_stack_info(unsigned long *stack, struct task_struct *task, struct stack_info *info, unsigned long *visit_mask); bool get_stack_info_noinstr(unsigned long *stack, struct task_struct *task, struct stack_info *info); static __always_inline bool get_stack_guard_info(unsigned long *stack, struct stack_info *info) { /* make sure it's not in the stack proper */ if (get_stack_info_noinstr(stack, current, info)) return false; /* but if it is in the page below it, we hit a guard */ return get_stack_info_noinstr((void *)stack + PAGE_SIZE, current, info); } const char *stack_type_name(enum stack_type type); static inline bool on_stack(struct stack_info *info, void *addr, size_t len) { void *begin = info->begin; void *end = info->end; return (info->type != STACK_TYPE_UNKNOWN && addr >= begin && addr < end && addr + len > begin && addr + len <= end); } #ifdef CONFIG_X86_32 #define STACKSLOTS_PER_LINE 8 #else #define STACKSLOTS_PER_LINE 4 #endif #ifdef CONFIG_FRAME_POINTER static inline unsigned long * get_frame_pointer(struct task_struct *task, struct pt_regs *regs) { if (regs) return (unsigned long *)regs->bp; if (task == current) return __builtin_frame_address(0); return &((struct inactive_task_frame *)task->thread.sp)->bp; } #else static inline unsigned long * get_frame_pointer(struct task_struct *task, struct pt_regs *regs) { return NULL; } #endif /* CONFIG_FRAME_POINTER */ static inline unsigned long * get_stack_pointer(struct task_struct *task, struct pt_regs *regs) { if (regs) return (unsigned long *)regs->sp; if (task == current) return __builtin_frame_address(0); return (unsigned long *)task->thread.sp; } /* The form of the top of the frame on the stack */ struct stack_frame { struct stack_frame *next_frame; unsigned long return_address; }; struct stack_frame_ia32 { u32 next_frame; u32 return_address; }; void show_opcodes(struct pt_regs *regs, const char *loglvl); void show_ip(struct pt_regs *regs, const char *loglvl); #endif /* _ASM_X86_STACKTRACE_H */
303 502 453 48 420 227 192 337 336 226 225 337 227 225 192 118 162 85 190 81 4 4 4 123 123 12 12 36 36 36 36 2 2 2 2 2 35 676 676 616 143 615 143 11 246 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 // SPDX-License-Identifier: GPL-2.0-or-later /* * Synchronous Cryptographic Hash operations. * * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au> */ #include <crypto/scatterwalk.h> #include <linux/cryptouser.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/string.h> #include <net/netlink.h> #include "hash.h" static inline bool crypto_shash_block_only(struct crypto_shash *tfm) { return crypto_shash_alg(tfm)->base.cra_flags & CRYPTO_AHASH_ALG_BLOCK_ONLY; } static inline bool crypto_shash_final_nonzero(struct crypto_shash *tfm) { return crypto_shash_alg(tfm)->base.cra_flags & CRYPTO_AHASH_ALG_FINAL_NONZERO; } static inline bool crypto_shash_finup_max(struct crypto_shash *tfm) { return crypto_shash_alg(tfm)->base.cra_flags & CRYPTO_AHASH_ALG_FINUP_MAX; } int shash_no_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { return -ENOSYS; } EXPORT_SYMBOL_GPL(shash_no_setkey); static void shash_set_needkey(struct crypto_shash *tfm, struct shash_alg *alg) { if (crypto_shash_alg_needs_key(alg)) crypto_shash_set_flags(tfm, CRYPTO_TFM_NEED_KEY); } int crypto_shash_setkey(struct crypto_shash *tfm, const u8 *key, unsigned int keylen) { struct shash_alg *shash = crypto_shash_alg(tfm); int err; err = shash->setkey(tfm, key, keylen); if (unlikely(err)) { shash_set_needkey(tfm, shash); return err; } crypto_shash_clear_flags(tfm, CRYPTO_TFM_NEED_KEY); return 0; } EXPORT_SYMBOL_GPL(crypto_shash_setkey); static int __crypto_shash_init(struct shash_desc *desc) { struct crypto_shash *tfm = desc->tfm; if (crypto_shash_block_only(tfm)) { u8 *buf = shash_desc_ctx(desc); buf += crypto_shash_descsize(tfm) - 1; *buf = 0; } return crypto_shash_alg(tfm)->init(desc); } int crypto_shash_init(struct shash_desc *desc) { if (crypto_shash_get_flags(desc->tfm) & CRYPTO_TFM_NEED_KEY) return -ENOKEY; return __crypto_shash_init(desc); } EXPORT_SYMBOL_GPL(crypto_shash_init); static int shash_default_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { struct shash_alg *shash = crypto_shash_alg(desc->tfm); return shash->update(desc, data, len) ?: shash->final(desc, out); } static int crypto_shash_op_and_zero( int (*op)(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out), struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { int err; err = op(desc, data, len, out); memset(shash_desc_ctx(desc), 0, crypto_shash_descsize(desc->tfm)); return err; } int crypto_shash_finup(struct shash_desc *restrict desc, const u8 *data, unsigned int len, u8 *restrict out) { struct crypto_shash *tfm = desc->tfm; u8 *blenp = shash_desc_ctx(desc); bool finup_max, nonzero; unsigned int bs; int err; u8 *buf; if (!crypto_shash_block_only(tfm)) { if (out) goto finup; return crypto_shash_alg(tfm)->update(desc, data, len); } finup_max = out && crypto_shash_finup_max(tfm); /* Retain extra block for final nonzero algorithms. */ nonzero = crypto_shash_final_nonzero(tfm); /* * The partial block buffer follows the algorithm desc context. * The byte following that contains the length. */ blenp += crypto_shash_descsize(tfm) - 1; bs = crypto_shash_blocksize(tfm); buf = blenp - bs; if (likely(!*blenp && finup_max)) goto finup; while ((*blenp + len) >= bs + nonzero) { unsigned int nbytes = len - nonzero; const u8 *src = data; if (*blenp) { memcpy(buf + *blenp, data, bs - *blenp); nbytes = bs; src = buf; } err = crypto_shash_alg(tfm)->update(desc, src, nbytes); if (err < 0) return err; data += nbytes - err - *blenp; len -= nbytes - err - *blenp; *blenp = 0; } if (*blenp || !out) { memcpy(buf + *blenp, data, len); *blenp += len; if (!out) return 0; data = buf; len = *blenp; } finup: return crypto_shash_op_and_zero(crypto_shash_alg(tfm)->finup, desc, data, len, out); } EXPORT_SYMBOL_GPL(crypto_shash_finup); static int shash_default_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { return __crypto_shash_init(desc) ?: crypto_shash_finup(desc, data, len, out); } int crypto_shash_digest(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { struct crypto_shash *tfm = desc->tfm; if (crypto_shash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) return -ENOKEY; return crypto_shash_op_and_zero(crypto_shash_alg(tfm)->digest, desc, data, len, out); } EXPORT_SYMBOL_GPL(crypto_shash_digest); int crypto_shash_tfm_digest(struct crypto_shash *tfm, const u8 *data, unsigned int len, u8 *out) { SHASH_DESC_ON_STACK(desc, tfm); desc->tfm = tfm; return crypto_shash_digest(desc, data, len, out); } EXPORT_SYMBOL_GPL(crypto_shash_tfm_digest); static int __crypto_shash_export(struct shash_desc *desc, void *out, int (*export)(struct shash_desc *desc, void *out)) { struct crypto_shash *tfm = desc->tfm; u8 *buf = shash_desc_ctx(desc); unsigned int plen, ss; plen = crypto_shash_blocksize(tfm) + 1; ss = crypto_shash_statesize(tfm); if (crypto_shash_block_only(tfm)) ss -= plen; if (!export) { memcpy(out, buf, ss); return 0; } return export(desc, out); } int crypto_shash_export_core(struct shash_desc *desc, void *out) { return __crypto_shash_export(desc, out, crypto_shash_alg(desc->tfm)->export_core); } EXPORT_SYMBOL_GPL(crypto_shash_export_core); int crypto_shash_export(struct shash_desc *desc, void *out) { struct crypto_shash *tfm = desc->tfm; if (crypto_shash_block_only(tfm)) { unsigned int plen = crypto_shash_blocksize(tfm) + 1; unsigned int descsize = crypto_shash_descsize(tfm); unsigned int ss = crypto_shash_statesize(tfm); u8 *buf = shash_desc_ctx(desc); memcpy(out + ss - plen, buf + descsize - plen, plen); } return __crypto_shash_export(desc, out, crypto_shash_alg(tfm)->export); } EXPORT_SYMBOL_GPL(crypto_shash_export); static int __crypto_shash_import(struct shash_desc *desc, const void *in, int (*import)(struct shash_desc *desc, const void *in)) { struct crypto_shash *tfm = desc->tfm; unsigned int descsize, plen, ss; u8 *buf = shash_desc_ctx(desc); if (crypto_shash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) return -ENOKEY; ss = crypto_shash_statesize(tfm); if (crypto_shash_block_only(tfm)) { plen = crypto_shash_blocksize(tfm) + 1; ss -= plen; descsize = crypto_shash_descsize(tfm); buf[descsize - 1] = 0; } if (!import) { memcpy(buf, in, ss); return 0; } return import(desc, in); } int crypto_shash_import_core(struct shash_desc *desc, const void *in) { return __crypto_shash_import(desc, in, crypto_shash_alg(desc->tfm)->import_core); } EXPORT_SYMBOL_GPL(crypto_shash_import_core); int crypto_shash_import(struct shash_desc *desc, const void *in) { struct crypto_shash *tfm = desc->tfm; int err; err = __crypto_shash_import(desc, in, crypto_shash_alg(tfm)->import); if (crypto_shash_block_only(tfm)) { unsigned int plen = crypto_shash_blocksize(tfm) + 1; unsigned int descsize = crypto_shash_descsize(tfm); unsigned int ss = crypto_shash_statesize(tfm); u8 *buf = shash_desc_ctx(desc); memcpy(buf + descsize - plen, in + ss - plen, plen); if (buf[descsize - 1] >= plen) err = -EOVERFLOW; } return err; } EXPORT_SYMBOL_GPL(crypto_shash_import); static void crypto_shash_exit_tfm(struct crypto_tfm *tfm) { struct crypto_shash *hash = __crypto_shash_cast(tfm); struct shash_alg *alg = crypto_shash_alg(hash); alg->exit_tfm(hash); } static int crypto_shash_init_tfm(struct crypto_tfm *tfm) { struct crypto_shash *hash = __crypto_shash_cast(tfm); struct shash_alg *alg = crypto_shash_alg(hash); shash_set_needkey(hash, alg); if (alg->exit_tfm) tfm->exit = crypto_shash_exit_tfm; if (!alg->init_tfm) return 0; return alg->init_tfm(hash); } static void crypto_shash_free_instance(struct crypto_instance *inst) { struct shash_instance *shash = shash_instance(inst); shash->free(shash); } static int __maybe_unused crypto_shash_report( struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_hash rhash; struct shash_alg *salg = __crypto_shash_alg(alg); memset(&rhash, 0, sizeof(rhash)); strscpy(rhash.type, "shash", sizeof(rhash.type)); rhash.blocksize = alg->cra_blocksize; rhash.digestsize = salg->digestsize; return nla_put(skb, CRYPTOCFGA_REPORT_HASH, sizeof(rhash), &rhash); } static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg) __maybe_unused; static void crypto_shash_show(struct seq_file *m, struct crypto_alg *alg) { struct shash_alg *salg = __crypto_shash_alg(alg); seq_printf(m, "type : shash\n"); seq_printf(m, "blocksize : %u\n", alg->cra_blocksize); seq_printf(m, "digestsize : %u\n", salg->digestsize); } const struct crypto_type crypto_shash_type = { .extsize = crypto_alg_extsize, .init_tfm = crypto_shash_init_tfm, .free = crypto_shash_free_instance, #ifdef CONFIG_PROC_FS .show = crypto_shash_show, #endif #if IS_ENABLED(CONFIG_CRYPTO_USER) .report = crypto_shash_report, #endif .maskclear = ~CRYPTO_ALG_TYPE_MASK, .maskset = CRYPTO_ALG_TYPE_MASK, .type = CRYPTO_ALG_TYPE_SHASH, .tfmsize = offsetof(struct crypto_shash, base), .algsize = offsetof(struct shash_alg, base), }; int crypto_grab_shash(struct crypto_shash_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask) { spawn->base.frontend = &crypto_shash_type; return crypto_grab_spawn(&spawn->base, inst, name, type, mask); } EXPORT_SYMBOL_GPL(crypto_grab_shash); struct crypto_shash *crypto_alloc_shash(const char *alg_name, u32 type, u32 mask) { return crypto_alloc_tfm(alg_name, &crypto_shash_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_alloc_shash); int crypto_has_shash(const char *alg_name, u32 type, u32 mask) { return crypto_type_has_alg(alg_name, &crypto_shash_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_has_shash); struct crypto_shash *crypto_clone_shash(struct crypto_shash *hash) { struct crypto_tfm *tfm = crypto_shash_tfm(hash); struct shash_alg *alg = crypto_shash_alg(hash); struct crypto_shash *nhash; int err; if (!crypto_shash_alg_has_setkey(alg)) { tfm = crypto_tfm_get(tfm); if (IS_ERR(tfm)) return ERR_CAST(tfm); return hash; } if (!alg->clone_tfm && (alg->init_tfm || alg->base.cra_init)) return ERR_PTR(-ENOSYS); nhash = crypto_clone_tfm(&crypto_shash_type, tfm); if (IS_ERR(nhash)) return nhash; if (alg->clone_tfm) { err = alg->clone_tfm(nhash, hash); if (err) { crypto_free_shash(nhash); return ERR_PTR(err); } } if (alg->exit_tfm) crypto_shash_tfm(nhash)->exit = crypto_shash_exit_tfm; return nhash; } EXPORT_SYMBOL_GPL(crypto_clone_shash); int hash_prepare_alg(struct hash_alg_common *alg) { struct crypto_alg *base = &alg->base; if (alg->digestsize > HASH_MAX_DIGESTSIZE) return -EINVAL; /* alignmask is not useful for hashes, so it is not supported. */ if (base->cra_alignmask) return -EINVAL; base->cra_flags &= ~CRYPTO_ALG_TYPE_MASK; return 0; } static int shash_default_export_core(struct shash_desc *desc, void *out) { return -ENOSYS; } static int shash_default_import_core(struct shash_desc *desc, const void *in) { return -ENOSYS; } static int shash_prepare_alg(struct shash_alg *alg) { struct crypto_alg *base = &alg->halg.base; int err; if ((alg->export && !alg->import) || (alg->import && !alg->export)) return -EINVAL; err = hash_prepare_alg(&alg->halg); if (err) return err; base->cra_type = &crypto_shash_type; base->cra_flags |= CRYPTO_ALG_TYPE_SHASH; base->cra_flags |= CRYPTO_ALG_REQ_VIRT; /* * Handle missing optional functions. For each one we can either * install a default here, or we can leave the pointer as NULL and check * the pointer for NULL in crypto_shash_*(), avoiding an indirect call * when the default behavior is desired. For ->finup and ->digest we * install defaults, since for optimal performance algorithms should * implement these anyway. On the other hand, for ->import and * ->export the common case and best performance comes from the simple * memcpy of the shash_desc_ctx, so when those pointers are NULL we * leave them NULL and provide the memcpy with no indirect call. */ if (!alg->finup) alg->finup = shash_default_finup; if (!alg->digest) alg->digest = shash_default_digest; if (!alg->export && !alg->halg.statesize) alg->halg.statesize = alg->descsize; if (!alg->setkey) alg->setkey = shash_no_setkey; if (base->cra_flags & CRYPTO_AHASH_ALG_BLOCK_ONLY) { BUILD_BUG_ON(MAX_ALGAPI_BLOCKSIZE >= 256); alg->descsize += base->cra_blocksize + 1; alg->statesize += base->cra_blocksize + 1; alg->export_core = alg->export; alg->import_core = alg->import; } else if (!alg->export_core || !alg->import_core) { alg->export_core = shash_default_export_core; alg->import_core = shash_default_import_core; base->cra_flags |= CRYPTO_AHASH_ALG_NO_EXPORT_CORE; } if (alg->descsize > HASH_MAX_DESCSIZE) return -EINVAL; if (alg->statesize > HASH_MAX_STATESIZE) return -EINVAL; base->cra_reqsize = sizeof(struct shash_desc) + alg->descsize; return 0; } int crypto_register_shash(struct shash_alg *alg) { struct crypto_alg *base = &alg->base; int err; err = shash_prepare_alg(alg); if (err) return err; return crypto_register_alg(base); } EXPORT_SYMBOL_GPL(crypto_register_shash); void crypto_unregister_shash(struct shash_alg *alg) { crypto_unregister_alg(&alg->base); } EXPORT_SYMBOL_GPL(crypto_unregister_shash); int crypto_register_shashes(struct shash_alg *algs, int count) { int i, ret; for (i = 0; i < count; i++) { ret = crypto_register_shash(&algs[i]); if (ret) goto err; } return 0; err: for (--i; i >= 0; --i) crypto_unregister_shash(&algs[i]); return ret; } EXPORT_SYMBOL_GPL(crypto_register_shashes); void crypto_unregister_shashes(struct shash_alg *algs, int count) { int i; for (i = count - 1; i >= 0; --i) crypto_unregister_shash(&algs[i]); } EXPORT_SYMBOL_GPL(crypto_unregister_shashes); int shash_register_instance(struct crypto_template *tmpl, struct shash_instance *inst) { int err; if (WARN_ON(!inst->free)) return -EINVAL; err = shash_prepare_alg(&inst->alg); if (err) return err; return crypto_register_instance(tmpl, shash_crypto_instance(inst)); } EXPORT_SYMBOL_GPL(shash_register_instance); void shash_free_singlespawn_instance(struct shash_instance *inst) { crypto_drop_spawn(shash_instance_ctx(inst)); kfree(inst); } EXPORT_SYMBOL_GPL(shash_free_singlespawn_instance); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Synchronous cryptographic hash type");
2 52 52 50 2 143 21 35 2 33 3 1 2 3 1 2 6 1 5 2 1 1 159 159 37 159 173 9 1 170 4 2 169 159 23 159 159 99 60 132 159 2 28 132 132 66 65 30 65 2 1 1 34 1 33 43 3 12 22 5 4 1 5 29 8 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 /* * Copyright (c) 2007, 2020 Oracle and/or its affiliates. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/dma-mapping.h> /* for DMA_*_DEVICE */ #include "rds.h" /* * XXX * - build with sparse * - should we detect duplicate keys on a socket? hmm. * - an rdma is an mlock, apply rlimit? */ /* * get the number of pages by looking at the page indices that the start and * end addresses fall in. * * Returns 0 if the vec is invalid. It is invalid if the number of bytes * causes the address to wrap or overflows an unsigned int. This comes * from being stored in the 'length' member of 'struct scatterlist'. */ static unsigned int rds_pages_in_vec(struct rds_iovec *vec) { if ((vec->addr + vec->bytes <= vec->addr) || (vec->bytes > (u64)UINT_MAX)) return 0; return ((vec->addr + vec->bytes + PAGE_SIZE - 1) >> PAGE_SHIFT) - (vec->addr >> PAGE_SHIFT); } static struct rds_mr *rds_mr_tree_walk(struct rb_root *root, u64 key, struct rds_mr *insert) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; struct rds_mr *mr; while (*p) { parent = *p; mr = rb_entry(parent, struct rds_mr, r_rb_node); if (key < mr->r_key) p = &(*p)->rb_left; else if (key > mr->r_key) p = &(*p)->rb_right; else return mr; } if (insert) { rb_link_node(&insert->r_rb_node, parent, p); rb_insert_color(&insert->r_rb_node, root); kref_get(&insert->r_kref); } return NULL; } /* * Destroy the transport-specific part of a MR. */ static void rds_destroy_mr(struct rds_mr *mr) { struct rds_sock *rs = mr->r_sock; void *trans_private = NULL; unsigned long flags; rdsdebug("RDS: destroy mr key is %x refcnt %u\n", mr->r_key, kref_read(&mr->r_kref)); spin_lock_irqsave(&rs->rs_rdma_lock, flags); if (!RB_EMPTY_NODE(&mr->r_rb_node)) rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); trans_private = mr->r_trans_private; mr->r_trans_private = NULL; spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (trans_private) mr->r_trans->free_mr(trans_private, mr->r_invalidate); } void __rds_put_mr_final(struct kref *kref) { struct rds_mr *mr = container_of(kref, struct rds_mr, r_kref); rds_destroy_mr(mr); kfree(mr); } /* * By the time this is called we can't have any more ioctls called on * the socket so we don't need to worry about racing with others. */ void rds_rdma_drop_keys(struct rds_sock *rs) { struct rds_mr *mr; struct rb_node *node; unsigned long flags; /* Release any MRs associated with this socket */ spin_lock_irqsave(&rs->rs_rdma_lock, flags); while ((node = rb_first(&rs->rs_rdma_keys))) { mr = rb_entry(node, struct rds_mr, r_rb_node); if (mr->r_trans == rs->rs_transport) mr->r_invalidate = 0; rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); RB_CLEAR_NODE(&mr->r_rb_node); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); kref_put(&mr->r_kref, __rds_put_mr_final); spin_lock_irqsave(&rs->rs_rdma_lock, flags); } spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (rs->rs_transport && rs->rs_transport->flush_mrs) rs->rs_transport->flush_mrs(); } /* * Helper function to pin user pages. */ static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages, struct page **pages, int write) { unsigned int gup_flags = FOLL_LONGTERM; int ret; if (write) gup_flags |= FOLL_WRITE; ret = pin_user_pages_fast(user_addr, nr_pages, gup_flags, pages); if (ret >= 0 && ret < nr_pages) { unpin_user_pages(pages, ret); ret = -EFAULT; } return ret; } static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, u64 *cookie_ret, struct rds_mr **mr_ret, struct rds_conn_path *cp) { struct rds_mr *mr = NULL, *found; struct scatterlist *sg = NULL; unsigned int nr_pages; struct page **pages = NULL; void *trans_private; unsigned long flags; rds_rdma_cookie_t cookie; unsigned int nents = 0; int need_odp = 0; long i; int ret; if (ipv6_addr_any(&rs->rs_bound_addr) || !rs->rs_transport) { ret = -ENOTCONN; /* XXX not a great errno */ goto out; } if (!rs->rs_transport->get_mr) { ret = -EOPNOTSUPP; goto out; } /* If the combination of the addr and size requested for this memory * region causes an integer overflow, return error. */ if (((args->vec.addr + args->vec.bytes) < args->vec.addr) || PAGE_ALIGN(args->vec.addr + args->vec.bytes) < (args->vec.addr + args->vec.bytes)) { ret = -EINVAL; goto out; } if (!can_do_mlock()) { ret = -EPERM; goto out; } nr_pages = rds_pages_in_vec(&args->vec); if (nr_pages == 0) { ret = -EINVAL; goto out; } /* Restrict the size of mr irrespective of underlying transport * To account for unaligned mr regions, subtract one from nr_pages */ if ((nr_pages - 1) > (RDS_MAX_MSG_SIZE >> PAGE_SHIFT)) { ret = -EMSGSIZE; goto out; } rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n", args->vec.addr, args->vec.bytes, nr_pages); /* XXX clamp nr_pages to limit the size of this alloc? */ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) { ret = -ENOMEM; goto out; } mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL); if (!mr) { ret = -ENOMEM; goto out; } kref_init(&mr->r_kref); RB_CLEAR_NODE(&mr->r_rb_node); mr->r_trans = rs->rs_transport; mr->r_sock = rs; if (args->flags & RDS_RDMA_USE_ONCE) mr->r_use_once = 1; if (args->flags & RDS_RDMA_INVALIDATE) mr->r_invalidate = 1; if (args->flags & RDS_RDMA_READWRITE) mr->r_write = 1; /* * Pin the pages that make up the user buffer and transfer the page * pointers to the mr's sg array. We check to see if we've mapped * the whole region after transferring the partial page references * to the sg array so that we can have one page ref cleanup path. * * For now we have no flag that tells us whether the mapping is * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to * the zero page. */ ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1); if (ret == -EOPNOTSUPP) { need_odp = 1; } else if (ret <= 0) { goto out; } else { nents = ret; sg = kmalloc_array(nents, sizeof(*sg), GFP_KERNEL); if (!sg) { ret = -ENOMEM; goto out; } WARN_ON(!nents); sg_init_table(sg, nents); /* Stick all pages into the scatterlist */ for (i = 0 ; i < nents; i++) sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0); rdsdebug("RDS: trans_private nents is %u\n", nents); } /* Obtain a transport specific MR. If this succeeds, the * s/g list is now owned by the MR. * Note that dma_map() implies that pending writes are * flushed to RAM, so no dma_sync is needed here. */ trans_private = rs->rs_transport->get_mr( sg, nents, rs, &mr->r_key, cp ? cp->cp_conn : NULL, args->vec.addr, args->vec.bytes, need_odp ? ODP_ZEROBASED : ODP_NOT_NEEDED); if (IS_ERR(trans_private)) { /* In ODP case, we don't GUP pages, so don't need * to release anything. */ if (!need_odp) { unpin_user_pages(pages, nr_pages); kfree(sg); } ret = PTR_ERR(trans_private); /* Trigger connection so that its ready for the next retry */ if (ret == -ENODEV && cp) rds_conn_connect_if_down(cp->cp_conn); goto out; } mr->r_trans_private = trans_private; rdsdebug("RDS: get_mr put_user key is %x cookie_addr %p\n", mr->r_key, (void *)(unsigned long) args->cookie_addr); /* The user may pass us an unaligned address, but we can only * map page aligned regions. So we keep the offset, and build * a 64bit cookie containing <R_Key, offset> and pass that * around. */ if (need_odp) cookie = rds_rdma_make_cookie(mr->r_key, 0); else cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK); if (cookie_ret) *cookie_ret = cookie; if (args->cookie_addr && put_user(cookie, (u64 __user *)(unsigned long)args->cookie_addr)) { if (!need_odp) { unpin_user_pages(pages, nr_pages); kfree(sg); } ret = -EFAULT; goto out; } /* Inserting the new MR into the rbtree bumps its * reference count. */ spin_lock_irqsave(&rs->rs_rdma_lock, flags); found = rds_mr_tree_walk(&rs->rs_rdma_keys, mr->r_key, mr); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); BUG_ON(found && found != mr); rdsdebug("RDS: get_mr key is %x\n", mr->r_key); if (mr_ret) { kref_get(&mr->r_kref); *mr_ret = mr; } ret = 0; out: kfree(pages); if (mr) kref_put(&mr->r_kref, __rds_put_mr_final); return ret; } int rds_get_mr(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_get_mr_args args; if (optlen != sizeof(struct rds_get_mr_args)) return -EINVAL; if (copy_from_sockptr(&args, optval, sizeof(struct rds_get_mr_args))) return -EFAULT; return __rds_rdma_map(rs, &args, NULL, NULL, NULL); } int rds_get_mr_for_dest(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_get_mr_for_dest_args args; struct rds_get_mr_args new_args; if (optlen != sizeof(struct rds_get_mr_for_dest_args)) return -EINVAL; if (copy_from_sockptr(&args, optval, sizeof(struct rds_get_mr_for_dest_args))) return -EFAULT; /* * Initially, just behave like get_mr(). * TODO: Implement get_mr as wrapper around this * and deprecate it. */ new_args.vec = args.vec; new_args.cookie_addr = args.cookie_addr; new_args.flags = args.flags; return __rds_rdma_map(rs, &new_args, NULL, NULL, NULL); } /* * Free the MR indicated by the given R_Key */ int rds_free_mr(struct rds_sock *rs, sockptr_t optval, int optlen) { struct rds_free_mr_args args; struct rds_mr *mr; unsigned long flags; if (optlen != sizeof(struct rds_free_mr_args)) return -EINVAL; if (copy_from_sockptr(&args, optval, sizeof(struct rds_free_mr_args))) return -EFAULT; /* Special case - a null cookie means flush all unused MRs */ if (args.cookie == 0) { if (!rs->rs_transport || !rs->rs_transport->flush_mrs) return -EINVAL; rs->rs_transport->flush_mrs(); return 0; } /* Look up the MR given its R_key and remove it from the rbtree * so nobody else finds it. * This should also prevent races with rds_rdma_unuse. */ spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, rds_rdma_cookie_key(args.cookie), NULL); if (mr) { rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); RB_CLEAR_NODE(&mr->r_rb_node); if (args.flags & RDS_RDMA_INVALIDATE) mr->r_invalidate = 1; } spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (!mr) return -EINVAL; kref_put(&mr->r_kref, __rds_put_mr_final); return 0; } /* * This is called when we receive an extension header that * tells us this MR was used. It allows us to implement * use_once semantics */ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) { struct rds_mr *mr; unsigned long flags; int zot_me = 0; spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); if (!mr) { pr_debug("rds: trying to unuse MR with unknown r_key %u!\n", r_key); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); return; } /* Get a reference so that the MR won't go away before calling * sync_mr() below. */ kref_get(&mr->r_kref); /* If it is going to be freed, remove it from the tree now so * that no other thread can find it and free it. */ if (mr->r_use_once || force) { rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); RB_CLEAR_NODE(&mr->r_rb_node); zot_me = 1; } spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); /* May have to issue a dma_sync on this memory region. * Note we could avoid this if the operation was a RDMA READ, * but at this point we can't tell. */ if (mr->r_trans->sync_mr) mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); /* Release the reference held above. */ kref_put(&mr->r_kref, __rds_put_mr_final); /* If the MR was marked as invalidate, this will * trigger an async flush. */ if (zot_me) kref_put(&mr->r_kref, __rds_put_mr_final); } void rds_rdma_free_op(struct rm_rdma_op *ro) { unsigned int i; if (ro->op_odp_mr) { kref_put(&ro->op_odp_mr->r_kref, __rds_put_mr_final); } else { for (i = 0; i < ro->op_nents; i++) { struct page *page = sg_page(&ro->op_sg[i]); /* Mark page dirty if it was possibly modified, which * is the case for a RDMA_READ which copies from remote * to local memory */ unpin_user_pages_dirty_lock(&page, 1, !ro->op_write); } } kfree(ro->op_notifier); ro->op_notifier = NULL; ro->op_active = 0; ro->op_odp_mr = NULL; } void rds_atomic_free_op(struct rm_atomic_op *ao) { struct page *page = sg_page(ao->op_sg); /* Mark page dirty if it was possibly modified, which * is the case for a RDMA_READ which copies from remote * to local memory */ unpin_user_pages_dirty_lock(&page, 1, true); kfree(ao->op_notifier); ao->op_notifier = NULL; ao->op_active = 0; } /* * Count the number of pages needed to describe an incoming iovec array. */ static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs) { int tot_pages = 0; unsigned int nr_pages; unsigned int i; /* figure out the number of pages in the vector */ for (i = 0; i < nr_iovecs; i++) { nr_pages = rds_pages_in_vec(&iov[i]); if (nr_pages == 0) return -EINVAL; tot_pages += nr_pages; /* * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, * so tot_pages cannot overflow without first going negative. */ if (tot_pages < 0) return -EINVAL; } return tot_pages; } int rds_rdma_extra_size(struct rds_rdma_args *args, struct rds_iov_vector *iov) { struct rds_iovec *vec; struct rds_iovec __user *local_vec; int tot_pages = 0; unsigned int nr_pages; unsigned int i; local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; if (args->nr_local == 0) return -EINVAL; if (args->nr_local > UIO_MAXIOV) return -EMSGSIZE; iov->iov = kcalloc(args->nr_local, sizeof(struct rds_iovec), GFP_KERNEL); if (!iov->iov) return -ENOMEM; vec = &iov->iov[0]; if (copy_from_user(vec, local_vec, args->nr_local * sizeof(struct rds_iovec))) return -EFAULT; iov->len = args->nr_local; /* figure out the number of pages in the vector */ for (i = 0; i < args->nr_local; i++, vec++) { nr_pages = rds_pages_in_vec(vec); if (nr_pages == 0) return -EINVAL; tot_pages += nr_pages; /* * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1, * so tot_pages cannot overflow without first going negative. */ if (tot_pages < 0) return -EINVAL; } return tot_pages * sizeof(struct scatterlist); } /* * The application asks for a RDMA transfer. * Extract all arguments and set up the rdma_op */ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg, struct rds_iov_vector *vec) { struct rds_rdma_args *args; struct rm_rdma_op *op = &rm->rdma; int nr_pages; unsigned int nr_bytes; struct page **pages = NULL; struct rds_iovec *iovs; unsigned int i, j; int ret = 0; bool odp_supported = true; if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) || rm->rdma.op_active) return -EINVAL; args = CMSG_DATA(cmsg); if (ipv6_addr_any(&rs->rs_bound_addr)) { ret = -ENOTCONN; /* XXX not a great errno */ goto out_ret; } if (args->nr_local > UIO_MAXIOV) { ret = -EMSGSIZE; goto out_ret; } if (vec->len != args->nr_local) { ret = -EINVAL; goto out_ret; } /* odp-mr is not supported for multiple requests within one message */ if (args->nr_local != 1) odp_supported = false; iovs = vec->iov; nr_pages = rds_rdma_pages(iovs, args->nr_local); if (nr_pages < 0) { ret = -EINVAL; goto out_ret; } pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) { ret = -ENOMEM; goto out_ret; } op->op_write = !!(args->flags & RDS_RDMA_READWRITE); op->op_fence = !!(args->flags & RDS_RDMA_FENCE); op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); op->op_silent = !!(args->flags & RDS_RDMA_SILENT); op->op_active = 1; op->op_recverr = rs->rs_recverr; op->op_odp_mr = NULL; WARN_ON(!nr_pages); op->op_sg = rds_message_alloc_sgs(rm, nr_pages); if (IS_ERR(op->op_sg)) { ret = PTR_ERR(op->op_sg); goto out_pages; } if (op->op_notify || op->op_recverr) { /* We allocate an uninitialized notifier here, because * we don't want to do that in the completion handler. We * would have to use GFP_ATOMIC there, and don't want to deal * with failed allocations. */ op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); if (!op->op_notifier) { ret = -ENOMEM; goto out_pages; } op->op_notifier->n_user_token = args->user_token; op->op_notifier->n_status = RDS_RDMA_SUCCESS; } /* The cookie contains the R_Key of the remote memory region, and * optionally an offset into it. This is how we implement RDMA into * unaligned memory. * When setting up the RDMA, we need to add that offset to the * destination address (which is really an offset into the MR) * FIXME: We may want to move this into ib_rdma.c */ op->op_rkey = rds_rdma_cookie_key(args->cookie); op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie); nr_bytes = 0; rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n", (unsigned long long)args->nr_local, (unsigned long long)args->remote_vec.addr, op->op_rkey); for (i = 0; i < args->nr_local; i++) { struct rds_iovec *iov = &iovs[i]; /* don't need to check, rds_rdma_pages() verified nr will be +nonzero */ unsigned int nr = rds_pages_in_vec(iov); rs->rs_user_addr = iov->addr; rs->rs_user_bytes = iov->bytes; /* If it's a WRITE operation, we want to pin the pages for reading. * If it's a READ operation, we need to pin the pages for writing. */ ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write); if ((!odp_supported && ret <= 0) || (odp_supported && ret <= 0 && ret != -EOPNOTSUPP)) goto out_pages; if (ret == -EOPNOTSUPP) { struct rds_mr *local_odp_mr; if (!rs->rs_transport->get_mr) { ret = -EOPNOTSUPP; goto out_pages; } local_odp_mr = kzalloc(sizeof(*local_odp_mr), GFP_KERNEL); if (!local_odp_mr) { ret = -ENOMEM; goto out_pages; } RB_CLEAR_NODE(&local_odp_mr->r_rb_node); kref_init(&local_odp_mr->r_kref); local_odp_mr->r_trans = rs->rs_transport; local_odp_mr->r_sock = rs; local_odp_mr->r_trans_private = rs->rs_transport->get_mr( NULL, 0, rs, &local_odp_mr->r_key, NULL, iov->addr, iov->bytes, ODP_VIRTUAL); if (IS_ERR(local_odp_mr->r_trans_private)) { ret = PTR_ERR(local_odp_mr->r_trans_private); rdsdebug("get_mr ret %d %p\"", ret, local_odp_mr->r_trans_private); kfree(local_odp_mr); ret = -EOPNOTSUPP; goto out_pages; } rdsdebug("Need odp; local_odp_mr %p trans_private %p\n", local_odp_mr, local_odp_mr->r_trans_private); op->op_odp_mr = local_odp_mr; op->op_odp_addr = iov->addr; } rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n", nr_bytes, nr, iov->bytes, iov->addr); nr_bytes += iov->bytes; for (j = 0; j < nr; j++) { unsigned int offset = iov->addr & ~PAGE_MASK; struct scatterlist *sg; sg = &op->op_sg[op->op_nents + j]; sg_set_page(sg, pages[j], min_t(unsigned int, iov->bytes, PAGE_SIZE - offset), offset); sg_dma_len(sg) = sg->length; rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n", sg->offset, sg->length, iov->addr, iov->bytes); iov->addr += sg->length; iov->bytes -= sg->length; } op->op_nents += nr; } if (nr_bytes > args->remote_vec.bytes) { rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n", nr_bytes, (unsigned int) args->remote_vec.bytes); ret = -EINVAL; goto out_pages; } op->op_bytes = nr_bytes; ret = 0; out_pages: kfree(pages); out_ret: if (ret) rds_rdma_free_op(op); else rds_stats_inc(s_send_rdma); return ret; } /* * The application wants us to pass an RDMA destination (aka MR) * to the remote */ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg) { unsigned long flags; struct rds_mr *mr; u32 r_key; int err = 0; if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t)) || rm->m_rdma_cookie != 0) return -EINVAL; memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), sizeof(rm->m_rdma_cookie)); /* We are reusing a previously mapped MR here. Most likely, the * application has written to the buffer, so we need to explicitly * flush those writes to RAM. Otherwise the HCA may not see them * when doing a DMA from that buffer. */ r_key = rds_rdma_cookie_key(rm->m_rdma_cookie); spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); if (!mr) err = -EINVAL; /* invalid r_key */ else kref_get(&mr->r_kref); spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (mr) { mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); rm->rdma.op_rdma_mr = mr; } return err; } /* * The application passes us an address range it wants to enable RDMA * to/from. We map the area, and save the <R_Key,offset> pair * in rm->m_rdma_cookie. This causes it to be sent along to the peer * in an extension header. */ int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg) { if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args)) || rm->m_rdma_cookie != 0) return -EINVAL; return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr, rm->m_conn_path); } /* * Fill in rds_message for an atomic request. */ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg) { struct page *page = NULL; struct rds_atomic_args *args; int ret = 0; if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args)) || rm->atomic.op_active) return -EINVAL; args = CMSG_DATA(cmsg); /* Nonmasked & masked cmsg ops converted to masked hw ops */ switch (cmsg->cmsg_type) { case RDS_CMSG_ATOMIC_FADD: rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; rm->atomic.op_m_fadd.add = args->fadd.add; rm->atomic.op_m_fadd.nocarry_mask = 0; break; case RDS_CMSG_MASKED_ATOMIC_FADD: rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; rm->atomic.op_m_fadd.add = args->m_fadd.add; rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask; break; case RDS_CMSG_ATOMIC_CSWP: rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; rm->atomic.op_m_cswp.compare = args->cswp.compare; rm->atomic.op_m_cswp.swap = args->cswp.swap; rm->atomic.op_m_cswp.compare_mask = ~0; rm->atomic.op_m_cswp.swap_mask = ~0; break; case RDS_CMSG_MASKED_ATOMIC_CSWP: rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; rm->atomic.op_m_cswp.compare = args->m_cswp.compare; rm->atomic.op_m_cswp.swap = args->m_cswp.swap; rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask; rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask; break; default: BUG(); /* should never happen */ } rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT); rm->atomic.op_active = 1; rm->atomic.op_recverr = rs->rs_recverr; rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1); if (IS_ERR(rm->atomic.op_sg)) { ret = PTR_ERR(rm->atomic.op_sg); goto err; } /* verify 8 byte-aligned */ if (args->local_addr & 0x7) { ret = -EFAULT; goto err; } ret = rds_pin_pages(args->local_addr, 1, &page, 1); if (ret != 1) goto err; ret = 0; sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr)); if (rm->atomic.op_notify || rm->atomic.op_recverr) { /* We allocate an uninitialized notifier here, because * we don't want to do that in the completion handler. We * would have to use GFP_ATOMIC there, and don't want to deal * with failed allocations. */ rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL); if (!rm->atomic.op_notifier) { ret = -ENOMEM; goto err; } rm->atomic.op_notifier->n_user_token = args->user_token; rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS; } rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie); rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie); return ret; err: if (page) unpin_user_page(page); rm->atomic.op_active = 0; kfree(rm->atomic.op_notifier); return ret; }
2405 2400 1350 1289 63 619 619 85 85 8 8 277 277 2132 714 1921 917 88 851 1576 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 // SPDX-License-Identifier: GPL-2.0 /* * fs/ext4/fast_commit.c * * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com> * * Ext4 fast commits routines. */ #include "ext4.h" #include "ext4_jbd2.h" #include "ext4_extents.h" #include "mballoc.h" #include <linux/lockdep.h> /* * Ext4 Fast Commits * ----------------- * * Ext4 fast commits implement fine grained journalling for Ext4. * * Fast commits are organized as a log of tag-length-value (TLV) structs. (See * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by * TLV during the recovery phase. For the scenarios for which we currently * don't have replay code, fast commit falls back to full commits. * Fast commits record delta in one of the following three categories. * * (A) Directory entry updates: * * - EXT4_FC_TAG_UNLINK - records directory entry unlink * - EXT4_FC_TAG_LINK - records directory entry link * - EXT4_FC_TAG_CREAT - records inode and directory entry creation * * (B) File specific data range updates: * * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode * * (C) Inode metadata (mtime / ctime etc): * * - EXT4_FC_TAG_INODE - record the inode that should be replayed * during recovery. Note that iblocks field is * not replayed and instead derived during * replay. * Commit Operation * ---------------- * With fast commits, we maintain all the directory entry operations in the * order in which they are issued in an in-memory queue. This queue is flushed * to disk during the commit operation. We also maintain a list of inodes * that need to be committed during a fast commit in another in memory queue of * inodes. During the commit operation, we commit in the following order: * * [1] Prepare all the inodes to write out their data by setting * "EXT4_STATE_FC_FLUSHING_DATA". This ensures that inode cannot be * deleted while it is being flushed. * [2] Flush data buffers to disk and clear "EXT4_STATE_FC_FLUSHING_DATA" * state. * [3] Lock the journal by calling jbd2_journal_lock_updates. This ensures that * all the exsiting handles finish and no new handles can start. * [4] Mark all the fast commit eligible inodes as undergoing fast commit * by setting "EXT4_STATE_FC_COMMITTING" state. * [5] Unlock the journal by calling jbd2_journal_unlock_updates. This allows * starting of new handles. If new handles try to start an update on * any of the inodes that are being committed, ext4_fc_track_inode() * will block until those inodes have finished the fast commit. * [6] Commit all the directory entry updates in the fast commit space. * [7] Commit all the changed inodes in the fast commit space and clear * "EXT4_STATE_FC_COMMITTING" for these inodes. * [8] Write tail tag (this tag ensures the atomicity, please read the following * section for more details). * * All the inode updates must be enclosed within jbd2_jounrnal_start() * and jbd2_journal_stop() similar to JBD2 journaling. * * Fast Commit Ineligibility * ------------------------- * * Not all operations are supported by fast commits today (e.g extended * attributes). Fast commit ineligibility is marked by calling * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back * to full commit. * * Atomicity of commits * -------------------- * In order to guarantee atomicity during the commit operation, fast commit * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail * tag contains CRC of the contents and TID of the transaction after which * this fast commit should be applied. Recovery code replays fast commit * logs only if there's at least 1 valid tail present. For every fast commit * operation, there is 1 tail. This means, we may end up with multiple tails * in the fast commit space. Here's an example: * * - Create a new file A and remove existing file B * - fsync() * - Append contents to file A * - Truncate file A * - fsync() * * The fast commit space at the end of above operations would look like this: * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL] * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->| * * Replay code should thus check for all the valid tails in the FC area. * * Fast Commit Replay Idempotence * ------------------------------ * * Fast commits tags are idempotent in nature provided the recovery code follows * certain rules. The guiding principle that the commit path follows while * committing is that it stores the result of a particular operation instead of * storing the procedure. * * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a' * was associated with inode 10. During fast commit, instead of storing this * operation as a procedure "rename a to b", we store the resulting file system * state as a "series" of outcomes: * * - Link dirent b to inode 10 * - Unlink dirent a * - Inode <10> with valid refcount * * Now when recovery code runs, it needs "enforce" this state on the file * system. This is what guarantees idempotence of fast commit replay. * * Let's take an example of a procedure that is not idempotent and see how fast * commits make it idempotent. Consider following sequence of operations: * * rm A; mv B A; read A * (x) (y) (z) * * (x), (y) and (z) are the points at which we can crash. If we store this * sequence of operations as is then the replay is not idempotent. Let's say * while in replay, we crash at (z). During the second replay, file A (which was * actually created as a result of "mv B A" operation) would get deleted. Thus, * file named A would be absent when we try to read A. So, this sequence of * operations is not idempotent. However, as mentioned above, instead of storing * the procedure fast commits store the outcome of each procedure. Thus the fast * commit log for above procedure would be as follows: * * (Let's assume dirent A was linked to inode 10 and dirent B was linked to * inode 11 before the replay) * * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11] * (w) (x) (y) (z) * * If we crash at (z), we will have file A linked to inode 11. During the second * replay, we will remove file A (inode 11). But we will create it back and make * it point to inode 11. We won't find B, so we'll just skip that step. At this * point, the refcount for inode 11 is not reliable, but that gets fixed by the * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled * similarly. Thus, by converting a non-idempotent procedure into a series of * idempotent outcomes, fast commits ensured idempotence during the replay. * * Locking * ------- * sbi->s_fc_lock protects the fast commit inodes queue and the fast commit * dentry queue. ei->i_fc_lock protects the fast commit related info in a given * inode. Most of the code avoids acquiring both the locks, but if one must do * that then sbi->s_fc_lock must be acquired before ei->i_fc_lock. * * TODOs * ----- * * 0) Fast commit replay path hardening: Fast commit replay code should use * journal handles to make sure all the updates it does during the replay * path are atomic. With that if we crash during fast commit replay, after * trying to do recovery again, we will find a file system where fast commit * area is invalid (because new full commit would be found). In order to deal * with that, fast commit replay code should ensure that the "FC_REPLAY" * superblock state is persisted before starting the replay, so that after * the crash, fast commit recovery code can look at that flag and perform * fast commit recovery even if that area is invalidated by later full * commits. * * 1) Handle more ineligible cases. * * 2) Change ext4_fc_commit() to lookup logical to physical mapping using extent * status tree. This would get rid of the need to call ext4_fc_track_inode() * before acquiring i_data_sem. To do that we would need to ensure that * modified extents from the extent status tree are not evicted from memory. */ #include <trace/events/ext4.h> static struct kmem_cache *ext4_fc_dentry_cachep; static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate) { BUFFER_TRACE(bh, ""); if (uptodate) { ext4_debug("%s: Block %lld up-to-date", __func__, bh->b_blocknr); set_buffer_uptodate(bh); } else { ext4_debug("%s: Block %lld not up-to-date", __func__, bh->b_blocknr); clear_buffer_uptodate(bh); } unlock_buffer(bh); } static inline void ext4_fc_reset_inode(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); ei->i_fc_lblk_start = 0; ei->i_fc_lblk_len = 0; } void ext4_fc_init_inode(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); ext4_fc_reset_inode(inode); ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); INIT_LIST_HEAD(&ei->i_fc_list); INIT_LIST_HEAD(&ei->i_fc_dilist); init_waitqueue_head(&ei->i_fc_wait); } static bool ext4_fc_disabled(struct super_block *sb) { return (!test_opt2(sb, JOURNAL_FAST_COMMIT) || (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)); } /* * Remove inode from fast commit list. If the inode is being committed * we wait until inode commit is done. */ void ext4_fc_del(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_fc_dentry_update *fc_dentry; wait_queue_head_t *wq; if (ext4_fc_disabled(inode->i_sb)) return; mutex_lock(&sbi->s_fc_lock); if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { mutex_unlock(&sbi->s_fc_lock); return; } /* * Since ext4_fc_del is called from ext4_evict_inode while having a * handle open, there is no need for us to wait here even if a fast * commit is going on. That is because, if this inode is being * committed, ext4_mark_inode_dirty would have waited for inode commit * operation to finish before we come here. So, by the time we come * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So, * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode * here. * * We may come here without any handles open in the "no_delete" case of * ext4_evict_inode as well. However, if that happens, we first mark the * file system as fast commit ineligible anyway. So, even in that case, * it is okay to remove the inode from the fc list. */ WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING) && !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)); while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { #if (BITS_PER_LONG < 64) DEFINE_WAIT_BIT(wait, &ei->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA); wq = bit_waitqueue(&ei->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA); #else DEFINE_WAIT_BIT(wait, &ei->i_flags, EXT4_STATE_FC_FLUSHING_DATA); wq = bit_waitqueue(&ei->i_flags, EXT4_STATE_FC_FLUSHING_DATA); #endif prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { mutex_unlock(&sbi->s_fc_lock); schedule(); mutex_lock(&sbi->s_fc_lock); } finish_wait(wq, &wait.wq_entry); } list_del_init(&ei->i_fc_list); /* * Since this inode is getting removed, let's also remove all FC * dentry create references, since it is not needed to log it anyways. */ if (list_empty(&ei->i_fc_dilist)) { mutex_unlock(&sbi->s_fc_lock); return; } fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist); WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT); list_del_init(&fc_dentry->fcd_list); list_del_init(&fc_dentry->fcd_dilist); WARN_ON(!list_empty(&ei->i_fc_dilist)); mutex_unlock(&sbi->s_fc_lock); release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); } /* * Mark file system as fast commit ineligible, and record latest * ineligible transaction tid. This means until the recorded * transaction, commit operation would result in a full jbd2 commit. */ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle) { struct ext4_sb_info *sbi = EXT4_SB(sb); tid_t tid; bool has_transaction = true; bool is_ineligible; if (ext4_fc_disabled(sb)) return; if (handle && !IS_ERR(handle)) tid = handle->h_transaction->t_tid; else { read_lock(&sbi->s_journal->j_state_lock); if (sbi->s_journal->j_running_transaction) tid = sbi->s_journal->j_running_transaction->t_tid; else has_transaction = false; read_unlock(&sbi->s_journal->j_state_lock); } mutex_lock(&sbi->s_fc_lock); is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid))) sbi->s_fc_ineligible_tid = tid; ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); mutex_unlock(&sbi->s_fc_lock); WARN_ON(reason >= EXT4_FC_REASON_MAX); sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; } /* * Generic fast commit tracking function. If this is the first time this we are * called after a full commit, we initialize fast commit fields and then call * __fc_track_fn() with update = 0. If we have already been called after a full * commit, we pass update = 1. Based on that, the track function can determine * if it needs to track a field for the first time or if it needs to just * update the previously tracked value. * * If enqueue is set, this function enqueues the inode in fast commit list. */ static int ext4_fc_track_template( handle_t *handle, struct inode *inode, int (*__fc_track_fn)(handle_t *handle, struct inode *, void *, bool), void *args, int enqueue) { bool update = false; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); tid_t tid = 0; int ret; tid = handle->h_transaction->t_tid; spin_lock(&ei->i_fc_lock); if (tid == ei->i_sync_tid) { update = true; } else { ext4_fc_reset_inode(inode); ei->i_sync_tid = tid; } ret = __fc_track_fn(handle, inode, args, update); spin_unlock(&ei->i_fc_lock); if (!enqueue) return ret; mutex_lock(&sbi->s_fc_lock); if (list_empty(&EXT4_I(inode)->i_fc_list)) list_add_tail(&EXT4_I(inode)->i_fc_list, (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? &sbi->s_fc_q[FC_Q_STAGING] : &sbi->s_fc_q[FC_Q_MAIN]); mutex_unlock(&sbi->s_fc_lock); return ret; } struct __track_dentry_update_args { struct dentry *dentry; int op; }; /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */ static int __track_dentry_update(handle_t *handle, struct inode *inode, void *arg, bool update) { struct ext4_fc_dentry_update *node; struct ext4_inode_info *ei = EXT4_I(inode); struct __track_dentry_update_args *dentry_update = (struct __track_dentry_update_args *)arg; struct dentry *dentry = dentry_update->dentry; struct inode *dir = dentry->d_parent->d_inode; struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); spin_unlock(&ei->i_fc_lock); if (IS_ENCRYPTED(dir)) { ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME, handle); spin_lock(&ei->i_fc_lock); return -EOPNOTSUPP; } node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); if (!node) { ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle); spin_lock(&ei->i_fc_lock); return -ENOMEM; } node->fcd_op = dentry_update->op; node->fcd_parent = dir->i_ino; node->fcd_ino = inode->i_ino; take_dentry_name_snapshot(&node->fcd_name, dentry); INIT_LIST_HEAD(&node->fcd_dilist); INIT_LIST_HEAD(&node->fcd_list); mutex_lock(&sbi->s_fc_lock); if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_STAGING]); else list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]); /* * This helps us keep a track of all fc_dentry updates which is part of * this ext4 inode. So in case the inode is getting unlinked, before * even we get a chance to fsync, we could remove all fc_dentry * references while evicting the inode in ext4_fc_del(). * Also with this, we don't need to loop over all the inodes in * sbi->s_fc_q to get the corresponding inode in * ext4_fc_commit_dentry_updates(). */ if (dentry_update->op == EXT4_FC_TAG_CREAT) { WARN_ON(!list_empty(&ei->i_fc_dilist)); list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist); } mutex_unlock(&sbi->s_fc_lock); spin_lock(&ei->i_fc_lock); return 0; } void __ext4_fc_track_unlink(handle_t *handle, struct inode *inode, struct dentry *dentry) { struct __track_dentry_update_args args; int ret; args.dentry = dentry; args.op = EXT4_FC_TAG_UNLINK; ret = ext4_fc_track_template(handle, inode, __track_dentry_update, (void *)&args, 0); trace_ext4_fc_track_unlink(handle, inode, dentry, ret); } void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; __ext4_fc_track_unlink(handle, inode, dentry); } void __ext4_fc_track_link(handle_t *handle, struct inode *inode, struct dentry *dentry) { struct __track_dentry_update_args args; int ret; args.dentry = dentry; args.op = EXT4_FC_TAG_LINK; ret = ext4_fc_track_template(handle, inode, __track_dentry_update, (void *)&args, 0); trace_ext4_fc_track_link(handle, inode, dentry, ret); } void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; __ext4_fc_track_link(handle, inode, dentry); } void __ext4_fc_track_create(handle_t *handle, struct inode *inode, struct dentry *dentry) { struct __track_dentry_update_args args; int ret; args.dentry = dentry; args.op = EXT4_FC_TAG_CREAT; ret = ext4_fc_track_template(handle, inode, __track_dentry_update, (void *)&args, 0); trace_ext4_fc_track_create(handle, inode, dentry, ret); } void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; __ext4_fc_track_create(handle, inode, dentry); } /* __track_fn for inode tracking */ static int __track_inode(handle_t *handle, struct inode *inode, void *arg, bool update) { if (update) return -EEXIST; EXT4_I(inode)->i_fc_lblk_len = 0; return 0; } void ext4_fc_track_inode(handle_t *handle, struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); wait_queue_head_t *wq; int ret; if (S_ISDIR(inode->i_mode)) return; if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_should_journal_data(inode)) { ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); return; } if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; /* * If we come here, we may sleep while waiting for the inode to * commit. We shouldn't be holding i_data_sem when we go to sleep since * the commit path needs to grab the lock while committing the inode. */ lockdep_assert_not_held(&ei->i_data_sem); while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { #if (BITS_PER_LONG < 64) DEFINE_WAIT_BIT(wait, &ei->i_state_flags, EXT4_STATE_FC_COMMITTING); wq = bit_waitqueue(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING); #else DEFINE_WAIT_BIT(wait, &ei->i_flags, EXT4_STATE_FC_COMMITTING); wq = bit_waitqueue(&ei->i_flags, EXT4_STATE_FC_COMMITTING); #endif prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) schedule(); finish_wait(wq, &wait.wq_entry); } /* * From this point on, this inode will not be committed either * by fast or full commit as long as the handle is open. */ ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); trace_ext4_fc_track_inode(handle, inode, ret); } struct __track_range_args { ext4_lblk_t start, end; }; /* __track_fn for tracking data updates */ static int __track_range(handle_t *handle, struct inode *inode, void *arg, bool update) { struct ext4_inode_info *ei = EXT4_I(inode); ext4_lblk_t oldstart; struct __track_range_args *__arg = (struct __track_range_args *)arg; if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) { ext4_debug("Special inode %ld being modified\n", inode->i_ino); return -ECANCELED; } oldstart = ei->i_fc_lblk_start; if (update && ei->i_fc_lblk_len > 0) { ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start); ei->i_fc_lblk_len = max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) - ei->i_fc_lblk_start + 1; } else { ei->i_fc_lblk_start = __arg->start; ei->i_fc_lblk_len = __arg->end - __arg->start + 1; } return 0; } void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, ext4_lblk_t end) { struct __track_range_args args; int ret; if (S_ISDIR(inode->i_mode)) return; if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; if (ext4_has_inline_data(inode)) { ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); return; } args.start = start; args.end = end; ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1); trace_ext4_fc_track_range(handle, inode, start, end, ret); } static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) { blk_opf_t write_flags = REQ_SYNC; struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; /* Add REQ_FUA | REQ_PREFLUSH only its tail */ if (test_opt(sb, BARRIER) && is_tail) write_flags |= REQ_FUA | REQ_PREFLUSH; lock_buffer(bh); set_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = ext4_end_buffer_io_sync; submit_bh(REQ_OP_WRITE | write_flags, bh); EXT4_SB(sb)->s_fc_bh = NULL; } /* Ext4 commit path routines */ /* * Allocate len bytes on a fast commit buffer. * * During the commit time this function is used to manage fast commit * block space. We don't split a fast commit log onto different * blocks. So this function makes sure that if there's not enough space * on the current block, the remaining space in the current block is * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case, * new block is from jbd2 and CRC is updated to reflect the padding * we added. */ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) { struct ext4_fc_tl tl; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh; int bsize = sbi->s_journal->j_blocksize; int ret, off = sbi->s_fc_bytes % bsize; int remaining; u8 *dst; /* * If 'len' is too long to fit in any block alongside a PAD tlv, then we * cannot fulfill the request. */ if (len > bsize - EXT4_FC_TAG_BASE_LEN) return NULL; if (!sbi->s_fc_bh) { ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); if (ret) return NULL; sbi->s_fc_bh = bh; } dst = sbi->s_fc_bh->b_data + off; /* * Allocate the bytes in the current block if we can do so while still * leaving enough space for a PAD tlv. */ remaining = bsize - EXT4_FC_TAG_BASE_LEN - off; if (len <= remaining) { sbi->s_fc_bytes += len; return dst; } /* * Else, terminate the current block with a PAD tlv, then allocate a new * block and allocate the bytes at the start of that new block. */ tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); tl.fc_len = cpu_to_le16(remaining); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining); *crc = ext4_chksum(*crc, sbi->s_fc_bh->b_data, bsize); ext4_fc_submit_bh(sb, false); ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); if (ret) return NULL; sbi->s_fc_bh = bh; sbi->s_fc_bytes += bsize - off + len; return sbi->s_fc_bh->b_data; } /* * Complete a fast commit by writing tail tag. * * Writing tail tag marks the end of a fast commit. In order to guarantee * atomicity, after writing tail tag, even if there's space remaining * in the block, next commit shouldn't use it. That's why tail tag * has the length as that of the remaining space on the block. */ static int ext4_fc_write_tail(struct super_block *sb, u32 crc) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_fc_tl tl; struct ext4_fc_tail tail; int off, bsize = sbi->s_journal->j_blocksize; u8 *dst; /* * ext4_fc_reserve_space takes care of allocating an extra block if * there's no enough space on this block for accommodating this tail. */ dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc); if (!dst) return -ENOSPC; off = sbi->s_fc_bytes % bsize; tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail)); sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); dst += EXT4_FC_TAG_BASE_LEN; tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid)); dst += sizeof(tail.fc_tid); crc = ext4_chksum(crc, sbi->s_fc_bh->b_data, dst - (u8 *)sbi->s_fc_bh->b_data); tail.fc_crc = cpu_to_le32(crc); memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc)); dst += sizeof(tail.fc_crc); memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */ ext4_fc_submit_bh(sb, true); return 0; } /* * Adds tag, length, value and updates CRC. Returns true if tlv was added. * Returns false if there's not enough space. */ static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, u32 *crc) { struct ext4_fc_tl tl; u8 *dst; dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc); if (!dst) return false; tl.fc_tag = cpu_to_le16(tag); tl.fc_len = cpu_to_le16(len); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len); return true; } /* Same as above, but adds dentry tlv. */ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, struct ext4_fc_dentry_update *fc_dentry) { struct ext4_fc_dentry_info fcd; struct ext4_fc_tl tl; int dlen = fc_dentry->fcd_name.name.len; u8 *dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc); if (!dst) return false; fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent); fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino); tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op); tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); dst += EXT4_FC_TAG_BASE_LEN; memcpy(dst, &fcd, sizeof(fcd)); dst += sizeof(fcd); memcpy(dst, fc_dentry->fcd_name.name.name, dlen); return true; } /* * Writes inode in the fast commit space under TLV with tag @tag. * Returns 0 on success, error on failure. */ static int ext4_fc_write_inode(struct inode *inode, u32 *crc) { struct ext4_inode_info *ei = EXT4_I(inode); int inode_len = EXT4_GOOD_OLD_INODE_SIZE; int ret; struct ext4_iloc iloc; struct ext4_fc_inode fc_inode; struct ext4_fc_tl tl; u8 *dst; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) inode_len = EXT4_INODE_SIZE(inode->i_sb); else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) inode_len += ei->i_extra_isize; fc_inode.fc_ino = cpu_to_le32(inode->i_ino); tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); ret = -ECANCELED; dst = ext4_fc_reserve_space(inode->i_sb, EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc); if (!dst) goto err; memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); dst += EXT4_FC_TAG_BASE_LEN; memcpy(dst, &fc_inode, sizeof(fc_inode)); dst += sizeof(fc_inode); memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len); ret = 0; err: brelse(iloc.bh); return ret; } /* * Writes updated data ranges for the inode in question. Updates CRC. * Returns 0 on success, error otherwise. */ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) { ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_map_blocks map; struct ext4_fc_add_range fc_ext; struct ext4_fc_del_range lrange; struct ext4_extent *ex; int ret; spin_lock(&ei->i_fc_lock); if (ei->i_fc_lblk_len == 0) { spin_unlock(&ei->i_fc_lock); return 0; } old_blk_size = ei->i_fc_lblk_start; new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; ei->i_fc_lblk_len = 0; spin_unlock(&ei->i_fc_lock); cur_lblk_off = old_blk_size; ext4_debug("will try writing %d to %d for inode %ld\n", cur_lblk_off, new_blk_size, inode->i_ino); while (cur_lblk_off <= new_blk_size) { map.m_lblk = cur_lblk_off; map.m_len = new_blk_size - cur_lblk_off + 1; ret = ext4_map_blocks(NULL, inode, &map, EXT4_GET_BLOCKS_IO_SUBMIT | EXT4_EX_NOCACHE); if (ret < 0) return -ECANCELED; if (map.m_len == 0) { cur_lblk_off++; continue; } if (ret == 0) { lrange.fc_ino = cpu_to_le32(inode->i_ino); lrange.fc_lblk = cpu_to_le32(map.m_lblk); lrange.fc_len = cpu_to_le32(map.m_len); if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE, sizeof(lrange), (u8 *)&lrange, crc)) return -ENOSPC; } else { unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ? EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN; /* Limit the number of blocks in one extent */ map.m_len = min(max, map.m_len); fc_ext.fc_ino = cpu_to_le32(inode->i_ino); ex = (struct ext4_extent *)&fc_ext.fc_ex; ex->ee_block = cpu_to_le32(map.m_lblk); ex->ee_len = cpu_to_le16(map.m_len); ext4_ext_store_pblock(ex, map.m_pblk); if (map.m_flags & EXT4_MAP_UNWRITTEN) ext4_ext_mark_unwritten(ex); else ext4_ext_mark_initialized(ex); if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE, sizeof(fc_ext), (u8 *)&fc_ext, crc)) return -ENOSPC; } cur_lblk_off += map.m_len; } return 0; } /* Flushes data of all the inodes in the commit queue. */ static int ext4_fc_flush_data(journal_t *journal) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *ei; int ret = 0; list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ret = jbd2_submit_inode_data(journal, ei->jinode); if (ret) return ret; } list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ret = jbd2_wait_inode_data(journal, ei->jinode); if (ret) return ret; } return 0; } /* Commit all the directory entry updates */ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n; struct inode *inode; struct ext4_inode_info *ei; int ret; if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) return 0; list_for_each_entry_safe(fc_dentry, fc_dentry_n, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) return -ENOSPC; continue; } /* * With fcd_dilist we need not loop in sbi->s_fc_q to get the * corresponding inode. Also, the corresponding inode could have been * deleted, in which case, we don't need to do anything. */ if (list_empty(&fc_dentry->fcd_dilist)) continue; ei = list_first_entry(&fc_dentry->fcd_dilist, struct ext4_inode_info, i_fc_dilist); inode = &ei->vfs_inode; WARN_ON(inode->i_ino != fc_dentry->fcd_ino); /* * We first write the inode and then the create dirent. This * allows the recovery code to create an unnamed inode first * and then link it to a directory entry. This allows us * to use namei.c routines almost as is and simplifies * the recovery code. */ ret = ext4_fc_write_inode(inode, crc); if (ret) return ret; ret = ext4_fc_write_inode_data(inode, crc); if (ret) return ret; if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) return -ENOSPC; } return 0; } static int ext4_fc_perform_commit(journal_t *journal) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *iter; struct ext4_fc_head head; struct inode *inode; struct blk_plug plug; int ret = 0; u32 crc = 0; /* * Step 1: Mark all inodes on s_fc_q[MAIN] with * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being * freed until the data flush is over. */ mutex_lock(&sbi->s_fc_lock); list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ext4_set_inode_state(&iter->vfs_inode, EXT4_STATE_FC_FLUSHING_DATA); } mutex_unlock(&sbi->s_fc_lock); /* Step 2: Flush data for all the eligible inodes. */ ret = ext4_fc_flush_data(journal); /* * Step 3: Clear EXT4_STATE_FC_FLUSHING_DATA flag, before returning * any error from step 2. This ensures that waiters waiting on * EXT4_STATE_FC_FLUSHING_DATA can resume. */ mutex_lock(&sbi->s_fc_lock); list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ext4_clear_inode_state(&iter->vfs_inode, EXT4_STATE_FC_FLUSHING_DATA); #if (BITS_PER_LONG < 64) wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA); #else wake_up_bit(&iter->i_flags, EXT4_STATE_FC_FLUSHING_DATA); #endif } /* * Make sure clearing of EXT4_STATE_FC_FLUSHING_DATA is visible before * the waiter checks the bit. Pairs with implicit barrier in * prepare_to_wait() in ext4_fc_del(). */ smp_mb(); mutex_unlock(&sbi->s_fc_lock); /* * If we encountered error in Step 2, return it now after clearing * EXT4_STATE_FC_FLUSHING_DATA bit. */ if (ret) return ret; /* Step 4: Mark all inodes as being committed. */ jbd2_journal_lock_updates(journal); /* * The journal is now locked. No more handles can start and all the * previous handles are now drained. We now mark the inodes on the * commit queue as being committed. */ mutex_lock(&sbi->s_fc_lock); list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ext4_set_inode_state(&iter->vfs_inode, EXT4_STATE_FC_COMMITTING); } mutex_unlock(&sbi->s_fc_lock); jbd2_journal_unlock_updates(journal); /* * Step 5: If file system device is different from journal device, * issue a cache flush before we start writing fast commit blocks. */ if (journal->j_fs_dev != journal->j_dev) blkdev_issue_flush(journal->j_fs_dev); blk_start_plug(&plug); /* Step 6: Write fast commit blocks to disk. */ if (sbi->s_fc_bytes == 0) { /* * Step 6.1: Add a head tag only if this is the first fast * commit in this TID. */ head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); head.fc_tid = cpu_to_le32( sbi->s_journal->j_running_transaction->t_tid); if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), (u8 *)&head, &crc)) { ret = -ENOSPC; goto out; } } /* Step 6.2: Now write all the dentry updates. */ mutex_lock(&sbi->s_fc_lock); ret = ext4_fc_commit_dentry_updates(journal, &crc); if (ret) goto out; /* Step 6.3: Now write all the changed inodes to disk. */ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { inode = &iter->vfs_inode; if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) continue; ret = ext4_fc_write_inode_data(inode, &crc); if (ret) goto out; ret = ext4_fc_write_inode(inode, &crc); if (ret) goto out; } /* Step 6.4: Finally write tail tag to conclude this fast commit. */ ret = ext4_fc_write_tail(sb, crc); out: mutex_unlock(&sbi->s_fc_lock); blk_finish_plug(&plug); return ret; } static void ext4_fc_update_stats(struct super_block *sb, int status, u64 commit_time, int nblks, tid_t commit_tid) { struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats; ext4_debug("Fast commit ended with status = %d for tid %u", status, commit_tid); if (status == EXT4_FC_STATUS_OK) { stats->fc_num_commits++; stats->fc_numblks += nblks; if (likely(stats->s_fc_avg_commit_time)) stats->s_fc_avg_commit_time = (commit_time + stats->s_fc_avg_commit_time * 3) / 4; else stats->s_fc_avg_commit_time = commit_time; } else if (status == EXT4_FC_STATUS_FAILED || status == EXT4_FC_STATUS_INELIGIBLE) { if (status == EXT4_FC_STATUS_FAILED) stats->fc_failed_commits++; stats->fc_ineligible_commits++; } else { stats->fc_skipped_commits++; } trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid); } /* * The main commit entry point. Performs a fast commit for transaction * commit_tid if needed. If it's not possible to perform a fast commit * due to various reasons, we fall back to full commit. Returns 0 * on success, error otherwise. */ int ext4_fc_commit(journal_t *journal, tid_t commit_tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); int nblks = 0, ret, bsize = journal->j_blocksize; int subtid = atomic_read(&sbi->s_fc_subtid); int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0; ktime_t start_time, commit_time; int old_ioprio, journal_ioprio; if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) return jbd2_complete_transaction(journal, commit_tid); trace_ext4_fc_commit_start(sb, commit_tid); start_time = ktime_get(); old_ioprio = get_current_ioprio(); restart_fc: ret = jbd2_fc_begin_commit(journal, commit_tid); if (ret == -EALREADY) { /* There was an ongoing commit, check if we need to restart */ if (atomic_read(&sbi->s_fc_subtid) <= subtid && tid_gt(commit_tid, journal->j_commit_sequence)) goto restart_fc; ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0, commit_tid); return 0; } else if (ret) { /* * Commit couldn't start. Just update stats and perform a * full commit. */ ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0, commit_tid); return jbd2_complete_transaction(journal, commit_tid); } /* * After establishing journal barrier via jbd2_fc_begin_commit(), check * if we are fast commit ineligible. */ if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) { status = EXT4_FC_STATUS_INELIGIBLE; goto fallback; } /* * Now that we know that this thread is going to do a fast commit, * elevate the priority to match that of the journal thread. */ if (journal->j_task->io_context) journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; else journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; set_task_ioprio(current, journal_ioprio); fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; ret = ext4_fc_perform_commit(journal); if (ret < 0) { status = EXT4_FC_STATUS_FAILED; goto fallback; } nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; ret = jbd2_fc_wait_bufs(journal, nblks); if (ret < 0) { status = EXT4_FC_STATUS_FAILED; goto fallback; } atomic_inc(&sbi->s_fc_subtid); ret = jbd2_fc_end_commit(journal); set_task_ioprio(current, old_ioprio); /* * weight the commit time higher than the average time so we * don't react too strongly to vast changes in the commit time */ commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid); return ret; fallback: set_task_ioprio(current, old_ioprio); ret = jbd2_fc_end_commit_fallback(journal); ext4_fc_update_stats(sb, status, 0, 0, commit_tid); return ret; } /* * Fast commit cleanup routine. This is called after every fast commit and * full commit. full is true if we are called after a full commit. */ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *ei; struct ext4_fc_dentry_update *fc_dentry; if (full && sbi->s_fc_bh) sbi->s_fc_bh = NULL; trace_ext4_fc_cleanup(journal, full, tid); jbd2_fc_release_bufs(journal); mutex_lock(&sbi->s_fc_lock); while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) { ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN], struct ext4_inode_info, i_fc_list); list_del_init(&ei->i_fc_list); ext4_clear_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); if (tid_geq(tid, ei->i_sync_tid)) { ext4_fc_reset_inode(&ei->vfs_inode); } else if (full) { /* * We are called after a full commit, inode has been * modified while the commit was running. Re-enqueue * the inode into STAGING, which will then be splice * back into MAIN. This cannot happen during * fastcommit because the journal is locked all the * time in that case (and tid doesn't increase so * tid check above isn't reliable). */ list_add_tail(&ei->i_fc_list, &sbi->s_fc_q[FC_Q_STAGING]); } /* * Make sure clearing of EXT4_STATE_FC_COMMITTING is * visible before we send the wakeup. Pairs with implicit * barrier in prepare_to_wait() in ext4_fc_track_inode(). */ smp_mb(); #if (BITS_PER_LONG < 64) wake_up_bit(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING); #else wake_up_bit(&ei->i_flags, EXT4_STATE_FC_COMMITTING); #endif } while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) { fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN], struct ext4_fc_dentry_update, fcd_list); list_del_init(&fc_dentry->fcd_list); list_del_init(&fc_dentry->fcd_dilist); release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); } list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], &sbi->s_fc_dentry_q[FC_Q_MAIN]); list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], &sbi->s_fc_q[FC_Q_MAIN]); if (tid_geq(tid, sbi->s_fc_ineligible_tid)) { sbi->s_fc_ineligible_tid = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); } if (full) sbi->s_fc_bytes = 0; mutex_unlock(&sbi->s_fc_lock); trace_ext4_fc_stats(sb); } /* Ext4 Replay Path Routines */ /* Helper struct for dentry replay routines */ struct dentry_info_args { int parent_ino, dname_len, ino, inode_len; char *dname; }; /* Same as struct ext4_fc_tl, but uses native endianness fields */ struct ext4_fc_tl_mem { u16 fc_tag; u16 fc_len; }; static inline void tl_to_darg(struct dentry_info_args *darg, struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_dentry_info fcd; memcpy(&fcd, val, sizeof(fcd)); darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino); darg->ino = le32_to_cpu(fcd.fc_ino); darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname); darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info); } static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_tl tl_disk; memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN); tl->fc_len = le16_to_cpu(tl_disk.fc_len); tl->fc_tag = le16_to_cpu(tl_disk.fc_tag); } /* Unlink replay function */ static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { struct inode *inode, *old_parent; struct qstr entry; struct dentry_info_args darg; int ret = 0; tl_to_darg(&darg, tl, val); trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino, darg.parent_ino, darg.dname_len); entry.name = darg.dname; entry.len = darg.dname_len; inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode %d not found", darg.ino); return 0; } old_parent = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); if (IS_ERR(old_parent)) { ext4_debug("Dir with inode %d not found", darg.parent_ino); iput(inode); return 0; } ret = __ext4_unlink(old_parent, &entry, inode, NULL); /* -ENOENT ok coz it might not exist anymore. */ if (ret == -ENOENT) ret = 0; iput(old_parent); iput(inode); return ret; } static int ext4_fc_replay_link_internal(struct super_block *sb, struct dentry_info_args *darg, struct inode *inode) { struct inode *dir = NULL; struct dentry *dentry_dir = NULL, *dentry_inode = NULL; struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); int ret = 0; dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); if (IS_ERR(dir)) { ext4_debug("Dir with inode %d not found.", darg->parent_ino); dir = NULL; goto out; } dentry_dir = d_obtain_alias(dir); if (IS_ERR(dentry_dir)) { ext4_debug("Failed to obtain dentry"); dentry_dir = NULL; goto out; } dentry_inode = d_alloc(dentry_dir, &qstr_dname); if (!dentry_inode) { ext4_debug("Inode dentry not created."); ret = -ENOMEM; goto out; } ret = __ext4_link(dir, inode, dentry_inode); /* * It's possible that link already existed since data blocks * for the dir in question got persisted before we crashed OR * we replayed this tag and crashed before the entire replay * could complete. */ if (ret && ret != -EEXIST) { ext4_debug("Failed to link\n"); goto out; } ret = 0; out: if (dentry_dir) { d_drop(dentry_dir); dput(dentry_dir); } else if (dir) { iput(dir); } if (dentry_inode) { d_drop(dentry_inode); dput(dentry_inode); } return ret; } /* Link replay function */ static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { struct inode *inode; struct dentry_info_args darg; int ret = 0; tl_to_darg(&darg, tl, val); trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino, darg.parent_ino, darg.dname_len); inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode not found."); return 0; } ret = ext4_fc_replay_link_internal(sb, &darg, inode); iput(inode); return ret; } /* * Record all the modified inodes during replay. We use this later to setup * block bitmaps correctly. */ static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) { struct ext4_fc_replay_state *state; int i; state = &EXT4_SB(sb)->s_fc_replay_state; for (i = 0; i < state->fc_modified_inodes_used; i++) if (state->fc_modified_inodes[i] == ino) return 0; if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { int *fc_modified_inodes; fc_modified_inodes = krealloc(state->fc_modified_inodes, sizeof(int) * (state->fc_modified_inodes_size + EXT4_FC_REPLAY_REALLOC_INCREMENT), GFP_KERNEL); if (!fc_modified_inodes) return -ENOMEM; state->fc_modified_inodes = fc_modified_inodes; state->fc_modified_inodes_size += EXT4_FC_REPLAY_REALLOC_INCREMENT; } state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; return 0; } /* * Inode replay function */ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_inode fc_inode; struct ext4_inode *raw_inode; struct ext4_inode *raw_fc_inode; struct inode *inode = NULL; struct ext4_iloc iloc; int inode_len, ino, ret, tag = tl->fc_tag; struct ext4_extent_header *eh; size_t off_gen = offsetof(struct ext4_inode, i_generation); memcpy(&fc_inode, val, sizeof(fc_inode)); ino = le32_to_cpu(fc_inode.fc_ino); trace_ext4_fc_replay(sb, tag, ino, 0, 0); inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (!IS_ERR(inode)) { ext4_ext_clear_bb(inode); iput(inode); } inode = NULL; ret = ext4_fc_record_modified_inode(sb, ino); if (ret) goto out; raw_fc_inode = (struct ext4_inode *) (val + offsetof(struct ext4_fc_inode, fc_raw_inode)); ret = ext4_get_fc_inode_loc(sb, ino, &iloc); if (ret) goto out; inode_len = tl->fc_len - sizeof(struct ext4_fc_inode); raw_inode = ext4_raw_inode(&iloc); memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen, inode_len - off_gen); if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); if (eh->eh_magic != EXT4_EXT_MAGIC) { memset(eh, 0, sizeof(*eh)); eh->eh_magic = EXT4_EXT_MAGIC; eh->eh_max = cpu_to_le16( (sizeof(raw_inode->i_block) - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent)); } } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) { memcpy(raw_inode->i_block, raw_fc_inode->i_block, sizeof(raw_inode->i_block)); } /* Immediately update the inode on disk. */ ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); if (ret) goto out; ret = sync_dirty_buffer(iloc.bh); if (ret) goto out; ret = ext4_mark_inode_used(sb, ino); if (ret) goto out; /* Given that we just wrote the inode on disk, this SHOULD succeed. */ inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode not found."); return -EFSCORRUPTED; } /* * Our allocator could have made different decisions than before * crashing. This should be fixed but until then, we calculate * the number of blocks the inode. */ if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) ext4_ext_replay_set_iblocks(inode); inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation); ext4_reset_inode_seed(inode); ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode)); ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); sync_dirty_buffer(iloc.bh); brelse(iloc.bh); out: iput(inode); if (!ret) blkdev_issue_flush(sb->s_bdev); return 0; } /* * Dentry create replay function. * * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the * inode for which we are trying to create a dentry here, should already have * been replayed before we start here. */ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { int ret = 0; struct inode *inode = NULL; struct inode *dir = NULL; struct dentry_info_args darg; tl_to_darg(&darg, tl, val); trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino, darg.parent_ino, darg.dname_len); /* This takes care of update group descriptor and other metadata */ ret = ext4_mark_inode_used(sb, darg.ino); if (ret) goto out; inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("inode %d not found.", darg.ino); inode = NULL; ret = -EINVAL; goto out; } if (S_ISDIR(inode->i_mode)) { /* * If we are creating a directory, we need to make sure that the * dot and dot dot dirents are setup properly. */ dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); if (IS_ERR(dir)) { ext4_debug("Dir %d not found.", darg.ino); goto out; } ret = ext4_init_new_dir(NULL, dir, inode); iput(dir); if (ret) { ret = 0; goto out; } } ret = ext4_fc_replay_link_internal(sb, &darg, inode); if (ret) goto out; set_nlink(inode, 1); ext4_mark_inode_dirty(NULL, inode); out: iput(inode); return ret; } /* * Record physical disk regions which are in use as per fast commit area, * and used by inodes during replay phase. Our simple replay phase * allocator excludes these regions from allocation. */ int ext4_fc_record_regions(struct super_block *sb, int ino, ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay) { struct ext4_fc_replay_state *state; struct ext4_fc_alloc_region *region; state = &EXT4_SB(sb)->s_fc_replay_state; /* * during replay phase, the fc_regions_valid may not same as * fc_regions_used, update it when do new additions. */ if (replay && state->fc_regions_used != state->fc_regions_valid) state->fc_regions_used = state->fc_regions_valid; if (state->fc_regions_used == state->fc_regions_size) { struct ext4_fc_alloc_region *fc_regions; fc_regions = krealloc(state->fc_regions, sizeof(struct ext4_fc_alloc_region) * (state->fc_regions_size + EXT4_FC_REPLAY_REALLOC_INCREMENT), GFP_KERNEL); if (!fc_regions) return -ENOMEM; state->fc_regions_size += EXT4_FC_REPLAY_REALLOC_INCREMENT; state->fc_regions = fc_regions; } region = &state->fc_regions[state->fc_regions_used++]; region->ino = ino; region->lblk = lblk; region->pblk = pblk; region->len = len; if (replay) state->fc_regions_valid++; return 0; } /* Replay add range tag */ static int ext4_fc_replay_add_range(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_add_range fc_add_ex; struct ext4_extent newex, *ex; struct inode *inode; ext4_lblk_t start, cur; int remaining, len; ext4_fsblk_t start_pblk; struct ext4_map_blocks map; struct ext4_ext_path *path = NULL; int ret; memcpy(&fc_add_ex, val, sizeof(fc_add_ex)); ex = (struct ext4_extent *)&fc_add_ex.fc_ex; trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE, le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block), ext4_ext_get_actual_len(ex)); inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode not found."); return 0; } ret = ext4_fc_record_modified_inode(sb, inode->i_ino); if (ret) goto out; start = le32_to_cpu(ex->ee_block); start_pblk = ext4_ext_pblock(ex); len = ext4_ext_get_actual_len(ex); cur = start; remaining = len; ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", start, start_pblk, len, ext4_ext_is_unwritten(ex), inode->i_ino); while (remaining > 0) { map.m_lblk = cur; map.m_len = remaining; map.m_pblk = 0; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) goto out; if (ret == 0) { /* Range is not mapped */ path = ext4_find_extent(inode, cur, path, 0); if (IS_ERR(path)) goto out; memset(&newex, 0, sizeof(newex)); newex.ee_block = cpu_to_le32(cur); ext4_ext_store_pblock( &newex, start_pblk + cur - start); newex.ee_len = cpu_to_le16(map.m_len); if (ext4_ext_is_unwritten(ex)) ext4_ext_mark_unwritten(&newex); down_write(&EXT4_I(inode)->i_data_sem); path = ext4_ext_insert_extent(NULL, inode, path, &newex, 0); up_write((&EXT4_I(inode)->i_data_sem)); if (IS_ERR(path)) goto out; goto next; } if (start_pblk + cur - start != map.m_pblk) { /* * Logical to physical mapping changed. This can happen * if this range was removed and then reallocated to * map to new physical blocks during a fast commit. */ ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, ext4_ext_is_unwritten(ex), start_pblk + cur - start); if (ret) goto out; /* * Mark the old blocks as free since they aren't used * anymore. We maintain an array of all the modified * inodes. In case these blocks are still used at either * a different logical range in the same inode or in * some different inode, we will mark them as allocated * at the end of the FC replay using our array of * modified inodes. */ ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); goto next; } /* Range is mapped and needs a state change */ ext4_debug("Converting from %ld to %d %lld", map.m_flags & EXT4_MAP_UNWRITTEN, ext4_ext_is_unwritten(ex), map.m_pblk); ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, ext4_ext_is_unwritten(ex), map.m_pblk); if (ret) goto out; /* * We may have split the extent tree while toggling the state. * Try to shrink the extent tree now. */ ext4_ext_replay_shrink_inode(inode, start + len); next: cur += map.m_len; remaining -= map.m_len; } ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> sb->s_blocksize_bits); out: ext4_free_ext_path(path); iput(inode); return 0; } /* Replay DEL_RANGE tag */ static int ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { struct inode *inode; struct ext4_fc_del_range lrange; struct ext4_map_blocks map; ext4_lblk_t cur, remaining; int ret; memcpy(&lrange, val, sizeof(lrange)); cur = le32_to_cpu(lrange.fc_lblk); remaining = le32_to_cpu(lrange.fc_len); trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE, le32_to_cpu(lrange.fc_ino), cur, remaining); inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino)); return 0; } ret = ext4_fc_record_modified_inode(sb, inode->i_ino); if (ret) goto out; ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n", inode->i_ino, le32_to_cpu(lrange.fc_lblk), le32_to_cpu(lrange.fc_len)); while (remaining > 0) { map.m_lblk = cur; map.m_len = remaining; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) goto out; if (ret > 0) { remaining -= ret; cur += ret; ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); } else { remaining -= map.m_len; cur += map.m_len; } } down_write(&EXT4_I(inode)->i_data_sem); ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk), le32_to_cpu(lrange.fc_lblk) + le32_to_cpu(lrange.fc_len) - 1); up_write(&EXT4_I(inode)->i_data_sem); if (ret) goto out; ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> sb->s_blocksize_bits); ext4_mark_inode_dirty(NULL, inode); out: iput(inode); return 0; } static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) { struct ext4_fc_replay_state *state; struct inode *inode; struct ext4_ext_path *path = NULL; struct ext4_map_blocks map; int i, ret, j; ext4_lblk_t cur, end; state = &EXT4_SB(sb)->s_fc_replay_state; for (i = 0; i < state->fc_modified_inodes_used; i++) { inode = ext4_iget(sb, state->fc_modified_inodes[i], EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode %d not found.", state->fc_modified_inodes[i]); continue; } cur = 0; end = EXT_MAX_BLOCKS; if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) { iput(inode); continue; } while (cur < end) { map.m_lblk = cur; map.m_len = end - cur; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) break; if (ret > 0) { path = ext4_find_extent(inode, map.m_lblk, path, 0); if (!IS_ERR(path)) { for (j = 0; j < path->p_depth; j++) ext4_mb_mark_bb(inode->i_sb, path[j].p_block, 1, true); } else { path = NULL; } cur += ret; ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, true); } else { cur = cur + (map.m_len ? map.m_len : 1); } } iput(inode); } ext4_free_ext_path(path); } /* * Check if block is in excluded regions for block allocation. The simple * allocator that runs during replay phase is calls this function to see * if it is okay to use a block. */ bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk) { int i; struct ext4_fc_replay_state *state; state = &EXT4_SB(sb)->s_fc_replay_state; for (i = 0; i < state->fc_regions_valid; i++) { if (state->fc_regions[i].ino == 0 || state->fc_regions[i].len == 0) continue; if (in_range(blk, state->fc_regions[i].pblk, state->fc_regions[i].len)) return true; } return false; } /* Cleanup function called after replay */ void ext4_fc_replay_cleanup(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); sbi->s_mount_state &= ~EXT4_FC_REPLAY; kfree(sbi->s_fc_replay_state.fc_regions); kfree(sbi->s_fc_replay_state.fc_modified_inodes); } static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi, int tag, int len) { switch (tag) { case EXT4_FC_TAG_ADD_RANGE: return len == sizeof(struct ext4_fc_add_range); case EXT4_FC_TAG_DEL_RANGE: return len == sizeof(struct ext4_fc_del_range); case EXT4_FC_TAG_CREAT: case EXT4_FC_TAG_LINK: case EXT4_FC_TAG_UNLINK: len -= sizeof(struct ext4_fc_dentry_info); return len >= 1 && len <= EXT4_NAME_LEN; case EXT4_FC_TAG_INODE: len -= sizeof(struct ext4_fc_inode); return len >= EXT4_GOOD_OLD_INODE_SIZE && len <= sbi->s_inode_size; case EXT4_FC_TAG_PAD: return true; /* padding can have any length */ case EXT4_FC_TAG_TAIL: return len >= sizeof(struct ext4_fc_tail); case EXT4_FC_TAG_HEAD: return len == sizeof(struct ext4_fc_head); } return false; } /* * Recovery Scan phase handler * * This function is called during the scan phase and is responsible * for doing following things: * - Make sure the fast commit area has valid tags for replay * - Count number of tags that need to be replayed by the replay handler * - Verify CRC * - Create a list of excluded blocks for allocation during replay phase * * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP * to indicate that scan has finished and JBD2 can now start replay phase. * It returns a negative error to indicate that there was an error. At the end * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set * to indicate the number of tags that need to replayed during the replay phase. */ static int ext4_fc_replay_scan(journal_t *journal, struct buffer_head *bh, int off, tid_t expected_tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_fc_replay_state *state; int ret = JBD2_FC_REPLAY_CONTINUE; struct ext4_fc_add_range ext; struct ext4_fc_tl_mem tl; struct ext4_fc_tail tail; __u8 *start, *end, *cur, *val; struct ext4_fc_head head; struct ext4_extent *ex; state = &sbi->s_fc_replay_state; start = (u8 *)bh->b_data; end = start + journal->j_blocksize; if (state->fc_replay_expected_off == 0) { state->fc_cur_tag = 0; state->fc_replay_num_tags = 0; state->fc_crc = 0; state->fc_regions = NULL; state->fc_regions_valid = state->fc_regions_used = state->fc_regions_size = 0; /* Check if we can stop early */ if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag) != EXT4_FC_TAG_HEAD) return 0; } if (off != state->fc_replay_expected_off) { ret = -EFSCORRUPTED; goto out_err; } state->fc_replay_expected_off++; for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { ext4_fc_get_tl(&tl, cur); val = cur + EXT4_FC_TAG_BASE_LEN; if (tl.fc_len > end - val || !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) { ret = state->fc_replay_num_tags ? JBD2_FC_REPLAY_STOP : -ECANCELED; goto out_err; } ext4_debug("Scan phase, tag:%s, blk %lld\n", tag2str(tl.fc_tag), bh->b_blocknr); switch (tl.fc_tag) { case EXT4_FC_TAG_ADD_RANGE: memcpy(&ext, val, sizeof(ext)); ex = (struct ext4_extent *)&ext.fc_ex; ret = ext4_fc_record_regions(sb, le32_to_cpu(ext.fc_ino), le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), ext4_ext_get_actual_len(ex), 0); if (ret < 0) break; ret = JBD2_FC_REPLAY_CONTINUE; fallthrough; case EXT4_FC_TAG_DEL_RANGE: case EXT4_FC_TAG_LINK: case EXT4_FC_TAG_UNLINK: case EXT4_FC_TAG_CREAT: case EXT4_FC_TAG_INODE: case EXT4_FC_TAG_PAD: state->fc_cur_tag++; state->fc_crc = ext4_chksum(state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; case EXT4_FC_TAG_TAIL: state->fc_cur_tag++; memcpy(&tail, val, sizeof(tail)); state->fc_crc = ext4_chksum(state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + offsetof(struct ext4_fc_tail, fc_crc)); if (le32_to_cpu(tail.fc_tid) == expected_tid && le32_to_cpu(tail.fc_crc) == state->fc_crc) { state->fc_replay_num_tags = state->fc_cur_tag; state->fc_regions_valid = state->fc_regions_used; } else { ret = state->fc_replay_num_tags ? JBD2_FC_REPLAY_STOP : -EFSBADCRC; } state->fc_crc = 0; break; case EXT4_FC_TAG_HEAD: memcpy(&head, val, sizeof(head)); if (le32_to_cpu(head.fc_features) & ~EXT4_FC_SUPPORTED_FEATURES) { ret = -EOPNOTSUPP; break; } if (le32_to_cpu(head.fc_tid) != expected_tid) { ret = JBD2_FC_REPLAY_STOP; break; } state->fc_cur_tag++; state->fc_crc = ext4_chksum(state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; default: ret = state->fc_replay_num_tags ? JBD2_FC_REPLAY_STOP : -ECANCELED; } if (ret < 0 || ret == JBD2_FC_REPLAY_STOP) break; } out_err: trace_ext4_fc_replay_scan(sb, ret, off); return ret; } /* * Main recovery path entry point. * The meaning of return codes is similar as above. */ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, enum passtype pass, int off, tid_t expected_tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_fc_tl_mem tl; __u8 *start, *end, *cur, *val; int ret = JBD2_FC_REPLAY_CONTINUE; struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; struct ext4_fc_tail tail; if (pass == PASS_SCAN) { state->fc_current_pass = PASS_SCAN; return ext4_fc_replay_scan(journal, bh, off, expected_tid); } if (state->fc_current_pass != pass) { state->fc_current_pass = pass; sbi->s_mount_state |= EXT4_FC_REPLAY; } if (!sbi->s_fc_replay_state.fc_replay_num_tags) { ext4_debug("Replay stops\n"); ext4_fc_set_bitmaps_and_counters(sb); return 0; } #ifdef CONFIG_EXT4_DEBUG if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) { pr_warn("Dropping fc block %d because max_replay set\n", off); return JBD2_FC_REPLAY_STOP; } #endif start = (u8 *)bh->b_data; end = start + journal->j_blocksize; for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { ext4_fc_get_tl(&tl, cur); val = cur + EXT4_FC_TAG_BASE_LEN; if (state->fc_replay_num_tags == 0) { ret = JBD2_FC_REPLAY_STOP; ext4_fc_set_bitmaps_and_counters(sb); break; } ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag)); state->fc_replay_num_tags--; switch (tl.fc_tag) { case EXT4_FC_TAG_LINK: ret = ext4_fc_replay_link(sb, &tl, val); break; case EXT4_FC_TAG_UNLINK: ret = ext4_fc_replay_unlink(sb, &tl, val); break; case EXT4_FC_TAG_ADD_RANGE: ret = ext4_fc_replay_add_range(sb, &tl, val); break; case EXT4_FC_TAG_CREAT: ret = ext4_fc_replay_create(sb, &tl, val); break; case EXT4_FC_TAG_DEL_RANGE: ret = ext4_fc_replay_del_range(sb, &tl, val); break; case EXT4_FC_TAG_INODE: ret = ext4_fc_replay_inode(sb, &tl, val); break; case EXT4_FC_TAG_PAD: trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, tl.fc_len, 0); break; case EXT4_FC_TAG_TAIL: trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0, tl.fc_len, 0); memcpy(&tail, val, sizeof(tail)); WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid); break; case EXT4_FC_TAG_HEAD: break; default: trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0); ret = -ECANCELED; break; } if (ret < 0) break; ret = JBD2_FC_REPLAY_CONTINUE; } return ret; } void ext4_fc_init(struct super_block *sb, journal_t *journal) { /* * We set replay callback even if fast commit disabled because we may * could still have fast commit blocks that need to be replayed even if * fast commit has now been turned off. */ journal->j_fc_replay_callback = ext4_fc_replay; if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) return; journal->j_fc_cleanup_callback = ext4_fc_cleanup; } static const char * const fc_ineligible_reasons[] = { [EXT4_FC_REASON_XATTR] = "Extended attributes changed", [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename", [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed", [EXT4_FC_REASON_NOMEM] = "Insufficient memory", [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot", [EXT4_FC_REASON_RESIZE] = "Resize", [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed", [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op", [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling", [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename", }; int ext4_fc_info_show(struct seq_file *seq, void *v) { struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private); struct ext4_fc_stats *stats = &sbi->s_fc_stats; int i; if (v != SEQ_START_TOKEN) return 0; seq_printf(seq, "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n", stats->fc_num_commits, stats->fc_ineligible_commits, stats->fc_numblks, div_u64(stats->s_fc_avg_commit_time, 1000)); seq_puts(seq, "Ineligible reasons:\n"); for (i = 0; i < EXT4_FC_REASON_MAX; i++) seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i], stats->fc_ineligible_reason_count[i]); return 0; } int __init ext4_fc_init_dentry_cache(void) { ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update, SLAB_RECLAIM_ACCOUNT); if (ext4_fc_dentry_cachep == NULL) return -ENOMEM; return 0; } void ext4_fc_destroy_dentry_cache(void) { kmem_cache_destroy(ext4_fc_dentry_cachep); }
47 1 50 47 6 10 9 2 10 10 10 10 1 1 10 10 10 496 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C)2003,2004 USAGI/WIDE Project * * Authors Mitsuru KANDA <mk@linux-ipv6.org> * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> * * Based on net/ipv4/xfrm4_tunnel.c */ #include <linux/module.h> #include <linux/xfrm.h> #include <linux/slab.h> #include <linux/rculist.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/ipv6.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/mutex.h> #include <net/netns/generic.h> #define XFRM6_TUNNEL_SPI_BYADDR_HSIZE 256 #define XFRM6_TUNNEL_SPI_BYSPI_HSIZE 256 #define XFRM6_TUNNEL_SPI_MIN 1 #define XFRM6_TUNNEL_SPI_MAX 0xffffffff struct xfrm6_tunnel_net { struct hlist_head spi_byaddr[XFRM6_TUNNEL_SPI_BYADDR_HSIZE]; struct hlist_head spi_byspi[XFRM6_TUNNEL_SPI_BYSPI_HSIZE]; u32 spi; }; static unsigned int xfrm6_tunnel_net_id __read_mostly; static inline struct xfrm6_tunnel_net *xfrm6_tunnel_pernet(struct net *net) { return net_generic(net, xfrm6_tunnel_net_id); } /* * xfrm_tunnel_spi things are for allocating unique id ("spi") * per xfrm_address_t. */ struct xfrm6_tunnel_spi { struct hlist_node list_byaddr; struct hlist_node list_byspi; xfrm_address_t addr; u32 spi; refcount_t refcnt; struct rcu_head rcu_head; }; static DEFINE_SPINLOCK(xfrm6_tunnel_spi_lock); static struct kmem_cache *xfrm6_tunnel_spi_kmem __read_mostly; static inline unsigned int xfrm6_tunnel_spi_hash_byaddr(const xfrm_address_t *addr) { unsigned int h; h = ipv6_addr_hash((const struct in6_addr *)addr); h ^= h >> 16; h ^= h >> 8; h &= XFRM6_TUNNEL_SPI_BYADDR_HSIZE - 1; return h; } static inline unsigned int xfrm6_tunnel_spi_hash_byspi(u32 spi) { return spi % XFRM6_TUNNEL_SPI_BYSPI_HSIZE; } static struct xfrm6_tunnel_spi *__xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr) { struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); struct xfrm6_tunnel_spi *x6spi; hlist_for_each_entry_rcu(x6spi, &xfrm6_tn->spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)], list_byaddr, lockdep_is_held(&xfrm6_tunnel_spi_lock)) { if (xfrm6_addr_equal(&x6spi->addr, saddr)) return x6spi; } return NULL; } __be32 xfrm6_tunnel_spi_lookup(struct net *net, const xfrm_address_t *saddr) { struct xfrm6_tunnel_spi *x6spi; u32 spi; rcu_read_lock_bh(); x6spi = __xfrm6_tunnel_spi_lookup(net, saddr); spi = x6spi ? x6spi->spi : 0; rcu_read_unlock_bh(); return htonl(spi); } EXPORT_SYMBOL(xfrm6_tunnel_spi_lookup); static int __xfrm6_tunnel_spi_check(struct net *net, u32 spi) { struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); struct xfrm6_tunnel_spi *x6spi; int index = xfrm6_tunnel_spi_hash_byspi(spi); hlist_for_each_entry(x6spi, &xfrm6_tn->spi_byspi[index], list_byspi) { if (x6spi->spi == spi) return -1; } return index; } static u32 __xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr) { struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); u32 spi; struct xfrm6_tunnel_spi *x6spi; int index; if (xfrm6_tn->spi < XFRM6_TUNNEL_SPI_MIN || xfrm6_tn->spi >= XFRM6_TUNNEL_SPI_MAX) xfrm6_tn->spi = XFRM6_TUNNEL_SPI_MIN; else xfrm6_tn->spi++; for (spi = xfrm6_tn->spi; spi <= XFRM6_TUNNEL_SPI_MAX; spi++) { index = __xfrm6_tunnel_spi_check(net, spi); if (index >= 0) goto alloc_spi; if (spi == XFRM6_TUNNEL_SPI_MAX) break; } for (spi = XFRM6_TUNNEL_SPI_MIN; spi < xfrm6_tn->spi; spi++) { index = __xfrm6_tunnel_spi_check(net, spi); if (index >= 0) goto alloc_spi; } spi = 0; goto out; alloc_spi: xfrm6_tn->spi = spi; x6spi = kmem_cache_alloc(xfrm6_tunnel_spi_kmem, GFP_ATOMIC); if (!x6spi) goto out; memcpy(&x6spi->addr, saddr, sizeof(x6spi->addr)); x6spi->spi = spi; refcount_set(&x6spi->refcnt, 1); hlist_add_head_rcu(&x6spi->list_byspi, &xfrm6_tn->spi_byspi[index]); index = xfrm6_tunnel_spi_hash_byaddr(saddr); hlist_add_head_rcu(&x6spi->list_byaddr, &xfrm6_tn->spi_byaddr[index]); out: return spi; } __be32 xfrm6_tunnel_alloc_spi(struct net *net, xfrm_address_t *saddr) { struct xfrm6_tunnel_spi *x6spi; u32 spi; spin_lock_bh(&xfrm6_tunnel_spi_lock); x6spi = __xfrm6_tunnel_spi_lookup(net, saddr); if (x6spi) { refcount_inc(&x6spi->refcnt); spi = x6spi->spi; } else spi = __xfrm6_tunnel_alloc_spi(net, saddr); spin_unlock_bh(&xfrm6_tunnel_spi_lock); return htonl(spi); } EXPORT_SYMBOL(xfrm6_tunnel_alloc_spi); static void x6spi_destroy_rcu(struct rcu_head *head) { kmem_cache_free(xfrm6_tunnel_spi_kmem, container_of(head, struct xfrm6_tunnel_spi, rcu_head)); } static void xfrm6_tunnel_free_spi(struct net *net, xfrm_address_t *saddr) { struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); struct xfrm6_tunnel_spi *x6spi; struct hlist_node *n; spin_lock_bh(&xfrm6_tunnel_spi_lock); hlist_for_each_entry_safe(x6spi, n, &xfrm6_tn->spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)], list_byaddr) { if (xfrm6_addr_equal(&x6spi->addr, saddr)) { if (refcount_dec_and_test(&x6spi->refcnt)) { hlist_del_rcu(&x6spi->list_byaddr); hlist_del_rcu(&x6spi->list_byspi); call_rcu(&x6spi->rcu_head, x6spi_destroy_rcu); break; } } } spin_unlock_bh(&xfrm6_tunnel_spi_lock); } static int xfrm6_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) { skb_push(skb, -skb_network_offset(skb)); return 0; } static int xfrm6_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) { return skb_network_header(skb)[IP6CB(skb)->nhoff]; } static int xfrm6_tunnel_rcv(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); const struct ipv6hdr *iph = ipv6_hdr(skb); __be32 spi; spi = xfrm6_tunnel_spi_lookup(net, (const xfrm_address_t *)&iph->saddr); return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL); } static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { /* xfrm6_tunnel native err handling */ switch (type) { case ICMPV6_DEST_UNREACH: switch (code) { case ICMPV6_NOROUTE: case ICMPV6_ADM_PROHIBITED: case ICMPV6_NOT_NEIGHBOUR: case ICMPV6_ADDR_UNREACH: case ICMPV6_PORT_UNREACH: default: break; } break; case ICMPV6_PKT_TOOBIG: break; case ICMPV6_TIME_EXCEED: switch (code) { case ICMPV6_EXC_HOPLIMIT: break; case ICMPV6_EXC_FRAGTIME: default: break; } break; case ICMPV6_PARAMPROB: switch (code) { case ICMPV6_HDR_FIELD: break; case ICMPV6_UNK_NEXTHDR: break; case ICMPV6_UNK_OPTION: break; } break; default: break; } return 0; } static int xfrm6_tunnel_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { if (x->props.mode != XFRM_MODE_TUNNEL) { NL_SET_ERR_MSG(extack, "IPv6 tunnel can only be used with tunnel mode"); return -EINVAL; } if (x->encap) { NL_SET_ERR_MSG(extack, "IPv6 tunnel is not compatible with encapsulation"); return -EINVAL; } x->props.header_len = sizeof(struct ipv6hdr); return 0; } static void xfrm6_tunnel_destroy(struct xfrm_state *x) { struct net *net = xs_net(x); xfrm6_tunnel_free_spi(net, (xfrm_address_t *)&x->props.saddr); } static const struct xfrm_type xfrm6_tunnel_type = { .owner = THIS_MODULE, .proto = IPPROTO_IPV6, .init_state = xfrm6_tunnel_init_state, .destructor = xfrm6_tunnel_destroy, .input = xfrm6_tunnel_input, .output = xfrm6_tunnel_output, }; static struct xfrm6_tunnel xfrm6_tunnel_handler __read_mostly = { .handler = xfrm6_tunnel_rcv, .err_handler = xfrm6_tunnel_err, .priority = 3, }; static struct xfrm6_tunnel xfrm46_tunnel_handler __read_mostly = { .handler = xfrm6_tunnel_rcv, .err_handler = xfrm6_tunnel_err, .priority = 3, }; static int __net_init xfrm6_tunnel_net_init(struct net *net) { struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); unsigned int i; for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) INIT_HLIST_HEAD(&xfrm6_tn->spi_byaddr[i]); for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++) INIT_HLIST_HEAD(&xfrm6_tn->spi_byspi[i]); xfrm6_tn->spi = 0; return 0; } static void __net_exit xfrm6_tunnel_net_exit(struct net *net) { struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); unsigned int i; xfrm_state_flush(net, 0, false); xfrm_flush_gc(); for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) WARN_ON_ONCE(!hlist_empty(&xfrm6_tn->spi_byaddr[i])); for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++) WARN_ON_ONCE(!hlist_empty(&xfrm6_tn->spi_byspi[i])); } static struct pernet_operations xfrm6_tunnel_net_ops = { .init = xfrm6_tunnel_net_init, .exit = xfrm6_tunnel_net_exit, .id = &xfrm6_tunnel_net_id, .size = sizeof(struct xfrm6_tunnel_net), }; static int __init xfrm6_tunnel_init(void) { int rv; xfrm6_tunnel_spi_kmem = KMEM_CACHE(xfrm6_tunnel_spi, SLAB_HWCACHE_ALIGN); if (!xfrm6_tunnel_spi_kmem) return -ENOMEM; rv = register_pernet_subsys(&xfrm6_tunnel_net_ops); if (rv < 0) goto out_pernet; rv = xfrm_register_type(&xfrm6_tunnel_type, AF_INET6); if (rv < 0) goto out_type; rv = xfrm6_tunnel_register(&xfrm6_tunnel_handler, AF_INET6); if (rv < 0) goto out_xfrm6; rv = xfrm6_tunnel_register(&xfrm46_tunnel_handler, AF_INET); if (rv < 0) goto out_xfrm46; return 0; out_xfrm46: xfrm6_tunnel_deregister(&xfrm6_tunnel_handler, AF_INET6); out_xfrm6: xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); out_type: unregister_pernet_subsys(&xfrm6_tunnel_net_ops); out_pernet: kmem_cache_destroy(xfrm6_tunnel_spi_kmem); return rv; } static void __exit xfrm6_tunnel_fini(void) { xfrm6_tunnel_deregister(&xfrm46_tunnel_handler, AF_INET); xfrm6_tunnel_deregister(&xfrm6_tunnel_handler, AF_INET6); xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); unregister_pernet_subsys(&xfrm6_tunnel_net_ops); /* Someone maybe has gotten the xfrm6_tunnel_spi. * So need to wait it. */ rcu_barrier(); kmem_cache_destroy(xfrm6_tunnel_spi_kmem); } module_init(xfrm6_tunnel_init); module_exit(xfrm6_tunnel_fini); MODULE_DESCRIPTION("IPv6 XFRM tunnel driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET6, XFRM_PROTO_IPV6);
3 2 1 2 2 4 2 2 2 2 3 3 18 1 2 1 4 10 4 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/if_arp.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_tables_offload.h> #include <net/netfilter/nf_tables.h> struct nft_cmp_expr { struct nft_data data; u8 sreg; u8 len; enum nft_cmp_ops op:8; }; void nft_cmp_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_cmp_expr *priv = nft_expr_priv(expr); int d; d = memcmp(&regs->data[priv->sreg], &priv->data, priv->len); switch (priv->op) { case NFT_CMP_EQ: if (d != 0) goto mismatch; break; case NFT_CMP_NEQ: if (d == 0) goto mismatch; break; case NFT_CMP_LT: if (d == 0) goto mismatch; fallthrough; case NFT_CMP_LTE: if (d > 0) goto mismatch; break; case NFT_CMP_GT: if (d == 0) goto mismatch; fallthrough; case NFT_CMP_GTE: if (d < 0) goto mismatch; break; } return; mismatch: regs->verdict.code = NFT_BREAK; } static const struct nla_policy nft_cmp_policy[NFTA_CMP_MAX + 1] = { [NFTA_CMP_SREG] = { .type = NLA_U32 }, [NFTA_CMP_OP] = { .type = NLA_U32 }, [NFTA_CMP_DATA] = { .type = NLA_NESTED }, }; static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_cmp_expr *priv = nft_expr_priv(expr); struct nft_data_desc desc = { .type = NFT_DATA_VALUE, .size = sizeof(priv->data), }; int err; err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]); if (err < 0) return err; err = nft_parse_register_load(ctx, tb[NFTA_CMP_SREG], &priv->sreg, desc.len); if (err < 0) return err; priv->op = ntohl(nla_get_be32(tb[NFTA_CMP_OP])); priv->len = desc.len; return 0; } static int nft_cmp_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_cmp_expr *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_CMP_SREG, priv->sreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CMP_OP, htonl(priv->op))) goto nla_put_failure; if (nft_data_dump(skb, NFTA_CMP_DATA, &priv->data, NFT_DATA_VALUE, priv->len) < 0) goto nla_put_failure; return 0; nla_put_failure: return -1; } union nft_cmp_offload_data { u16 val16; u32 val32; u64 val64; }; static void nft_payload_n2h(union nft_cmp_offload_data *data, const u8 *val, u32 len) { switch (len) { case 2: data->val16 = ntohs(*((__be16 *)val)); break; case 4: data->val32 = ntohl(*((__be32 *)val)); break; case 8: data->val64 = be64_to_cpu(*((__be64 *)val)); break; default: WARN_ON_ONCE(1); break; } } static int __nft_cmp_offload(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_cmp_expr *priv) { struct nft_offload_reg *reg = &ctx->regs[priv->sreg]; union nft_cmp_offload_data _data, _datamask; u8 *mask = (u8 *)&flow->match.mask; u8 *key = (u8 *)&flow->match.key; u8 *data, *datamask; if (priv->op != NFT_CMP_EQ || priv->len > reg->len) return -EOPNOTSUPP; if (reg->flags & NFT_OFFLOAD_F_NETWORK2HOST) { nft_payload_n2h(&_data, (u8 *)&priv->data, reg->len); nft_payload_n2h(&_datamask, (u8 *)&reg->mask, reg->len); data = (u8 *)&_data; datamask = (u8 *)&_datamask; } else { data = (u8 *)&priv->data; datamask = (u8 *)&reg->mask; } memcpy(key + reg->offset, data, reg->len); memcpy(mask + reg->offset, datamask, reg->len); flow->match.dissector.used_keys |= BIT_ULL(reg->key); flow->match.dissector.offset[reg->key] = reg->base_offset; if (reg->key == FLOW_DISSECTOR_KEY_META && reg->offset == offsetof(struct nft_flow_key, meta.ingress_iftype) && nft_reg_load16(priv->data.data) != ARPHRD_ETHER) return -EOPNOTSUPP; nft_offload_update_dependency(ctx, &priv->data, reg->len); return 0; } static int nft_cmp_offload(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_expr *expr) { const struct nft_cmp_expr *priv = nft_expr_priv(expr); return __nft_cmp_offload(ctx, flow, priv); } static const struct nft_expr_ops nft_cmp_ops = { .type = &nft_cmp_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_cmp_expr)), .eval = nft_cmp_eval, .init = nft_cmp_init, .dump = nft_cmp_dump, .reduce = NFT_REDUCE_READONLY, .offload = nft_cmp_offload, }; /* Calculate the mask for the nft_cmp_fast expression. On big endian the * mask needs to include the *upper* bytes when interpreting that data as * something smaller than the full u32, therefore a cpu_to_le32 is done. */ static u32 nft_cmp_fast_mask(unsigned int len) { __le32 mask = cpu_to_le32(~0U >> (sizeof_field(struct nft_cmp_fast_expr, data) * BITS_PER_BYTE - len)); return (__force u32)mask; } static int nft_cmp_fast_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); struct nft_data data; struct nft_data_desc desc = { .type = NFT_DATA_VALUE, .size = sizeof(data), }; int err; err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]); if (err < 0) return err; err = nft_parse_register_load(ctx, tb[NFTA_CMP_SREG], &priv->sreg, desc.len); if (err < 0) return err; desc.len *= BITS_PER_BYTE; priv->mask = nft_cmp_fast_mask(desc.len); priv->data = data.data[0] & priv->mask; priv->len = desc.len; priv->inv = ntohl(nla_get_be32(tb[NFTA_CMP_OP])) != NFT_CMP_EQ; return 0; } static int nft_cmp_fast_offload(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_expr *expr) { const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); struct nft_cmp_expr cmp = { .data = { .data = { [0] = priv->data, }, }, .sreg = priv->sreg, .len = priv->len / BITS_PER_BYTE, .op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ, }; return __nft_cmp_offload(ctx, flow, &cmp); } static int nft_cmp_fast_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr); enum nft_cmp_ops op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ; struct nft_data data; if (nft_dump_register(skb, NFTA_CMP_SREG, priv->sreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CMP_OP, htonl(op))) goto nla_put_failure; data.data[0] = priv->data; if (nft_data_dump(skb, NFTA_CMP_DATA, &data, NFT_DATA_VALUE, priv->len / BITS_PER_BYTE) < 0) goto nla_put_failure; return 0; nla_put_failure: return -1; } const struct nft_expr_ops nft_cmp_fast_ops = { .type = &nft_cmp_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_cmp_fast_expr)), .eval = NULL, /* inlined */ .init = nft_cmp_fast_init, .dump = nft_cmp_fast_dump, .reduce = NFT_REDUCE_READONLY, .offload = nft_cmp_fast_offload, }; static u32 nft_cmp_mask(u32 bitlen) { return (__force u32)cpu_to_le32(~0U >> (sizeof(u32) * BITS_PER_BYTE - bitlen)); } static void nft_cmp16_fast_mask(struct nft_data *data, unsigned int bitlen) { int len = bitlen / BITS_PER_BYTE; int i, words = len / sizeof(u32); for (i = 0; i < words; i++) { data->data[i] = 0xffffffff; bitlen -= sizeof(u32) * BITS_PER_BYTE; } if (len % sizeof(u32)) data->data[i++] = nft_cmp_mask(bitlen); for (; i < 4; i++) data->data[i] = 0; } static int nft_cmp16_fast_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr); struct nft_data_desc desc = { .type = NFT_DATA_VALUE, .size = sizeof(priv->data), }; int err; err = nft_data_init(NULL, &priv->data, &desc, tb[NFTA_CMP_DATA]); if (err < 0) return err; err = nft_parse_register_load(ctx, tb[NFTA_CMP_SREG], &priv->sreg, desc.len); if (err < 0) return err; nft_cmp16_fast_mask(&priv->mask, desc.len * BITS_PER_BYTE); priv->inv = ntohl(nla_get_be32(tb[NFTA_CMP_OP])) != NFT_CMP_EQ; priv->len = desc.len; return 0; } static int nft_cmp16_fast_offload(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_expr *expr) { const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr); struct nft_cmp_expr cmp = { .data = priv->data, .sreg = priv->sreg, .len = priv->len, .op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ, }; return __nft_cmp_offload(ctx, flow, &cmp); } static int nft_cmp16_fast_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr); enum nft_cmp_ops op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ; if (nft_dump_register(skb, NFTA_CMP_SREG, priv->sreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CMP_OP, htonl(op))) goto nla_put_failure; if (nft_data_dump(skb, NFTA_CMP_DATA, &priv->data, NFT_DATA_VALUE, priv->len) < 0) goto nla_put_failure; return 0; nla_put_failure: return -1; } const struct nft_expr_ops nft_cmp16_fast_ops = { .type = &nft_cmp_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_cmp16_fast_expr)), .eval = NULL, /* inlined */ .init = nft_cmp16_fast_init, .dump = nft_cmp16_fast_dump, .reduce = NFT_REDUCE_READONLY, .offload = nft_cmp16_fast_offload, }; static const struct nft_expr_ops * nft_cmp_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_data data; struct nft_data_desc desc = { .type = NFT_DATA_VALUE, .size = sizeof(data), }; enum nft_cmp_ops op; u8 sreg; int err; if (tb[NFTA_CMP_SREG] == NULL || tb[NFTA_CMP_OP] == NULL || tb[NFTA_CMP_DATA] == NULL) return ERR_PTR(-EINVAL); op = ntohl(nla_get_be32(tb[NFTA_CMP_OP])); switch (op) { case NFT_CMP_EQ: case NFT_CMP_NEQ: case NFT_CMP_LT: case NFT_CMP_LTE: case NFT_CMP_GT: case NFT_CMP_GTE: break; default: return ERR_PTR(-EINVAL); } err = nft_data_init(NULL, &data, &desc, tb[NFTA_CMP_DATA]); if (err < 0) return ERR_PTR(err); sreg = ntohl(nla_get_be32(tb[NFTA_CMP_SREG])); if (op == NFT_CMP_EQ || op == NFT_CMP_NEQ) { if (desc.len <= sizeof(u32)) return &nft_cmp_fast_ops; else if (desc.len <= sizeof(data) && ((sreg >= NFT_REG_1 && sreg <= NFT_REG_4) || (sreg >= NFT_REG32_00 && sreg <= NFT_REG32_12 && sreg % 2 == 0))) return &nft_cmp16_fast_ops; } return &nft_cmp_ops; } struct nft_expr_type nft_cmp_type __read_mostly = { .name = "cmp", .select_ops = nft_cmp_select_ops, .policy = nft_cmp_policy, .maxattr = NFTA_CMP_MAX, .owner = THIS_MODULE, };
4 4 4 4 4 16 16 16 507 506 106 1022 3334 42 31 11 42 21 7 17 12 72 695 696 693 2 695 20 20 5 5 5 20 5 69 17 18 2 18 18 2 2 2 2 2 18 16 2 18 5 17 6 6 6 4 1 1 1 1 1 36 36 36 44 1 14 30 27 18 25 25 20 23 23 6 17 12 12 1 11 14 14 12 3 2 1 1 3 3 2 2 2 2 10 6 4 6 6 3 3 1021 6 1022 169 1 58 110 103 103 169 169 169 8 8 2132 430 1878 1881 1240 1236 65 7 57 49 49 49 5 44 13 31 1066 265 854 1066 634 794 1 2109 1621 1624 9 1624 1623 27 27 637 9 2134 1166 25 2 1192 1 1194 1192 487 865 5 487 484 1097 2131 2132 2134 851 1835 245 247 245 13 1 5 133 68 136 207 201 200 195 6 200 43 41 1 40 24 40 1 471 261 29 810 8 111 11 2 489 1 492 495 67 3 433 8 191 72 30 776 770 360 232 220 610 552 71 361 262 266 13 42 1040 3 221 4 16 5 207 821 1028 242 14 1691 536 1 654 513 576 307 1117 551 1244 53 928 922 2139 1503 451 337 109 581 1 2134 712 2637 1789 23 219 1152 25 465 110 25 1953 26 97 117 4 49 77 158 3 125 58 296 1860 816 55 100 108 823 883 1115 389 2188 12 12 12 1 2381 829 133 1871 13 9 4 13 11 2 13 13 7 6 13 11 2 13 12 2 11 11 7 4 4 13 2 2 2 2 2 20 21 21 21 2 25 26 24 1 1 2 25 19 18 3 2 7 1 3 8 5 2 6 6 5 1 46 46 4 29 29 29 29 4 25 23 6 22 4 23 3 26 26 22 4 22 3 2 10 9 11 9 13 4 14 3 17 4 1 1 1 11 13 2124 507 481 26 507 106 106 507 507 507 507 106 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * ROUTE - implementation of the IP router. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Linus Torvalds, <Linus.Torvalds@helsinki.fi> * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Fixes: * Alan Cox : Verify area fixes. * Alan Cox : cli() protects routing changes * Rui Oliveira : ICMP routing table updates * (rco@di.uminho.pt) Routing table insertion and update * Linus Torvalds : Rewrote bits to be sensible * Alan Cox : Added BSD route gw semantics * Alan Cox : Super /proc >4K * Alan Cox : MTU in route table * Alan Cox : MSS actually. Also added the window * clamper. * Sam Lantinga : Fixed route matching in rt_del() * Alan Cox : Routing cache support. * Alan Cox : Removed compatibility cruft. * Alan Cox : RTF_REJECT support. * Alan Cox : TCP irtt support. * Jonathan Naylor : Added Metric support. * Miquel van Smoorenburg : BSD API fixes. * Miquel van Smoorenburg : Metrics. * Alan Cox : Use __u32 properly * Alan Cox : Aligned routing errors more closely with BSD * our system is still very different. * Alan Cox : Faster /proc handling * Alexey Kuznetsov : Massive rework to support tree based routing, * routing caches and better behaviour. * * Olaf Erb : irtt wasn't being copied right. * Bjorn Ekwall : Kerneld route support. * Alan Cox : Multicast fixed (I hope) * Pavel Krauz : Limited broadcast fixed * Mike McLagan : Routing by source * Alexey Kuznetsov : End of old history. Split to fib.c and * route.c and rewritten from scratch. * Andi Kleen : Load-limit warning messages. * Vitaly E. Lavrov : Transparent proxy revived after year coma. * Vitaly E. Lavrov : Race condition in ip_route_input_slow. * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. * Vladimir V. Ivanov : IP rule info (flowid) is really useful. * Marc Boucher : routing by fwmark * Robert Olsson : Added rt_cache statistics * Arnaldo C. Melo : Convert proc stuff to seq_file * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect * Ilia Sotnikov : Removed TOS from hash calculations */ #define pr_fmt(fmt) "IPv4: " fmt #include <linux/module.h> #include <linux/bitops.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/memblock.h> #include <linux/socket.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/proc_fs.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/pkt_sched.h> #include <linux/mroute.h> #include <linux/netfilter_ipv4.h> #include <linux/random.h> #include <linux/rcupdate.h> #include <linux/slab.h> #include <linux/jhash.h> #include <net/dst.h> #include <net/dst_metadata.h> #include <net/inet_dscp.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/route.h> #include <net/inetpeer.h> #include <net/sock.h> #include <net/ip_fib.h> #include <net/nexthop.h> #include <net/tcp.h> #include <net/icmp.h> #include <net/xfrm.h> #include <net/lwtunnel.h> #include <net/netevent.h> #include <net/rtnetlink.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <net/secure_seq.h> #include <net/ip_tunnels.h> #include "fib_lookup.h" #define RT_GC_TIMEOUT (300*HZ) #define DEFAULT_MIN_PMTU (512 + 20 + 20) #define DEFAULT_MTU_EXPIRES (10 * 60 * HZ) #define DEFAULT_MIN_ADVMSS 256 static int ip_rt_max_size; static int ip_rt_redirect_number __read_mostly = 9; static int ip_rt_redirect_load __read_mostly = HZ / 50; static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1)); static int ip_rt_error_cost __read_mostly = HZ; static int ip_rt_error_burst __read_mostly = 5 * HZ; static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; /* * Interface to generic destination cache. */ INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); static unsigned int ipv4_default_advmss(const struct dst_entry *dst); INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst); static void ipv4_negative_advice(struct sock *sk, struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh); static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb); static void ipv4_dst_destroy(struct dst_entry *dst); static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) { WARN_ON(1); return NULL; } static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr); static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr); static struct dst_ops ipv4_dst_ops = { .family = AF_INET, .check = ipv4_dst_check, .default_advmss = ipv4_default_advmss, .mtu = ipv4_mtu, .cow_metrics = ipv4_cow_metrics, .destroy = ipv4_dst_destroy, .negative_advice = ipv4_negative_advice, .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, .redirect = ip_do_redirect, .local_out = __ip_local_out, .neigh_lookup = ipv4_neigh_lookup, .confirm_neigh = ipv4_confirm_neigh, }; #define ECN_OR_COST(class) TC_PRIO_##class const __u8 ip_tos2prio[16] = { TC_PRIO_BESTEFFORT, ECN_OR_COST(BESTEFFORT), TC_PRIO_BESTEFFORT, ECN_OR_COST(BESTEFFORT), TC_PRIO_BULK, ECN_OR_COST(BULK), TC_PRIO_BULK, ECN_OR_COST(BULK), TC_PRIO_INTERACTIVE, ECN_OR_COST(INTERACTIVE), TC_PRIO_INTERACTIVE, ECN_OR_COST(INTERACTIVE), TC_PRIO_INTERACTIVE_BULK, ECN_OR_COST(INTERACTIVE_BULK), TC_PRIO_INTERACTIVE_BULK, ECN_OR_COST(INTERACTIVE_BULK) }; EXPORT_SYMBOL(ip_tos2prio); static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); #ifndef CONFIG_PREEMPT_RT #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field) #else #define RT_CACHE_STAT_INC(field) this_cpu_inc(rt_cache_stat.field) #endif #ifdef CONFIG_PROC_FS static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) { if (*pos) return NULL; return SEQ_START_TOKEN; } static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; return NULL; } static void rt_cache_seq_stop(struct seq_file *seq, void *v) { } static int rt_cache_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" "HHUptod\tSpecDst"); return 0; } static const struct seq_operations rt_cache_seq_ops = { .start = rt_cache_seq_start, .next = rt_cache_seq_next, .stop = rt_cache_seq_stop, .show = rt_cache_seq_show, }; static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) { int cpu; if (*pos == 0) return SEQ_START_TOKEN; for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; return &per_cpu(rt_cache_stat, cpu); } return NULL; } static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) { int cpu; for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; return &per_cpu(rt_cache_stat, cpu); } (*pos)++; return NULL; } static void rt_cpu_seq_stop(struct seq_file *seq, void *v) { } static int rt_cpu_seq_show(struct seq_file *seq, void *v) { struct rt_cache_stat *st = v; if (v == SEQ_START_TOKEN) { seq_puts(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); return 0; } seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x " "%08x %08x %08x %08x %08x %08x " "%08x %08x %08x %08x\n", dst_entries_get_slow(&ipv4_dst_ops), 0, /* st->in_hit */ st->in_slow_tot, st->in_slow_mc, st->in_no_route, st->in_brd, st->in_martian_dst, st->in_martian_src, 0, /* st->out_hit */ st->out_slow_tot, st->out_slow_mc, 0, /* st->gc_total */ 0, /* st->gc_ignored */ 0, /* st->gc_goal_miss */ 0, /* st->gc_dst_overflow */ 0, /* st->in_hlist_search */ 0 /* st->out_hlist_search */ ); return 0; } static const struct seq_operations rt_cpu_seq_ops = { .start = rt_cpu_seq_start, .next = rt_cpu_seq_next, .stop = rt_cpu_seq_stop, .show = rt_cpu_seq_show, }; #ifdef CONFIG_IP_ROUTE_CLASSID static int rt_acct_proc_show(struct seq_file *m, void *v) { struct ip_rt_acct *dst, *src; unsigned int i, j; dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL); if (!dst) return -ENOMEM; for_each_possible_cpu(i) { src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i); for (j = 0; j < 256; j++) { dst[j].o_bytes += src[j].o_bytes; dst[j].o_packets += src[j].o_packets; dst[j].i_bytes += src[j].i_bytes; dst[j].i_packets += src[j].i_packets; } } seq_write(m, dst, 256 * sizeof(struct ip_rt_acct)); kfree(dst); return 0; } #endif static int __net_init ip_rt_do_proc_init(struct net *net) { struct proc_dir_entry *pde; pde = proc_create_seq("rt_cache", 0444, net->proc_net, &rt_cache_seq_ops); if (!pde) goto err1; pde = proc_create_seq("rt_cache", 0444, net->proc_net_stat, &rt_cpu_seq_ops); if (!pde) goto err2; #ifdef CONFIG_IP_ROUTE_CLASSID pde = proc_create_single("rt_acct", 0, net->proc_net, rt_acct_proc_show); if (!pde) goto err3; #endif return 0; #ifdef CONFIG_IP_ROUTE_CLASSID err3: remove_proc_entry("rt_cache", net->proc_net_stat); #endif err2: remove_proc_entry("rt_cache", net->proc_net); err1: return -ENOMEM; } static void __net_exit ip_rt_do_proc_exit(struct net *net) { remove_proc_entry("rt_cache", net->proc_net_stat); remove_proc_entry("rt_cache", net->proc_net); #ifdef CONFIG_IP_ROUTE_CLASSID remove_proc_entry("rt_acct", net->proc_net); #endif } static struct pernet_operations ip_rt_proc_ops __net_initdata = { .init = ip_rt_do_proc_init, .exit = ip_rt_do_proc_exit, }; static int __init ip_rt_proc_init(void) { return register_pernet_subsys(&ip_rt_proc_ops); } #else static inline int ip_rt_proc_init(void) { return 0; } #endif /* CONFIG_PROC_FS */ static inline bool rt_is_expired(const struct rtable *rth) { bool res; rcu_read_lock(); res = rth->rt_genid != rt_genid_ipv4(dev_net_rcu(rth->dst.dev)); rcu_read_unlock(); return res; } void rt_cache_flush(struct net *net) { rt_genid_bump_ipv4(net); } static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { const struct rtable *rt = container_of(dst, struct rtable, dst); struct net_device *dev = dst_dev(dst); struct neighbour *n; rcu_read_lock(); if (likely(rt->rt_gw_family == AF_INET)) { n = ip_neigh_gw4(dev, rt->rt_gw4); } else if (rt->rt_gw_family == AF_INET6) { n = ip_neigh_gw6(dev, &rt->rt_gw6); } else { __be32 pkey; pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr); n = ip_neigh_gw4(dev, pkey); } if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt)) n = NULL; rcu_read_unlock(); return n; } static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr) { const struct rtable *rt = container_of(dst, struct rtable, dst); struct net_device *dev = dst_dev(dst); const __be32 *pkey = daddr; if (rt->rt_gw_family == AF_INET) { pkey = (const __be32 *)&rt->rt_gw4; } else if (rt->rt_gw_family == AF_INET6) { return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6); } else if (!daddr || (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) { return; } __ipv4_confirm_neigh(dev, *(__force u32 *)pkey); } /* Hash tables of size 2048..262144 depending on RAM size. * Each bucket uses 8 bytes. */ static u32 ip_idents_mask __read_mostly; static atomic_t *ip_idents __read_mostly; static u32 *ip_tstamps __read_mostly; /* In order to protect privacy, we add a perturbation to identifiers * if one generator is seldom used. This makes hard for an attacker * to infer how many packets were sent between two points in time. */ static u32 ip_idents_reserve(u32 hash, int segs) { u32 bucket, old, now = (u32)jiffies; atomic_t *p_id; u32 *p_tstamp; u32 delta = 0; bucket = hash & ip_idents_mask; p_tstamp = ip_tstamps + bucket; p_id = ip_idents + bucket; old = READ_ONCE(*p_tstamp); if (old != now && cmpxchg(p_tstamp, old, now) == old) delta = get_random_u32_below(now - old); /* If UBSAN reports an error there, please make sure your compiler * supports -fno-strict-overflow before reporting it that was a bug * in UBSAN, and it has been fixed in GCC-8. */ return atomic_add_return(segs + delta, p_id) - segs; } void __ip_select_ident(struct net *net, struct iphdr *iph, int segs) { u32 hash, id; /* Note the following code is not safe, but this is okay. */ if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key))) get_random_bytes(&net->ipv4.ip_id_key, sizeof(net->ipv4.ip_id_key)); hash = siphash_3u32((__force u32)iph->daddr, (__force u32)iph->saddr, iph->protocol, &net->ipv4.ip_id_key); id = ip_idents_reserve(hash, segs); iph->id = htons(id); } EXPORT_SYMBOL(__ip_select_ident); static void __build_flow_key(const struct net *net, struct flowi4 *fl4, const struct sock *sk, const struct iphdr *iph, int oif, __u8 tos, u8 prot, u32 mark, int flow_flags) { __u8 scope = RT_SCOPE_UNIVERSE; if (sk) { oif = sk->sk_bound_dev_if; mark = READ_ONCE(sk->sk_mark); tos = ip_sock_rt_tos(sk); scope = ip_sock_rt_scope(sk); prot = inet_test_bit(HDRINCL, sk) ? IPPROTO_RAW : sk->sk_protocol; } flowi4_init_output(fl4, oif, mark, tos & INET_DSCP_MASK, scope, prot, flow_flags, iph->daddr, iph->saddr, 0, 0, sock_net_uid(net, sk)); } static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, const struct sock *sk) { const struct net *net = dev_net(skb->dev); const struct iphdr *iph = ip_hdr(skb); int oif = skb->dev->ifindex; u8 prot = iph->protocol; u32 mark = skb->mark; __u8 tos = iph->tos; __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0); } static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); const struct ip_options_rcu *inet_opt; __be32 daddr = inet->inet_daddr; rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; flowi4_init_output(fl4, sk->sk_bound_dev_if, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), inet_test_bit(HDRINCL, sk) ? IPPROTO_RAW : sk->sk_protocol, inet_sk_flowi_flags(sk), daddr, inet->inet_saddr, 0, 0, sk_uid(sk)); rcu_read_unlock(); } static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, const struct sk_buff *skb) { if (skb) build_skb_flow_key(fl4, skb, sk); else build_sk_flow_key(fl4, sk); } static DEFINE_SPINLOCK(fnhe_lock); static void fnhe_flush_routes(struct fib_nh_exception *fnhe) { struct rtable *rt; rt = rcu_dereference(fnhe->fnhe_rth_input); if (rt) { RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL); dst_dev_put(&rt->dst); dst_release(&rt->dst); } rt = rcu_dereference(fnhe->fnhe_rth_output); if (rt) { RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL); dst_dev_put(&rt->dst); dst_release(&rt->dst); } } static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash) { struct fib_nh_exception __rcu **fnhe_p, **oldest_p; struct fib_nh_exception *fnhe, *oldest = NULL; for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) { fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock)); if (!fnhe) break; if (!oldest || time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) { oldest = fnhe; oldest_p = fnhe_p; } } fnhe_flush_routes(oldest); *oldest_p = oldest->fnhe_next; kfree_rcu(oldest, rcu); } static u32 fnhe_hashfun(__be32 daddr) { static siphash_aligned_key_t fnhe_hash_key; u64 hval; net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key)); hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key); return hash_64(hval, FNHE_HASH_SHIFT); } static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe) { rt->rt_pmtu = fnhe->fnhe_pmtu; rt->rt_mtu_locked = fnhe->fnhe_mtu_locked; rt->dst.expires = fnhe->fnhe_expires; if (fnhe->fnhe_gw) { rt->rt_flags |= RTCF_REDIRECTED; rt->rt_uses_gateway = 1; rt->rt_gw_family = AF_INET; rt->rt_gw4 = fnhe->fnhe_gw; } } static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr, __be32 gw, u32 pmtu, bool lock, unsigned long expires) { struct fnhe_hash_bucket *hash; struct fib_nh_exception *fnhe; struct rtable *rt; u32 genid, hval; unsigned int i; int depth; genid = fnhe_genid(dev_net(nhc->nhc_dev)); hval = fnhe_hashfun(daddr); spin_lock_bh(&fnhe_lock); hash = rcu_dereference(nhc->nhc_exceptions); if (!hash) { hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC); if (!hash) goto out_unlock; rcu_assign_pointer(nhc->nhc_exceptions, hash); } hash += hval; depth = 0; for (fnhe = rcu_dereference(hash->chain); fnhe; fnhe = rcu_dereference(fnhe->fnhe_next)) { if (fnhe->fnhe_daddr == daddr) break; depth++; } if (fnhe) { if (fnhe->fnhe_genid != genid) fnhe->fnhe_genid = genid; if (gw) fnhe->fnhe_gw = gw; if (pmtu) { fnhe->fnhe_pmtu = pmtu; fnhe->fnhe_mtu_locked = lock; } fnhe->fnhe_expires = max(1UL, expires); /* Update all cached dsts too */ rt = rcu_dereference(fnhe->fnhe_rth_input); if (rt) fill_route_from_fnhe(rt, fnhe); rt = rcu_dereference(fnhe->fnhe_rth_output); if (rt) fill_route_from_fnhe(rt, fnhe); } else { /* Randomize max depth to avoid some side channels attacks. */ int max_depth = FNHE_RECLAIM_DEPTH + get_random_u32_below(FNHE_RECLAIM_DEPTH); while (depth > max_depth) { fnhe_remove_oldest(hash); depth--; } fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); if (!fnhe) goto out_unlock; fnhe->fnhe_next = hash->chain; fnhe->fnhe_genid = genid; fnhe->fnhe_daddr = daddr; fnhe->fnhe_gw = gw; fnhe->fnhe_pmtu = pmtu; fnhe->fnhe_mtu_locked = lock; fnhe->fnhe_expires = max(1UL, expires); rcu_assign_pointer(hash->chain, fnhe); /* Exception created; mark the cached routes for the nexthop * stale, so anyone caching it rechecks if this exception * applies to them. */ rt = rcu_dereference(nhc->nhc_rth_input); if (rt) WRITE_ONCE(rt->dst.obsolete, DST_OBSOLETE_KILL); for_each_possible_cpu(i) { struct rtable __rcu **prt; prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i); rt = rcu_dereference(*prt); if (rt) WRITE_ONCE(rt->dst.obsolete, DST_OBSOLETE_KILL); } } fnhe->fnhe_stamp = jiffies; out_unlock: spin_unlock_bh(&fnhe_lock); } static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, bool kill_route) { __be32 new_gw = icmp_hdr(skb)->un.gateway; __be32 old_gw = ip_hdr(skb)->saddr; struct net_device *dev = skb->dev; struct in_device *in_dev; struct fib_result res; struct neighbour *n; struct net *net; switch (icmp_hdr(skb)->code & 7) { case ICMP_REDIR_NET: case ICMP_REDIR_NETTOS: case ICMP_REDIR_HOST: case ICMP_REDIR_HOSTTOS: break; default: return; } if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw) return; in_dev = __in_dev_get_rcu(dev); if (!in_dev) return; net = dev_net(dev); if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) || ipv4_is_zeronet(new_gw)) goto reject_redirect; if (!IN_DEV_SHARED_MEDIA(in_dev)) { if (!inet_addr_onlink(in_dev, new_gw, old_gw)) goto reject_redirect; if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) goto reject_redirect; } else { if (inet_addr_type(net, new_gw) != RTN_UNICAST) goto reject_redirect; } n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw); if (!n) n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev); if (!IS_ERR(n)) { if (!(READ_ONCE(n->nud_state) & NUD_VALID)) { neigh_event_send(n, NULL); } else { if (fib_lookup(net, fl4, &res, 0) == 0) { struct fib_nh_common *nhc; fib_select_path(net, &res, fl4, skb); nhc = FIB_RES_NHC(res); update_or_create_fnhe(nhc, fl4->daddr, new_gw, 0, false, jiffies + ip_rt_gc_timeout); } if (kill_route) WRITE_ONCE(rt->dst.obsolete, DST_OBSOLETE_KILL); call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); } neigh_release(n); } return; reject_redirect: #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev)) { const struct iphdr *iph = (const struct iphdr *) skb->data; __be32 daddr = iph->daddr; __be32 saddr = iph->saddr; net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n" " Advised path = %pI4 -> %pI4\n", &old_gw, dev->name, &new_gw, &saddr, &daddr); } #endif ; } static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { struct rtable *rt; struct flowi4 fl4; const struct iphdr *iph = (const struct iphdr *) skb->data; struct net *net = dev_net(skb->dev); int oif = skb->dev->ifindex; u8 prot = iph->protocol; u32 mark = skb->mark; __u8 tos = iph->tos; rt = dst_rtable(dst); __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0); __ip_do_redirect(rt, skb, &fl4, true); } static void ipv4_negative_advice(struct sock *sk, struct dst_entry *dst) { struct rtable *rt = dst_rtable(dst); if ((READ_ONCE(dst->obsolete) > 0) || (rt->rt_flags & RTCF_REDIRECTED) || READ_ONCE(rt->dst.expires)) sk_dst_reset(sk); } /* * Algorithm: * 1. The first ip_rt_redirect_number redirects are sent * with exponential backoff, then we stop sending them at all, * assuming that the host ignores our redirects. * 2. If we did not see packets requiring redirects * during ip_rt_redirect_silence, we assume that the host * forgot redirected route and start to send redirects again. * * This algorithm is much cheaper and more intelligent than dumb load limiting * in icmp.c. * * NOTE. Do not forget to inhibit load limiting for redirects (redundant) * and "frag. need" (breaks PMTU discovery) in icmp.c. */ void ip_rt_send_redirect(struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct in_device *in_dev; struct inet_peer *peer; struct net *net; int log_martians; int vif; rcu_read_lock(); in_dev = __in_dev_get_rcu(rt->dst.dev); if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { rcu_read_unlock(); return; } log_martians = IN_DEV_LOG_MARTIANS(in_dev); vif = l3mdev_master_ifindex_rcu(rt->dst.dev); net = dev_net(rt->dst.dev); peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif); if (!peer) { rcu_read_unlock(); icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt_nexthop(rt, ip_hdr(skb)->daddr)); return; } /* No redirected packets during ip_rt_redirect_silence; * reset the algorithm. */ if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) { peer->rate_tokens = 0; peer->n_redirects = 0; } /* Too many ignored redirects; do not send anything * set dst.rate_last to the last seen redirected packet. */ if (peer->n_redirects >= ip_rt_redirect_number) { peer->rate_last = jiffies; goto out_unlock; } /* Check for load limit; set rate_last to the latest sent * redirect. */ if (peer->n_redirects == 0 || time_after(jiffies, (peer->rate_last + (ip_rt_redirect_load << peer->n_redirects)))) { __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr); icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); peer->rate_last = jiffies; ++peer->n_redirects; if (IS_ENABLED(CONFIG_IP_ROUTE_VERBOSE) && log_martians && peer->n_redirects == ip_rt_redirect_number) net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", &ip_hdr(skb)->saddr, inet_iif(skb), &ip_hdr(skb)->daddr, &gw); } out_unlock: rcu_read_unlock(); } static int ip_error(struct sk_buff *skb) { struct rtable *rt = skb_rtable(skb); struct net_device *dev = skb->dev; struct in_device *in_dev; struct inet_peer *peer; unsigned long now; struct net *net; SKB_DR(reason); bool send; int code; if (netif_is_l3_master(skb->dev)) { dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif); if (!dev) goto out; } in_dev = __in_dev_get_rcu(dev); /* IP on this device is disabled. */ if (!in_dev) goto out; net = dev_net(rt->dst.dev); if (!IN_DEV_FORWARD(in_dev)) { switch (rt->dst.error) { case EHOSTUNREACH: SKB_DR_SET(reason, IP_INADDRERRORS); __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS); break; case ENETUNREACH: SKB_DR_SET(reason, IP_INNOROUTES); __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES); break; } goto out; } switch (rt->dst.error) { case EINVAL: default: goto out; case EHOSTUNREACH: code = ICMP_HOST_UNREACH; break; case ENETUNREACH: code = ICMP_NET_UNREACH; SKB_DR_SET(reason, IP_INNOROUTES); __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES); break; case EACCES: code = ICMP_PKT_FILTERED; break; } rcu_read_lock(); peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, l3mdev_master_ifindex_rcu(skb->dev)); send = true; if (peer) { now = jiffies; peer->rate_tokens += now - peer->rate_last; if (peer->rate_tokens > ip_rt_error_burst) peer->rate_tokens = ip_rt_error_burst; peer->rate_last = now; if (peer->rate_tokens >= ip_rt_error_cost) peer->rate_tokens -= ip_rt_error_cost; else send = false; } rcu_read_unlock(); if (send) icmp_send(skb, ICMP_DEST_UNREACH, code, 0); out: kfree_skb_reason(skb, reason); return 0; } static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) { struct dst_entry *dst = &rt->dst; struct fib_result res; bool lock = false; struct net *net; u32 old_mtu; if (ip_mtu_locked(dst)) return; old_mtu = ipv4_mtu(dst); if (old_mtu < mtu) return; rcu_read_lock(); net = dev_net_rcu(dst_dev(dst)); if (mtu < net->ipv4.ip_rt_min_pmtu) { lock = true; mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu); } if (rt->rt_pmtu == mtu && !lock && time_before(jiffies, READ_ONCE(dst->expires) - net->ipv4.ip_rt_mtu_expires / 2)) goto out; if (fib_lookup(net, fl4, &res, 0) == 0) { struct fib_nh_common *nhc; fib_select_path(net, &res, fl4, NULL); #ifdef CONFIG_IP_ROUTE_MULTIPATH if (fib_info_num_path(res.fi) > 1) { int nhsel; for (nhsel = 0; nhsel < fib_info_num_path(res.fi); nhsel++) { nhc = fib_info_nhc(res.fi, nhsel); update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, jiffies + net->ipv4.ip_rt_mtu_expires); } goto out; } #endif /* CONFIG_IP_ROUTE_MULTIPATH */ nhc = FIB_RES_NHC(res); update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, jiffies + net->ipv4.ip_rt_mtu_expires); } out: rcu_read_unlock(); } static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh) { struct rtable *rt = dst_rtable(dst); struct flowi4 fl4; ip_rt_build_flow_key(&fl4, sk, skb); /* Don't make lookup fail for bridged encapsulations */ if (skb && netif_is_any_bridge_port(skb->dev)) fl4.flowi4_oif = 0; __ip_rt_update_pmtu(rt, &fl4, mtu); } void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, int oif, u8 protocol) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; u32 mark = IP4_REPLY_MARK(net, skb->mark); __build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, mark, 0); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { __ip_rt_update_pmtu(rt, &fl4, mtu); ip_rt_put(rt); } } EXPORT_SYMBOL_GPL(ipv4_update_pmtu); static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0); if (!fl4.flowi4_mark) fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark); rt = __ip_route_output_key(sock_net(sk), &fl4); if (!IS_ERR(rt)) { __ip_rt_update_pmtu(rt, &fl4, mtu); ip_rt_put(rt); } } void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; struct dst_entry *odst = NULL; bool new = false; struct net *net = sock_net(sk); bh_lock_sock(sk); if (!ip_sk_accept_pmtu(sk)) goto out; odst = sk_dst_get(sk); if (sock_owned_by_user(sk) || !odst) { __ipv4_sk_update_pmtu(skb, sk, mtu); goto out; } __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); rt = dst_rtable(odst); if (READ_ONCE(odst->obsolete) && !odst->ops->check(odst, 0)) { rt = ip_route_output_flow(sock_net(sk), &fl4, sk); if (IS_ERR(rt)) goto out; new = true; } __ip_rt_update_pmtu(dst_rtable(xfrm_dst_path(&rt->dst)), &fl4, mtu); if (!dst_check(&rt->dst, 0)) { if (new) dst_release(&rt->dst); rt = ip_route_output_flow(sock_net(sk), &fl4, sk); if (IS_ERR(rt)) goto out; new = true; } if (new) sk_dst_set(sk, &rt->dst); out: bh_unlock_sock(sk); dst_release(odst); } EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); void ipv4_redirect(struct sk_buff *skb, struct net *net, int oif, u8 protocol) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; __build_flow_key(net, &fl4, NULL, iph, oif, iph->tos, protocol, 0, 0); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { __ip_do_redirect(rt, skb, &fl4, false); ip_rt_put(rt); } } EXPORT_SYMBOL_GPL(ipv4_redirect); void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct flowi4 fl4; struct rtable *rt; struct net *net = sock_net(sk); __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); rt = __ip_route_output_key(net, &fl4); if (!IS_ERR(rt)) { __ip_do_redirect(rt, skb, &fl4, false); ip_rt_put(rt); } } EXPORT_SYMBOL_GPL(ipv4_sk_redirect); INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) { struct rtable *rt = dst_rtable(dst); /* All IPV4 dsts are created with ->obsolete set to the value * DST_OBSOLETE_FORCE_CHK which forces validation calls down * into this function always. * * When a PMTU/redirect information update invalidates a route, * this is indicated by setting obsolete to DST_OBSOLETE_KILL or * DST_OBSOLETE_DEAD. */ if (READ_ONCE(dst->obsolete) != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt)) return NULL; return dst; } EXPORT_INDIRECT_CALLABLE(ipv4_dst_check); static void ipv4_send_dest_unreach(struct sk_buff *skb) { struct net_device *dev; struct ip_options opt; int res; /* Recompile ip options since IPCB may not be valid anymore. * Also check we have a reasonable ipv4 header. */ if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) || ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5) return; memset(&opt, 0, sizeof(opt)); if (ip_hdr(skb)->ihl > 5) { if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4)) return; opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); rcu_read_lock(); dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev; res = __ip_options_compile(dev_net(dev), &opt, skb, NULL); rcu_read_unlock(); if (res) return; } __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt); } static void ipv4_link_failure(struct sk_buff *skb) { struct rtable *rt; ipv4_send_dest_unreach(skb); rt = skb_rtable(skb); if (rt) dst_set_expires(&rt->dst, 0); } static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb) { pr_debug("%s: %pI4 -> %pI4, %s\n", __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, skb->dev ? skb->dev->name : "?"); kfree_skb(skb); WARN_ON(1); return 0; } /* * We do not cache source address of outgoing interface, * because it is used only by IP RR, TS and SRR options, * so that it out of fast path. * * BTW remember: "addr" is allowed to be not aligned * in IP options! */ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) { __be32 src; if (rt_is_output_route(rt)) src = ip_hdr(skb)->saddr; else { struct fib_result res; struct iphdr *iph = ip_hdr(skb); struct flowi4 fl4 = { .daddr = iph->daddr, .saddr = iph->saddr, .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)), .flowi4_oif = rt->dst.dev->ifindex, .flowi4_iif = skb->dev->ifindex, .flowi4_mark = skb->mark, }; rcu_read_lock(); if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0) src = fib_result_prefsrc(dev_net(rt->dst.dev), &res); else src = inet_select_addr(rt->dst.dev, rt_nexthop(rt, iph->daddr), RT_SCOPE_UNIVERSE); rcu_read_unlock(); } memcpy(addr, &src, 4); } #ifdef CONFIG_IP_ROUTE_CLASSID static void set_class_tag(struct rtable *rt, u32 tag) { if (!(rt->dst.tclassid & 0xFFFF)) rt->dst.tclassid |= tag & 0xFFFF; if (!(rt->dst.tclassid & 0xFFFF0000)) rt->dst.tclassid |= tag & 0xFFFF0000; } #endif static unsigned int ipv4_default_advmss(const struct dst_entry *dst) { unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr); unsigned int advmss; struct net *net; rcu_read_lock(); net = dev_net_rcu(dst_dev(dst)); advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, net->ipv4.ip_rt_min_advmss); rcu_read_unlock(); return min(advmss, IPV4_MAX_PMTU - header_size); } INDIRECT_CALLABLE_SCOPE unsigned int ipv4_mtu(const struct dst_entry *dst) { return ip_dst_mtu_maybe_forward(dst, false); } EXPORT_INDIRECT_CALLABLE(ipv4_mtu); static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr) { struct fnhe_hash_bucket *hash; struct fib_nh_exception *fnhe, __rcu **fnhe_p; u32 hval = fnhe_hashfun(daddr); spin_lock_bh(&fnhe_lock); hash = rcu_dereference_protected(nhc->nhc_exceptions, lockdep_is_held(&fnhe_lock)); hash += hval; fnhe_p = &hash->chain; fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock)); while (fnhe) { if (fnhe->fnhe_daddr == daddr) { rcu_assign_pointer(*fnhe_p, rcu_dereference_protected( fnhe->fnhe_next, lockdep_is_held(&fnhe_lock))); /* set fnhe_daddr to 0 to ensure it won't bind with * new dsts in rt_bind_exception(). */ fnhe->fnhe_daddr = 0; fnhe_flush_routes(fnhe); kfree_rcu(fnhe, rcu); break; } fnhe_p = &fnhe->fnhe_next; fnhe = rcu_dereference_protected(fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)); } spin_unlock_bh(&fnhe_lock); } static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc, __be32 daddr) { struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions); struct fib_nh_exception *fnhe; u32 hval; if (!hash) return NULL; hval = fnhe_hashfun(daddr); for (fnhe = rcu_dereference(hash[hval].chain); fnhe; fnhe = rcu_dereference(fnhe->fnhe_next)) { if (fnhe->fnhe_daddr == daddr) { if (fnhe->fnhe_expires && time_after(jiffies, fnhe->fnhe_expires)) { ip_del_fnhe(nhc, daddr); break; } return fnhe; } } return NULL; } /* MTU selection: * 1. mtu on route is locked - use it * 2. mtu from nexthop exception * 3. mtu from egress device */ u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr) { struct fib_nh_common *nhc = res->nhc; struct net_device *dev = nhc->nhc_dev; struct fib_info *fi = res->fi; u32 mtu = 0; if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) || fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU)) mtu = fi->fib_mtu; if (likely(!mtu)) { struct fib_nh_exception *fnhe; fnhe = find_exception(nhc, daddr); if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires)) mtu = fnhe->fnhe_pmtu; } if (likely(!mtu)) mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU); return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu); } static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, __be32 daddr, const bool do_cache) { bool ret = false; spin_lock_bh(&fnhe_lock); if (daddr == fnhe->fnhe_daddr) { struct rtable __rcu **porig; struct rtable *orig; int genid = fnhe_genid(dev_net(rt->dst.dev)); if (rt_is_input_route(rt)) porig = &fnhe->fnhe_rth_input; else porig = &fnhe->fnhe_rth_output; orig = rcu_dereference(*porig); if (fnhe->fnhe_genid != genid) { fnhe->fnhe_genid = genid; fnhe->fnhe_gw = 0; fnhe->fnhe_pmtu = 0; fnhe->fnhe_expires = 0; fnhe->fnhe_mtu_locked = false; fnhe_flush_routes(fnhe); orig = NULL; } fill_route_from_fnhe(rt, fnhe); if (!rt->rt_gw4) { rt->rt_gw4 = daddr; rt->rt_gw_family = AF_INET; } if (do_cache) { dst_hold(&rt->dst); rcu_assign_pointer(*porig, rt); if (orig) { dst_dev_put(&orig->dst); dst_release(&orig->dst); } ret = true; } fnhe->fnhe_stamp = jiffies; } spin_unlock_bh(&fnhe_lock); return ret; } static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt) { struct rtable *orig, *prev, **p; bool ret = true; if (rt_is_input_route(rt)) { p = (struct rtable **)&nhc->nhc_rth_input; } else { p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output); } orig = *p; /* hold dst before doing cmpxchg() to avoid race condition * on this dst */ dst_hold(&rt->dst); prev = cmpxchg(p, orig, rt); if (prev == orig) { if (orig) { rt_add_uncached_list(orig); dst_release(&orig->dst); } } else { dst_release(&rt->dst); ret = false; } return ret; } struct uncached_list { spinlock_t lock; struct list_head head; }; static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list); void rt_add_uncached_list(struct rtable *rt) { struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list); rt->dst.rt_uncached_list = ul; spin_lock_bh(&ul->lock); list_add_tail(&rt->dst.rt_uncached, &ul->head); spin_unlock_bh(&ul->lock); } void rt_del_uncached_list(struct rtable *rt) { if (!list_empty(&rt->dst.rt_uncached)) { struct uncached_list *ul = rt->dst.rt_uncached_list; spin_lock_bh(&ul->lock); list_del_init(&rt->dst.rt_uncached); spin_unlock_bh(&ul->lock); } } static void ipv4_dst_destroy(struct dst_entry *dst) { ip_dst_metrics_put(dst); rt_del_uncached_list(dst_rtable(dst)); } void rt_flush_dev(struct net_device *dev) { struct rtable *rt, *safe; int cpu; for_each_possible_cpu(cpu) { struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); if (list_empty(&ul->head)) continue; spin_lock_bh(&ul->lock); list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) { if (rt->dst.dev != dev) continue; rt->dst.dev = blackhole_netdev; netdev_ref_replace(dev, blackhole_netdev, &rt->dst.dev_tracker, GFP_ATOMIC); list_del_init(&rt->dst.rt_uncached); } spin_unlock_bh(&ul->lock); } } static bool rt_cache_valid(const struct rtable *rt) { return rt && READ_ONCE(rt->dst.obsolete) == DST_OBSOLETE_FORCE_CHK && !rt_is_expired(rt); } static void rt_set_nexthop(struct rtable *rt, __be32 daddr, const struct fib_result *res, struct fib_nh_exception *fnhe, struct fib_info *fi, u16 type, u32 itag, const bool do_cache) { bool cached = false; if (fi) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) { rt->rt_uses_gateway = 1; rt->rt_gw_family = nhc->nhc_gw_family; /* only INET and INET6 are supported */ if (likely(nhc->nhc_gw_family == AF_INET)) rt->rt_gw4 = nhc->nhc_gw.ipv4; else rt->rt_gw6 = nhc->nhc_gw.ipv6; } ip_dst_init_metrics(&rt->dst, fi->fib_metrics); #ifdef CONFIG_IP_ROUTE_CLASSID if (nhc->nhc_family == AF_INET) { struct fib_nh *nh; nh = container_of(nhc, struct fib_nh, nh_common); rt->dst.tclassid = nh->nh_tclassid; } #endif rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate); if (unlikely(fnhe)) cached = rt_bind_exception(rt, fnhe, daddr, do_cache); else if (do_cache) cached = rt_cache_route(nhc, rt); if (unlikely(!cached)) { /* Routes we intend to cache in nexthop exception or * FIB nexthop have the DST_NOCACHE bit clear. * However, if we are unsuccessful at storing this * route into the cache we really need to set it. */ if (!rt->rt_gw4) { rt->rt_gw_family = AF_INET; rt->rt_gw4 = daddr; } rt_add_uncached_list(rt); } } else rt_add_uncached_list(rt); #ifdef CONFIG_IP_ROUTE_CLASSID #ifdef CONFIG_IP_MULTIPLE_TABLES set_class_tag(rt, res->tclassid); #endif set_class_tag(rt, itag); #endif } struct rtable *rt_dst_alloc(struct net_device *dev, unsigned int flags, u16 type, bool noxfrm) { struct rtable *rt; rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK, (noxfrm ? DST_NOXFRM : 0)); if (rt) { rt->rt_genid = rt_genid_ipv4(dev_net(dev)); rt->rt_flags = flags; rt->rt_type = type; rt->rt_is_input = 0; rt->rt_iif = 0; rt->rt_pmtu = 0; rt->rt_mtu_locked = 0; rt->rt_uses_gateway = 0; rt->rt_gw_family = 0; rt->rt_gw4 = 0; rt->dst.output = ip_output; if (flags & RTCF_LOCAL) rt->dst.input = ip_local_deliver; } return rt; } EXPORT_SYMBOL(rt_dst_alloc); struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt) { struct rtable *new_rt; new_rt = dst_alloc(&ipv4_dst_ops, dev, DST_OBSOLETE_FORCE_CHK, rt->dst.flags); if (new_rt) { new_rt->rt_genid = rt_genid_ipv4(dev_net(dev)); new_rt->rt_flags = rt->rt_flags; new_rt->rt_type = rt->rt_type; new_rt->rt_is_input = rt->rt_is_input; new_rt->rt_iif = rt->rt_iif; new_rt->rt_pmtu = rt->rt_pmtu; new_rt->rt_mtu_locked = rt->rt_mtu_locked; new_rt->rt_gw_family = rt->rt_gw_family; if (rt->rt_gw_family == AF_INET) new_rt->rt_gw4 = rt->rt_gw4; else if (rt->rt_gw_family == AF_INET6) new_rt->rt_gw6 = rt->rt_gw6; new_rt->dst.input = READ_ONCE(rt->dst.input); new_rt->dst.output = READ_ONCE(rt->dst.output); new_rt->dst.error = rt->dst.error; new_rt->dst.lastuse = jiffies; new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate); } return new_rt; } EXPORT_SYMBOL(rt_dst_clone); /* called in rcu_read_lock() section */ enum skb_drop_reason ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, struct in_device *in_dev, u32 *itag) { enum skb_drop_reason reason; /* Primary sanity checks. */ if (!in_dev) return SKB_DROP_REASON_NOT_SPECIFIED; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) return SKB_DROP_REASON_IP_INVALID_SOURCE; if (skb->protocol != htons(ETH_P_IP)) return SKB_DROP_REASON_INVALID_PROTO; if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev)) return SKB_DROP_REASON_IP_LOCALNET; if (ipv4_is_zeronet(saddr)) { if (!ipv4_is_local_multicast(daddr) && ip_hdr(skb)->protocol != IPPROTO_IGMP) return SKB_DROP_REASON_IP_INVALID_SOURCE; } else { reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0, dev, in_dev, itag); if (reason) return reason; } return SKB_NOT_DROPPED_YET; } /* called in rcu_read_lock() section */ static enum skb_drop_reason ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, int our) { struct in_device *in_dev = __in_dev_get_rcu(dev); unsigned int flags = RTCF_MULTICAST; enum skb_drop_reason reason; struct rtable *rth; u32 itag = 0; reason = ip_mc_validate_source(skb, daddr, saddr, dscp, dev, in_dev, &itag); if (reason) return reason; if (our) flags |= RTCF_LOCAL; if (IN_DEV_ORCONF(in_dev, NOPOLICY)) IPCB(skb)->flags |= IPSKB_NOPOLICY; rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, false); if (!rth) return SKB_DROP_REASON_NOMEM; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif rth->dst.output = ip_rt_bug; rth->rt_is_input= 1; #ifdef CONFIG_IP_MROUTE if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) rth->dst.input = ip_mr_input; #endif RT_CACHE_STAT_INC(in_slow_mc); skb_dst_drop(skb); skb_dst_set(skb, &rth->dst); return SKB_NOT_DROPPED_YET; } static void ip_handle_martian_source(struct net_device *dev, struct in_device *in_dev, struct sk_buff *skb, __be32 daddr, __be32 saddr) { RT_CACHE_STAT_INC(in_martian_src); #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { /* * RFC1812 recommendation, if source is martian, * the only hint is MAC header. */ pr_warn("martian source %pI4 from %pI4, on dev %s\n", &daddr, &saddr, dev->name); if (dev->hard_header_len && skb_mac_header_was_set(skb)) { print_hex_dump(KERN_WARNING, "ll header: ", DUMP_PREFIX_OFFSET, 16, 1, skb_mac_header(skb), dev->hard_header_len, false); } } #endif } /* called in rcu_read_lock() section */ static enum skb_drop_reason __mkroute_input(struct sk_buff *skb, const struct fib_result *res, struct in_device *in_dev, __be32 daddr, __be32 saddr, dscp_t dscp) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct fib_nh_common *nhc = FIB_RES_NHC(*res); struct net_device *dev = nhc->nhc_dev; struct fib_nh_exception *fnhe; struct rtable *rth; int err; struct in_device *out_dev; bool do_cache; u32 itag = 0; /* get a working reference to the output device */ out_dev = __in_dev_get_rcu(dev); if (!out_dev) { net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n"); return reason; } err = fib_validate_source(skb, saddr, daddr, dscp, FIB_RES_OIF(*res), in_dev->dev, in_dev, &itag); if (err < 0) { reason = -err; ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); goto cleanup; } do_cache = res->fi && !itag; if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && skb->protocol == htons(ETH_P_IP)) { __be32 gw; gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0; if (IN_DEV_SHARED_MEDIA(out_dev) || inet_addr_onlink(out_dev, saddr, gw)) IPCB(skb)->flags |= IPSKB_DOREDIRECT; } if (skb->protocol != htons(ETH_P_IP)) { /* Not IP (i.e. ARP). Do not create route, if it is * invalid for proxy arp. DNAT routes are always valid. * * Proxy arp feature have been extended to allow, ARP * replies back to the same interface, to support * Private VLAN switch technologies. See arp.c. */ if (out_dev == in_dev && IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { reason = SKB_DROP_REASON_ARP_PVLAN_DISABLE; goto cleanup; } } if (IN_DEV_ORCONF(in_dev, NOPOLICY)) IPCB(skb)->flags |= IPSKB_NOPOLICY; fnhe = find_exception(nhc, daddr); if (do_cache) { if (fnhe) rth = rcu_dereference(fnhe->fnhe_rth_input); else rth = rcu_dereference(nhc->nhc_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); goto out; } } rth = rt_dst_alloc(out_dev->dev, 0, res->type, IN_DEV_ORCONF(out_dev, NOXFRM)); if (!rth) { reason = SKB_DROP_REASON_NOMEM; goto cleanup; } rth->rt_is_input = 1; RT_CACHE_STAT_INC(in_slow_tot); rth->dst.input = ip_forward; rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag, do_cache); lwtunnel_set_redirect(&rth->dst); skb_dst_set(skb, &rth->dst); out: reason = SKB_NOT_DROPPED_YET; cleanup: return reason; } #ifdef CONFIG_IP_ROUTE_MULTIPATH /* To make ICMP packets follow the right flow, the multipath hash is * calculated from the inner IP addresses. */ static void ip_multipath_l3_keys(const struct sk_buff *skb, struct flow_keys *hash_keys) { const struct iphdr *outer_iph = ip_hdr(skb); const struct iphdr *key_iph = outer_iph; const struct iphdr *inner_iph; const struct icmphdr *icmph; struct iphdr _inner_iph; struct icmphdr _icmph; if (likely(outer_iph->protocol != IPPROTO_ICMP)) goto out; if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0)) goto out; icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph), &_icmph); if (!icmph) goto out; if (!icmp_is_err(icmph->type)) goto out; inner_iph = skb_header_pointer(skb, outer_iph->ihl * 4 + sizeof(_icmph), sizeof(_inner_iph), &_inner_iph); if (!inner_iph) goto out; key_iph = inner_iph; out: hash_keys->addrs.v4addrs.src = key_iph->saddr; hash_keys->addrs.v4addrs.dst = key_iph->daddr; } static u32 fib_multipath_custom_hash_outer(const struct net *net, const struct sk_buff *skb, bool *p_has_inner) { u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields); struct flow_keys keys, hash_keys; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) return 0; memset(&hash_keys, 0, sizeof(hash_keys)); skb_flow_dissect_flow_keys(skb, &keys, FLOW_DISSECTOR_F_STOP_AT_ENCAP); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) hash_keys.basic.ip_proto = keys.basic.ip_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) hash_keys.ports.src = keys.ports.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = keys.ports.dst; *p_has_inner = !!(keys.control.flags & FLOW_DIS_ENCAPSULATION); return fib_multipath_hash_from_keys(net, &hash_keys); } static u32 fib_multipath_custom_hash_inner(const struct net *net, const struct sk_buff *skb, bool has_inner) { u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields); struct flow_keys keys, hash_keys; /* We assume the packet carries an encapsulation, but if none was * encountered during dissection of the outer flow, then there is no * point in calling the flow dissector again. */ if (!has_inner) return 0; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_MASK)) return 0; memset(&hash_keys, 0, sizeof(hash_keys)); skb_flow_dissect_flow_keys(skb, &keys, 0); if (!(keys.control.flags & FLOW_DIS_ENCAPSULATION)) return 0; if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP) hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP) hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL) hash_keys.tags.flow_label = keys.tags.flow_label; } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO) hash_keys.basic.ip_proto = keys.basic.ip_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT) hash_keys.ports.src = keys.ports.src; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT) hash_keys.ports.dst = keys.ports.dst; return fib_multipath_hash_from_keys(net, &hash_keys); } static u32 fib_multipath_custom_hash_skb(const struct net *net, const struct sk_buff *skb) { u32 mhash, mhash_inner; bool has_inner = true; mhash = fib_multipath_custom_hash_outer(net, skb, &has_inner); mhash_inner = fib_multipath_custom_hash_inner(net, skb, has_inner); return jhash_2words(mhash, mhash_inner, 0); } static u32 fib_multipath_custom_hash_fl4(const struct net *net, const struct flowi4 *fl4) { u32 hash_fields = READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_fields); struct flow_keys hash_keys; if (!(hash_fields & FIB_MULTIPATH_HASH_FIELD_OUTER_MASK)) return 0; memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_IP) hash_keys.addrs.v4addrs.src = fl4->saddr; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_IP) hash_keys.addrs.v4addrs.dst = fl4->daddr; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_IP_PROTO) hash_keys.basic.ip_proto = fl4->flowi4_proto; if (hash_fields & FIB_MULTIPATH_HASH_FIELD_SRC_PORT) { if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT) hash_keys.ports.src = (__force __be16)get_random_u16(); else hash_keys.ports.src = fl4->fl4_sport; } if (hash_fields & FIB_MULTIPATH_HASH_FIELD_DST_PORT) hash_keys.ports.dst = fl4->fl4_dport; return fib_multipath_hash_from_keys(net, &hash_keys); } /* if skb is set it will be used and fl4 can be NULL */ int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, const struct sk_buff *skb, struct flow_keys *flkeys) { u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0; struct flow_keys hash_keys; u32 mhash = 0; switch (READ_ONCE(net->ipv4.sysctl_fib_multipath_hash_policy)) { case 0: memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; if (skb) { ip_multipath_l3_keys(skb, &hash_keys); } else { hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; } mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 1: /* skb is currently provided only when forwarding */ if (skb) { unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; struct flow_keys keys; /* short-circuit if we already have L4 hash present */ if (skb->l4_hash) return skb_get_hash_raw(skb) >> 1; memset(&hash_keys, 0, sizeof(hash_keys)); if (!flkeys) { skb_flow_dissect_flow_keys(skb, &keys, flag); flkeys = &keys; } hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; hash_keys.ports.src = flkeys->ports.src; hash_keys.ports.dst = flkeys->ports.dst; hash_keys.basic.ip_proto = flkeys->basic.ip_proto; } else { memset(&hash_keys, 0, sizeof(hash_keys)); hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; if (fl4->flowi4_flags & FLOWI_FLAG_ANY_SPORT) hash_keys.ports.src = (__force __be16)get_random_u16(); else hash_keys.ports.src = fl4->fl4_sport; hash_keys.ports.dst = fl4->fl4_dport; hash_keys.basic.ip_proto = fl4->flowi4_proto; } mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 2: memset(&hash_keys, 0, sizeof(hash_keys)); /* skb is currently provided only when forwarding */ if (skb) { struct flow_keys keys; skb_flow_dissect_flow_keys(skb, &keys, 0); /* Inner can be v4 or v6 */ if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; hash_keys.tags.flow_label = keys.tags.flow_label; hash_keys.basic.ip_proto = keys.basic.ip_proto; } else { /* Same as case 0 */ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; ip_multipath_l3_keys(skb, &hash_keys); } } else { /* Same as case 0 */ hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; hash_keys.addrs.v4addrs.src = fl4->saddr; hash_keys.addrs.v4addrs.dst = fl4->daddr; } mhash = fib_multipath_hash_from_keys(net, &hash_keys); break; case 3: if (skb) mhash = fib_multipath_custom_hash_skb(net, skb); else mhash = fib_multipath_custom_hash_fl4(net, fl4); break; } if (multipath_hash) mhash = jhash_2words(mhash, multipath_hash, 0); return mhash >> 1; } #endif /* CONFIG_IP_ROUTE_MULTIPATH */ static enum skb_drop_reason ip_mkroute_input(struct sk_buff *skb, struct fib_result *res, struct in_device *in_dev, __be32 daddr, __be32 saddr, dscp_t dscp, struct flow_keys *hkeys) { #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi && fib_info_num_path(res->fi) > 1) { int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); fib_select_multipath(res, h, NULL); IPCB(skb)->flags |= IPSKB_MULTIPATH; } #endif /* create a routing cache entry */ return __mkroute_input(skb, res, in_dev, daddr, saddr, dscp); } /* Implements all the saddr-related checks as ip_route_input_slow(), * assuming daddr is valid and the destination is not a local broadcast one. * Uses the provided hint instead of performing a route lookup. */ enum skb_drop_reason ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, const struct sk_buff *hint) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct in_device *in_dev = __in_dev_get_rcu(dev); struct rtable *rt = skb_rtable(hint); struct net *net = dev_net(dev); u32 tag = 0; if (!in_dev) return reason; if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) { reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; } if (ipv4_is_zeronet(saddr)) { reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; } if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) { reason = SKB_DROP_REASON_IP_LOCALNET; goto martian_source; } if (rt->rt_type != RTN_LOCAL) goto skip_validate_source; reason = fib_validate_source_reason(skb, saddr, daddr, dscp, 0, dev, in_dev, &tag); if (reason) goto martian_source; skip_validate_source: skb_dst_copy(skb, hint); return SKB_NOT_DROPPED_YET; martian_source: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); return reason; } /* get device for dst_alloc with local routes */ static struct net_device *ip_rt_get_dev(struct net *net, const struct fib_result *res) { struct fib_nh_common *nhc = res->fi ? res->nhc : NULL; struct net_device *dev = NULL; if (nhc) dev = l3mdev_master_dev_rcu(nhc->nhc_dev); return dev ? : net->loopback_dev; } /* * NOTE. We drop all the packets that has local source * addresses, because every properly looped back packet * must have correct destination already attached by output routine. * Changes in the enforced policies must be applied also to * ip_route_use_hint(). * * Such approach solves two big problems: * 1. Not simplex devices are handled properly. * 2. IP spoofing attempts are filtered with 100% of guarantee. * called with rcu_read_lock() */ static enum skb_drop_reason ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, struct fib_result *res) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct in_device *in_dev = __in_dev_get_rcu(dev); struct flow_keys *flkeys = NULL, _flkeys; struct net *net = dev_net(dev); struct ip_tunnel_info *tun_info; int err = -EINVAL; unsigned int flags = 0; u32 itag = 0; struct rtable *rth; struct flowi4 fl4; bool do_cache = true; /* IP on this device is disabled. */ if (!in_dev) goto out; /* Check for the most weird martians, which can be not detected * by fib_lookup. */ tun_info = skb_tunnel_info(skb); if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id; else fl4.flowi4_tun_key.tun_id = 0; skb_dst_drop(skb); if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) { reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; } res->fi = NULL; res->table = NULL; if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) goto brd_input; /* Accept zero addresses only to limited broadcast; * I even do not know to fix it or not. Waiting for complains :-) */ if (ipv4_is_zeronet(saddr)) { reason = SKB_DROP_REASON_IP_INVALID_SOURCE; goto martian_source; } if (ipv4_is_zeronet(daddr)) { reason = SKB_DROP_REASON_IP_INVALID_DEST; goto martian_destination; } /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(), * and call it once if daddr or/and saddr are loopback addresses */ if (ipv4_is_loopback(daddr)) { if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) { reason = SKB_DROP_REASON_IP_LOCALNET; goto martian_destination; } } else if (ipv4_is_loopback(saddr)) { if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) { reason = SKB_DROP_REASON_IP_LOCALNET; goto martian_source; } } /* * Now we are ready to route packet. */ fl4.flowi4_l3mdev = 0; fl4.flowi4_oif = 0; fl4.flowi4_iif = dev->ifindex; fl4.flowi4_mark = skb->mark; fl4.flowi4_tos = inet_dscp_to_dsfield(dscp); fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = 0; fl4.daddr = daddr; fl4.saddr = saddr; fl4.flowi4_uid = sock_net_uid(net, NULL); fl4.flowi4_multipath_hash = 0; if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) { flkeys = &_flkeys; } else { fl4.flowi4_proto = 0; fl4.fl4_sport = 0; fl4.fl4_dport = 0; } err = fib_lookup(net, &fl4, res, 0); if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) err = -EHOSTUNREACH; goto no_route; } if (res->type == RTN_BROADCAST) { if (IN_DEV_BFORWARD(in_dev)) goto make_route; /* not do cache if bc_forwarding is enabled */ if (IPV4_DEVCONF_ALL_RO(net, BC_FORWARDING)) do_cache = false; goto brd_input; } err = -EINVAL; if (res->type == RTN_LOCAL) { reason = fib_validate_source_reason(skb, saddr, daddr, dscp, 0, dev, in_dev, &itag); if (reason) goto martian_source; goto local_input; } if (!IN_DEV_FORWARD(in_dev)) { err = -EHOSTUNREACH; goto no_route; } if (res->type != RTN_UNICAST) { reason = SKB_DROP_REASON_IP_INVALID_DEST; goto martian_destination; } make_route: reason = ip_mkroute_input(skb, res, in_dev, daddr, saddr, dscp, flkeys); out: return reason; brd_input: if (skb->protocol != htons(ETH_P_IP)) { reason = SKB_DROP_REASON_INVALID_PROTO; goto out; } if (!ipv4_is_zeronet(saddr)) { reason = fib_validate_source_reason(skb, saddr, 0, dscp, 0, dev, in_dev, &itag); if (reason) goto martian_source; } flags |= RTCF_BROADCAST; res->type = RTN_BROADCAST; RT_CACHE_STAT_INC(in_brd); local_input: if (IN_DEV_ORCONF(in_dev, NOPOLICY)) IPCB(skb)->flags |= IPSKB_NOPOLICY; do_cache &= res->fi && !itag; if (do_cache) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); rth = rcu_dereference(nhc->nhc_rth_input); if (rt_cache_valid(rth)) { skb_dst_set_noref(skb, &rth->dst); reason = SKB_NOT_DROPPED_YET; goto out; } } rth = rt_dst_alloc(ip_rt_get_dev(net, res), flags | RTCF_LOCAL, res->type, false); if (!rth) goto e_nobufs; rth->dst.output= ip_rt_bug; #ifdef CONFIG_IP_ROUTE_CLASSID rth->dst.tclassid = itag; #endif rth->rt_is_input = 1; RT_CACHE_STAT_INC(in_slow_tot); if (res->type == RTN_UNREACHABLE) { rth->dst.input= ip_error; rth->dst.error= -err; rth->rt_flags &= ~RTCF_LOCAL; } if (do_cache) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate); if (lwtunnel_input_redirect(rth->dst.lwtstate)) { WARN_ON(rth->dst.input == lwtunnel_input); rth->dst.lwtstate->orig_input = rth->dst.input; rth->dst.input = lwtunnel_input; } if (unlikely(!rt_cache_route(nhc, rth))) rt_add_uncached_list(rth); } skb_dst_set(skb, &rth->dst); reason = SKB_NOT_DROPPED_YET; goto out; no_route: RT_CACHE_STAT_INC(in_no_route); res->type = RTN_UNREACHABLE; res->fi = NULL; res->table = NULL; goto local_input; /* * Do not cache martian addresses: they should be logged (RFC1812) */ martian_destination: RT_CACHE_STAT_INC(in_martian_dst); #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev)) net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n", &daddr, &saddr, dev->name); #endif goto out; e_nobufs: reason = SKB_DROP_REASON_NOMEM; goto out; martian_source: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); goto out; } /* called with rcu_read_lock held */ static enum skb_drop_reason ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev, struct fib_result *res) { /* Multicast recognition logic is moved from route cache to here. * The problem was that too many Ethernet cards have broken/missing * hardware multicast filters :-( As result the host on multicasting * network acquires a lot of useless route cache entries, sort of * SDR messages from all the world. Now we try to get rid of them. * Really, provided software IP multicast filter is organized * reasonably (at least, hashed), it does not result in a slowdown * comparing with route cache reject entries. * Note, that multicast routers are not affected, because * route cache entry is created eventually. */ if (ipv4_is_multicast(daddr)) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct in_device *in_dev = __in_dev_get_rcu(dev); int our = 0; if (!in_dev) return reason; our = ip_check_mc_rcu(in_dev, daddr, saddr, ip_hdr(skb)->protocol); /* check l3 master if no match yet */ if (!our && netif_is_l3_slave(dev)) { struct in_device *l3_in_dev; l3_in_dev = __in_dev_get_rcu(skb->dev); if (l3_in_dev) our = ip_check_mc_rcu(l3_in_dev, daddr, saddr, ip_hdr(skb)->protocol); } if (our #ifdef CONFIG_IP_MROUTE || (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) #endif ) { reason = ip_route_input_mc(skb, daddr, saddr, dscp, dev, our); } return reason; } return ip_route_input_slow(skb, daddr, saddr, dscp, dev, res); } enum skb_drop_reason ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, dscp_t dscp, struct net_device *dev) { enum skb_drop_reason reason; struct fib_result res; rcu_read_lock(); reason = ip_route_input_rcu(skb, daddr, saddr, dscp, dev, &res); rcu_read_unlock(); return reason; } EXPORT_SYMBOL(ip_route_input_noref); /* called with rcu_read_lock() */ static struct rtable *__mkroute_output(const struct fib_result *res, const struct flowi4 *fl4, int orig_oif, struct net_device *dev_out, unsigned int flags) { struct fib_info *fi = res->fi; struct fib_nh_exception *fnhe; struct in_device *in_dev; u16 type = res->type; struct rtable *rth; bool do_cache; in_dev = __in_dev_get_rcu(dev_out); if (!in_dev) return ERR_PTR(-EINVAL); if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK) && !netif_is_l3_master(dev_out)) return ERR_PTR(-EINVAL); if (ipv4_is_lbcast(fl4->daddr)) { type = RTN_BROADCAST; /* reset fi to prevent gateway resolution */ fi = NULL; } else if (ipv4_is_multicast(fl4->daddr)) { type = RTN_MULTICAST; } else if (ipv4_is_zeronet(fl4->daddr)) { return ERR_PTR(-EINVAL); } if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; do_cache = true; if (type == RTN_BROADCAST) { flags |= RTCF_BROADCAST | RTCF_LOCAL; } else if (type == RTN_MULTICAST) { flags |= RTCF_MULTICAST | RTCF_LOCAL; if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, fl4->flowi4_proto)) flags &= ~RTCF_LOCAL; else do_cache = false; /* If multicast route do not exist use * default one, but do not gateway in this case. * Yes, it is hack. */ if (fi && res->prefixlen < 4) fi = NULL; } else if ((type == RTN_LOCAL) && (orig_oif != 0) && (orig_oif != dev_out->ifindex)) { /* For local routes that require a particular output interface * we do not want to cache the result. Caching the result * causes incorrect behaviour when there are multiple source * addresses on the interface, the end result being that if the * intended recipient is waiting on that interface for the * packet he won't receive it because it will be delivered on * the loopback interface and the IP_PKTINFO ipi_ifindex will * be set to the loopback interface as well. */ do_cache = false; } fnhe = NULL; do_cache &= fi != NULL; if (fi) { struct fib_nh_common *nhc = FIB_RES_NHC(*res); struct rtable __rcu **prth; fnhe = find_exception(nhc, fl4->daddr); if (!do_cache) goto add; if (fnhe) { prth = &fnhe->fnhe_rth_output; } else { if (unlikely(fl4->flowi4_flags & FLOWI_FLAG_KNOWN_NH && !(nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK))) { do_cache = false; goto add; } prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output); } rth = rcu_dereference(*prth); if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst)) return rth; } add: rth = rt_dst_alloc(dev_out, flags, type, IN_DEV_ORCONF(in_dev, NOXFRM)); if (!rth) return ERR_PTR(-ENOBUFS); rth->rt_iif = orig_oif; RT_CACHE_STAT_INC(out_slow_tot); if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { if (flags & RTCF_LOCAL && !(dev_out->flags & IFF_LOOPBACK)) { rth->dst.output = ip_mc_output; RT_CACHE_STAT_INC(out_slow_mc); } #ifdef CONFIG_IP_MROUTE if (type == RTN_MULTICAST) { if (IN_DEV_MFORWARD(in_dev) && !ipv4_is_local_multicast(fl4->daddr)) { rth->dst.input = ip_mr_input; rth->dst.output = ip_mr_output; } } #endif } rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache); lwtunnel_set_redirect(&rth->dst); return rth; } /* * Major route resolver routine. */ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, const struct sk_buff *skb) { struct fib_result res = { .type = RTN_UNSPEC, .fi = NULL, .table = NULL, .tclassid = 0, }; struct rtable *rth; fl4->flowi4_iif = LOOPBACK_IFINDEX; fl4->flowi4_tos &= INET_DSCP_MASK; rcu_read_lock(); rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb); rcu_read_unlock(); return rth; } EXPORT_SYMBOL_GPL(ip_route_output_key_hash); struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, struct fib_result *res, const struct sk_buff *skb) { struct net_device *dev_out = NULL; int orig_oif = fl4->flowi4_oif; unsigned int flags = 0; struct rtable *rth; int err; if (fl4->saddr) { if (ipv4_is_multicast(fl4->saddr) || ipv4_is_lbcast(fl4->saddr)) { rth = ERR_PTR(-EINVAL); goto out; } rth = ERR_PTR(-ENETUNREACH); /* I removed check for oif == dev_out->oif here. * It was wrong for two reasons: * 1. ip_dev_find(net, saddr) can return wrong iface, if saddr * is assigned to multiple interfaces. * 2. Moreover, we are allowed to send packets with saddr * of another iface. --ANK */ if (fl4->flowi4_oif == 0 && (ipv4_is_multicast(fl4->daddr) || ipv4_is_lbcast(fl4->daddr))) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ dev_out = __ip_dev_find(net, fl4->saddr, false); if (!dev_out) goto out; /* Special hack: user can direct multicasts * and limited broadcast via necessary interface * without fiddling with IP_MULTICAST_IF or IP_PKTINFO. * This hack is not just for fun, it allows * vic,vat and friends to work. * They bind socket to loopback, set ttl to zero * and expect that it will work. * From the viewpoint of routing cache they are broken, * because we are not allowed to build multicast path * with loopback source addr (look, routing cache * cannot know, that ttl is zero, so that packet * will not leave this host and route is valid). * Luckily, this hack is good workaround. */ fl4->flowi4_oif = dev_out->ifindex; goto make_route; } if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ if (!__ip_dev_find(net, fl4->saddr, false)) goto out; } } if (fl4->flowi4_oif) { dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); rth = ERR_PTR(-ENODEV); if (!dev_out) goto out; /* RACE: Check return value of inet_select_addr instead. */ if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { rth = ERR_PTR(-ENETUNREACH); goto out; } if (ipv4_is_local_multicast(fl4->daddr) || ipv4_is_lbcast(fl4->daddr) || fl4->flowi4_proto == IPPROTO_IGMP) { if (!fl4->saddr) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); goto make_route; } if (!fl4->saddr) { if (ipv4_is_multicast(fl4->daddr)) fl4->saddr = inet_select_addr(dev_out, 0, fl4->flowi4_scope); else if (!fl4->daddr) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_HOST); } } if (!fl4->daddr) { fl4->daddr = fl4->saddr; if (!fl4->daddr) fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); dev_out = net->loopback_dev; fl4->flowi4_oif = LOOPBACK_IFINDEX; res->type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; } err = fib_lookup(net, fl4, res, 0); if (err) { res->fi = NULL; res->table = NULL; if (fl4->flowi4_oif && (ipv4_is_multicast(fl4->daddr) || !fl4->flowi4_l3mdev)) { /* Apparently, routing tables are wrong. Assume, * that the destination is on link. * * WHY? DW. * Because we are allowed to send to iface * even if it has NO routes and NO assigned * addresses. When oif is specified, routing * tables are looked up with only one purpose: * to catch if destination is gatewayed, rather than * direct. Moreover, if MSG_DONTROUTE is set, * we send packet, ignoring both routing tables * and ifaddr state. --ANK * * * We could make it even if oif is unknown, * likely IPv6, but we do not. */ if (fl4->saddr == 0) fl4->saddr = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); res->type = RTN_UNICAST; goto make_route; } rth = ERR_PTR(err); goto out; } if (res->type == RTN_LOCAL) { if (!fl4->saddr) { if (res->fi->fib_prefsrc) fl4->saddr = res->fi->fib_prefsrc; else fl4->saddr = fl4->daddr; } /* L3 master device is the loopback for that domain */ dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? : net->loopback_dev; /* make sure orig_oif points to fib result device even * though packet rx/tx happens over loopback or l3mdev */ orig_oif = FIB_RES_OIF(*res); fl4->flowi4_oif = dev_out->ifindex; flags |= RTCF_LOCAL; goto make_route; } fib_select_path(net, res, fl4, skb); dev_out = FIB_RES_DEV(*res); make_route: rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags); out: return rth; } static struct dst_ops ipv4_dst_blackhole_ops = { .family = AF_INET, .default_advmss = ipv4_default_advmss, .neigh_lookup = ipv4_neigh_lookup, .check = dst_blackhole_check, .cow_metrics = dst_blackhole_cow_metrics, .update_pmtu = dst_blackhole_update_pmtu, .redirect = dst_blackhole_redirect, .mtu = dst_blackhole_mtu, }; struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) { struct rtable *ort = dst_rtable(dst_orig); struct rtable *rt; rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, DST_OBSOLETE_DEAD, 0); if (rt) { struct dst_entry *new = &rt->dst; new->__use = 1; new->input = dst_discard; new->output = dst_discard_out; new->dev = net->loopback_dev; netdev_hold(new->dev, &new->dev_tracker, GFP_ATOMIC); rt->rt_is_input = ort->rt_is_input; rt->rt_iif = ort->rt_iif; rt->rt_pmtu = ort->rt_pmtu; rt->rt_mtu_locked = ort->rt_mtu_locked; rt->rt_genid = rt_genid_ipv4(net); rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; rt->rt_uses_gateway = ort->rt_uses_gateway; rt->rt_gw_family = ort->rt_gw_family; if (rt->rt_gw_family == AF_INET) rt->rt_gw4 = ort->rt_gw4; else if (rt->rt_gw_family == AF_INET6) rt->rt_gw6 = ort->rt_gw6; } dst_release(dst_orig); return rt ? &rt->dst : ERR_PTR(-ENOMEM); } struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, const struct sock *sk) { struct rtable *rt = __ip_route_output_key(net, flp4); if (IS_ERR(rt)) return rt; if (flp4->flowi4_proto) { flp4->flowi4_oif = rt->dst.dev->ifindex; rt = dst_rtable(xfrm_lookup_route(net, &rt->dst, flowi4_to_flowi(flp4), sk, 0)); } return rt; } EXPORT_SYMBOL_GPL(ip_route_output_flow); /* called with rcu_read_lock held */ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, struct rtable *rt, u32 table_id, dscp_t dscp, struct flowi4 *fl4, struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags) { struct rtmsg *r; struct nlmsghdr *nlh; unsigned long expires = 0; u32 error; u32 metrics[RTAX_MAX]; nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags); if (!nlh) return -EMSGSIZE; r = nlmsg_data(nlh); r->rtm_family = AF_INET; r->rtm_dst_len = 32; r->rtm_src_len = 0; r->rtm_tos = inet_dscp_to_dsfield(dscp); r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT; if (nla_put_u32(skb, RTA_TABLE, table_id)) goto nla_put_failure; r->rtm_type = rt->rt_type; r->rtm_scope = RT_SCOPE_UNIVERSE; r->rtm_protocol = RTPROT_UNSPEC; r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; if (rt->rt_flags & RTCF_NOTIFY) r->rtm_flags |= RTM_F_NOTIFY; if (IPCB(skb)->flags & IPSKB_DOREDIRECT) r->rtm_flags |= RTCF_DOREDIRECT; if (nla_put_in_addr(skb, RTA_DST, dst)) goto nla_put_failure; if (src) { r->rtm_src_len = 32; if (nla_put_in_addr(skb, RTA_SRC, src)) goto nla_put_failure; } if (rt->dst.dev && nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) goto nla_put_failure; if (lwtunnel_fill_encap(skb, rt->dst.lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) goto nla_put_failure; #ifdef CONFIG_IP_ROUTE_CLASSID if (rt->dst.tclassid && nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) goto nla_put_failure; #endif if (fl4 && !rt_is_input_route(rt) && fl4->saddr != src) { if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr)) goto nla_put_failure; } if (rt->rt_uses_gateway) { if (rt->rt_gw_family == AF_INET && nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) { goto nla_put_failure; } else if (rt->rt_gw_family == AF_INET6) { int alen = sizeof(struct in6_addr); struct nlattr *nla; struct rtvia *via; nla = nla_reserve(skb, RTA_VIA, alen + 2); if (!nla) goto nla_put_failure; via = nla_data(nla); via->rtvia_family = AF_INET6; memcpy(via->rtvia_addr, &rt->rt_gw6, alen); } } expires = READ_ONCE(rt->dst.expires); if (expires) { unsigned long now = jiffies; if (time_before(now, expires)) expires -= now; else expires = 0; } memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); if (rt->rt_pmtu && expires) metrics[RTAX_MTU - 1] = rt->rt_pmtu; if (rt->rt_mtu_locked && expires) metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU); if (rtnetlink_put_metrics(skb, metrics) < 0) goto nla_put_failure; if (fl4) { if (fl4->flowi4_mark && nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) goto nla_put_failure; if (!uid_eq(fl4->flowi4_uid, INVALID_UID) && nla_put_u32(skb, RTA_UID, from_kuid_munged(current_user_ns(), fl4->flowi4_uid))) goto nla_put_failure; if (rt_is_input_route(rt)) { #ifdef CONFIG_IP_MROUTE if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && IPV4_DEVCONF_ALL_RO(net, MC_FORWARDING)) { int err = ipmr_get_route(net, skb, fl4->saddr, fl4->daddr, r, portid); if (err <= 0) { if (err == 0) return 0; goto nla_put_failure; } } else #endif if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif)) goto nla_put_failure; } } error = rt->dst.error; if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, u32 table_id, struct fnhe_hash_bucket *bucket, int genid, int *fa_index, int fa_start, unsigned int flags) { int i; for (i = 0; i < FNHE_HASH_SIZE; i++) { struct fib_nh_exception *fnhe; for (fnhe = rcu_dereference(bucket[i].chain); fnhe; fnhe = rcu_dereference(fnhe->fnhe_next)) { struct rtable *rt; int err; if (*fa_index < fa_start) goto next; if (fnhe->fnhe_genid != genid) goto next; if (fnhe->fnhe_expires && time_after(jiffies, fnhe->fnhe_expires)) goto next; rt = rcu_dereference(fnhe->fnhe_rth_input); if (!rt) rt = rcu_dereference(fnhe->fnhe_rth_output); if (!rt) goto next; err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt, table_id, 0, NULL, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); if (err) return err; next: (*fa_index)++; } } return 0; } int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb, u32 table_id, struct fib_info *fi, int *fa_index, int fa_start, unsigned int flags) { struct net *net = sock_net(cb->skb->sk); int nhsel, genid = fnhe_genid(net); for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) { struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel); struct fnhe_hash_bucket *bucket; int err; if (nhc->nhc_flags & RTNH_F_DEAD) continue; rcu_read_lock(); bucket = rcu_dereference(nhc->nhc_exceptions); err = 0; if (bucket) err = fnhe_dump_bucket(net, skb, cb, table_id, bucket, genid, fa_index, fa_start, flags); rcu_read_unlock(); if (err) return err; } return 0; } static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst, u8 ip_proto, __be16 sport, __be16 dport) { struct sk_buff *skb; struct iphdr *iph; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return NULL; /* Reserve room for dummy headers, this skb can pass * through good chunk of routing engine. */ skb_reset_mac_header(skb); skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IP); iph = skb_put(skb, sizeof(struct iphdr)); iph->protocol = ip_proto; iph->saddr = src; iph->daddr = dst; iph->version = 0x4; iph->frag_off = 0; iph->ihl = 0x5; skb_set_transport_header(skb, skb->len); switch (iph->protocol) { case IPPROTO_UDP: { struct udphdr *udph; udph = skb_put_zero(skb, sizeof(struct udphdr)); udph->source = sport; udph->dest = dport; udph->len = htons(sizeof(struct udphdr)); udph->check = 0; break; } case IPPROTO_TCP: { struct tcphdr *tcph; tcph = skb_put_zero(skb, sizeof(struct tcphdr)); tcph->source = sport; tcph->dest = dport; tcph->doff = sizeof(struct tcphdr) / 4; tcph->rst = 1; tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), src, dst, 0); break; } case IPPROTO_ICMP: { struct icmphdr *icmph; icmph = skb_put_zero(skb, sizeof(struct icmphdr)); icmph->type = ICMP_ECHO; icmph->code = 0; } } return skb; } static int inet_rtm_valid_getroute_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct rtmsg *rtm; int i, err; rtm = nlmsg_payload(nlh, sizeof(*rtm)); if (!rtm) { NL_SET_ERR_MSG(extack, "ipv4: Invalid header for route get request"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || rtm->rtm_type) { NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request"); return -EINVAL; } if (rtm->rtm_flags & ~(RTM_F_NOTIFY | RTM_F_LOOKUP_TABLE | RTM_F_FIB_MATCH)) { NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, extack); if (err) return err; if ((tb[RTA_SRC] && !rtm->rtm_src_len) || (tb[RTA_DST] && !rtm->rtm_dst_len)) { NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4"); return -EINVAL; } for (i = 0; i <= RTA_MAX; i++) { if (!tb[i]) continue; switch (i) { case RTA_IIF: case RTA_OIF: case RTA_SRC: case RTA_DST: case RTA_IP_PROTO: case RTA_SPORT: case RTA_DPORT: case RTA_MARK: case RTA_UID: break; default: NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request"); return -EINVAL; } } return 0; } static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[RTA_MAX+1]; u32 table_id = RT_TABLE_MAIN; __be16 sport = 0, dport = 0; struct fib_result res = {}; u8 ip_proto = IPPROTO_UDP; struct rtable *rt = NULL; struct sk_buff *skb; struct rtmsg *rtm; struct flowi4 fl4 = {}; __be32 dst = 0; __be32 src = 0; dscp_t dscp; kuid_t uid; u32 iif; int err; int mark; err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack); if (err < 0) return err; rtm = nlmsg_data(nlh); src = nla_get_in_addr_default(tb[RTA_SRC], 0); dst = nla_get_in_addr_default(tb[RTA_DST], 0); iif = nla_get_u32_default(tb[RTA_IIF], 0); mark = nla_get_u32_default(tb[RTA_MARK], 0); dscp = inet_dsfield_to_dscp(rtm->rtm_tos); if (tb[RTA_UID]) uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID])); else uid = (iif ? INVALID_UID : current_uid()); if (tb[RTA_IP_PROTO]) { err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], &ip_proto, AF_INET, extack); if (err) return err; } if (tb[RTA_SPORT]) sport = nla_get_be16(tb[RTA_SPORT]); if (tb[RTA_DPORT]) dport = nla_get_be16(tb[RTA_DPORT]); skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport); if (!skb) return -ENOBUFS; fl4.daddr = dst; fl4.saddr = src; fl4.flowi4_tos = inet_dscp_to_dsfield(dscp); fl4.flowi4_oif = nla_get_u32_default(tb[RTA_OIF], 0); fl4.flowi4_mark = mark; fl4.flowi4_uid = uid; if (sport) fl4.fl4_sport = sport; if (dport) fl4.fl4_dport = dport; fl4.flowi4_proto = ip_proto; rcu_read_lock(); if (iif) { struct net_device *dev; dev = dev_get_by_index_rcu(net, iif); if (!dev) { err = -ENODEV; goto errout_rcu; } fl4.flowi4_iif = iif; /* for rt_fill_info */ skb->dev = dev; skb->mark = mark; err = ip_route_input_rcu(skb, dst, src, dscp, dev, &res) ? -EINVAL : 0; rt = skb_rtable(skb); if (err == 0 && rt->dst.error) err = -rt->dst.error; } else { fl4.flowi4_iif = LOOPBACK_IFINDEX; skb->dev = net->loopback_dev; rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb); err = 0; if (IS_ERR(rt)) err = PTR_ERR(rt); else skb_dst_set(skb, &rt->dst); } if (err) goto errout_rcu; if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE) table_id = res.table ? res.table->tb_id : 0; /* reset skb for netlink reply msg */ skb_trim(skb, 0); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb_reset_mac_header(skb); if (rtm->rtm_flags & RTM_F_FIB_MATCH) { struct fib_rt_info fri; if (!res.fi) { err = fib_props[res.type].error; if (!err) err = -EHOSTUNREACH; goto errout_rcu; } fri.fi = res.fi; fri.tb_id = table_id; fri.dst = res.prefix; fri.dst_len = res.prefixlen; fri.dscp = res.dscp; fri.type = rt->rt_type; fri.offload = 0; fri.trap = 0; fri.offload_failed = 0; if (res.fa_head) { struct fib_alias *fa; hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) { u8 slen = 32 - fri.dst_len; if (fa->fa_slen == slen && fa->tb_id == fri.tb_id && fa->fa_dscp == fri.dscp && fa->fa_info == res.fi && fa->fa_type == fri.type) { fri.offload = READ_ONCE(fa->offload); fri.trap = READ_ONCE(fa->trap); fri.offload_failed = READ_ONCE(fa->offload_failed); break; } } } err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0); } else { err = rt_fill_info(net, dst, src, rt, table_id, res.dscp, &fl4, skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0); } if (err < 0) goto errout_rcu; rcu_read_unlock(); err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout_free: return err; errout_rcu: rcu_read_unlock(); kfree_skb(skb); goto errout_free; } void ip_rt_multicast_event(struct in_device *in_dev) { rt_cache_flush(dev_net(in_dev->dev)); } #ifdef CONFIG_SYSCTL static int ip_rt_gc_interval __read_mostly = 60 * HZ; static int ip_rt_gc_min_interval __read_mostly = HZ / 2; static int ip_rt_gc_elasticity __read_mostly = 8; static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU; static int ipv4_sysctl_rtcache_flush(const struct ctl_table *__ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = (struct net *)__ctl->extra1; if (write) { rt_cache_flush(net); fnhe_genid_bump(net); return 0; } return -EINVAL; } static struct ctl_table ipv4_route_table[] = { { .procname = "gc_thresh", .data = &ipv4_dst_ops.gc_thresh, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "max_size", .data = &ip_rt_max_size, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { /* Deprecated. Use gc_min_interval_ms */ .procname = "gc_min_interval", .data = &ip_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "gc_min_interval_ms", .data = &ip_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, { .procname = "gc_timeout", .data = &ip_rt_gc_timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "gc_interval", .data = &ip_rt_gc_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "redirect_load", .data = &ip_rt_redirect_load, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "redirect_number", .data = &ip_rt_redirect_number, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "redirect_silence", .data = &ip_rt_redirect_silence, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "error_cost", .data = &ip_rt_error_cost, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "error_burst", .data = &ip_rt_error_burst, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "gc_elasticity", .data = &ip_rt_gc_elasticity, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, }; static const char ipv4_route_flush_procname[] = "flush"; static struct ctl_table ipv4_route_netns_table[] = { { .procname = ipv4_route_flush_procname, .maxlen = sizeof(int), .mode = 0200, .proc_handler = ipv4_sysctl_rtcache_flush, }, { .procname = "min_pmtu", .data = &init_net.ipv4.ip_rt_min_pmtu, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &ip_min_valid_pmtu, }, { .procname = "mtu_expires", .data = &init_net.ipv4.ip_rt_mtu_expires, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "min_adv_mss", .data = &init_net.ipv4.ip_rt_min_advmss, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, }; static __net_init int sysctl_route_net_init(struct net *net) { struct ctl_table *tbl; size_t table_size = ARRAY_SIZE(ipv4_route_netns_table); tbl = ipv4_route_netns_table; if (!net_eq(net, &init_net)) { int i; tbl = kmemdup(tbl, sizeof(ipv4_route_netns_table), GFP_KERNEL); if (!tbl) goto err_dup; /* Don't export non-whitelisted sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) { if (tbl[0].procname != ipv4_route_flush_procname) table_size = 0; } /* Update the variables to point into the current struct net * except for the first element flush */ for (i = 1; i < table_size; i++) tbl[i].data += (void *)net - (void *)&init_net; } tbl[0].extra1 = net; net->ipv4.route_hdr = register_net_sysctl_sz(net, "net/ipv4/route", tbl, table_size); if (!net->ipv4.route_hdr) goto err_reg; return 0; err_reg: if (tbl != ipv4_route_netns_table) kfree(tbl); err_dup: return -ENOMEM; } static __net_exit void sysctl_route_net_exit(struct net *net) { const struct ctl_table *tbl; tbl = net->ipv4.route_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.route_hdr); BUG_ON(tbl == ipv4_route_netns_table); kfree(tbl); } static __net_initdata struct pernet_operations sysctl_route_ops = { .init = sysctl_route_net_init, .exit = sysctl_route_net_exit, }; #endif static __net_init int netns_ip_rt_init(struct net *net) { /* Set default value for namespaceified sysctls */ net->ipv4.ip_rt_min_pmtu = DEFAULT_MIN_PMTU; net->ipv4.ip_rt_mtu_expires = DEFAULT_MTU_EXPIRES; net->ipv4.ip_rt_min_advmss = DEFAULT_MIN_ADVMSS; return 0; } static struct pernet_operations __net_initdata ip_rt_ops = { .init = netns_ip_rt_init, }; static __net_init int rt_genid_init(struct net *net) { atomic_set(&net->ipv4.rt_genid, 0); atomic_set(&net->fnhe_genid, 0); atomic_set(&net->ipv4.dev_addr_genid, get_random_u32()); return 0; } static __net_initdata struct pernet_operations rt_genid_ops = { .init = rt_genid_init, }; static int __net_init ipv4_inetpeer_init(struct net *net) { struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); if (!bp) return -ENOMEM; inet_peer_base_init(bp); net->ipv4.peers = bp; return 0; } static void __net_exit ipv4_inetpeer_exit(struct net *net) { struct inet_peer_base *bp = net->ipv4.peers; net->ipv4.peers = NULL; inetpeer_invalidate_tree(bp); kfree(bp); } static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { .init = ipv4_inetpeer_init, .exit = ipv4_inetpeer_exit, }; #ifdef CONFIG_IP_ROUTE_CLASSID struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; #endif /* CONFIG_IP_ROUTE_CLASSID */ static const struct rtnl_msg_handler ip_rt_rtnl_msg_handlers[] __initconst = { {.protocol = PF_INET, .msgtype = RTM_GETROUTE, .doit = inet_rtm_getroute, .flags = RTNL_FLAG_DOIT_UNLOCKED}, }; int __init ip_rt_init(void) { void *idents_hash; int cpu; /* For modern hosts, this will use 2 MB of memory */ idents_hash = alloc_large_system_hash("IP idents", sizeof(*ip_idents) + sizeof(*ip_tstamps), 0, 16, /* one bucket per 64 KB */ HASH_ZERO, NULL, &ip_idents_mask, 2048, 256*1024); ip_idents = idents_hash; get_random_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents)); ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents); for_each_possible_cpu(cpu) { struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); INIT_LIST_HEAD(&ul->head); spin_lock_init(&ul->lock); } #ifdef CONFIG_IP_ROUTE_CLASSID ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); if (!ip_rt_acct) panic("IP: failed to allocate ip_rt_acct\n"); #endif ipv4_dst_ops.kmem_cachep = KMEM_CACHE(rtable, SLAB_HWCACHE_ALIGN | SLAB_PANIC); ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; if (dst_entries_init(&ipv4_dst_ops) < 0) panic("IP: failed to allocate ipv4_dst_ops counter\n"); if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); ipv4_dst_ops.gc_thresh = ~0; ip_rt_max_size = INT_MAX; devinet_init(); ip_fib_init(); if (ip_rt_proc_init()) pr_err("Unable to create route proc files\n"); #ifdef CONFIG_XFRM xfrm_init(); xfrm4_init(); #endif rtnl_register_many(ip_rt_rtnl_msg_handlers); #ifdef CONFIG_SYSCTL register_pernet_subsys(&sysctl_route_ops); #endif register_pernet_subsys(&ip_rt_ops); register_pernet_subsys(&rt_genid_ops); register_pernet_subsys(&ipv4_inetpeer_ops); return 0; } #ifdef CONFIG_SYSCTL /* * We really need to sanitize the damn ipv4 init order, then all * this nonsense will go away. */ void __init ip_static_sysctl_init(void) { register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table); } #endif
69 69 1171 60 69 1667 1737 1247 14 1354 1223 1224 788 787 786 787 485 1 788 489 488 488 787 787 787 788 788 786 8 50 2 37 7 488 252 745 44 300 488 786 787 788 787 788 788 427 884 880 208 676 883 686 775 1243 1150 95 3 92 78 12 3 3 26 1211 1243 1851 7 1852 158 159 185 2 2 2 1 1 158 1 20 159 1 2 159 1 159 1831 1830 8 49 37 4 1210 1203 8 5 14 1 678 664 1 1 1 1 1 9 1 1 9 566 8 558 723 1 2 21 18 682 675 5 679 434 30 88 71 54 316 362 32 647 6 627 42 3 292 275 1 530 565 5 560 575 3 999 1000 627 627 1018 1 667 128 225 5 1008 2 1 2 1 8 884 105 1 115 109 110 883 786 207 9 21 600 111 521 630 2 1 1 74 2 5 65 1 1 52 1 5 52 19 2 3 6 1 1 1 1 1 1 2 3 3 1 1 1 153 2 1 3 1 144 1 1 144 1 10 1 3 6 5 58 3 22 24 1 10 1 2 1 1 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2017 Facebook */ #include <linux/bpf.h> #include <linux/btf.h> #include <linux/btf_ids.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/vmalloc.h> #include <linux/etherdevice.h> #include <linux/filter.h> #include <linux/rcupdate_trace.h> #include <linux/sched/signal.h> #include <net/bpf_sk_storage.h> #include <net/hotdata.h> #include <net/sock.h> #include <net/tcp.h> #include <net/net_namespace.h> #include <net/page_pool/helpers.h> #include <linux/error-injection.h> #include <linux/smp.h> #include <linux/sock_diag.h> #include <linux/netfilter.h> #include <net/netdev_rx_queue.h> #include <net/xdp.h> #include <net/netfilter/nf_bpf_link.h> #define CREATE_TRACE_POINTS #include <trace/events/bpf_test_run.h> struct bpf_test_timer { enum { NO_PREEMPT, NO_MIGRATE } mode; u32 i; u64 time_start, time_spent; }; static void bpf_test_timer_enter(struct bpf_test_timer *t) __acquires(rcu) { rcu_read_lock(); if (t->mode == NO_PREEMPT) preempt_disable(); else migrate_disable(); t->time_start = ktime_get_ns(); } static void bpf_test_timer_leave(struct bpf_test_timer *t) __releases(rcu) { t->time_start = 0; if (t->mode == NO_PREEMPT) preempt_enable(); else migrate_enable(); rcu_read_unlock(); } static bool bpf_test_timer_continue(struct bpf_test_timer *t, int iterations, u32 repeat, int *err, u32 *duration) __must_hold(rcu) { t->i += iterations; if (t->i >= repeat) { /* We're done. */ t->time_spent += ktime_get_ns() - t->time_start; do_div(t->time_spent, t->i); *duration = t->time_spent > U32_MAX ? U32_MAX : (u32)t->time_spent; *err = 0; goto reset; } if (signal_pending(current)) { /* During iteration: we've been cancelled, abort. */ *err = -EINTR; goto reset; } if (need_resched()) { /* During iteration: we need to reschedule between runs. */ t->time_spent += ktime_get_ns() - t->time_start; bpf_test_timer_leave(t); cond_resched(); bpf_test_timer_enter(t); } /* Do another round. */ return true; reset: t->i = 0; return false; } /* We put this struct at the head of each page with a context and frame * initialised when the page is allocated, so we don't have to do this on each * repetition of the test run. */ struct xdp_page_head { struct xdp_buff orig_ctx; struct xdp_buff ctx; union { /* ::data_hard_start starts here */ DECLARE_FLEX_ARRAY(struct xdp_frame, frame); DECLARE_FLEX_ARRAY(u8, data); }; }; struct xdp_test_data { struct xdp_buff *orig_ctx; struct xdp_rxq_info rxq; struct net_device *dev; struct page_pool *pp; struct xdp_frame **frames; struct sk_buff **skbs; struct xdp_mem_info mem; u32 batch_size; u32 frame_cnt; }; /* tools/testing/selftests/bpf/prog_tests/xdp_do_redirect.c:%MAX_PKT_SIZE * must be updated accordingly this gets changed, otherwise BPF selftests * will fail. */ #define TEST_XDP_FRAME_SIZE (PAGE_SIZE - sizeof(struct xdp_page_head)) #define TEST_XDP_MAX_BATCH 256 static void xdp_test_run_init_page(netmem_ref netmem, void *arg) { struct xdp_page_head *head = phys_to_virt(page_to_phys(netmem_to_page(netmem))); struct xdp_buff *new_ctx, *orig_ctx; u32 headroom = XDP_PACKET_HEADROOM; struct xdp_test_data *xdp = arg; size_t frm_len, meta_len; struct xdp_frame *frm; void *data; orig_ctx = xdp->orig_ctx; frm_len = orig_ctx->data_end - orig_ctx->data_meta; meta_len = orig_ctx->data - orig_ctx->data_meta; headroom -= meta_len; new_ctx = &head->ctx; frm = head->frame; data = head->data; memcpy(data + headroom, orig_ctx->data_meta, frm_len); xdp_init_buff(new_ctx, TEST_XDP_FRAME_SIZE, &xdp->rxq); xdp_prepare_buff(new_ctx, data, headroom, frm_len, true); new_ctx->data = new_ctx->data_meta + meta_len; xdp_update_frame_from_buff(new_ctx, frm); frm->mem_type = new_ctx->rxq->mem.type; memcpy(&head->orig_ctx, new_ctx, sizeof(head->orig_ctx)); } static int xdp_test_run_setup(struct xdp_test_data *xdp, struct xdp_buff *orig_ctx) { struct page_pool *pp; int err = -ENOMEM; struct page_pool_params pp_params = { .order = 0, .flags = 0, .pool_size = xdp->batch_size, .nid = NUMA_NO_NODE, .init_callback = xdp_test_run_init_page, .init_arg = xdp, }; xdp->frames = kvmalloc_array(xdp->batch_size, sizeof(void *), GFP_KERNEL); if (!xdp->frames) return -ENOMEM; xdp->skbs = kvmalloc_array(xdp->batch_size, sizeof(void *), GFP_KERNEL); if (!xdp->skbs) goto err_skbs; pp = page_pool_create(&pp_params); if (IS_ERR(pp)) { err = PTR_ERR(pp); goto err_pp; } /* will copy 'mem.id' into pp->xdp_mem_id */ err = xdp_reg_mem_model(&xdp->mem, MEM_TYPE_PAGE_POOL, pp); if (err) goto err_mmodel; xdp->pp = pp; /* We create a 'fake' RXQ referencing the original dev, but with an * xdp_mem_info pointing to our page_pool */ xdp_rxq_info_reg(&xdp->rxq, orig_ctx->rxq->dev, 0, 0); xdp->rxq.mem.type = MEM_TYPE_PAGE_POOL; xdp->rxq.mem.id = pp->xdp_mem_id; xdp->dev = orig_ctx->rxq->dev; xdp->orig_ctx = orig_ctx; return 0; err_mmodel: page_pool_destroy(pp); err_pp: kvfree(xdp->skbs); err_skbs: kvfree(xdp->frames); return err; } static void xdp_test_run_teardown(struct xdp_test_data *xdp) { xdp_unreg_mem_model(&xdp->mem); page_pool_destroy(xdp->pp); kfree(xdp->frames); kfree(xdp->skbs); } static bool frame_was_changed(const struct xdp_page_head *head) { /* xdp_scrub_frame() zeroes the data pointer, flags is the last field, * i.e. has the highest chances to be overwritten. If those two are * untouched, it's most likely safe to skip the context reset. */ return head->frame->data != head->orig_ctx.data || head->frame->flags != head->orig_ctx.flags; } static bool ctx_was_changed(struct xdp_page_head *head) { return head->orig_ctx.data != head->ctx.data || head->orig_ctx.data_meta != head->ctx.data_meta || head->orig_ctx.data_end != head->ctx.data_end; } static void reset_ctx(struct xdp_page_head *head) { if (likely(!frame_was_changed(head) && !ctx_was_changed(head))) return; head->ctx.data = head->orig_ctx.data; head->ctx.data_meta = head->orig_ctx.data_meta; head->ctx.data_end = head->orig_ctx.data_end; xdp_update_frame_from_buff(&head->ctx, head->frame); head->frame->mem_type = head->orig_ctx.rxq->mem.type; } static int xdp_recv_frames(struct xdp_frame **frames, int nframes, struct sk_buff **skbs, struct net_device *dev) { gfp_t gfp = __GFP_ZERO | GFP_ATOMIC; int i, n; LIST_HEAD(list); n = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, gfp, nframes, (void **)skbs); if (unlikely(n == 0)) { for (i = 0; i < nframes; i++) xdp_return_frame(frames[i]); return -ENOMEM; } for (i = 0; i < nframes; i++) { struct xdp_frame *xdpf = frames[i]; struct sk_buff *skb = skbs[i]; skb = __xdp_build_skb_from_frame(xdpf, skb, dev); if (!skb) { xdp_return_frame(xdpf); continue; } list_add_tail(&skb->list, &list); } netif_receive_skb_list(&list); return 0; } static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog, u32 repeat) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int err = 0, act, ret, i, nframes = 0, batch_sz; struct xdp_frame **frames = xdp->frames; struct bpf_redirect_info *ri; struct xdp_page_head *head; struct xdp_frame *frm; bool redirect = false; struct xdp_buff *ctx; struct page *page; batch_sz = min_t(u32, repeat, xdp->batch_size); local_bh_disable(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); ri = bpf_net_ctx_get_ri(); xdp_set_return_frame_no_direct(); for (i = 0; i < batch_sz; i++) { page = page_pool_dev_alloc_pages(xdp->pp); if (!page) { err = -ENOMEM; goto out; } head = phys_to_virt(page_to_phys(page)); reset_ctx(head); ctx = &head->ctx; frm = head->frame; xdp->frame_cnt++; act = bpf_prog_run_xdp(prog, ctx); /* if program changed pkt bounds we need to update the xdp_frame */ if (unlikely(ctx_was_changed(head))) { ret = xdp_update_frame_from_buff(ctx, frm); if (ret) { xdp_return_buff(ctx); continue; } } switch (act) { case XDP_TX: /* we can't do a real XDP_TX since we're not in the * driver, so turn it into a REDIRECT back to the same * index */ ri->tgt_index = xdp->dev->ifindex; ri->map_id = INT_MAX; ri->map_type = BPF_MAP_TYPE_UNSPEC; fallthrough; case XDP_REDIRECT: redirect = true; ret = xdp_do_redirect_frame(xdp->dev, ctx, frm, prog); if (ret) xdp_return_buff(ctx); break; case XDP_PASS: frames[nframes++] = frm; break; default: bpf_warn_invalid_xdp_action(NULL, prog, act); fallthrough; case XDP_DROP: xdp_return_buff(ctx); break; } } out: if (redirect) xdp_do_flush(); if (nframes) { ret = xdp_recv_frames(frames, nframes, xdp->skbs, xdp->dev); if (ret) err = ret; } xdp_clear_return_frame_no_direct(); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); return err; } static int bpf_test_run_xdp_live(struct bpf_prog *prog, struct xdp_buff *ctx, u32 repeat, u32 batch_size, u32 *time) { struct xdp_test_data xdp = { .batch_size = batch_size }; struct bpf_test_timer t = { .mode = NO_MIGRATE }; int ret; if (!repeat) repeat = 1; ret = xdp_test_run_setup(&xdp, ctx); if (ret) return ret; bpf_test_timer_enter(&t); do { xdp.frame_cnt = 0; ret = xdp_test_run_batch(&xdp, prog, repeat - t.i); if (unlikely(ret < 0)) break; } while (bpf_test_timer_continue(&t, xdp.frame_cnt, repeat, &ret, time)); bpf_test_timer_leave(&t); xdp_test_run_teardown(&xdp); return ret; } static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *retval, u32 *time, bool xdp) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct bpf_prog_array_item item = {.prog = prog}; struct bpf_run_ctx *old_ctx; struct bpf_cg_run_ctx run_ctx; struct bpf_test_timer t = { NO_MIGRATE }; enum bpf_cgroup_storage_type stype; int ret; for_each_cgroup_storage_type(stype) { item.cgroup_storage[stype] = bpf_cgroup_storage_alloc(prog, stype); if (IS_ERR(item.cgroup_storage[stype])) { item.cgroup_storage[stype] = NULL; for_each_cgroup_storage_type(stype) bpf_cgroup_storage_free(item.cgroup_storage[stype]); return -ENOMEM; } } if (!repeat) repeat = 1; bpf_test_timer_enter(&t); old_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); do { run_ctx.prog_item = &item; local_bh_disable(); bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); if (xdp) *retval = bpf_prog_run_xdp(prog, ctx); else *retval = bpf_prog_run(prog, ctx); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); } while (bpf_test_timer_continue(&t, 1, repeat, &ret, time)); bpf_reset_run_ctx(old_ctx); bpf_test_timer_leave(&t); for_each_cgroup_storage_type(stype) bpf_cgroup_storage_free(item.cgroup_storage[stype]); return ret; } static int bpf_test_finish(const union bpf_attr *kattr, union bpf_attr __user *uattr, const void *data, struct skb_shared_info *sinfo, u32 size, u32 retval, u32 duration) { void __user *data_out = u64_to_user_ptr(kattr->test.data_out); int err = -EFAULT; u32 copy_size = size; /* Clamp copy if the user has provided a size hint, but copy the full * buffer if not to retain old behaviour. */ if (kattr->test.data_size_out && copy_size > kattr->test.data_size_out) { copy_size = kattr->test.data_size_out; err = -ENOSPC; } if (data_out) { int len = sinfo ? copy_size - sinfo->xdp_frags_size : copy_size; if (len < 0) { err = -ENOSPC; goto out; } if (copy_to_user(data_out, data, len)) goto out; if (sinfo) { int i, offset = len; u32 data_len; for (i = 0; i < sinfo->nr_frags; i++) { skb_frag_t *frag = &sinfo->frags[i]; if (offset >= copy_size) { err = -ENOSPC; break; } data_len = min_t(u32, copy_size - offset, skb_frag_size(frag)); if (copy_to_user(data_out + offset, skb_frag_address(frag), data_len)) goto out; offset += data_len; } } } if (copy_to_user(&uattr->test.data_size_out, &size, sizeof(size))) goto out; if (copy_to_user(&uattr->test.retval, &retval, sizeof(retval))) goto out; if (copy_to_user(&uattr->test.duration, &duration, sizeof(duration))) goto out; if (err != -ENOSPC) err = 0; out: trace_bpf_test_finish(&err); return err; } /* Integer types of various sizes and pointer combinations cover variety of * architecture dependent calling conventions. 7+ can be supported in the * future. */ __bpf_kfunc_start_defs(); __bpf_kfunc int bpf_fentry_test1(int a) { return a + 1; } EXPORT_SYMBOL_GPL(bpf_fentry_test1); int noinline bpf_fentry_test2(int a, u64 b) { return a + b; } int noinline bpf_fentry_test3(char a, int b, u64 c) { return a + b + c; } int noinline bpf_fentry_test4(void *a, char b, int c, u64 d) { return (long)a + b + c + d; } int noinline bpf_fentry_test5(u64 a, void *b, short c, int d, u64 e) { return a + (long)b + c + d + e; } int noinline bpf_fentry_test6(u64 a, void *b, short c, int d, void *e, u64 f) { return a + (long)b + c + d + (long)e + f; } struct bpf_fentry_test_t { struct bpf_fentry_test_t *a; }; int noinline bpf_fentry_test7(struct bpf_fentry_test_t *arg) { asm volatile ("": "+r"(arg)); return (long)arg; } int noinline bpf_fentry_test8(struct bpf_fentry_test_t *arg) { return (long)arg->a; } __bpf_kfunc u32 bpf_fentry_test9(u32 *a) { return *a; } int noinline bpf_fentry_test10(const void *a) { return (long)a; } void noinline bpf_fentry_test_sinfo(struct skb_shared_info *sinfo) { } __bpf_kfunc int bpf_modify_return_test(int a, int *b) { *b += 1; return a + *b; } __bpf_kfunc int bpf_modify_return_test2(int a, int *b, short c, int d, void *e, char f, int g) { *b += 1; return a + *b + c + d + (long)e + f + g; } __bpf_kfunc int bpf_modify_return_test_tp(int nonce) { trace_bpf_trigger_tp(nonce); return nonce; } int noinline bpf_fentry_shadow_test(int a) { return a + 1; } struct prog_test_member1 { int a; }; struct prog_test_member { struct prog_test_member1 m; int c; }; struct prog_test_ref_kfunc { int a; int b; struct prog_test_member memb; struct prog_test_ref_kfunc *next; refcount_t cnt; }; __bpf_kfunc void bpf_kfunc_call_test_release(struct prog_test_ref_kfunc *p) { refcount_dec(&p->cnt); } __bpf_kfunc void bpf_kfunc_call_test_release_dtor(void *p) { bpf_kfunc_call_test_release(p); } CFI_NOSEAL(bpf_kfunc_call_test_release_dtor); __bpf_kfunc void bpf_kfunc_call_memb_release(struct prog_test_member *p) { } __bpf_kfunc void bpf_kfunc_call_memb_release_dtor(void *p) { } CFI_NOSEAL(bpf_kfunc_call_memb_release_dtor); __bpf_kfunc_end_defs(); BTF_KFUNCS_START(bpf_test_modify_return_ids) BTF_ID_FLAGS(func, bpf_modify_return_test) BTF_ID_FLAGS(func, bpf_modify_return_test2) BTF_ID_FLAGS(func, bpf_modify_return_test_tp) BTF_ID_FLAGS(func, bpf_fentry_test1, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_test_modify_return_ids) static const struct btf_kfunc_id_set bpf_test_modify_return_set = { .owner = THIS_MODULE, .set = &bpf_test_modify_return_ids, }; BTF_KFUNCS_START(test_sk_check_kfunc_ids) BTF_ID_FLAGS(func, bpf_kfunc_call_test_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_kfunc_call_memb_release, KF_RELEASE) BTF_KFUNCS_END(test_sk_check_kfunc_ids) static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size, u32 size, u32 headroom, u32 tailroom) { void __user *data_in = u64_to_user_ptr(kattr->test.data_in); void *data; if (user_size < ETH_HLEN || user_size > PAGE_SIZE - headroom - tailroom) return ERR_PTR(-EINVAL); size = SKB_DATA_ALIGN(size); data = kzalloc(size + headroom + tailroom, GFP_USER); if (!data) return ERR_PTR(-ENOMEM); if (copy_from_user(data + headroom, data_in, user_size)) { kfree(data); return ERR_PTR(-EFAULT); } return data; } int bpf_prog_test_run_tracing(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { struct bpf_fentry_test_t arg = {}; u16 side_effect = 0, ret = 0; int b = 2, err = -EFAULT; u32 retval = 0; if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size) return -EINVAL; switch (prog->expected_attach_type) { case BPF_TRACE_FENTRY: case BPF_TRACE_FEXIT: if (bpf_fentry_test1(1) != 2 || bpf_fentry_test2(2, 3) != 5 || bpf_fentry_test3(4, 5, 6) != 15 || bpf_fentry_test4((void *)7, 8, 9, 10) != 34 || bpf_fentry_test5(11, (void *)12, 13, 14, 15) != 65 || bpf_fentry_test6(16, (void *)17, 18, 19, (void *)20, 21) != 111 || bpf_fentry_test7((struct bpf_fentry_test_t *)0) != 0 || bpf_fentry_test8(&arg) != 0 || bpf_fentry_test9(&retval) != 0 || bpf_fentry_test10((void *)0) != 0) goto out; break; case BPF_MODIFY_RETURN: ret = bpf_modify_return_test(1, &b); if (b != 2) side_effect++; b = 2; ret += bpf_modify_return_test2(1, &b, 3, 4, (void *)5, 6, 7); if (b != 2) side_effect++; break; default: goto out; } retval = ((u32)side_effect << 16) | ret; if (copy_to_user(&uattr->test.retval, &retval, sizeof(retval))) goto out; err = 0; out: trace_bpf_test_finish(&err); return err; } struct bpf_raw_tp_test_run_info { struct bpf_prog *prog; void *ctx; u32 retval; }; static void __bpf_prog_test_run_raw_tp(void *data) { struct bpf_raw_tp_test_run_info *info = data; struct bpf_trace_run_ctx run_ctx = {}; struct bpf_run_ctx *old_run_ctx; old_run_ctx = bpf_set_run_ctx(&run_ctx.run_ctx); rcu_read_lock(); info->retval = bpf_prog_run(info->prog, info->ctx); rcu_read_unlock(); bpf_reset_run_ctx(old_run_ctx); } int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { void __user *ctx_in = u64_to_user_ptr(kattr->test.ctx_in); __u32 ctx_size_in = kattr->test.ctx_size_in; struct bpf_raw_tp_test_run_info info; int cpu = kattr->test.cpu, err = 0; int current_cpu; /* doesn't support data_in/out, ctx_out, duration, or repeat */ if (kattr->test.data_in || kattr->test.data_out || kattr->test.ctx_out || kattr->test.duration || kattr->test.repeat || kattr->test.batch_size) return -EINVAL; if (ctx_size_in < prog->aux->max_ctx_offset || ctx_size_in > MAX_BPF_FUNC_ARGS * sizeof(u64)) return -EINVAL; if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 && cpu != 0) return -EINVAL; if (ctx_size_in) { info.ctx = memdup_user(ctx_in, ctx_size_in); if (IS_ERR(info.ctx)) return PTR_ERR(info.ctx); } else { info.ctx = NULL; } info.prog = prog; current_cpu = get_cpu(); if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 || cpu == current_cpu) { __bpf_prog_test_run_raw_tp(&info); } else if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { /* smp_call_function_single() also checks cpu_online() * after csd_lock(). However, since cpu is from user * space, let's do an extra quick check to filter out * invalid value before smp_call_function_single(). */ err = -ENXIO; } else { err = smp_call_function_single(cpu, __bpf_prog_test_run_raw_tp, &info, 1); } put_cpu(); if (!err && copy_to_user(&uattr->test.retval, &info.retval, sizeof(u32))) err = -EFAULT; kfree(info.ctx); return err; } static void *bpf_ctx_init(const union bpf_attr *kattr, u32 max_size) { void __user *data_in = u64_to_user_ptr(kattr->test.ctx_in); void __user *data_out = u64_to_user_ptr(kattr->test.ctx_out); u32 size = kattr->test.ctx_size_in; void *data; int err; if (!data_in && !data_out) return NULL; data = kzalloc(max_size, GFP_USER); if (!data) return ERR_PTR(-ENOMEM); if (data_in) { err = bpf_check_uarg_tail_zero(USER_BPFPTR(data_in), max_size, size); if (err) { kfree(data); return ERR_PTR(err); } size = min_t(u32, max_size, size); if (copy_from_user(data, data_in, size)) { kfree(data); return ERR_PTR(-EFAULT); } } return data; } static int bpf_ctx_finish(const union bpf_attr *kattr, union bpf_attr __user *uattr, const void *data, u32 size) { void __user *data_out = u64_to_user_ptr(kattr->test.ctx_out); int err = -EFAULT; u32 copy_size = size; if (!data || !data_out) return 0; if (copy_size > kattr->test.ctx_size_out) { copy_size = kattr->test.ctx_size_out; err = -ENOSPC; } if (copy_to_user(data_out, data, copy_size)) goto out; if (copy_to_user(&uattr->test.ctx_size_out, &size, sizeof(size))) goto out; if (err != -ENOSPC) err = 0; out: return err; } /** * range_is_zero - test whether buffer is initialized * @buf: buffer to check * @from: check from this position * @to: check up until (excluding) this position * * This function returns true if the there is a non-zero byte * in the buf in the range [from,to). */ static inline bool range_is_zero(void *buf, size_t from, size_t to) { return !memchr_inv((u8 *)buf + from, 0, to - from); } static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) { struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb; if (!__skb) return 0; /* make sure the fields we don't use are zeroed */ if (!range_is_zero(__skb, 0, offsetof(struct __sk_buff, mark))) return -EINVAL; /* mark is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, mark), offsetof(struct __sk_buff, priority))) return -EINVAL; /* priority is allowed */ /* ingress_ifindex is allowed */ /* ifindex is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, ifindex), offsetof(struct __sk_buff, cb))) return -EINVAL; /* cb is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, cb), offsetof(struct __sk_buff, tstamp))) return -EINVAL; /* tstamp is allowed */ /* wire_len is allowed */ /* gso_segs is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, gso_segs), offsetof(struct __sk_buff, gso_size))) return -EINVAL; /* gso_size is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, gso_size), offsetof(struct __sk_buff, hwtstamp))) return -EINVAL; /* hwtstamp is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, hwtstamp), sizeof(struct __sk_buff))) return -EINVAL; skb->mark = __skb->mark; skb->priority = __skb->priority; skb->skb_iif = __skb->ingress_ifindex; skb->tstamp = __skb->tstamp; memcpy(&cb->data, __skb->cb, QDISC_CB_PRIV_LEN); if (__skb->wire_len == 0) { cb->pkt_len = skb->len; } else { if (__skb->wire_len < skb->len || __skb->wire_len > GSO_LEGACY_MAX_SIZE) return -EINVAL; cb->pkt_len = __skb->wire_len; } if (__skb->gso_segs > GSO_MAX_SEGS) return -EINVAL; skb_shinfo(skb)->gso_segs = __skb->gso_segs; skb_shinfo(skb)->gso_size = __skb->gso_size; skb_shinfo(skb)->hwtstamps.hwtstamp = __skb->hwtstamp; return 0; } static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb) { struct qdisc_skb_cb *cb = (struct qdisc_skb_cb *)skb->cb; if (!__skb) return; __skb->mark = skb->mark; __skb->priority = skb->priority; __skb->ingress_ifindex = skb->skb_iif; __skb->ifindex = skb->dev->ifindex; __skb->tstamp = skb->tstamp; memcpy(__skb->cb, &cb->data, QDISC_CB_PRIV_LEN); __skb->wire_len = cb->pkt_len; __skb->gso_segs = skb_shinfo(skb)->gso_segs; __skb->hwtstamp = skb_shinfo(skb)->hwtstamps.hwtstamp; } static struct proto bpf_dummy_proto = { .name = "bpf_dummy", .owner = THIS_MODULE, .obj_size = sizeof(struct sock), }; int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { bool is_l2 = false, is_direct_pkt_access = false; struct net *net = current->nsproxy->net_ns; struct net_device *dev = net->loopback_dev; u32 size = kattr->test.data_size_in; u32 repeat = kattr->test.repeat; struct __sk_buff *ctx = NULL; u32 retval, duration; int hh_len = ETH_HLEN; struct sk_buff *skb; struct sock *sk; void *data; int ret; if ((kattr->test.flags & ~BPF_F_TEST_SKB_CHECKSUM_COMPLETE) || kattr->test.cpu || kattr->test.batch_size) return -EINVAL; data = bpf_test_init(kattr, kattr->test.data_size_in, size, NET_SKB_PAD + NET_IP_ALIGN, SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); if (IS_ERR(data)) return PTR_ERR(data); ctx = bpf_ctx_init(kattr, sizeof(struct __sk_buff)); if (IS_ERR(ctx)) { kfree(data); return PTR_ERR(ctx); } switch (prog->type) { case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: is_l2 = true; fallthrough; case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: case BPF_PROG_TYPE_LWT_XMIT: case BPF_PROG_TYPE_CGROUP_SKB: is_direct_pkt_access = true; break; default: break; } sk = sk_alloc(net, AF_UNSPEC, GFP_USER, &bpf_dummy_proto, 1); if (!sk) { kfree(data); kfree(ctx); return -ENOMEM; } sock_init_data(NULL, sk); skb = slab_build_skb(data); if (!skb) { kfree(data); kfree(ctx); sk_free(sk); return -ENOMEM; } skb->sk = sk; skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); __skb_put(skb, size); if (ctx && ctx->ifindex > 1) { dev = dev_get_by_index(net, ctx->ifindex); if (!dev) { ret = -ENODEV; goto out; } } skb->protocol = eth_type_trans(skb, dev); skb_reset_network_header(skb); switch (skb->protocol) { case htons(ETH_P_IP): sk->sk_family = AF_INET; if (sizeof(struct iphdr) <= skb_headlen(skb)) { sk->sk_rcv_saddr = ip_hdr(skb)->saddr; sk->sk_daddr = ip_hdr(skb)->daddr; } break; #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): sk->sk_family = AF_INET6; if (sizeof(struct ipv6hdr) <= skb_headlen(skb)) { sk->sk_v6_rcv_saddr = ipv6_hdr(skb)->saddr; sk->sk_v6_daddr = ipv6_hdr(skb)->daddr; } break; #endif default: break; } if (is_l2) __skb_push(skb, hh_len); if (is_direct_pkt_access) bpf_compute_data_pointers(skb); ret = convert___skb_to_skb(skb, ctx); if (ret) goto out; if (kattr->test.flags & BPF_F_TEST_SKB_CHECKSUM_COMPLETE) { const int off = skb_network_offset(skb); int len = skb->len - off; skb->csum = skb_checksum(skb, off, len, 0); skb->ip_summed = CHECKSUM_COMPLETE; } ret = bpf_test_run(prog, skb, repeat, &retval, &duration, false); if (ret) goto out; if (!is_l2) { if (skb_headroom(skb) < hh_len) { int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); if (pskb_expand_head(skb, nhead, 0, GFP_USER)) { ret = -ENOMEM; goto out; } } memset(__skb_push(skb, hh_len), 0, hh_len); } if (kattr->test.flags & BPF_F_TEST_SKB_CHECKSUM_COMPLETE) { const int off = skb_network_offset(skb); int len = skb->len - off; __wsum csum; csum = skb_checksum(skb, off, len, 0); if (csum_fold(skb->csum) != csum_fold(csum)) { ret = -EBADMSG; goto out; } } convert_skb_to___skb(skb, ctx); size = skb->len; /* bpf program can never convert linear skb to non-linear */ if (WARN_ON_ONCE(skb_is_nonlinear(skb))) size = skb_headlen(skb); ret = bpf_test_finish(kattr, uattr, skb->data, NULL, size, retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, ctx, sizeof(struct __sk_buff)); out: if (dev && dev != net->loopback_dev) dev_put(dev); kfree_skb(skb); sk_free(sk); kfree(ctx); return ret; } static int xdp_convert_md_to_buff(struct xdp_md *xdp_md, struct xdp_buff *xdp) { unsigned int ingress_ifindex, rx_queue_index; struct netdev_rx_queue *rxqueue; struct net_device *device; if (!xdp_md) return 0; if (xdp_md->egress_ifindex != 0) return -EINVAL; ingress_ifindex = xdp_md->ingress_ifindex; rx_queue_index = xdp_md->rx_queue_index; if (!ingress_ifindex && rx_queue_index) return -EINVAL; if (ingress_ifindex) { device = dev_get_by_index(current->nsproxy->net_ns, ingress_ifindex); if (!device) return -ENODEV; if (rx_queue_index >= device->real_num_rx_queues) goto free_dev; rxqueue = __netif_get_rx_queue(device, rx_queue_index); if (!xdp_rxq_info_is_reg(&rxqueue->xdp_rxq)) goto free_dev; xdp->rxq = &rxqueue->xdp_rxq; /* The device is now tracked in the xdp->rxq for later * dev_put() */ } xdp->data = xdp->data_meta + xdp_md->data; return 0; free_dev: dev_put(device); return -EINVAL; } static void xdp_convert_buff_to_md(struct xdp_buff *xdp, struct xdp_md *xdp_md) { if (!xdp_md) return; xdp_md->data = xdp->data - xdp->data_meta; xdp_md->data_end = xdp->data_end - xdp->data_meta; if (xdp_md->ingress_ifindex) dev_put(xdp->rxq->dev); } int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { bool do_live = (kattr->test.flags & BPF_F_TEST_XDP_LIVE_FRAMES); u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); u32 batch_size = kattr->test.batch_size; u32 retval = 0, duration, max_data_sz; u32 size = kattr->test.data_size_in; u32 headroom = XDP_PACKET_HEADROOM; u32 repeat = kattr->test.repeat; struct netdev_rx_queue *rxqueue; struct skb_shared_info *sinfo; struct xdp_buff xdp = {}; int i, ret = -EINVAL; struct xdp_md *ctx; void *data; if (prog->expected_attach_type == BPF_XDP_DEVMAP || prog->expected_attach_type == BPF_XDP_CPUMAP) return -EINVAL; if (kattr->test.flags & ~BPF_F_TEST_XDP_LIVE_FRAMES) return -EINVAL; if (bpf_prog_is_dev_bound(prog->aux)) return -EINVAL; if (do_live) { if (!batch_size) batch_size = NAPI_POLL_WEIGHT; else if (batch_size > TEST_XDP_MAX_BATCH) return -E2BIG; headroom += sizeof(struct xdp_page_head); } else if (batch_size) { return -EINVAL; } ctx = bpf_ctx_init(kattr, sizeof(struct xdp_md)); if (IS_ERR(ctx)) return PTR_ERR(ctx); if (ctx) { /* There can't be user provided data before the meta data */ if (ctx->data_meta || ctx->data_end != size || ctx->data > ctx->data_end || unlikely(xdp_metalen_invalid(ctx->data)) || (do_live && (kattr->test.data_out || kattr->test.ctx_out))) goto free_ctx; /* Meta data is allocated from the headroom */ headroom -= ctx->data; } max_data_sz = PAGE_SIZE - headroom - tailroom; if (size > max_data_sz) { /* disallow live data mode for jumbo frames */ if (do_live) goto free_ctx; size = max_data_sz; } data = bpf_test_init(kattr, size, max_data_sz, headroom, tailroom); if (IS_ERR(data)) { ret = PTR_ERR(data); goto free_ctx; } rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); rxqueue->xdp_rxq.frag_size = headroom + max_data_sz + tailroom; xdp_init_buff(&xdp, rxqueue->xdp_rxq.frag_size, &rxqueue->xdp_rxq); xdp_prepare_buff(&xdp, data, headroom, size, true); sinfo = xdp_get_shared_info_from_buff(&xdp); ret = xdp_convert_md_to_buff(ctx, &xdp); if (ret) goto free_data; if (unlikely(kattr->test.data_size_in > size)) { void __user *data_in = u64_to_user_ptr(kattr->test.data_in); while (size < kattr->test.data_size_in) { struct page *page; skb_frag_t *frag; u32 data_len; if (sinfo->nr_frags == MAX_SKB_FRAGS) { ret = -ENOMEM; goto out; } page = alloc_page(GFP_KERNEL); if (!page) { ret = -ENOMEM; goto out; } frag = &sinfo->frags[sinfo->nr_frags++]; data_len = min_t(u32, kattr->test.data_size_in - size, PAGE_SIZE); skb_frag_fill_page_desc(frag, page, 0, data_len); if (copy_from_user(page_address(page), data_in + size, data_len)) { ret = -EFAULT; goto out; } sinfo->xdp_frags_size += data_len; size += data_len; } xdp_buff_set_frags_flag(&xdp); } if (repeat > 1) bpf_prog_change_xdp(NULL, prog); if (do_live) ret = bpf_test_run_xdp_live(prog, &xdp, repeat, batch_size, &duration); else ret = bpf_test_run(prog, &xdp, repeat, &retval, &duration, true); /* We convert the xdp_buff back to an xdp_md before checking the return * code so the reference count of any held netdevice will be decremented * even if the test run failed. */ xdp_convert_buff_to_md(&xdp, ctx); if (ret) goto out; size = xdp.data_end - xdp.data_meta + sinfo->xdp_frags_size; ret = bpf_test_finish(kattr, uattr, xdp.data_meta, sinfo, size, retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, ctx, sizeof(struct xdp_md)); out: if (repeat > 1) bpf_prog_change_xdp(prog, NULL); free_data: for (i = 0; i < sinfo->nr_frags; i++) __free_page(skb_frag_page(&sinfo->frags[i])); kfree(data); free_ctx: kfree(ctx); return ret; } static int verify_user_bpf_flow_keys(struct bpf_flow_keys *ctx) { /* make sure the fields we don't use are zeroed */ if (!range_is_zero(ctx, 0, offsetof(struct bpf_flow_keys, flags))) return -EINVAL; /* flags is allowed */ if (!range_is_zero(ctx, offsetofend(struct bpf_flow_keys, flags), sizeof(struct bpf_flow_keys))) return -EINVAL; return 0; } int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { struct bpf_test_timer t = { NO_PREEMPT }; u32 size = kattr->test.data_size_in; struct bpf_flow_dissector ctx = {}; u32 repeat = kattr->test.repeat; struct bpf_flow_keys *user_ctx; struct bpf_flow_keys flow_keys; const struct ethhdr *eth; unsigned int flags = 0; u32 retval, duration; void *data; int ret; if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size) return -EINVAL; if (size < ETH_HLEN) return -EINVAL; data = bpf_test_init(kattr, kattr->test.data_size_in, size, 0, 0); if (IS_ERR(data)) return PTR_ERR(data); eth = (struct ethhdr *)data; if (!repeat) repeat = 1; user_ctx = bpf_ctx_init(kattr, sizeof(struct bpf_flow_keys)); if (IS_ERR(user_ctx)) { kfree(data); return PTR_ERR(user_ctx); } if (user_ctx) { ret = verify_user_bpf_flow_keys(user_ctx); if (ret) goto out; flags = user_ctx->flags; } ctx.flow_keys = &flow_keys; ctx.data = data; ctx.data_end = (__u8 *)data + size; bpf_test_timer_enter(&t); do { retval = bpf_flow_dissect(prog, &ctx, eth->h_proto, ETH_HLEN, size, flags); } while (bpf_test_timer_continue(&t, 1, repeat, &ret, &duration)); bpf_test_timer_leave(&t); if (ret < 0) goto out; ret = bpf_test_finish(kattr, uattr, &flow_keys, NULL, sizeof(flow_keys), retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(struct bpf_flow_keys)); out: kfree(user_ctx); kfree(data); return ret; } int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { struct bpf_test_timer t = { NO_PREEMPT }; struct bpf_prog_array *progs = NULL; struct bpf_sk_lookup_kern ctx = {}; u32 repeat = kattr->test.repeat; struct bpf_sk_lookup *user_ctx; u32 retval, duration; int ret = -EINVAL; if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size) return -EINVAL; if (kattr->test.data_in || kattr->test.data_size_in || kattr->test.data_out || kattr->test.data_size_out) return -EINVAL; if (!repeat) repeat = 1; user_ctx = bpf_ctx_init(kattr, sizeof(*user_ctx)); if (IS_ERR(user_ctx)) return PTR_ERR(user_ctx); if (!user_ctx) return -EINVAL; if (user_ctx->sk) goto out; if (!range_is_zero(user_ctx, offsetofend(typeof(*user_ctx), local_port), sizeof(*user_ctx))) goto out; if (user_ctx->local_port > U16_MAX) { ret = -ERANGE; goto out; } ctx.family = (u16)user_ctx->family; ctx.protocol = (u16)user_ctx->protocol; ctx.dport = (u16)user_ctx->local_port; ctx.sport = user_ctx->remote_port; switch (ctx.family) { case AF_INET: ctx.v4.daddr = (__force __be32)user_ctx->local_ip4; ctx.v4.saddr = (__force __be32)user_ctx->remote_ip4; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: ctx.v6.daddr = (struct in6_addr *)user_ctx->local_ip6; ctx.v6.saddr = (struct in6_addr *)user_ctx->remote_ip6; break; #endif default: ret = -EAFNOSUPPORT; goto out; } progs = bpf_prog_array_alloc(1, GFP_KERNEL); if (!progs) { ret = -ENOMEM; goto out; } progs->items[0].prog = prog; bpf_test_timer_enter(&t); do { ctx.selected_sk = NULL; retval = BPF_PROG_SK_LOOKUP_RUN_ARRAY(progs, ctx, bpf_prog_run); } while (bpf_test_timer_continue(&t, 1, repeat, &ret, &duration)); bpf_test_timer_leave(&t); if (ret < 0) goto out; user_ctx->cookie = 0; if (ctx.selected_sk) { if (ctx.selected_sk->sk_reuseport && !ctx.no_reuseport) { ret = -EOPNOTSUPP; goto out; } user_ctx->cookie = sock_gen_cookie(ctx.selected_sk); } ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(*user_ctx)); out: bpf_prog_array_free(progs); kfree(user_ctx); return ret; } int bpf_prog_test_run_syscall(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { void __user *ctx_in = u64_to_user_ptr(kattr->test.ctx_in); __u32 ctx_size_in = kattr->test.ctx_size_in; void *ctx = NULL; u32 retval; int err = 0; /* doesn't support data_in/out, ctx_out, duration, or repeat or flags */ if (kattr->test.data_in || kattr->test.data_out || kattr->test.ctx_out || kattr->test.duration || kattr->test.repeat || kattr->test.flags || kattr->test.batch_size) return -EINVAL; if (ctx_size_in < prog->aux->max_ctx_offset || ctx_size_in > U16_MAX) return -EINVAL; if (ctx_size_in) { ctx = memdup_user(ctx_in, ctx_size_in); if (IS_ERR(ctx)) return PTR_ERR(ctx); } rcu_read_lock_trace(); retval = bpf_prog_run_pin_on_cpu(prog, ctx); rcu_read_unlock_trace(); if (copy_to_user(&uattr->test.retval, &retval, sizeof(u32))) { err = -EFAULT; goto out; } if (ctx_size_in) if (copy_to_user(ctx_in, ctx, ctx_size_in)) err = -EFAULT; out: kfree(ctx); return err; } static int verify_and_copy_hook_state(struct nf_hook_state *state, const struct nf_hook_state *user, struct net_device *dev) { if (user->in || user->out) return -EINVAL; if (user->net || user->sk || user->okfn) return -EINVAL; switch (user->pf) { case NFPROTO_IPV4: case NFPROTO_IPV6: switch (state->hook) { case NF_INET_PRE_ROUTING: state->in = dev; break; case NF_INET_LOCAL_IN: state->in = dev; break; case NF_INET_FORWARD: state->in = dev; state->out = dev; break; case NF_INET_LOCAL_OUT: state->out = dev; break; case NF_INET_POST_ROUTING: state->out = dev; break; } break; default: return -EINVAL; } state->pf = user->pf; state->hook = user->hook; return 0; } static __be16 nfproto_eth(int nfproto) { switch (nfproto) { case NFPROTO_IPV4: return htons(ETH_P_IP); case NFPROTO_IPV6: break; } return htons(ETH_P_IPV6); } int bpf_prog_test_run_nf(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { struct net *net = current->nsproxy->net_ns; struct net_device *dev = net->loopback_dev; struct nf_hook_state *user_ctx, hook_state = { .pf = NFPROTO_IPV4, .hook = NF_INET_LOCAL_OUT, }; u32 size = kattr->test.data_size_in; u32 repeat = kattr->test.repeat; struct bpf_nf_ctx ctx = { .state = &hook_state, }; struct sk_buff *skb = NULL; u32 retval, duration; void *data; int ret; if (kattr->test.flags || kattr->test.cpu || kattr->test.batch_size) return -EINVAL; if (size < sizeof(struct iphdr)) return -EINVAL; data = bpf_test_init(kattr, kattr->test.data_size_in, size, NET_SKB_PAD + NET_IP_ALIGN, SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); if (IS_ERR(data)) return PTR_ERR(data); if (!repeat) repeat = 1; user_ctx = bpf_ctx_init(kattr, sizeof(struct nf_hook_state)); if (IS_ERR(user_ctx)) { kfree(data); return PTR_ERR(user_ctx); } if (user_ctx) { ret = verify_and_copy_hook_state(&hook_state, user_ctx, dev); if (ret) goto out; } skb = slab_build_skb(data); if (!skb) { ret = -ENOMEM; goto out; } data = NULL; /* data released via kfree_skb */ skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); __skb_put(skb, size); ret = -EINVAL; if (hook_state.hook != NF_INET_LOCAL_OUT) { if (size < ETH_HLEN + sizeof(struct iphdr)) goto out; skb->protocol = eth_type_trans(skb, dev); switch (skb->protocol) { case htons(ETH_P_IP): if (hook_state.pf == NFPROTO_IPV4) break; goto out; case htons(ETH_P_IPV6): if (size < ETH_HLEN + sizeof(struct ipv6hdr)) goto out; if (hook_state.pf == NFPROTO_IPV6) break; goto out; default: ret = -EPROTO; goto out; } skb_reset_network_header(skb); } else { skb->protocol = nfproto_eth(hook_state.pf); } ctx.skb = skb; ret = bpf_test_run(prog, &ctx, repeat, &retval, &duration, false); if (ret) goto out; ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, retval, duration); out: kfree(user_ctx); kfree_skb(skb); kfree(data); return ret; } static const struct btf_kfunc_id_set bpf_prog_test_kfunc_set = { .owner = THIS_MODULE, .set = &test_sk_check_kfunc_ids, }; BTF_ID_LIST(bpf_prog_test_dtor_kfunc_ids) BTF_ID(struct, prog_test_ref_kfunc) BTF_ID(func, bpf_kfunc_call_test_release_dtor) BTF_ID(struct, prog_test_member) BTF_ID(func, bpf_kfunc_call_memb_release_dtor) static int __init bpf_prog_test_run_init(void) { const struct btf_id_dtor_kfunc bpf_prog_test_dtor_kfunc[] = { { .btf_id = bpf_prog_test_dtor_kfunc_ids[0], .kfunc_btf_id = bpf_prog_test_dtor_kfunc_ids[1] }, { .btf_id = bpf_prog_test_dtor_kfunc_ids[2], .kfunc_btf_id = bpf_prog_test_dtor_kfunc_ids[3], }, }; int ret; ret = register_btf_fmodret_id_set(&bpf_test_modify_return_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_prog_test_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_prog_test_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &bpf_prog_test_kfunc_set); return ret ?: register_btf_id_dtor_kfuncs(bpf_prog_test_dtor_kfunc, ARRAY_SIZE(bpf_prog_test_dtor_kfunc), THIS_MODULE); } late_initcall(bpf_prog_test_run_init);
94 169 3 81 81 81 81 81 81 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 // SPDX-License-Identifier: GPL-2.0-only /* * scsi.c Copyright (C) 1992 Drew Eckhardt * Copyright (C) 1993, 1994, 1995, 1999 Eric Youngdale * Copyright (C) 2002, 2003 Christoph Hellwig * * generic mid-level SCSI driver * Initial versions: Drew Eckhardt * Subsequent revisions: Eric Youngdale * * <drew@colorado.edu> * * Bug correction thanks go to : * Rik Faith <faith@cs.unc.edu> * Tommy Thorn <tthorn> * Thomas Wuensche <tw@fgb1.fgb.mw.tu-muenchen.de> * * Modified by Eric Youngdale eric@andante.org or ericy@gnu.ai.mit.edu to * add scatter-gather, multiple outstanding request, and other * enhancements. * * Native multichannel, wide scsi, /proc/scsi and hot plugging * support added by Michael Neuffer <mike@i-connect.net> * * Added request_module("scsi_hostadapter") for kerneld: * (Put an "alias scsi_hostadapter your_hostadapter" in /etc/modprobe.conf) * Bjorn Ekwall <bj0rn@blox.se> * (changed to kmod) * * Major improvements to the timeout, abort, and reset processing, * as well as performance modifications for large queue depths by * Leonard N. Zubkoff <lnz@dandelion.com> * * Converted cli() code to spinlocks, Ingo Molnar * * Jiffies wrap fixes (host->resetting), 3 Dec 1998 Andrea Arcangeli * * out_of_space hacks, D. Gilbert (dpg) 990608 */ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/delay.h> #include <linux/init.h> #include <linux/completion.h> #include <linux/unistd.h> #include <linux/spinlock.h> #include <linux/kmod.h> #include <linux/interrupt.h> #include <linux/notifier.h> #include <linux/cpu.h> #include <linux/mutex.h> #include <linux/unaligned.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_dbg.h> #include <scsi/scsi_device.h> #include <scsi/scsi_driver.h> #include <scsi/scsi_eh.h> #include <scsi/scsi_host.h> #include <scsi/scsi_tcq.h> #include "scsi_priv.h" #include "scsi_logging.h" #define CREATE_TRACE_POINTS #include <trace/events/scsi.h> /* * Definitions and constants. */ /* * Note - the initial logging level can be set here to log events at boot time. * After the system is up, you may enable logging via the /proc interface. */ unsigned int scsi_logging_level; #if defined(CONFIG_SCSI_LOGGING) EXPORT_SYMBOL(scsi_logging_level); #endif #ifdef CONFIG_SCSI_LOGGING void scsi_log_send(struct scsi_cmnd *cmd) { unsigned int level; /* * If ML QUEUE log level is greater than or equal to: * * 1: nothing (match completion) * * 2: log opcode + command of all commands + cmd address * * 3: same as 2 * * 4: same as 3 */ if (unlikely(scsi_logging_level)) { level = SCSI_LOG_LEVEL(SCSI_LOG_MLQUEUE_SHIFT, SCSI_LOG_MLQUEUE_BITS); if (level > 1) { scmd_printk(KERN_INFO, cmd, "Send: scmd 0x%p\n", cmd); scsi_print_command(cmd); } } } void scsi_log_completion(struct scsi_cmnd *cmd, int disposition) { unsigned int level; /* * If ML COMPLETE log level is greater than or equal to: * * 1: log disposition, result, opcode + command, and conditionally * sense data for failures or non SUCCESS dispositions. * * 2: same as 1 but for all command completions. * * 3: same as 2 * * 4: same as 3 plus dump extra junk */ if (unlikely(scsi_logging_level)) { level = SCSI_LOG_LEVEL(SCSI_LOG_MLCOMPLETE_SHIFT, SCSI_LOG_MLCOMPLETE_BITS); if (((level > 0) && (cmd->result || disposition != SUCCESS)) || (level > 1)) { scsi_print_result(cmd, "Done", disposition); scsi_print_command(cmd); if (scsi_status_is_check_condition(cmd->result)) scsi_print_sense(cmd); if (level > 3) scmd_printk(KERN_INFO, cmd, "scsi host busy %d failed %d\n", scsi_host_busy(cmd->device->host), cmd->device->host->host_failed); } } } #endif /** * scsi_finish_command - cleanup and pass command back to upper layer * @cmd: the command * * Description: Pass command off to upper layer for finishing of I/O * request, waking processes that are waiting on results, * etc. */ void scsi_finish_command(struct scsi_cmnd *cmd) { struct scsi_device *sdev = cmd->device; struct scsi_target *starget = scsi_target(sdev); struct Scsi_Host *shost = sdev->host; struct scsi_driver *drv; unsigned int good_bytes; scsi_device_unbusy(sdev, cmd); /* * Clear the flags that say that the device/target/host is no longer * capable of accepting new commands. */ if (atomic_read(&shost->host_blocked)) atomic_set(&shost->host_blocked, 0); if (atomic_read(&starget->target_blocked)) atomic_set(&starget->target_blocked, 0); if (atomic_read(&sdev->device_blocked)) atomic_set(&sdev->device_blocked, 0); SCSI_LOG_MLCOMPLETE(4, sdev_printk(KERN_INFO, sdev, "Notifying upper driver of completion " "(result %x)\n", cmd->result)); good_bytes = scsi_bufflen(cmd); if (!blk_rq_is_passthrough(scsi_cmd_to_rq(cmd))) { int old_good_bytes = good_bytes; drv = scsi_cmd_to_driver(cmd); if (drv->done) good_bytes = drv->done(cmd); /* * USB may not give sense identifying bad sector and * simply return a residue instead, so subtract off the * residue if drv->done() error processing indicates no * change to the completion length. */ if (good_bytes == old_good_bytes) good_bytes -= scsi_get_resid(cmd); } scsi_io_completion(cmd, good_bytes); } /* * 4096 is big enough for saturating fast SCSI LUNs. */ int scsi_device_max_queue_depth(struct scsi_device *sdev) { return min_t(int, sdev->host->can_queue, 4096); } /** * scsi_change_queue_depth - change a device's queue depth * @sdev: SCSI Device in question * @depth: number of commands allowed to be queued to the driver * * Sets the device queue depth and returns the new value. */ int scsi_change_queue_depth(struct scsi_device *sdev, int depth) { depth = min_t(int, depth, scsi_device_max_queue_depth(sdev)); if (depth > 0) { sdev->queue_depth = depth; wmb(); } if (sdev->request_queue) blk_set_queue_depth(sdev->request_queue, depth); sbitmap_resize(&sdev->budget_map, sdev->queue_depth); return sdev->queue_depth; } EXPORT_SYMBOL(scsi_change_queue_depth); /** * scsi_track_queue_full - track QUEUE_FULL events to adjust queue depth * @sdev: SCSI Device in question * @depth: Current number of outstanding SCSI commands on this device, * not counting the one returned as QUEUE_FULL. * * Description: This function will track successive QUEUE_FULL events on a * specific SCSI device to determine if and when there is a * need to adjust the queue depth on the device. * * Returns: * * 0 - No change needed * * >0 - Adjust queue depth to this new depth, * * -1 - Drop back to untagged operation using host->cmd_per_lun as the * untagged command depth * * Lock Status: None held on entry * * Notes: Low level drivers may call this at any time and we will do * "The Right Thing." We are interrupt context safe. */ int scsi_track_queue_full(struct scsi_device *sdev, int depth) { /* * Don't let QUEUE_FULLs on the same * jiffies count, they could all be from * same event. */ if ((jiffies >> 4) == (sdev->last_queue_full_time >> 4)) return 0; sdev->last_queue_full_time = jiffies; if (sdev->last_queue_full_depth != depth) { sdev->last_queue_full_count = 1; sdev->last_queue_full_depth = depth; } else { sdev->last_queue_full_count++; } if (sdev->last_queue_full_count <= 10) return 0; return scsi_change_queue_depth(sdev, depth); } EXPORT_SYMBOL(scsi_track_queue_full); /** * scsi_vpd_inquiry - Request a device provide us with a VPD page * @sdev: The device to ask * @buffer: Where to put the result * @page: Which Vital Product Data to return * @len: The length of the buffer * * This is an internal helper function. You probably want to use * scsi_get_vpd_page instead. * * Returns size of the vpd page on success or a negative error number. */ static int scsi_vpd_inquiry(struct scsi_device *sdev, unsigned char *buffer, u8 page, unsigned len) { int result; unsigned char cmd[16]; if (len < 4) return -EINVAL; cmd[0] = INQUIRY; cmd[1] = 1; /* EVPD */ cmd[2] = page; cmd[3] = len >> 8; cmd[4] = len & 0xff; cmd[5] = 0; /* Control byte */ /* * I'm not convinced we need to try quite this hard to get VPD, but * all the existing users tried this hard. */ result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_IN, buffer, len, 30 * HZ, 3, NULL); if (result) return -EIO; /* * Sanity check that we got the page back that we asked for and that * the page size is not 0. */ if (buffer[1] != page) return -EIO; result = get_unaligned_be16(&buffer[2]); if (!result) return -EIO; return result + 4; } enum scsi_vpd_parameters { SCSI_VPD_HEADER_SIZE = 4, SCSI_VPD_LIST_SIZE = 36, }; static int scsi_get_vpd_size(struct scsi_device *sdev, u8 page) { unsigned char vpd[SCSI_VPD_LIST_SIZE] __aligned(4); int result; if (sdev->no_vpd_size) return SCSI_DEFAULT_VPD_LEN; /* * Fetch the supported pages VPD and validate that the requested page * number is present. */ if (page != 0) { result = scsi_vpd_inquiry(sdev, vpd, 0, sizeof(vpd)); if (result < SCSI_VPD_HEADER_SIZE) return 0; if (result > sizeof(vpd)) { dev_warn_once(&sdev->sdev_gendev, "%s: long VPD page 0 length: %d bytes\n", __func__, result); result = sizeof(vpd); } result -= SCSI_VPD_HEADER_SIZE; if (!memchr(&vpd[SCSI_VPD_HEADER_SIZE], page, result)) return 0; } /* * Fetch the VPD page header to find out how big the page * is. This is done to prevent problems on legacy devices * which can not handle allocation lengths as large as * potentially requested by the caller. */ result = scsi_vpd_inquiry(sdev, vpd, page, SCSI_VPD_HEADER_SIZE); if (result < 0) return 0; if (result < SCSI_VPD_HEADER_SIZE) { dev_warn_once(&sdev->sdev_gendev, "%s: short VPD page 0x%02x length: %d bytes\n", __func__, page, result); return 0; } return result; } /** * scsi_get_vpd_page - Get Vital Product Data from a SCSI device * @sdev: The device to ask * @page: Which Vital Product Data to return * @buf: where to store the VPD * @buf_len: number of bytes in the VPD buffer area * * SCSI devices may optionally supply Vital Product Data. Each 'page' * of VPD is defined in the appropriate SCSI document (eg SPC, SBC). * If the device supports this VPD page, this routine fills @buf * with the data from that page and return 0. If the VPD page is not * supported or its content cannot be retrieved, -EINVAL is returned. */ int scsi_get_vpd_page(struct scsi_device *sdev, u8 page, unsigned char *buf, int buf_len) { int result, vpd_len; if (!scsi_device_supports_vpd(sdev)) return -EINVAL; vpd_len = scsi_get_vpd_size(sdev, page); if (vpd_len <= 0) return -EINVAL; vpd_len = min(vpd_len, buf_len); /* * Fetch the actual page. Since the appropriate size was reported * by the device it is now safe to ask for something bigger. */ memset(buf, 0, buf_len); result = scsi_vpd_inquiry(sdev, buf, page, vpd_len); if (result < 0) return -EINVAL; else if (result > vpd_len) dev_warn_once(&sdev->sdev_gendev, "%s: VPD page 0x%02x result %d > %d bytes\n", __func__, page, result, vpd_len); return 0; } EXPORT_SYMBOL_GPL(scsi_get_vpd_page); /** * scsi_get_vpd_buf - Get Vital Product Data from a SCSI device * @sdev: The device to ask * @page: Which Vital Product Data to return * * Returns %NULL upon failure. */ static struct scsi_vpd *scsi_get_vpd_buf(struct scsi_device *sdev, u8 page) { struct scsi_vpd *vpd_buf; int vpd_len, result; vpd_len = scsi_get_vpd_size(sdev, page); if (vpd_len <= 0) return NULL; retry_pg: /* * Fetch the actual page. Since the appropriate size was reported * by the device it is now safe to ask for something bigger. */ vpd_buf = kmalloc(sizeof(*vpd_buf) + vpd_len, GFP_KERNEL); if (!vpd_buf) return NULL; result = scsi_vpd_inquiry(sdev, vpd_buf->data, page, vpd_len); if (result < 0) { kfree(vpd_buf); return NULL; } if (result > vpd_len) { dev_warn_once(&sdev->sdev_gendev, "%s: VPD page 0x%02x result %d > %d bytes\n", __func__, page, result, vpd_len); vpd_len = result; kfree(vpd_buf); goto retry_pg; } vpd_buf->len = result; return vpd_buf; } static void scsi_update_vpd_page(struct scsi_device *sdev, u8 page, struct scsi_vpd __rcu **sdev_vpd_buf) { struct scsi_vpd *vpd_buf; vpd_buf = scsi_get_vpd_buf(sdev, page); if (!vpd_buf) return; mutex_lock(&sdev->inquiry_mutex); vpd_buf = rcu_replace_pointer(*sdev_vpd_buf, vpd_buf, lockdep_is_held(&sdev->inquiry_mutex)); mutex_unlock(&sdev->inquiry_mutex); if (vpd_buf) kfree_rcu(vpd_buf, rcu); } /** * scsi_attach_vpd - Attach Vital Product Data to a SCSI device structure * @sdev: The device to ask * * Attach the 'Device Identification' VPD page (0x83) and the * 'Unit Serial Number' VPD page (0x80) to a SCSI device * structure. This information can be used to identify the device * uniquely. */ void scsi_attach_vpd(struct scsi_device *sdev) { int i; struct scsi_vpd *vpd_buf; if (!scsi_device_supports_vpd(sdev)) return; /* Ask for all the pages supported by this device */ vpd_buf = scsi_get_vpd_buf(sdev, 0); if (!vpd_buf) return; for (i = 4; i < vpd_buf->len; i++) { switch (vpd_buf->data[i]) { case 0x0: scsi_update_vpd_page(sdev, 0x0, &sdev->vpd_pg0); break; case 0x80: scsi_update_vpd_page(sdev, 0x80, &sdev->vpd_pg80); break; case 0x83: scsi_update_vpd_page(sdev, 0x83, &sdev->vpd_pg83); break; case 0x89: scsi_update_vpd_page(sdev, 0x89, &sdev->vpd_pg89); break; case 0xb0: scsi_update_vpd_page(sdev, 0xb0, &sdev->vpd_pgb0); break; case 0xb1: scsi_update_vpd_page(sdev, 0xb1, &sdev->vpd_pgb1); break; case 0xb2: scsi_update_vpd_page(sdev, 0xb2, &sdev->vpd_pgb2); break; case 0xb7: scsi_update_vpd_page(sdev, 0xb7, &sdev->vpd_pgb7); break; default: break; } } kfree(vpd_buf); } /** * scsi_report_opcode - Find out if a given command is supported * @sdev: scsi device to query * @buffer: scratch buffer (must be at least 20 bytes long) * @len: length of buffer * @opcode: opcode for the command to look up * @sa: service action for the command to look up * * Uses the REPORT SUPPORTED OPERATION CODES to check support for the * command identified with @opcode and @sa. If the command does not * have a service action, @sa must be 0. Returns -EINVAL if RSOC fails, * 0 if the command is not supported and 1 if the device claims to * support the command. */ int scsi_report_opcode(struct scsi_device *sdev, unsigned char *buffer, unsigned int len, unsigned char opcode, unsigned short sa) { unsigned char cmd[16]; struct scsi_sense_hdr sshdr; int result, request_len; const struct scsi_exec_args exec_args = { .sshdr = &sshdr, }; if (sdev->no_report_opcodes || sdev->scsi_level < SCSI_SPC_3) return -EINVAL; /* RSOC header + size of command we are asking about */ request_len = 4 + COMMAND_SIZE(opcode); if (request_len > len) { dev_warn_once(&sdev->sdev_gendev, "%s: len %u bytes, opcode 0x%02x needs %u\n", __func__, len, opcode, request_len); return -EINVAL; } memset(cmd, 0, 16); cmd[0] = MAINTENANCE_IN; cmd[1] = MI_REPORT_SUPPORTED_OPERATION_CODES; if (!sa) { cmd[2] = 1; /* One command format */ cmd[3] = opcode; } else { cmd[2] = 3; /* One command format with service action */ cmd[3] = opcode; put_unaligned_be16(sa, &cmd[4]); } put_unaligned_be32(request_len, &cmd[6]); memset(buffer, 0, len); result = scsi_execute_cmd(sdev, cmd, REQ_OP_DRV_IN, buffer, request_len, 30 * HZ, 3, &exec_args); if (result < 0) return result; if (result && scsi_sense_valid(&sshdr) && sshdr.sense_key == ILLEGAL_REQUEST && (sshdr.asc == 0x20 || sshdr.asc == 0x24) && sshdr.ascq == 0x00) return -EINVAL; if ((buffer[1] & 3) == 3) /* Command supported */ return 1; return 0; } EXPORT_SYMBOL(scsi_report_opcode); #define SCSI_CDL_CHECK_BUF_LEN 64 static bool scsi_cdl_check_cmd(struct scsi_device *sdev, u8 opcode, u16 sa, unsigned char *buf) { int ret; u8 cdlp; /* Check operation code */ ret = scsi_report_opcode(sdev, buf, SCSI_CDL_CHECK_BUF_LEN, opcode, sa); if (ret <= 0) return false; if ((buf[1] & 0x03) != 0x03) return false; /* * See SPC-6, One_command parameter data format for * REPORT SUPPORTED OPERATION CODES. We have the following cases * depending on rwcdlp (buf[0] & 0x01) value: * - rwcdlp == 0: then cdlp indicates support for the A mode page when * it is equal to 1 and for the B mode page when it is * equal to 2. * - rwcdlp == 1: then cdlp indicates support for the T2A mode page * when it is equal to 1 and for the T2B mode page when * it is equal to 2. * Overall, to detect support for command duration limits, we only need * to check that cdlp is 1 or 2. */ cdlp = (buf[1] & 0x18) >> 3; return cdlp == 0x01 || cdlp == 0x02; } /** * scsi_cdl_check - Check if a SCSI device supports Command Duration Limits * @sdev: The device to check */ void scsi_cdl_check(struct scsi_device *sdev) { bool cdl_supported; unsigned char *buf; /* * Support for CDL was defined in SPC-5. Ignore devices reporting an * lower SPC version. This also avoids problems with old drives choking * on MAINTENANCE_IN / MI_REPORT_SUPPORTED_OPERATION_CODES with a * service action specified, as done in scsi_cdl_check_cmd(). */ if (sdev->scsi_level < SCSI_SPC_5) { sdev->cdl_supported = 0; return; } buf = kmalloc(SCSI_CDL_CHECK_BUF_LEN, GFP_KERNEL); if (!buf) { sdev->cdl_supported = 0; return; } /* Check support for READ_16, WRITE_16, READ_32 and WRITE_32 commands */ cdl_supported = scsi_cdl_check_cmd(sdev, READ_16, 0, buf) || scsi_cdl_check_cmd(sdev, WRITE_16, 0, buf) || scsi_cdl_check_cmd(sdev, VARIABLE_LENGTH_CMD, READ_32, buf) || scsi_cdl_check_cmd(sdev, VARIABLE_LENGTH_CMD, WRITE_32, buf); if (cdl_supported) { /* * We have CDL support: force the use of READ16/WRITE16. * READ32 and WRITE32 will be used for devices that support * the T10_PI_TYPE2_PROTECTION protection type. */ sdev->use_16_for_rw = 1; sdev->use_10_for_rw = 0; sdev->cdl_supported = 1; /* * If the device supports CDL, make sure that the current drive * feature status is consistent with the user controlled * cdl_enable state. */ scsi_cdl_enable(sdev, sdev->cdl_enable); } else { sdev->cdl_supported = 0; } kfree(buf); } /** * scsi_cdl_enable - Enable or disable a SCSI device supports for Command * Duration Limits * @sdev: The target device * @enable: the target state */ int scsi_cdl_enable(struct scsi_device *sdev, bool enable) { char buf[64]; int ret; if (!sdev->cdl_supported) return -EOPNOTSUPP; /* * For ATA devices, CDL needs to be enabled with a SET FEATURES command. */ if (sdev->is_ata) { struct scsi_mode_data data; struct scsi_sense_hdr sshdr; char *buf_data; int len; ret = scsi_mode_sense(sdev, 0x08, 0x0a, 0xf2, buf, sizeof(buf), 5 * HZ, 3, &data, NULL); if (ret) return -EINVAL; /* Enable or disable CDL using the ATA feature page */ len = min_t(size_t, sizeof(buf), data.length - data.header_length - data.block_descriptor_length); buf_data = buf + data.header_length + data.block_descriptor_length; /* * If we want to enable CDL and CDL is already enabled on the * device, do nothing. This avoids needlessly resetting the CDL * statistics on the device as that is implied by the CDL enable * action. Similar to this, there is no need to do anything if * we want to disable CDL and CDL is already disabled. */ if (enable) { if ((buf_data[4] & 0x03) == 0x02) goto out; buf_data[4] &= ~0x03; buf_data[4] |= 0x02; } else { if ((buf_data[4] & 0x03) == 0x00) goto out; buf_data[4] &= ~0x03; } ret = scsi_mode_select(sdev, 1, 0, buf_data, len, 5 * HZ, 3, &data, &sshdr); if (ret) { if (ret > 0 && scsi_sense_valid(&sshdr)) scsi_print_sense_hdr(sdev, dev_name(&sdev->sdev_gendev), &sshdr); return ret; } } out: sdev->cdl_enable = enable; return 0; } /** * scsi_device_get - get an additional reference to a scsi_device * @sdev: device to get a reference to * * Description: Gets a reference to the scsi_device and increments the use count * of the underlying LLDD module. You must hold host_lock of the * parent Scsi_Host or already have a reference when calling this. * * This will fail if a device is deleted or cancelled, or when the LLD module * is in the process of being unloaded. */ int scsi_device_get(struct scsi_device *sdev) { if (sdev->sdev_state == SDEV_DEL || sdev->sdev_state == SDEV_CANCEL) goto fail; if (!try_module_get(sdev->host->hostt->module)) goto fail; if (!get_device(&sdev->sdev_gendev)) goto fail_put_module; return 0; fail_put_module: module_put(sdev->host->hostt->module); fail: return -ENXIO; } EXPORT_SYMBOL(scsi_device_get); /** * scsi_device_put - release a reference to a scsi_device * @sdev: device to release a reference on. * * Description: Release a reference to the scsi_device and decrements the use * count of the underlying LLDD module. The device is freed once the last * user vanishes. */ void scsi_device_put(struct scsi_device *sdev) { struct module *mod = sdev->host->hostt->module; put_device(&sdev->sdev_gendev); module_put(mod); } EXPORT_SYMBOL(scsi_device_put); /* helper for shost_for_each_device, see that for documentation */ struct scsi_device *__scsi_iterate_devices(struct Scsi_Host *shost, struct scsi_device *prev) { struct list_head *list = (prev ? &prev->siblings : &shost->__devices); struct scsi_device *next = NULL; unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); while (list->next != &shost->__devices) { next = list_entry(list->next, struct scsi_device, siblings); /* skip devices that we can't get a reference to */ if (!scsi_device_get(next)) break; next = NULL; list = list->next; } spin_unlock_irqrestore(shost->host_lock, flags); if (prev) scsi_device_put(prev); return next; } EXPORT_SYMBOL(__scsi_iterate_devices); /** * starget_for_each_device - helper to walk all devices of a target * @starget: target whose devices we want to iterate over. * @data: Opaque passed to each function call. * @fn: Function to call on each device * * This traverses over each device of @starget. The devices have * a reference that must be released by scsi_host_put when breaking * out of the loop. */ void starget_for_each_device(struct scsi_target *starget, void *data, void (*fn)(struct scsi_device *, void *)) { struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); struct scsi_device *sdev; shost_for_each_device(sdev, shost) { if ((sdev->channel == starget->channel) && (sdev->id == starget->id)) fn(sdev, data); } } EXPORT_SYMBOL(starget_for_each_device); /** * __starget_for_each_device - helper to walk all devices of a target (UNLOCKED) * @starget: target whose devices we want to iterate over. * @data: parameter for callback @fn() * @fn: callback function that is invoked for each device * * This traverses over each device of @starget. It does _not_ * take a reference on the scsi_device, so the whole loop must be * protected by shost->host_lock. * * Note: The only reason why drivers would want to use this is because * they need to access the device list in irq context. Otherwise you * really want to use starget_for_each_device instead. **/ void __starget_for_each_device(struct scsi_target *starget, void *data, void (*fn)(struct scsi_device *, void *)) { struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); struct scsi_device *sdev; __shost_for_each_device(sdev, shost) { if ((sdev->channel == starget->channel) && (sdev->id == starget->id)) fn(sdev, data); } } EXPORT_SYMBOL(__starget_for_each_device); /** * __scsi_device_lookup_by_target - find a device given the target (UNLOCKED) * @starget: SCSI target pointer * @lun: SCSI Logical Unit Number * * Description: Looks up the scsi_device with the specified @lun for a given * @starget. The returned scsi_device does not have an additional * reference. You must hold the host's host_lock over this call and * any access to the returned scsi_device. A scsi_device in state * SDEV_DEL is skipped. * * Note: The only reason why drivers should use this is because * they need to access the device list in irq context. Otherwise you * really want to use scsi_device_lookup_by_target instead. **/ struct scsi_device *__scsi_device_lookup_by_target(struct scsi_target *starget, u64 lun) { struct scsi_device *sdev; list_for_each_entry(sdev, &starget->devices, same_target_siblings) { if (sdev->sdev_state == SDEV_DEL) continue; if (sdev->lun ==lun) return sdev; } return NULL; } EXPORT_SYMBOL(__scsi_device_lookup_by_target); /** * scsi_device_lookup_by_target - find a device given the target * @starget: SCSI target pointer * @lun: SCSI Logical Unit Number * * Description: Looks up the scsi_device with the specified @lun for a given * @starget. The returned scsi_device has an additional reference that * needs to be released with scsi_device_put once you're done with it. **/ struct scsi_device *scsi_device_lookup_by_target(struct scsi_target *starget, u64 lun) { struct scsi_device *sdev; struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); sdev = __scsi_device_lookup_by_target(starget, lun); if (sdev && scsi_device_get(sdev)) sdev = NULL; spin_unlock_irqrestore(shost->host_lock, flags); return sdev; } EXPORT_SYMBOL(scsi_device_lookup_by_target); /** * __scsi_device_lookup - find a device given the host (UNLOCKED) * @shost: SCSI host pointer * @channel: SCSI channel (zero if only one channel) * @id: SCSI target number (physical unit number) * @lun: SCSI Logical Unit Number * * Description: Looks up the scsi_device with the specified @channel, @id, @lun * for a given host. The returned scsi_device does not have an additional * reference. You must hold the host's host_lock over this call and any access * to the returned scsi_device. * * Note: The only reason why drivers would want to use this is because * they need to access the device list in irq context. Otherwise you * really want to use scsi_device_lookup instead. **/ struct scsi_device *__scsi_device_lookup(struct Scsi_Host *shost, uint channel, uint id, u64 lun) { struct scsi_device *sdev; list_for_each_entry(sdev, &shost->__devices, siblings) { if (sdev->sdev_state == SDEV_DEL) continue; if (sdev->channel == channel && sdev->id == id && sdev->lun ==lun) return sdev; } return NULL; } EXPORT_SYMBOL(__scsi_device_lookup); /** * scsi_device_lookup - find a device given the host * @shost: SCSI host pointer * @channel: SCSI channel (zero if only one channel) * @id: SCSI target number (physical unit number) * @lun: SCSI Logical Unit Number * * Description: Looks up the scsi_device with the specified @channel, @id, @lun * for a given host. The returned scsi_device has an additional reference that * needs to be released with scsi_device_put once you're done with it. **/ struct scsi_device *scsi_device_lookup(struct Scsi_Host *shost, uint channel, uint id, u64 lun) { struct scsi_device *sdev; unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); sdev = __scsi_device_lookup(shost, channel, id, lun); if (sdev && scsi_device_get(sdev)) sdev = NULL; spin_unlock_irqrestore(shost->host_lock, flags); return sdev; } EXPORT_SYMBOL(scsi_device_lookup); MODULE_DESCRIPTION("SCSI core"); MODULE_LICENSE("GPL"); module_param(scsi_logging_level, int, S_IRUGO|S_IWUSR); MODULE_PARM_DESC(scsi_logging_level, "a bit mask of logging levels"); static int __init init_scsi(void) { int error; error = scsi_init_procfs(); if (error) goto cleanup_queue; error = scsi_init_devinfo(); if (error) goto cleanup_procfs; error = scsi_init_hosts(); if (error) goto cleanup_devlist; error = scsi_init_sysctl(); if (error) goto cleanup_hosts; error = scsi_sysfs_register(); if (error) goto cleanup_sysctl; scsi_netlink_init(); printk(KERN_NOTICE "SCSI subsystem initialized\n"); return 0; cleanup_sysctl: scsi_exit_sysctl(); cleanup_hosts: scsi_exit_hosts(); cleanup_devlist: scsi_exit_devinfo(); cleanup_procfs: scsi_exit_procfs(); cleanup_queue: scsi_exit_queue(); printk(KERN_ERR "SCSI subsystem failed to initialize, error = %d\n", -error); return error; } static void __exit exit_scsi(void) { scsi_netlink_exit(); scsi_sysfs_unregister(); scsi_exit_sysctl(); scsi_exit_hosts(); scsi_exit_devinfo(); scsi_exit_procfs(); scsi_exit_queue(); } subsys_initcall(init_scsi); module_exit(exit_scsi);
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@netfilter.org> */ /* Kernel module implementing an IP set type: the hash:ip,port,net type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/random.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <net/tcp.h> #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_getport.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 /* 1 SCTP and UDPLITE support added */ /* 2 Range as input support for IPv4 added */ /* 3 nomatch flag support added */ /* 4 Counters support added */ /* 5 Comments support added */ /* 6 Forceadd support added */ /* 7 skbinfo support added */ #define IPSET_TYPE_REV_MAX 8 /* bucketsize, initval support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@netfilter.org>"); IP_SET_MODULE_DESC("hash:ip,port,net", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port,net"); /* Type specific function prefix */ #define HTYPE hash_ipportnet /* We squeeze the "nomatch" flag into cidr: we don't support cidr == 0 * However this way we have to store internally cidr - 1, * dancing back and forth. */ #define IP_SET_HASH_WITH_NETS_PACKED #define IP_SET_HASH_WITH_PROTO #define IP_SET_HASH_WITH_NETS /* IPv4 variant */ /* Member elements */ struct hash_ipportnet4_elem { __be32 ip; __be32 ip2; __be16 port; u8 cidr:7; u8 nomatch:1; u8 proto; }; /* Common functions */ static bool hash_ipportnet4_data_equal(const struct hash_ipportnet4_elem *ip1, const struct hash_ipportnet4_elem *ip2, u32 *multi) { return ip1->ip == ip2->ip && ip1->ip2 == ip2->ip2 && ip1->cidr == ip2->cidr && ip1->port == ip2->port && ip1->proto == ip2->proto; } static int hash_ipportnet4_do_data_match(const struct hash_ipportnet4_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static void hash_ipportnet4_data_set_flags(struct hash_ipportnet4_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } static void hash_ipportnet4_data_reset_flags(struct hash_ipportnet4_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } static void hash_ipportnet4_data_netmask(struct hash_ipportnet4_elem *elem, u8 cidr) { elem->ip2 &= ip_set_netmask(cidr); elem->cidr = cidr - 1; } static bool hash_ipportnet4_data_list(struct sk_buff *skb, const struct hash_ipportnet4_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip2) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr + 1) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipportnet4_data_next(struct hash_ipportnet4_elem *next, const struct hash_ipportnet4_elem *d) { next->ip = d->ip; next->port = d->port; next->ip2 = d->ip2; } #define MTYPE hash_ipportnet4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" static int hash_ipportnet4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_ipportnet4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet4_elem e = { .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (adt == IPSET_TEST) e.cidr = HOST_MASK - 1; if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2); e.ip2 &= ip_set_netmask(e.cidr + 1); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { struct hash_ipportnet4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet4_elem e = { .cidr = HOST_MASK - 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, p = 0, port, port_to; u32 ip2_from = 0, ip2_to = 0, ip2, i = 0; bool with_ports = false; u8 cidr; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP], &ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2], &ip2_from); if (ret) return ret; if (tb[IPSET_ATTR_CIDR2]) { cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; e.cidr = cidr - 1; } e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } with_ports = with_ports && tb[IPSET_ATTR_PORT_TO]; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_IP_TO] || with_ports || tb[IPSET_ATTR_IP2_TO])) { e.ip = htonl(ip); e.ip2 = htonl(ip2_from & ip_set_hostmask(e.cidr + 1)); ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } ip_to = ip; if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; if (ip > ip_to) swap(ip, ip_to); } else if (tb[IPSET_ATTR_CIDR]) { cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } port_to = port = ntohs(e.port); if (tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); } ip2_to = ip2_from; if (tb[IPSET_ATTR_IP2_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to); if (ret) return ret; if (ip2_from > ip2_to) swap(ip2_from, ip2_to); if (ip2_from + UINT_MAX == ip2_to) return -IPSET_ERR_HASH_RANGE; } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr + 1); } if (retried) { ip = ntohl(h->next.ip); p = ntohs(h->next.port); ip2 = ntohl(h->next.ip2); } else { p = port; ip2 = ip2_from; } for (; ip <= ip_to; ip++) { e.ip = htonl(ip); for (; p <= port_to; p++) { e.port = htons(p); do { i++; e.ip2 = htonl(ip2); ip2 = ip_set_range_to_cidr(ip2, ip2_to, &cidr); e.cidr = cidr - 1; if (i > IPSET_MAX_RANGE) { hash_ipportnet4_data_next(&h->next, &e); return -ERANGE; } ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } while (ip2++ < ip2_to); ip2 = ip2_from; } p = port; } return ret; } /* IPv6 variant */ struct hash_ipportnet6_elem { union nf_inet_addr ip; union nf_inet_addr ip2; __be16 port; u8 cidr:7; u8 nomatch:1; u8 proto; }; /* Common functions */ static bool hash_ipportnet6_data_equal(const struct hash_ipportnet6_elem *ip1, const struct hash_ipportnet6_elem *ip2, u32 *multi) { return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && ipv6_addr_equal(&ip1->ip2.in6, &ip2->ip2.in6) && ip1->cidr == ip2->cidr && ip1->port == ip2->port && ip1->proto == ip2->proto; } static int hash_ipportnet6_do_data_match(const struct hash_ipportnet6_elem *elem) { return elem->nomatch ? -ENOTEMPTY : 1; } static void hash_ipportnet6_data_set_flags(struct hash_ipportnet6_elem *elem, u32 flags) { elem->nomatch = !!((flags >> 16) & IPSET_FLAG_NOMATCH); } static void hash_ipportnet6_data_reset_flags(struct hash_ipportnet6_elem *elem, u8 *flags) { swap(*flags, elem->nomatch); } static void hash_ipportnet6_data_netmask(struct hash_ipportnet6_elem *elem, u8 cidr) { ip6_netmask(&elem->ip2, cidr); elem->cidr = cidr - 1; } static bool hash_ipportnet6_data_list(struct sk_buff *skb, const struct hash_ipportnet6_elem *data) { u32 flags = data->nomatch ? IPSET_FLAG_NOMATCH : 0; if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip2.in6) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_CIDR2, data->cidr + 1) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto) || (flags && nla_put_net32(skb, IPSET_ATTR_CADT_FLAGS, htonl(flags)))) goto nla_put_failure; return false; nla_put_failure: return true; } static void hash_ipportnet6_data_next(struct hash_ipportnet6_elem *next, const struct hash_ipportnet6_elem *d) { next->port = d->port; } #undef MTYPE #undef HOST_MASK #define MTYPE hash_ipportnet6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int hash_ipportnet6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { const struct hash_ipportnet6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet6_elem e = { .cidr = INIT_CIDR(h->nets[0].cidr[0], HOST_MASK), }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (adt == IPSET_TEST) e.cidr = HOST_MASK - 1; if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2.in6); ip6_netmask(&e.ip2, e.cidr + 1); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_ipportnet6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportnet6_elem e = { .cidr = HOST_MASK - 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to; bool with_ports = false; u8 cidr; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO) || !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; if (unlikely(tb[IPSET_ATTR_CIDR])) { cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (cidr != HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip2); if (ret) return ret; if (tb[IPSET_ATTR_CIDR2]) { cidr = nla_get_u8(tb[IPSET_ATTR_CIDR2]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; e.cidr = cidr - 1; } ip6_netmask(&e.ip2, e.cidr + 1); e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); if (cadt_flags & IPSET_FLAG_NOMATCH) flags |= (IPSET_FLAG_NOMATCH << 16); } if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_enomatch(ret, flags, adt, set) ? -ret : ip_set_eexist(ret, flags) ? 0 : ret; } port = ntohs(e.port); port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); if (retried) port = ntohs(h->next.port); for (; port <= port_to; port++) { e.port = htons(port); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } return ret; } static struct ip_set_type hash_ipportnet_type __read_mostly = { .name = "hash:ip,port,net", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2 | IPSET_TYPE_NOMATCH, .dimension = IPSET_DIM_THREE, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create_flags[IPSET_TYPE_REV_MAX] = IPSET_CREATE_FLAG_BUCKETSIZE, .create = hash_ipportnet_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_INITVAL] = { .type = NLA_U32 }, [IPSET_ATTR_BUCKETSIZE] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_IP2] = { .type = NLA_NESTED }, [IPSET_ATTR_IP2_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_PORT] = { .type = NLA_U16 }, [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_CIDR2] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_ipportnet_init(void) { return ip_set_type_register(&hash_ipportnet_type); } static void __exit hash_ipportnet_fini(void) { rcu_barrier(); ip_set_type_unregister(&hash_ipportnet_type); } module_init(hash_ipportnet_init); module_exit(hash_ipportnet_fini);
3 3 6 2 1 2 1 2 11 6 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org> */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_offload.h> #include <net/netfilter/nf_dup_netdev.h> #include <net/neighbour.h> #include <net/ip.h> struct nft_fwd_netdev { u8 sreg_dev; }; static void nft_fwd_netdev_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_fwd_netdev *priv = nft_expr_priv(expr); int oif = regs->data[priv->sreg_dev]; struct sk_buff *skb = pkt->skb; /* This is used by ifb only. */ skb->skb_iif = skb->dev->ifindex; skb_set_redirected(skb, nft_hook(pkt) == NF_NETDEV_INGRESS); nf_fwd_netdev_egress(pkt, oif); regs->verdict.code = NF_STOLEN; } static const struct nla_policy nft_fwd_netdev_policy[NFTA_FWD_MAX + 1] = { [NFTA_FWD_SREG_DEV] = { .type = NLA_U32 }, [NFTA_FWD_SREG_ADDR] = { .type = NLA_U32 }, [NFTA_FWD_NFPROTO] = NLA_POLICY_MAX(NLA_BE32, 255), }; static int nft_fwd_netdev_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_fwd_netdev *priv = nft_expr_priv(expr); if (tb[NFTA_FWD_SREG_DEV] == NULL) return -EINVAL; return nft_parse_register_load(ctx, tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev, sizeof(int)); } static int nft_fwd_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { struct nft_fwd_netdev *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_FWD_SREG_DEV, priv->sreg_dev)) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int nft_fwd_netdev_offload(struct nft_offload_ctx *ctx, struct nft_flow_rule *flow, const struct nft_expr *expr) { const struct nft_fwd_netdev *priv = nft_expr_priv(expr); int oif = ctx->regs[priv->sreg_dev].data.data[0]; return nft_fwd_dup_netdev_offload(ctx, flow, FLOW_ACTION_REDIRECT, oif); } static bool nft_fwd_netdev_offload_action(const struct nft_expr *expr) { return true; } struct nft_fwd_neigh { u8 sreg_dev; u8 sreg_addr; u8 nfproto; }; static void nft_fwd_neigh_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nft_fwd_neigh *priv = nft_expr_priv(expr); void *addr = &regs->data[priv->sreg_addr]; int oif = regs->data[priv->sreg_dev]; unsigned int verdict = NF_STOLEN; struct sk_buff *skb = pkt->skb; struct net_device *dev; int neigh_table; switch (priv->nfproto) { case NFPROTO_IPV4: { struct iphdr *iph; if (skb->protocol != htons(ETH_P_IP)) { verdict = NFT_BREAK; goto out; } if (skb_try_make_writable(skb, sizeof(*iph))) { verdict = NF_DROP; goto out; } iph = ip_hdr(skb); ip_decrease_ttl(iph); neigh_table = NEIGH_ARP_TABLE; break; } case NFPROTO_IPV6: { struct ipv6hdr *ip6h; if (skb->protocol != htons(ETH_P_IPV6)) { verdict = NFT_BREAK; goto out; } if (skb_try_make_writable(skb, sizeof(*ip6h))) { verdict = NF_DROP; goto out; } ip6h = ipv6_hdr(skb); ip6h->hop_limit--; neigh_table = NEIGH_ND_TABLE; break; } default: verdict = NFT_BREAK; goto out; } dev = dev_get_by_index_rcu(nft_net(pkt), oif); if (dev == NULL) return; skb->dev = dev; skb_clear_tstamp(skb); neigh_xmit(neigh_table, dev, addr, skb); out: regs->verdict.code = verdict; } static int nft_fwd_neigh_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_fwd_neigh *priv = nft_expr_priv(expr); unsigned int addr_len; int err; if (!tb[NFTA_FWD_SREG_DEV] || !tb[NFTA_FWD_SREG_ADDR] || !tb[NFTA_FWD_NFPROTO]) return -EINVAL; priv->nfproto = ntohl(nla_get_be32(tb[NFTA_FWD_NFPROTO])); switch (priv->nfproto) { case NFPROTO_IPV4: addr_len = sizeof(struct in_addr); break; case NFPROTO_IPV6: addr_len = sizeof(struct in6_addr); break; default: return -EOPNOTSUPP; } err = nft_parse_register_load(ctx, tb[NFTA_FWD_SREG_DEV], &priv->sreg_dev, sizeof(int)); if (err < 0) return err; return nft_parse_register_load(ctx, tb[NFTA_FWD_SREG_ADDR], &priv->sreg_addr, addr_len); } static int nft_fwd_neigh_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { struct nft_fwd_neigh *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_FWD_SREG_DEV, priv->sreg_dev) || nft_dump_register(skb, NFTA_FWD_SREG_ADDR, priv->sreg_addr) || nla_put_be32(skb, NFTA_FWD_NFPROTO, htonl(priv->nfproto))) goto nla_put_failure; return 0; nla_put_failure: return -1; } static int nft_fwd_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { return nft_chain_validate_hooks(ctx->chain, (1 << NF_NETDEV_INGRESS) | (1 << NF_NETDEV_EGRESS)); } static struct nft_expr_type nft_fwd_netdev_type; static const struct nft_expr_ops nft_fwd_neigh_netdev_ops = { .type = &nft_fwd_netdev_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_fwd_neigh)), .eval = nft_fwd_neigh_eval, .init = nft_fwd_neigh_init, .dump = nft_fwd_neigh_dump, .validate = nft_fwd_validate, .reduce = NFT_REDUCE_READONLY, }; static const struct nft_expr_ops nft_fwd_netdev_ops = { .type = &nft_fwd_netdev_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_fwd_netdev)), .eval = nft_fwd_netdev_eval, .init = nft_fwd_netdev_init, .dump = nft_fwd_netdev_dump, .validate = nft_fwd_validate, .reduce = NFT_REDUCE_READONLY, .offload = nft_fwd_netdev_offload, .offload_action = nft_fwd_netdev_offload_action, }; static const struct nft_expr_ops * nft_fwd_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { if (tb[NFTA_FWD_SREG_ADDR]) return &nft_fwd_neigh_netdev_ops; if (tb[NFTA_FWD_SREG_DEV]) return &nft_fwd_netdev_ops; return ERR_PTR(-EOPNOTSUPP); } static struct nft_expr_type nft_fwd_netdev_type __read_mostly = { .family = NFPROTO_NETDEV, .name = "fwd", .select_ops = nft_fwd_select_ops, .policy = nft_fwd_netdev_policy, .maxattr = NFTA_FWD_MAX, .owner = THIS_MODULE, }; static int __init nft_fwd_netdev_module_init(void) { return nft_register_expr(&nft_fwd_netdev_type); } static void __exit nft_fwd_netdev_module_exit(void) { nft_unregister_expr(&nft_fwd_netdev_type); } module_init(nft_fwd_netdev_module_init); module_exit(nft_fwd_netdev_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); MODULE_DESCRIPTION("nftables netdev packet forwarding support"); MODULE_ALIAS_NFT_AF_EXPR(5, "fwd");
13 5 1 4 4 1 6 12 1 1 86 39 40 8 7 14 39 44 5 1 1 102 74 5 24 96 4 98 2 84 16 94 5 64 13 55 22 9 15 19 5 3 24 79 18 89 3 86 8 2 6 1 1 1 1 1 5 3 2 2 2 2 2 1 9 9 8 5 4 1 5 4 1 13 13 13 8 8 5 5 6 18 37 36 37 35 1 3 1 1 1 1 30 28 29 3 22 22 24 365 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 // SPDX-License-Identifier: GPL-2.0-only #include <linux/module.h> #include <linux/errno.h> #include <linux/socket.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/icmp.h> #include <linux/udp.h> #include <linux/types.h> #include <linux/kernel.h> #include <net/genetlink.h> #include <net/gro.h> #include <net/gue.h> #include <net/fou.h> #include <net/ip.h> #include <net/protocol.h> #include <net/udp.h> #include <net/udp_tunnel.h> #include <uapi/linux/fou.h> #include <uapi/linux/genetlink.h> #include "fou_nl.h" struct fou { struct socket *sock; u8 protocol; u8 flags; __be16 port; u8 family; u16 type; struct list_head list; struct rcu_head rcu; }; #define FOU_F_REMCSUM_NOPARTIAL BIT(0) struct fou_cfg { u16 type; u8 protocol; u8 flags; struct udp_port_cfg udp_config; }; static unsigned int fou_net_id; struct fou_net { struct list_head fou_list; struct mutex fou_lock; }; static inline struct fou *fou_from_sock(struct sock *sk) { return rcu_dereference_sk_user_data(sk); } static int fou_recv_pull(struct sk_buff *skb, struct fou *fou, size_t len) { /* Remove 'len' bytes from the packet (UDP header and * FOU header if present). */ if (fou->family == AF_INET) ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len); else ipv6_hdr(skb)->payload_len = htons(ntohs(ipv6_hdr(skb)->payload_len) - len); __skb_pull(skb, len); skb_postpull_rcsum(skb, udp_hdr(skb), len); skb_reset_transport_header(skb); return iptunnel_pull_offloads(skb); } static int fou_udp_recv(struct sock *sk, struct sk_buff *skb) { struct fou *fou = fou_from_sock(sk); if (!fou) return 1; if (fou_recv_pull(skb, fou, sizeof(struct udphdr))) goto drop; return -fou->protocol; drop: kfree_skb(skb); return 0; } static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr, void *data, size_t hdrlen, u8 ipproto, bool nopartial) { __be16 *pd = data; size_t start = ntohs(pd[0]); size_t offset = ntohs(pd[1]); size_t plen = sizeof(struct udphdr) + hdrlen + max_t(size_t, offset + sizeof(u16), start); if (skb->remcsum_offload) return guehdr; if (!pskb_may_pull(skb, plen)) return NULL; guehdr = (struct guehdr *)&udp_hdr(skb)[1]; skb_remcsum_process(skb, (void *)guehdr + hdrlen, start, offset, nopartial); return guehdr; } static int gue_control_message(struct sk_buff *skb, struct guehdr *guehdr) { /* No support yet */ kfree_skb(skb); return 0; } static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) { struct fou *fou = fou_from_sock(sk); size_t len, optlen, hdrlen; struct guehdr *guehdr; void *data; u16 doffset = 0; u8 proto_ctype; if (!fou) return 1; len = sizeof(struct udphdr) + sizeof(struct guehdr); if (!pskb_may_pull(skb, len)) goto drop; guehdr = (struct guehdr *)&udp_hdr(skb)[1]; switch (guehdr->version) { case 0: /* Full GUE header present */ break; case 1: { /* Direct encapsulation of IPv4 or IPv6 */ int prot; switch (((struct iphdr *)guehdr)->version) { case 4: prot = IPPROTO_IPIP; break; case 6: prot = IPPROTO_IPV6; break; default: goto drop; } if (fou_recv_pull(skb, fou, sizeof(struct udphdr))) goto drop; return -prot; } default: /* Undefined version */ goto drop; } optlen = guehdr->hlen << 2; len += optlen; if (!pskb_may_pull(skb, len)) goto drop; /* guehdr may change after pull */ guehdr = (struct guehdr *)&udp_hdr(skb)[1]; if (validate_gue_flags(guehdr, optlen)) goto drop; hdrlen = sizeof(struct guehdr) + optlen; if (fou->family == AF_INET) ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(skb)->tot_len) - len); else ipv6_hdr(skb)->payload_len = htons(ntohs(ipv6_hdr(skb)->payload_len) - len); /* Pull csum through the guehdr now . This can be used if * there is a remote checksum offload. */ skb_postpull_rcsum(skb, udp_hdr(skb), len); data = &guehdr[1]; if (guehdr->flags & GUE_FLAG_PRIV) { __be32 flags = *(__be32 *)(data + doffset); doffset += GUE_LEN_PRIV; if (flags & GUE_PFLAG_REMCSUM) { guehdr = gue_remcsum(skb, guehdr, data + doffset, hdrlen, guehdr->proto_ctype, !!(fou->flags & FOU_F_REMCSUM_NOPARTIAL)); if (!guehdr) goto drop; data = &guehdr[1]; doffset += GUE_PLEN_REMCSUM; } } if (unlikely(guehdr->control)) return gue_control_message(skb, guehdr); proto_ctype = guehdr->proto_ctype; __skb_pull(skb, sizeof(struct udphdr) + hdrlen); skb_reset_transport_header(skb); if (iptunnel_pull_offloads(skb)) goto drop; return -proto_ctype; drop: kfree_skb(skb); return 0; } static struct sk_buff *fou_gro_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb) { const struct net_offload __rcu **offloads; struct fou *fou = fou_from_sock(sk); const struct net_offload *ops; struct sk_buff *pp = NULL; u8 proto; if (!fou) goto out; proto = fou->protocol; /* We can clear the encap_mark for FOU as we are essentially doing * one of two possible things. We are either adding an L4 tunnel * header to the outer L3 tunnel header, or we are simply * treating the GRE tunnel header as though it is a UDP protocol * specific header such as VXLAN or GENEVE. */ NAPI_GRO_CB(skb)->encap_mark = 0; /* Flag this frame as already having an outer encap header */ NAPI_GRO_CB(skb)->is_fou = 1; offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[proto]); if (!ops || !ops->callbacks.gro_receive) goto out; pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); out: return pp; } static int fou_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { const struct net_offload __rcu **offloads; struct fou *fou = fou_from_sock(sk); const struct net_offload *ops; u8 proto; int err; if (!fou) { err = -ENOENT; goto out; } proto = fou->protocol; offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[proto]); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) { err = -ENOSYS; goto out; } err = ops->callbacks.gro_complete(skb, nhoff); skb_set_inner_mac_header(skb, nhoff); out: return err; } static struct guehdr *gue_gro_remcsum(struct sk_buff *skb, unsigned int off, struct guehdr *guehdr, void *data, size_t hdrlen, struct gro_remcsum *grc, bool nopartial) { __be16 *pd = data; size_t start = ntohs(pd[0]); size_t offset = ntohs(pd[1]); if (skb->remcsum_offload) return guehdr; if (!NAPI_GRO_CB(skb)->csum_valid) return NULL; guehdr = skb_gro_remcsum_process(skb, (void *)guehdr, off, hdrlen, start, offset, grc, nopartial); skb->remcsum_offload = 1; return guehdr; } static struct sk_buff *gue_gro_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb) { const struct net_offload __rcu **offloads; const struct net_offload *ops; struct sk_buff *pp = NULL; struct sk_buff *p; struct guehdr *guehdr; size_t len, optlen, hdrlen, off; void *data; u16 doffset = 0; int flush = 1; struct fou *fou = fou_from_sock(sk); struct gro_remcsum grc; u8 proto; skb_gro_remcsum_init(&grc); if (!fou) goto out; off = skb_gro_offset(skb); len = off + sizeof(*guehdr); guehdr = skb_gro_header(skb, len, off); if (unlikely(!guehdr)) goto out; switch (guehdr->version) { case 0: break; case 1: switch (((struct iphdr *)guehdr)->version) { case 4: proto = IPPROTO_IPIP; break; case 6: proto = IPPROTO_IPV6; break; default: goto out; } goto next_proto; default: goto out; } optlen = guehdr->hlen << 2; len += optlen; if (!skb_gro_may_pull(skb, len)) { guehdr = skb_gro_header_slow(skb, len, off); if (unlikely(!guehdr)) goto out; } if (unlikely(guehdr->control) || guehdr->version != 0 || validate_gue_flags(guehdr, optlen)) goto out; hdrlen = sizeof(*guehdr) + optlen; /* Adjust NAPI_GRO_CB(skb)->csum to account for guehdr, * this is needed if there is a remote checkcsum offload. */ skb_gro_postpull_rcsum(skb, guehdr, hdrlen); data = &guehdr[1]; if (guehdr->flags & GUE_FLAG_PRIV) { __be32 flags = *(__be32 *)(data + doffset); doffset += GUE_LEN_PRIV; if (flags & GUE_PFLAG_REMCSUM) { guehdr = gue_gro_remcsum(skb, off, guehdr, data + doffset, hdrlen, &grc, !!(fou->flags & FOU_F_REMCSUM_NOPARTIAL)); if (!guehdr) goto out; data = &guehdr[1]; doffset += GUE_PLEN_REMCSUM; } } skb_gro_pull(skb, hdrlen); list_for_each_entry(p, head, list) { const struct guehdr *guehdr2; if (!NAPI_GRO_CB(p)->same_flow) continue; guehdr2 = (struct guehdr *)(p->data + off); /* Compare base GUE header to be equal (covers * hlen, version, proto_ctype, and flags. */ if (guehdr->word != guehdr2->word) { NAPI_GRO_CB(p)->same_flow = 0; continue; } /* Compare optional fields are the same. */ if (guehdr->hlen && memcmp(&guehdr[1], &guehdr2[1], guehdr->hlen << 2)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } } proto = guehdr->proto_ctype; next_proto: /* We can clear the encap_mark for GUE as we are essentially doing * one of two possible things. We are either adding an L4 tunnel * header to the outer L3 tunnel header, or we are simply * treating the GRE tunnel header as though it is a UDP protocol * specific header such as VXLAN or GENEVE. */ NAPI_GRO_CB(skb)->encap_mark = 0; /* Flag this frame as already having an outer encap header */ NAPI_GRO_CB(skb)->is_fou = 1; offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[proto]); if (!ops || !ops->callbacks.gro_receive) goto out; pp = call_gro_receive(ops->callbacks.gro_receive, head, skb); flush = 0; out: skb_gro_flush_final_remcsum(skb, pp, flush, &grc); return pp; } static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff); const struct net_offload __rcu **offloads; const struct net_offload *ops; unsigned int guehlen = 0; u8 proto; int err = -ENOENT; switch (guehdr->version) { case 0: proto = guehdr->proto_ctype; guehlen = sizeof(*guehdr) + (guehdr->hlen << 2); break; case 1: switch (((struct iphdr *)guehdr)->version) { case 4: proto = IPPROTO_IPIP; break; case 6: proto = IPPROTO_IPV6; break; default: return err; } break; default: return err; } offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; ops = rcu_dereference(offloads[proto]); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) goto out; err = ops->callbacks.gro_complete(skb, nhoff + guehlen); skb_set_inner_mac_header(skb, nhoff + guehlen); out: return err; } static bool fou_cfg_cmp(struct fou *fou, struct fou_cfg *cfg) { struct sock *sk = fou->sock->sk; struct udp_port_cfg *udp_cfg = &cfg->udp_config; if (fou->family != udp_cfg->family || fou->port != udp_cfg->local_udp_port || sk->sk_dport != udp_cfg->peer_udp_port || sk->sk_bound_dev_if != udp_cfg->bind_ifindex) return false; if (fou->family == AF_INET) { if (sk->sk_rcv_saddr != udp_cfg->local_ip.s_addr || sk->sk_daddr != udp_cfg->peer_ip.s_addr) return false; else return true; #if IS_ENABLED(CONFIG_IPV6) } else { if (ipv6_addr_cmp(&sk->sk_v6_rcv_saddr, &udp_cfg->local_ip6) || ipv6_addr_cmp(&sk->sk_v6_daddr, &udp_cfg->peer_ip6)) return false; else return true; #endif } return false; } static int fou_add_to_port_list(struct net *net, struct fou *fou, struct fou_cfg *cfg) { struct fou_net *fn = net_generic(net, fou_net_id); struct fou *fout; mutex_lock(&fn->fou_lock); list_for_each_entry(fout, &fn->fou_list, list) { if (fou_cfg_cmp(fout, cfg)) { mutex_unlock(&fn->fou_lock); return -EALREADY; } } list_add(&fou->list, &fn->fou_list); mutex_unlock(&fn->fou_lock); return 0; } static void fou_release(struct fou *fou) { struct socket *sock = fou->sock; list_del(&fou->list); udp_tunnel_sock_release(sock); kfree_rcu(fou, rcu); } static int fou_create(struct net *net, struct fou_cfg *cfg, struct socket **sockp) { struct socket *sock = NULL; struct fou *fou = NULL; struct sock *sk; struct udp_tunnel_sock_cfg tunnel_cfg; int err; /* Open UDP socket */ err = udp_sock_create(net, &cfg->udp_config, &sock); if (err < 0) goto error; /* Allocate FOU port structure */ fou = kzalloc(sizeof(*fou), GFP_KERNEL); if (!fou) { err = -ENOMEM; goto error; } sk = sock->sk; fou->port = cfg->udp_config.local_udp_port; fou->family = cfg->udp_config.family; fou->flags = cfg->flags; fou->type = cfg->type; fou->sock = sock; memset(&tunnel_cfg, 0, sizeof(tunnel_cfg)); tunnel_cfg.encap_type = 1; tunnel_cfg.sk_user_data = fou; tunnel_cfg.encap_destroy = NULL; /* Initial for fou type */ switch (cfg->type) { case FOU_ENCAP_DIRECT: tunnel_cfg.encap_rcv = fou_udp_recv; tunnel_cfg.gro_receive = fou_gro_receive; tunnel_cfg.gro_complete = fou_gro_complete; fou->protocol = cfg->protocol; break; case FOU_ENCAP_GUE: tunnel_cfg.encap_rcv = gue_udp_recv; tunnel_cfg.gro_receive = gue_gro_receive; tunnel_cfg.gro_complete = gue_gro_complete; break; default: err = -EINVAL; goto error; } setup_udp_tunnel_sock(net, sock, &tunnel_cfg); sk->sk_allocation = GFP_ATOMIC; err = fou_add_to_port_list(net, fou, cfg); if (err) goto error; if (sockp) *sockp = sock; return 0; error: kfree(fou); if (sock) udp_tunnel_sock_release(sock); return err; } static int fou_destroy(struct net *net, struct fou_cfg *cfg) { struct fou_net *fn = net_generic(net, fou_net_id); int err = -EINVAL; struct fou *fou; mutex_lock(&fn->fou_lock); list_for_each_entry(fou, &fn->fou_list, list) { if (fou_cfg_cmp(fou, cfg)) { fou_release(fou); err = 0; break; } } mutex_unlock(&fn->fou_lock); return err; } static struct genl_family fou_nl_family; static int parse_nl_config(struct genl_info *info, struct fou_cfg *cfg) { bool has_local = false, has_peer = false; struct nlattr *attr; int ifindex; __be16 port; memset(cfg, 0, sizeof(*cfg)); cfg->udp_config.family = AF_INET; if (info->attrs[FOU_ATTR_AF]) { u8 family = nla_get_u8(info->attrs[FOU_ATTR_AF]); switch (family) { case AF_INET: break; case AF_INET6: cfg->udp_config.ipv6_v6only = 1; break; default: return -EAFNOSUPPORT; } cfg->udp_config.family = family; } if (info->attrs[FOU_ATTR_PORT]) { port = nla_get_be16(info->attrs[FOU_ATTR_PORT]); cfg->udp_config.local_udp_port = port; } if (info->attrs[FOU_ATTR_IPPROTO]) cfg->protocol = nla_get_u8(info->attrs[FOU_ATTR_IPPROTO]); if (info->attrs[FOU_ATTR_TYPE]) cfg->type = nla_get_u8(info->attrs[FOU_ATTR_TYPE]); if (info->attrs[FOU_ATTR_REMCSUM_NOPARTIAL]) cfg->flags |= FOU_F_REMCSUM_NOPARTIAL; if (cfg->udp_config.family == AF_INET) { if (info->attrs[FOU_ATTR_LOCAL_V4]) { attr = info->attrs[FOU_ATTR_LOCAL_V4]; cfg->udp_config.local_ip.s_addr = nla_get_in_addr(attr); has_local = true; } if (info->attrs[FOU_ATTR_PEER_V4]) { attr = info->attrs[FOU_ATTR_PEER_V4]; cfg->udp_config.peer_ip.s_addr = nla_get_in_addr(attr); has_peer = true; } #if IS_ENABLED(CONFIG_IPV6) } else { if (info->attrs[FOU_ATTR_LOCAL_V6]) { attr = info->attrs[FOU_ATTR_LOCAL_V6]; cfg->udp_config.local_ip6 = nla_get_in6_addr(attr); has_local = true; } if (info->attrs[FOU_ATTR_PEER_V6]) { attr = info->attrs[FOU_ATTR_PEER_V6]; cfg->udp_config.peer_ip6 = nla_get_in6_addr(attr); has_peer = true; } #endif } if (has_peer) { if (info->attrs[FOU_ATTR_PEER_PORT]) { port = nla_get_be16(info->attrs[FOU_ATTR_PEER_PORT]); cfg->udp_config.peer_udp_port = port; } else { return -EINVAL; } } if (info->attrs[FOU_ATTR_IFINDEX]) { if (!has_local) return -EINVAL; ifindex = nla_get_s32(info->attrs[FOU_ATTR_IFINDEX]); cfg->udp_config.bind_ifindex = ifindex; } return 0; } int fou_nl_add_doit(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct fou_cfg cfg; int err; err = parse_nl_config(info, &cfg); if (err) return err; return fou_create(net, &cfg, NULL); } int fou_nl_del_doit(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct fou_cfg cfg; int err; err = parse_nl_config(info, &cfg); if (err) return err; return fou_destroy(net, &cfg); } static int fou_fill_info(struct fou *fou, struct sk_buff *msg) { struct sock *sk = fou->sock->sk; if (nla_put_u8(msg, FOU_ATTR_AF, fou->sock->sk->sk_family) || nla_put_be16(msg, FOU_ATTR_PORT, fou->port) || nla_put_be16(msg, FOU_ATTR_PEER_PORT, sk->sk_dport) || nla_put_u8(msg, FOU_ATTR_IPPROTO, fou->protocol) || nla_put_u8(msg, FOU_ATTR_TYPE, fou->type) || nla_put_s32(msg, FOU_ATTR_IFINDEX, sk->sk_bound_dev_if)) return -1; if (fou->flags & FOU_F_REMCSUM_NOPARTIAL) if (nla_put_flag(msg, FOU_ATTR_REMCSUM_NOPARTIAL)) return -1; if (fou->sock->sk->sk_family == AF_INET) { if (nla_put_in_addr(msg, FOU_ATTR_LOCAL_V4, sk->sk_rcv_saddr)) return -1; if (nla_put_in_addr(msg, FOU_ATTR_PEER_V4, sk->sk_daddr)) return -1; #if IS_ENABLED(CONFIG_IPV6) } else { if (nla_put_in6_addr(msg, FOU_ATTR_LOCAL_V6, &sk->sk_v6_rcv_saddr)) return -1; if (nla_put_in6_addr(msg, FOU_ATTR_PEER_V6, &sk->sk_v6_daddr)) return -1; #endif } return 0; } static int fou_dump_info(struct fou *fou, u32 portid, u32 seq, u32 flags, struct sk_buff *skb, u8 cmd) { void *hdr; hdr = genlmsg_put(skb, portid, seq, &fou_nl_family, flags, cmd); if (!hdr) return -ENOMEM; if (fou_fill_info(fou, skb) < 0) goto nla_put_failure; genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } int fou_nl_get_doit(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct fou_net *fn = net_generic(net, fou_net_id); struct sk_buff *msg; struct fou_cfg cfg; struct fou *fout; __be16 port; u8 family; int ret; ret = parse_nl_config(info, &cfg); if (ret) return ret; port = cfg.udp_config.local_udp_port; if (port == 0) return -EINVAL; family = cfg.udp_config.family; if (family != AF_INET && family != AF_INET6) return -EINVAL; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; ret = -ESRCH; mutex_lock(&fn->fou_lock); list_for_each_entry(fout, &fn->fou_list, list) { if (fou_cfg_cmp(fout, &cfg)) { ret = fou_dump_info(fout, info->snd_portid, info->snd_seq, 0, msg, info->genlhdr->cmd); break; } } mutex_unlock(&fn->fou_lock); if (ret < 0) goto out_free; return genlmsg_reply(msg, info); out_free: nlmsg_free(msg); return ret; } int fou_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct fou_net *fn = net_generic(net, fou_net_id); struct fou *fout; int idx = 0, ret; mutex_lock(&fn->fou_lock); list_for_each_entry(fout, &fn->fou_list, list) { if (idx++ < cb->args[0]) continue; ret = fou_dump_info(fout, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, skb, FOU_CMD_GET); if (ret) break; } mutex_unlock(&fn->fou_lock); cb->args[0] = idx; return skb->len; } static struct genl_family fou_nl_family __ro_after_init = { .hdrsize = 0, .name = FOU_GENL_NAME, .version = FOU_GENL_VERSION, .maxattr = FOU_ATTR_MAX, .policy = fou_nl_policy, .netnsok = true, .module = THIS_MODULE, .small_ops = fou_nl_ops, .n_small_ops = ARRAY_SIZE(fou_nl_ops), .resv_start_op = FOU_CMD_GET + 1, }; size_t fou_encap_hlen(struct ip_tunnel_encap *e) { return sizeof(struct udphdr); } EXPORT_SYMBOL(fou_encap_hlen); size_t gue_encap_hlen(struct ip_tunnel_encap *e) { size_t len; bool need_priv = false; len = sizeof(struct udphdr) + sizeof(struct guehdr); if (e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) { len += GUE_PLEN_REMCSUM; need_priv = true; } len += need_priv ? GUE_LEN_PRIV : 0; return len; } EXPORT_SYMBOL(gue_encap_hlen); int __fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, __be16 *sport, int type) { int err; err = iptunnel_handle_offloads(skb, type); if (err) return err; *sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), skb, 0, 0, false); return 0; } EXPORT_SYMBOL(__fou_build_header); int __gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, __be16 *sport, int type) { struct guehdr *guehdr; size_t hdrlen, optlen = 0; void *data; bool need_priv = false; int err; if ((e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) && skb->ip_summed == CHECKSUM_PARTIAL) { optlen += GUE_PLEN_REMCSUM; type |= SKB_GSO_TUNNEL_REMCSUM; need_priv = true; } optlen += need_priv ? GUE_LEN_PRIV : 0; err = iptunnel_handle_offloads(skb, type); if (err) return err; /* Get source port (based on flow hash) before skb_push */ *sport = e->sport ? : udp_flow_src_port(dev_net(skb->dev), skb, 0, 0, false); hdrlen = sizeof(struct guehdr) + optlen; skb_push(skb, hdrlen); guehdr = (struct guehdr *)skb->data; guehdr->control = 0; guehdr->version = 0; guehdr->hlen = optlen >> 2; guehdr->flags = 0; guehdr->proto_ctype = *protocol; data = &guehdr[1]; if (need_priv) { __be32 *flags = data; guehdr->flags |= GUE_FLAG_PRIV; *flags = 0; data += GUE_LEN_PRIV; if (type & SKB_GSO_TUNNEL_REMCSUM) { u16 csum_start = skb_checksum_start_offset(skb); __be16 *pd = data; if (csum_start < hdrlen) return -EINVAL; csum_start -= hdrlen; pd[0] = htons(csum_start); pd[1] = htons(csum_start + skb->csum_offset); if (!skb_is_gso(skb)) { skb->ip_summed = CHECKSUM_NONE; skb->encapsulation = 0; } *flags |= GUE_PFLAG_REMCSUM; data += GUE_PLEN_REMCSUM; } } return 0; } EXPORT_SYMBOL(__gue_build_header); #ifdef CONFIG_NET_FOU_IP_TUNNELS static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e, struct flowi4 *fl4, u8 *protocol, __be16 sport) { struct udphdr *uh; skb_push(skb, sizeof(struct udphdr)); skb_reset_transport_header(skb); uh = udp_hdr(skb); uh->dest = e->dport; uh->source = sport; uh->len = htons(skb->len); udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb, fl4->saddr, fl4->daddr, skb->len); *protocol = IPPROTO_UDP; } static int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4) { int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; __be16 sport; int err; err = __fou_build_header(skb, e, protocol, &sport, type); if (err) return err; fou_build_udp(skb, e, fl4, protocol, sport); return 0; } static int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e, u8 *protocol, struct flowi4 *fl4) { int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL; __be16 sport; int err; err = __gue_build_header(skb, e, protocol, &sport, type); if (err) return err; fou_build_udp(skb, e, fl4, protocol, sport); return 0; } static int gue_err_proto_handler(int proto, struct sk_buff *skb, u32 info) { const struct net_protocol *ipprot = rcu_dereference(inet_protos[proto]); if (ipprot && ipprot->err_handler) { if (!ipprot->err_handler(skb, info)) return 0; } return -ENOENT; } static int gue_err(struct sk_buff *skb, u32 info) { int transport_offset = skb_transport_offset(skb); struct guehdr *guehdr; size_t len, optlen; int ret; len = sizeof(struct udphdr) + sizeof(struct guehdr); if (!pskb_may_pull(skb, transport_offset + len)) return -EINVAL; guehdr = (struct guehdr *)&udp_hdr(skb)[1]; switch (guehdr->version) { case 0: /* Full GUE header present */ break; case 1: { /* Direct encapsulation of IPv4 or IPv6 */ skb_set_transport_header(skb, -(int)sizeof(struct icmphdr)); switch (((struct iphdr *)guehdr)->version) { case 4: ret = gue_err_proto_handler(IPPROTO_IPIP, skb, info); goto out; #if IS_ENABLED(CONFIG_IPV6) case 6: ret = gue_err_proto_handler(IPPROTO_IPV6, skb, info); goto out; #endif default: ret = -EOPNOTSUPP; goto out; } } default: /* Undefined version */ return -EOPNOTSUPP; } if (guehdr->control) return -ENOENT; optlen = guehdr->hlen << 2; if (!pskb_may_pull(skb, transport_offset + len + optlen)) return -EINVAL; guehdr = (struct guehdr *)&udp_hdr(skb)[1]; if (validate_gue_flags(guehdr, optlen)) return -EINVAL; /* Handling exceptions for direct UDP encapsulation in GUE would lead to * recursion. Besides, this kind of encapsulation can't even be * configured currently. Discard this. */ if (guehdr->proto_ctype == IPPROTO_UDP || guehdr->proto_ctype == IPPROTO_UDPLITE) return -EOPNOTSUPP; skb_set_transport_header(skb, -(int)sizeof(struct icmphdr)); ret = gue_err_proto_handler(guehdr->proto_ctype, skb, info); out: skb_set_transport_header(skb, transport_offset); return ret; } static const struct ip_tunnel_encap_ops fou_iptun_ops = { .encap_hlen = fou_encap_hlen, .build_header = fou_build_header, .err_handler = gue_err, }; static const struct ip_tunnel_encap_ops gue_iptun_ops = { .encap_hlen = gue_encap_hlen, .build_header = gue_build_header, .err_handler = gue_err, }; static int ip_tunnel_encap_add_fou_ops(void) { int ret; ret = ip_tunnel_encap_add_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU); if (ret < 0) { pr_err("can't add fou ops\n"); return ret; } ret = ip_tunnel_encap_add_ops(&gue_iptun_ops, TUNNEL_ENCAP_GUE); if (ret < 0) { pr_err("can't add gue ops\n"); ip_tunnel_encap_del_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU); return ret; } return 0; } static void ip_tunnel_encap_del_fou_ops(void) { ip_tunnel_encap_del_ops(&fou_iptun_ops, TUNNEL_ENCAP_FOU); ip_tunnel_encap_del_ops(&gue_iptun_ops, TUNNEL_ENCAP_GUE); } #else static int ip_tunnel_encap_add_fou_ops(void) { return 0; } static void ip_tunnel_encap_del_fou_ops(void) { } #endif static __net_init int fou_init_net(struct net *net) { struct fou_net *fn = net_generic(net, fou_net_id); INIT_LIST_HEAD(&fn->fou_list); mutex_init(&fn->fou_lock); return 0; } static __net_exit void fou_exit_net(struct net *net) { struct fou_net *fn = net_generic(net, fou_net_id); struct fou *fou, *next; /* Close all the FOU sockets */ mutex_lock(&fn->fou_lock); list_for_each_entry_safe(fou, next, &fn->fou_list, list) fou_release(fou); mutex_unlock(&fn->fou_lock); } static struct pernet_operations fou_net_ops = { .init = fou_init_net, .exit = fou_exit_net, .id = &fou_net_id, .size = sizeof(struct fou_net), }; static int __init fou_init(void) { int ret; ret = register_pernet_device(&fou_net_ops); if (ret) goto exit; ret = genl_register_family(&fou_nl_family); if (ret < 0) goto unregister; ret = register_fou_bpf(); if (ret < 0) goto kfunc_failed; ret = ip_tunnel_encap_add_fou_ops(); if (ret == 0) return 0; kfunc_failed: genl_unregister_family(&fou_nl_family); unregister: unregister_pernet_device(&fou_net_ops); exit: return ret; } static void __exit fou_fini(void) { ip_tunnel_encap_del_fou_ops(); genl_unregister_family(&fou_nl_family); unregister_pernet_device(&fou_net_ops); } module_init(fou_init); module_exit(fou_fini); MODULE_AUTHOR("Tom Herbert <therbert@google.com>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Foo over UDP");
63 63 77 77 119 119 119 77 77 63 63 63 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/module.h> #include <linux/backing-dev.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/workqueue.h> #include <linux/smp.h> #include "blk.h" #include "blk-mq.h" static void blk_mq_sysfs_release(struct kobject *kobj) { struct blk_mq_ctxs *ctxs = container_of(kobj, struct blk_mq_ctxs, kobj); free_percpu(ctxs->queue_ctx); kfree(ctxs); } static void blk_mq_ctx_sysfs_release(struct kobject *kobj) { struct blk_mq_ctx *ctx = container_of(kobj, struct blk_mq_ctx, kobj); /* ctx->ctxs won't be released until all ctx are freed */ kobject_put(&ctx->ctxs->kobj); } static void blk_mq_hw_sysfs_release(struct kobject *kobj) { struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); blk_free_flush_queue(hctx->fq); sbitmap_free(&hctx->ctx_map); free_cpumask_var(hctx->cpumask); kfree(hctx->ctxs); kfree(hctx); } struct blk_mq_hw_ctx_sysfs_entry { struct attribute attr; ssize_t (*show)(struct blk_mq_hw_ctx *, char *); }; static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, struct attribute *attr, char *page) { struct blk_mq_hw_ctx_sysfs_entry *entry; struct blk_mq_hw_ctx *hctx; struct request_queue *q; ssize_t res; entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr); hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); q = hctx->queue; if (!entry->show) return -EIO; mutex_lock(&q->elevator_lock); res = entry->show(hctx, page); mutex_unlock(&q->elevator_lock); return res; } static ssize_t blk_mq_hw_sysfs_nr_tags_show(struct blk_mq_hw_ctx *hctx, char *page) { return sprintf(page, "%u\n", hctx->tags->nr_tags); } static ssize_t blk_mq_hw_sysfs_nr_reserved_tags_show(struct blk_mq_hw_ctx *hctx, char *page) { return sprintf(page, "%u\n", hctx->tags->nr_reserved_tags); } static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) { const size_t size = PAGE_SIZE - 1; unsigned int i, first = 1; int ret = 0, pos = 0; for_each_cpu(i, hctx->cpumask) { if (first) ret = snprintf(pos + page, size - pos, "%u", i); else ret = snprintf(pos + page, size - pos, ", %u", i); if (ret >= size - pos) break; first = 0; pos += ret; } ret = snprintf(pos + page, size + 1 - pos, "\n"); return pos + ret; } static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = { .attr = {.name = "nr_tags", .mode = 0444 }, .show = blk_mq_hw_sysfs_nr_tags_show, }; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_reserved_tags = { .attr = {.name = "nr_reserved_tags", .mode = 0444 }, .show = blk_mq_hw_sysfs_nr_reserved_tags_show, }; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = { .attr = {.name = "cpu_list", .mode = 0444 }, .show = blk_mq_hw_sysfs_cpus_show, }; static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_nr_tags.attr, &blk_mq_hw_sysfs_nr_reserved_tags.attr, &blk_mq_hw_sysfs_cpus.attr, NULL, }; ATTRIBUTE_GROUPS(default_hw_ctx); static const struct sysfs_ops blk_mq_hw_sysfs_ops = { .show = blk_mq_hw_sysfs_show, }; static const struct kobj_type blk_mq_ktype = { .release = blk_mq_sysfs_release, }; static const struct kobj_type blk_mq_ctx_ktype = { .release = blk_mq_ctx_sysfs_release, }; static const struct kobj_type blk_mq_hw_ktype = { .sysfs_ops = &blk_mq_hw_sysfs_ops, .default_groups = default_hw_ctx_groups, .release = blk_mq_hw_sysfs_release, }; static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx) { struct blk_mq_ctx *ctx; int i; if (!hctx->nr_ctx) return; hctx_for_each_ctx(hctx, ctx, i) kobject_del(&ctx->kobj); kobject_del(&hctx->kobj); } static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct blk_mq_ctx *ctx; int i, j, ret; if (!hctx->nr_ctx) return 0; ret = kobject_add(&hctx->kobj, q->mq_kobj, "%u", hctx->queue_num); if (ret) return ret; hctx_for_each_ctx(hctx, ctx, i) { ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu); if (ret) goto out; } return 0; out: hctx_for_each_ctx(hctx, ctx, j) { if (j < i) kobject_del(&ctx->kobj); } kobject_del(&hctx->kobj); return ret; } void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx) { kobject_init(&hctx->kobj, &blk_mq_hw_ktype); } void blk_mq_sysfs_deinit(struct request_queue *q) { struct blk_mq_ctx *ctx; int cpu; for_each_possible_cpu(cpu) { ctx = per_cpu_ptr(q->queue_ctx, cpu); kobject_put(&ctx->kobj); } kobject_put(q->mq_kobj); } void blk_mq_sysfs_init(struct request_queue *q) { struct blk_mq_ctx *ctx; int cpu; kobject_init(q->mq_kobj, &blk_mq_ktype); for_each_possible_cpu(cpu) { ctx = per_cpu_ptr(q->queue_ctx, cpu); kobject_get(q->mq_kobj); kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); } } int blk_mq_sysfs_register(struct gendisk *disk) { struct request_queue *q = disk->queue; struct blk_mq_hw_ctx *hctx; unsigned long i, j; int ret; ret = kobject_add(q->mq_kobj, &disk_to_dev(disk)->kobj, "mq"); if (ret < 0) return ret; kobject_uevent(q->mq_kobj, KOBJ_ADD); mutex_lock(&q->tag_set->tag_list_lock); queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_register_hctx(hctx); if (ret) goto out_unreg; } mutex_unlock(&q->tag_set->tag_list_lock); return 0; out_unreg: queue_for_each_hw_ctx(q, hctx, j) { if (j < i) blk_mq_unregister_hctx(hctx); } mutex_unlock(&q->tag_set->tag_list_lock); kobject_uevent(q->mq_kobj, KOBJ_REMOVE); kobject_del(q->mq_kobj); return ret; } void blk_mq_sysfs_unregister(struct gendisk *disk) { struct request_queue *q = disk->queue; struct blk_mq_hw_ctx *hctx; unsigned long i; mutex_lock(&q->tag_set->tag_list_lock); queue_for_each_hw_ctx(q, hctx, i) blk_mq_unregister_hctx(hctx); mutex_unlock(&q->tag_set->tag_list_lock); kobject_uevent(q->mq_kobj, KOBJ_REMOVE); kobject_del(q->mq_kobj); } void blk_mq_sysfs_unregister_hctxs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; if (!blk_queue_registered(q)) return; queue_for_each_hw_ctx(q, hctx, i) blk_mq_unregister_hctx(hctx); } int blk_mq_sysfs_register_hctxs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; int ret = 0; if (!blk_queue_registered(q)) goto out; queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_register_hctx(hctx); if (ret) break; } out: return ret; }
14108 14115 12762 1444 326 66 12735 279 117 12889 352 42 4 5 32 4313 4298 8 1 4310 8725 1116 13968 285 13820 74 7 6 6 1 64 7 5610 4622 13110 105 158 14 164 13531 13550 167 246 931 247 933 926 13344 1452 4574 13390 6531 13545 13552 60 6730 13546 542 13533 4314 9139 9523 9037 1100 13825 13816 13840 73 9789 223 13815 13591 344 345 345 449 50 13804 423 882 13808 13804 13466 12203 3324 882 1 112 113 13235 40 13228 10 12230 3 13209 100 13192 13191 46 166 6 19 141 10455 11885 2774 217 9921 80 11899 11 10322 4063 1 6180 6055 6059 6056 435 6098 2 6094 2 2 6095 412 414 2 1 412 68 1446 879 1100 1085 49 49 1249 34 5 1 1 1 1 28 28 51 3 65 21 58 20 20 20 4 16 532 527 531 2 15 528 48 527 15 532 534 2 46 27 43 133 2 131 9723 2 6465 6448 5222 10154 2 14 2 394 396 3374 2 24 5584 612 200 19 1749 1745 1 1496 44 40 82 1702 11744 5057 10056 741 1 3 135 1041 1075 742 50 6922 2 5 2 6923 6819 133 2953 13254 2 1 8 350 1846 3 345 1846 1846 334 191 1323 1085 5 1658 23 781 654 740 1093 410 1381 3 422 1794 432 11416 11430 3374 1948 10101 15 9658 2356 394 1755 687 245 204 51 226 227 4 15 191 167 4 16 179 4 8 10222 10051 240 99 245 192 333 328 5 9103 1312 68 9111 30 9578 2954 7269 1086 1 6319 5428 7594 1 146 146 760 2342 13813 3466 10925 5933 522 13259 483 2342 11742 337 9800 12280 580 13029 58 13046 977 11817 1202 1317 11804 3 7 13823 44 13594 628 13824 13822 296 231 69 5721 10137 10127 10138 239 400 728 11 735 499 219 10677 3 2 30 5 8351 7 321 302 23 8794 8737 73 2218 8354 7288 253 7309 1949 9079 514 8663 8646 47 8652 2029 7356 8662 9 238 7 1787 2208 346 1898 1894 237 9 243 244 1789 1898 3668 3671 73 73 4323 4328 4320 4316 4303 1 82 82 4005 4005 4 4001 35 1 34 807 7 800 807 174 633 137 133 5 133 6598 6605 22 571 571 5 1 565 3 242 345 559 1389 1388 1388 116 40 54 4 61 2 205 205 116 1768 66 1 2 63 12 54 93 30 72 2587 4 2589 6689 4382 50 2059 4318 65 2858 336 6647 11 6651 1 192 16 985 23 3392 3402 9 7 2518 1036 2614 997 2518 1037 64 999 13 3396 3349 71 1 2732 856 4 856 3296 2 1 1425 5814 20 6478 468 9 4 298 6 6884 24 5407 981 2520 984 67 886 2690 1042 2522 3411 3407 2713 796 3279 1041 2521 999 2616 206 2711 796 6342 262 6066 1968 795 43 6685 6237 5719 1295 24 1281 6671 795 6079 216 430 6617 6222 192 6654 6542 215 72 1 71 5 69 48 7 19 78 9 49 26 84 43 61 7226 78 85 2856 7022 6691 4283 6386 7036 7040 7014 108 7018 7027 224 224 223 222 4 223 223 223 1446 252 3 152 1199 10 22 143 109 109 137 523 129 129 187 2 186 4 183 182 67 5 19 173 225 1 226 226 63 169 169 169 226 116 110 457 2 2 454 455 100 386 386 530 162 458 104 386 458 529 383 155 101 6 95 94 18 79 19 79 194 94 6 101 104 194 138 169 4 1 165 165 19 12 154 154 238 80 2 44 147 1 147 160 163 239 1 150 1 57 92 148 397 2 2 19 385 473 54 45 397 474 474 376 98 48 1 1 48 48 185 8 125 1 49 50 50 185 185 76 109 309 2 3 187 76 51 4 1 188 51 62 95 16 1 298 71 61 99 41 2 13 154 298 299 297 130 169 229 70 72 196 31 13 44 193 44 298 191 118 75 192 44 469 3 13 113 46 1 1 2 205 116 31 8 260 24 31 136 2 179 2 1 309 321 322 365 472 472 277 30 164 17 52 14 66 78 29 1 26 22 3 17 3 20 20 120 79 1 68 106 120 14 106 106 106 66 66 1 65 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/namei.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* * Some corrections by tytso. */ /* [Feb 1997 T. Schoebel-Theuer] Complete rewrite of the pathname * lookup logic. */ /* [Feb-Apr 2000, AV] Rewrite to the new namespace architecture. */ #include <linux/init.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/wordpart.h> #include <linux/fs.h> #include <linux/filelock.h> #include <linux/namei.h> #include <linux/pagemap.h> #include <linux/sched/mm.h> #include <linux/fsnotify.h> #include <linux/personality.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/mount.h> #include <linux/audit.h> #include <linux/capability.h> #include <linux/file.h> #include <linux/fcntl.h> #include <linux/device_cgroup.h> #include <linux/fs_struct.h> #include <linux/posix_acl.h> #include <linux/hash.h> #include <linux/bitops.h> #include <linux/init_task.h> #include <linux/uaccess.h> #include "internal.h" #include "mount.h" /* [Feb-1997 T. Schoebel-Theuer] * Fundamental changes in the pathname lookup mechanisms (namei) * were necessary because of omirr. The reason is that omirr needs * to know the _real_ pathname, not the user-supplied one, in case * of symlinks (and also when transname replacements occur). * * The new code replaces the old recursive symlink resolution with * an iterative one (in case of non-nested symlink chains). It does * this with calls to <fs>_follow_link(). * As a side effect, dir_namei(), _namei() and follow_link() are now * replaced with a single function lookup_dentry() that can handle all * the special cases of the former code. * * With the new dcache, the pathname is stored at each inode, at least as * long as the refcount of the inode is positive. As a side effect, the * size of the dcache depends on the inode cache and thus is dynamic. * * [29-Apr-1998 C. Scott Ananian] Updated above description of symlink * resolution to correspond with current state of the code. * * Note that the symlink resolution is not *completely* iterative. * There is still a significant amount of tail- and mid- recursion in * the algorithm. Also, note that <fs>_readlink() is not used in * lookup_dentry(): lookup_dentry() on the result of <fs>_readlink() * may return different results than <fs>_follow_link(). Many virtual * filesystems (including /proc) exhibit this behavior. */ /* [24-Feb-97 T. Schoebel-Theuer] Side effects caused by new implementation: * New symlink semantics: when open() is called with flags O_CREAT | O_EXCL * and the name already exists in form of a symlink, try to create the new * name indicated by the symlink. The old code always complained that the * name already exists, due to not following the symlink even if its target * is nonexistent. The new semantics affects also mknod() and link() when * the name is a symlink pointing to a non-existent name. * * I don't know which semantics is the right one, since I have no access * to standards. But I found by trial that HP-UX 9.0 has the full "new" * semantics implemented, while SunOS 4.1.1 and Solaris (SunOS 5.4) have the * "old" one. Personally, I think the new semantics is much more logical. * Note that "ln old new" where "new" is a symlink pointing to a non-existing * file does succeed in both HP-UX and SunOs, but not in Solaris * and in the old Linux semantics. */ /* [16-Dec-97 Kevin Buhr] For security reasons, we change some symlink * semantics. See the comments in "open_namei" and "do_link" below. * * [10-Sep-98 Alan Modra] Another symlink change. */ /* [Feb-Apr 2000 AV] Complete rewrite. Rules for symlinks: * inside the path - always follow. * in the last component in creation/removal/renaming - never follow. * if LOOKUP_FOLLOW passed - follow. * if the pathname has trailing slashes - follow. * otherwise - don't follow. * (applied in that order). * * [Jun 2000 AV] Inconsistent behaviour of open() in case if flags==O_CREAT * restored for 2.4. This is the last surviving part of old 4.2BSD bug. * During the 2.4 we need to fix the userland stuff depending on it - * hopefully we will be able to get rid of that wart in 2.5. So far only * XEmacs seems to be relying on it... */ /* * [Sep 2001 AV] Single-semaphore locking scheme (kudos to David Holland) * implemented. Let's see if raised priority of ->s_vfs_rename_mutex gives * any extra contention... */ /* In order to reduce some races, while at the same time doing additional * checking and hopefully speeding things up, we copy filenames to the * kernel data space before using them.. * * POSIX.1 2.4: an empty pathname is invalid (ENOENT). * PATH_MAX includes the nul terminator --RR. */ #define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname)) static inline void initname(struct filename *name, const char __user *uptr) { name->uptr = uptr; name->aname = NULL; atomic_set(&name->refcnt, 1); } struct filename * getname_flags(const char __user *filename, int flags) { struct filename *result; char *kname; int len; result = audit_reusename(filename); if (result) return result; result = __getname(); if (unlikely(!result)) return ERR_PTR(-ENOMEM); /* * First, try to embed the struct filename inside the names_cache * allocation */ kname = (char *)result->iname; result->name = kname; len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX); /* * Handle both empty path and copy failure in one go. */ if (unlikely(len <= 0)) { if (unlikely(len < 0)) { __putname(result); return ERR_PTR(len); } /* The empty path is special. */ if (!(flags & LOOKUP_EMPTY)) { __putname(result); return ERR_PTR(-ENOENT); } } /* * Uh-oh. We have a name that's approaching PATH_MAX. Allocate a * separate struct filename so we can dedicate the entire * names_cache allocation for the pathname, and re-do the copy from * userland. */ if (unlikely(len == EMBEDDED_NAME_MAX)) { const size_t size = offsetof(struct filename, iname[1]); kname = (char *)result; /* * size is chosen that way we to guarantee that * result->iname[0] is within the same object and that * kname can't be equal to result->iname, no matter what. */ result = kzalloc(size, GFP_KERNEL); if (unlikely(!result)) { __putname(kname); return ERR_PTR(-ENOMEM); } result->name = kname; len = strncpy_from_user(kname, filename, PATH_MAX); if (unlikely(len < 0)) { __putname(kname); kfree(result); return ERR_PTR(len); } /* The empty path is special. */ if (unlikely(!len) && !(flags & LOOKUP_EMPTY)) { __putname(kname); kfree(result); return ERR_PTR(-ENOENT); } if (unlikely(len == PATH_MAX)) { __putname(kname); kfree(result); return ERR_PTR(-ENAMETOOLONG); } } initname(result, filename); audit_getname(result); return result; } struct filename *getname_uflags(const char __user *filename, int uflags) { int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; return getname_flags(filename, flags); } struct filename *__getname_maybe_null(const char __user *pathname) { struct filename *name; char c; /* try to save on allocations; loss on um, though */ if (get_user(c, pathname)) return ERR_PTR(-EFAULT); if (!c) return NULL; name = getname_flags(pathname, LOOKUP_EMPTY); if (!IS_ERR(name) && !(name->name[0])) { putname(name); name = NULL; } return name; } struct filename *getname_kernel(const char * filename) { struct filename *result; int len = strlen(filename) + 1; result = __getname(); if (unlikely(!result)) return ERR_PTR(-ENOMEM); if (len <= EMBEDDED_NAME_MAX) { result->name = (char *)result->iname; } else if (len <= PATH_MAX) { const size_t size = offsetof(struct filename, iname[1]); struct filename *tmp; tmp = kmalloc(size, GFP_KERNEL); if (unlikely(!tmp)) { __putname(result); return ERR_PTR(-ENOMEM); } tmp->name = (char *)result; result = tmp; } else { __putname(result); return ERR_PTR(-ENAMETOOLONG); } memcpy((char *)result->name, filename, len); initname(result, NULL); audit_getname(result); return result; } EXPORT_SYMBOL(getname_kernel); void putname(struct filename *name) { int refcnt; if (IS_ERR_OR_NULL(name)) return; refcnt = atomic_read(&name->refcnt); if (refcnt != 1) { if (WARN_ON_ONCE(!refcnt)) return; if (!atomic_dec_and_test(&name->refcnt)) return; } if (name->name != name->iname) { __putname(name->name); kfree(name); } else __putname(name); } EXPORT_SYMBOL(putname); /** * check_acl - perform ACL permission checking * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * * This function performs the ACL permission checking. Since this function * retrieve POSIX acls it needs to know whether it is called from a blocking or * non-blocking context and thus cares about the MAY_NOT_BLOCK bit. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ static int check_acl(struct mnt_idmap *idmap, struct inode *inode, int mask) { #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *acl; if (mask & MAY_NOT_BLOCK) { acl = get_cached_acl_rcu(inode, ACL_TYPE_ACCESS); if (!acl) return -EAGAIN; /* no ->get_inode_acl() calls in RCU mode... */ if (is_uncached_acl(acl)) return -ECHILD; return posix_acl_permission(idmap, inode, acl, mask); } acl = get_inode_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { int error = posix_acl_permission(idmap, inode, acl, mask); posix_acl_release(acl); return error; } #endif return -EAGAIN; } /* * Very quick optimistic "we know we have no ACL's" check. * * Note that this is purely for ACL_TYPE_ACCESS, and purely * for the "we have cached that there are no ACLs" case. * * If this returns true, we know there are no ACLs. But if * it returns false, we might still not have ACLs (it could * be the is_uncached_acl() case). */ static inline bool no_acl_inode(struct inode *inode) { #ifdef CONFIG_FS_POSIX_ACL return likely(!READ_ONCE(inode->i_acl)); #else return true; #endif } /** * acl_permission_check - perform basic UNIX permission checking * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * * This function performs the basic UNIX permission checking. Since this * function may retrieve POSIX acls it needs to know whether it is called from a * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ static int acl_permission_check(struct mnt_idmap *idmap, struct inode *inode, int mask) { unsigned int mode = inode->i_mode; vfsuid_t vfsuid; /* * Common cheap case: everybody has the requested * rights, and there are no ACLs to check. No need * to do any owner/group checks in that case. * * - 'mask&7' is the requested permission bit set * - multiplying by 0111 spreads them out to all of ugo * - '& ~mode' looks for missing inode permission bits * - the '!' is for "no missing permissions" * * After that, we just need to check that there are no * ACL's on the inode - do the 'IS_POSIXACL()' check last * because it will dereference the ->i_sb pointer and we * want to avoid that if at all possible. */ if (!((mask & 7) * 0111 & ~mode)) { if (no_acl_inode(inode)) return 0; if (!IS_POSIXACL(inode)) return 0; } /* Are we the owner? If so, ACL's don't matter */ vfsuid = i_uid_into_vfsuid(idmap, inode); if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) { mask &= 7; mode >>= 6; return (mask & ~mode) ? -EACCES : 0; } /* Do we have ACL's? */ if (IS_POSIXACL(inode) && (mode & S_IRWXG)) { int error = check_acl(idmap, inode, mask); if (error != -EAGAIN) return error; } /* Only RWX matters for group/other mode bits */ mask &= 7; /* * Are the group permissions different from * the other permissions in the bits we care * about? Need to check group ownership if so. */ if (mask & (mode ^ (mode >> 3))) { vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); if (vfsgid_in_group_p(vfsgid)) mode >>= 3; } /* Bits in 'mode' clear that we require? */ return (mask & ~mode) ? -EACCES : 0; } /** * generic_permission - check for access rights on a Posix-like filesystem * @idmap: idmap of the mount the inode was found from * @inode: inode to check access rights for * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, * %MAY_NOT_BLOCK ...) * * Used to check for read/write/execute permissions on a file. * We use "fsuid" for this, letting us set arbitrary permissions * for filesystem access without changing the "normal" uids which * are used for other things. * * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk * request cannot be satisfied (eg. requires blocking or too much complexity). * It would then be called again in ref-walk mode. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int generic_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int ret; /* * Do the basic permission checks. */ ret = acl_permission_check(idmap, inode, mask); if (ret != -EACCES) return ret; if (S_ISDIR(inode->i_mode)) { /* DACs are overridable for directories */ if (!(mask & MAY_WRITE)) if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_READ_SEARCH)) return 0; if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_OVERRIDE)) return 0; return -EACCES; } /* * Searching includes executable on directories, else just read. */ mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (mask == MAY_READ) if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_READ_SEARCH)) return 0; /* * Read/write DACs are always overridable. * Executable DACs are overridable when there is * at least one exec bit set. */ if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) if (capable_wrt_inode_uidgid(idmap, inode, CAP_DAC_OVERRIDE)) return 0; return -EACCES; } EXPORT_SYMBOL(generic_permission); /** * do_inode_permission - UNIX permission checking * @idmap: idmap of the mount the inode was found from * @inode: inode to check permissions on * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) * * We _really_ want to just do "generic_permission()" without * even looking at the inode->i_op values. So we keep a cache * flag in inode->i_opflags, that says "this has not special * permission function, use the fast case". */ static inline int do_inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) { if (likely(inode->i_op->permission)) return inode->i_op->permission(idmap, inode, mask); /* This gets set once for the inode lifetime */ spin_lock(&inode->i_lock); inode->i_opflags |= IOP_FASTPERM; spin_unlock(&inode->i_lock); } return generic_permission(idmap, inode, mask); } /** * sb_permission - Check superblock-level permissions * @sb: Superblock of inode to check permission on * @inode: Inode to check permission on * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * * Separate out file-system wide checks from inode-specific permission checks. */ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) { if (unlikely(mask & MAY_WRITE)) { umode_t mode = inode->i_mode; /* Nobody gets write access to a read-only fs. */ if (sb_rdonly(sb) && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) return -EROFS; } return 0; } /** * inode_permission - Check for access rights to a given inode * @idmap: idmap of the mount the inode was found from * @inode: Inode to check permission on * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * * Check for read/write/execute permissions on an inode. We use fs[ug]id for * this, letting us set arbitrary permissions for filesystem access without * changing the "normal" UIDs which are used for other things. * * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask. */ int inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { int retval; retval = sb_permission(inode->i_sb, inode, mask); if (unlikely(retval)) return retval; if (unlikely(mask & MAY_WRITE)) { /* * Nobody gets write access to an immutable file. */ if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; /* * Updating mtime will likely cause i_uid and i_gid to be * written back improperly if their true value is unknown * to the vfs. */ if (unlikely(HAS_UNMAPPED_ID(idmap, inode))) return -EACCES; } retval = do_inode_permission(idmap, inode, mask); if (unlikely(retval)) return retval; retval = devcgroup_inode_permission(inode, mask); if (unlikely(retval)) return retval; return security_inode_permission(inode, mask); } EXPORT_SYMBOL(inode_permission); /** * path_get - get a reference to a path * @path: path to get the reference to * * Given a path increment the reference count to the dentry and the vfsmount. */ void path_get(const struct path *path) { mntget(path->mnt); dget(path->dentry); } EXPORT_SYMBOL(path_get); /** * path_put - put a reference to a path * @path: path to put the reference to * * Given a path decrement the reference count to the dentry and the vfsmount. */ void path_put(const struct path *path) { dput(path->dentry); mntput(path->mnt); } EXPORT_SYMBOL(path_put); #define EMBEDDED_LEVELS 2 struct nameidata { struct path path; struct qstr last; struct path root; struct inode *inode; /* path.dentry.d_inode */ unsigned int flags, state; unsigned seq, next_seq, m_seq, r_seq; int last_type; unsigned depth; int total_link_count; struct saved { struct path link; struct delayed_call done; const char *name; unsigned seq; } *stack, internal[EMBEDDED_LEVELS]; struct filename *name; const char *pathname; struct nameidata *saved; unsigned root_seq; int dfd; vfsuid_t dir_vfsuid; umode_t dir_mode; } __randomize_layout; #define ND_ROOT_PRESET 1 #define ND_ROOT_GRABBED 2 #define ND_JUMPED 4 static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name) { struct nameidata *old = current->nameidata; p->stack = p->internal; p->depth = 0; p->dfd = dfd; p->name = name; p->pathname = likely(name) ? name->name : ""; p->path.mnt = NULL; p->path.dentry = NULL; p->total_link_count = old ? old->total_link_count : 0; p->saved = old; current->nameidata = p; } static inline void set_nameidata(struct nameidata *p, int dfd, struct filename *name, const struct path *root) { __set_nameidata(p, dfd, name); p->state = 0; if (unlikely(root)) { p->state = ND_ROOT_PRESET; p->root = *root; } } static void restore_nameidata(void) { struct nameidata *now = current->nameidata, *old = now->saved; current->nameidata = old; if (old) old->total_link_count = now->total_link_count; if (now->stack != now->internal) kfree(now->stack); } static bool nd_alloc_stack(struct nameidata *nd) { struct saved *p; p= kmalloc_array(MAXSYMLINKS, sizeof(struct saved), nd->flags & LOOKUP_RCU ? GFP_ATOMIC : GFP_KERNEL); if (unlikely(!p)) return false; memcpy(p, nd->internal, sizeof(nd->internal)); nd->stack = p; return true; } /** * path_connected - Verify that a dentry is below mnt.mnt_root * @mnt: The mountpoint to check. * @dentry: The dentry to check. * * Rename can sometimes move a file or directory outside of a bind * mount, path_connected allows those cases to be detected. */ static bool path_connected(struct vfsmount *mnt, struct dentry *dentry) { struct super_block *sb = mnt->mnt_sb; /* Bind mounts can have disconnected paths */ if (mnt->mnt_root == sb->s_root) return true; return is_subdir(dentry, mnt->mnt_root); } static void drop_links(struct nameidata *nd) { int i = nd->depth; while (i--) { struct saved *last = nd->stack + i; do_delayed_call(&last->done); clear_delayed_call(&last->done); } } static void leave_rcu(struct nameidata *nd) { nd->flags &= ~LOOKUP_RCU; nd->seq = nd->next_seq = 0; rcu_read_unlock(); } static void terminate_walk(struct nameidata *nd) { drop_links(nd); if (!(nd->flags & LOOKUP_RCU)) { int i; path_put(&nd->path); for (i = 0; i < nd->depth; i++) path_put(&nd->stack[i].link); if (nd->state & ND_ROOT_GRABBED) { path_put(&nd->root); nd->state &= ~ND_ROOT_GRABBED; } } else { leave_rcu(nd); } nd->depth = 0; nd->path.mnt = NULL; nd->path.dentry = NULL; } /* path_put is needed afterwards regardless of success or failure */ static bool __legitimize_path(struct path *path, unsigned seq, unsigned mseq) { int res = __legitimize_mnt(path->mnt, mseq); if (unlikely(res)) { if (res > 0) path->mnt = NULL; path->dentry = NULL; return false; } if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) { path->dentry = NULL; return false; } return !read_seqcount_retry(&path->dentry->d_seq, seq); } static inline bool legitimize_path(struct nameidata *nd, struct path *path, unsigned seq) { return __legitimize_path(path, seq, nd->m_seq); } static bool legitimize_links(struct nameidata *nd) { int i; if (unlikely(nd->flags & LOOKUP_CACHED)) { drop_links(nd); nd->depth = 0; return false; } for (i = 0; i < nd->depth; i++) { struct saved *last = nd->stack + i; if (unlikely(!legitimize_path(nd, &last->link, last->seq))) { drop_links(nd); nd->depth = i + 1; return false; } } return true; } static bool legitimize_root(struct nameidata *nd) { /* Nothing to do if nd->root is zero or is managed by the VFS user. */ if (!nd->root.mnt || (nd->state & ND_ROOT_PRESET)) return true; nd->state |= ND_ROOT_GRABBED; return legitimize_path(nd, &nd->root, nd->root_seq); } /* * Path walking has 2 modes, rcu-walk and ref-walk (see * Documentation/filesystems/path-lookup.txt). In situations when we can't * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab * normal reference counts on dentries and vfsmounts to transition to ref-walk * mode. Refcounts are grabbed at the last known good point before rcu-walk * got stuck, so ref-walk may continue from there. If this is not successful * (eg. a seqcount has changed), then failure is returned and it's up to caller * to restart the path walk from the beginning in ref-walk mode. */ /** * try_to_unlazy - try to switch to ref-walk mode. * @nd: nameidata pathwalk data * Returns: true on success, false on failure * * try_to_unlazy attempts to legitimize the current nd->path and nd->root * for ref-walk mode. * Must be called from rcu-walk context. * Nothing should touch nameidata between try_to_unlazy() failure and * terminate_walk(). */ static bool try_to_unlazy(struct nameidata *nd) { struct dentry *parent = nd->path.dentry; BUG_ON(!(nd->flags & LOOKUP_RCU)); if (unlikely(!legitimize_links(nd))) goto out1; if (unlikely(!legitimize_path(nd, &nd->path, nd->seq))) goto out; if (unlikely(!legitimize_root(nd))) goto out; leave_rcu(nd); BUG_ON(nd->inode != parent->d_inode); return true; out1: nd->path.mnt = NULL; nd->path.dentry = NULL; out: leave_rcu(nd); return false; } /** * try_to_unlazy_next - try to switch to ref-walk mode. * @nd: nameidata pathwalk data * @dentry: next dentry to step into * Returns: true on success, false on failure * * Similar to try_to_unlazy(), but here we have the next dentry already * picked by rcu-walk and want to legitimize that in addition to the current * nd->path and nd->root for ref-walk mode. Must be called from rcu-walk context. * Nothing should touch nameidata between try_to_unlazy_next() failure and * terminate_walk(). */ static bool try_to_unlazy_next(struct nameidata *nd, struct dentry *dentry) { int res; BUG_ON(!(nd->flags & LOOKUP_RCU)); if (unlikely(!legitimize_links(nd))) goto out2; res = __legitimize_mnt(nd->path.mnt, nd->m_seq); if (unlikely(res)) { if (res > 0) goto out2; goto out1; } if (unlikely(!lockref_get_not_dead(&nd->path.dentry->d_lockref))) goto out1; /* * We need to move both the parent and the dentry from the RCU domain * to be properly refcounted. And the sequence number in the dentry * validates *both* dentry counters, since we checked the sequence * number of the parent after we got the child sequence number. So we * know the parent must still be valid if the child sequence number is */ if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) goto out; if (read_seqcount_retry(&dentry->d_seq, nd->next_seq)) goto out_dput; /* * Sequence counts matched. Now make sure that the root is * still valid and get it if required. */ if (unlikely(!legitimize_root(nd))) goto out_dput; leave_rcu(nd); return true; out2: nd->path.mnt = NULL; out1: nd->path.dentry = NULL; out: leave_rcu(nd); return false; out_dput: leave_rcu(nd); dput(dentry); return false; } static inline int d_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags) { if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) return dentry->d_op->d_revalidate(dir, name, dentry, flags); else return 1; } /** * complete_walk - successful completion of path walk * @nd: pointer nameidata * * If we had been in RCU mode, drop out of it and legitimize nd->path. * Revalidate the final result, unless we'd already done that during * the path walk or the filesystem doesn't ask for it. Return 0 on * success, -error on failure. In case of failure caller does not * need to drop nd->path. */ static int complete_walk(struct nameidata *nd) { struct dentry *dentry = nd->path.dentry; int status; if (nd->flags & LOOKUP_RCU) { /* * We don't want to zero nd->root for scoped-lookups or * externally-managed nd->root. */ if (!(nd->state & ND_ROOT_PRESET)) if (!(nd->flags & LOOKUP_IS_SCOPED)) nd->root.mnt = NULL; nd->flags &= ~LOOKUP_CACHED; if (!try_to_unlazy(nd)) return -ECHILD; } if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) { /* * While the guarantee of LOOKUP_IS_SCOPED is (roughly) "don't * ever step outside the root during lookup" and should already * be guaranteed by the rest of namei, we want to avoid a namei * BUG resulting in userspace being given a path that was not * scoped within the root at some point during the lookup. * * So, do a final sanity-check to make sure that in the * worst-case scenario (a complete bypass of LOOKUP_IS_SCOPED) * we won't silently return an fd completely outside of the * requested root to userspace. * * Userspace could move the path outside the root after this * check, but as discussed elsewhere this is not a concern (the * resolved file was inside the root at some point). */ if (!path_is_under(&nd->path, &nd->root)) return -EXDEV; } if (likely(!(nd->state & ND_JUMPED))) return 0; if (likely(!(dentry->d_flags & DCACHE_OP_WEAK_REVALIDATE))) return 0; status = dentry->d_op->d_weak_revalidate(dentry, nd->flags); if (status > 0) return 0; if (!status) status = -ESTALE; return status; } static int set_root(struct nameidata *nd) { struct fs_struct *fs = current->fs; /* * Jumping to the real root in a scoped-lookup is a BUG in namei, but we * still have to ensure it doesn't happen because it will cause a breakout * from the dirfd. */ if (WARN_ON(nd->flags & LOOKUP_IS_SCOPED)) return -ENOTRECOVERABLE; if (nd->flags & LOOKUP_RCU) { unsigned seq; do { seq = read_seqbegin(&fs->seq); nd->root = fs->root; nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq); } while (read_seqretry(&fs->seq, seq)); } else { get_fs_root(fs, &nd->root); nd->state |= ND_ROOT_GRABBED; } return 0; } static int nd_jump_root(struct nameidata *nd) { if (unlikely(nd->flags & LOOKUP_BENEATH)) return -EXDEV; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) { /* Absolute path arguments to path_init() are allowed. */ if (nd->path.mnt != NULL && nd->path.mnt != nd->root.mnt) return -EXDEV; } if (!nd->root.mnt) { int error = set_root(nd); if (error) return error; } if (nd->flags & LOOKUP_RCU) { struct dentry *d; nd->path = nd->root; d = nd->path.dentry; nd->inode = d->d_inode; nd->seq = nd->root_seq; if (read_seqcount_retry(&d->d_seq, nd->seq)) return -ECHILD; } else { path_put(&nd->path); nd->path = nd->root; path_get(&nd->path); nd->inode = nd->path.dentry->d_inode; } nd->state |= ND_JUMPED; return 0; } /* * Helper to directly jump to a known parsed path from ->get_link, * caller must have taken a reference to path beforehand. */ int nd_jump_link(const struct path *path) { int error = -ELOOP; struct nameidata *nd = current->nameidata; if (unlikely(nd->flags & LOOKUP_NO_MAGICLINKS)) goto err; error = -EXDEV; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) { if (nd->path.mnt != path->mnt) goto err; } /* Not currently safe for scoped-lookups. */ if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) goto err; path_put(&nd->path); nd->path = *path; nd->inode = nd->path.dentry->d_inode; nd->state |= ND_JUMPED; return 0; err: path_put(path); return error; } static inline void put_link(struct nameidata *nd) { struct saved *last = nd->stack + --nd->depth; do_delayed_call(&last->done); if (!(nd->flags & LOOKUP_RCU)) path_put(&last->link); } static int sysctl_protected_symlinks __read_mostly; static int sysctl_protected_hardlinks __read_mostly; static int sysctl_protected_fifos __read_mostly; static int sysctl_protected_regular __read_mostly; #ifdef CONFIG_SYSCTL static const struct ctl_table namei_sysctls[] = { { .procname = "protected_symlinks", .data = &sysctl_protected_symlinks, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "protected_hardlinks", .data = &sysctl_protected_hardlinks, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "protected_fifos", .data = &sysctl_protected_fifos, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "protected_regular", .data = &sysctl_protected_regular, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, }; static int __init init_fs_namei_sysctls(void) { register_sysctl_init("fs", namei_sysctls); return 0; } fs_initcall(init_fs_namei_sysctls); #endif /* CONFIG_SYSCTL */ /** * may_follow_link - Check symlink following for unsafe situations * @nd: nameidata pathwalk data * @inode: Used for idmapping. * * In the case of the sysctl_protected_symlinks sysctl being enabled, * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is * in a sticky world-writable directory. This is to protect privileged * processes from failing races against path names that may change out * from under them by way of other users creating malicious symlinks. * It will permit symlinks to be followed only when outside a sticky * world-writable directory, or when the uid of the symlink and follower * match, or when the directory owner matches the symlink's owner. * * Returns 0 if following the symlink is allowed, -ve on error. */ static inline int may_follow_link(struct nameidata *nd, const struct inode *inode) { struct mnt_idmap *idmap; vfsuid_t vfsuid; if (!sysctl_protected_symlinks) return 0; idmap = mnt_idmap(nd->path.mnt); vfsuid = i_uid_into_vfsuid(idmap, inode); /* Allowed if owner and follower match. */ if (vfsuid_eq_kuid(vfsuid, current_fsuid())) return 0; /* Allowed if parent directory not sticky and world-writable. */ if ((nd->dir_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH)) return 0; /* Allowed if parent directory and link owner match. */ if (vfsuid_valid(nd->dir_vfsuid) && vfsuid_eq(nd->dir_vfsuid, vfsuid)) return 0; if (nd->flags & LOOKUP_RCU) return -ECHILD; audit_inode(nd->name, nd->stack[0].link.dentry, 0); audit_log_path_denied(AUDIT_ANOM_LINK, "follow_link"); return -EACCES; } /** * safe_hardlink_source - Check for safe hardlink conditions * @idmap: idmap of the mount the inode was found from * @inode: the source inode to hardlink from * * Return false if at least one of the following conditions: * - inode is not a regular file * - inode is setuid * - inode is setgid and group-exec * - access failure for read and write * * Otherwise returns true. */ static bool safe_hardlink_source(struct mnt_idmap *idmap, struct inode *inode) { umode_t mode = inode->i_mode; /* Special files should not get pinned to the filesystem. */ if (!S_ISREG(mode)) return false; /* Setuid files should not get pinned to the filesystem. */ if (mode & S_ISUID) return false; /* Executable setgid files should not get pinned to the filesystem. */ if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) return false; /* Hardlinking to unreadable or unwritable sources is dangerous. */ if (inode_permission(idmap, inode, MAY_READ | MAY_WRITE)) return false; return true; } /** * may_linkat - Check permissions for creating a hardlink * @idmap: idmap of the mount the inode was found from * @link: the source to hardlink from * * Block hardlink when all of: * - sysctl_protected_hardlinks enabled * - fsuid does not match inode * - hardlink source is unsafe (see safe_hardlink_source() above) * - not CAP_FOWNER in a namespace with the inode owner uid mapped * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. * * Returns 0 if successful, -ve on error. */ int may_linkat(struct mnt_idmap *idmap, const struct path *link) { struct inode *inode = link->dentry->d_inode; /* Inode writeback is not safe when the uid or gid are invalid. */ if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) return -EOVERFLOW; if (!sysctl_protected_hardlinks) return 0; /* Source inode owner (or CAP_FOWNER) can hardlink all they like, * otherwise, it must be a safe source. */ if (safe_hardlink_source(idmap, inode) || inode_owner_or_capable(idmap, inode)) return 0; audit_log_path_denied(AUDIT_ANOM_LINK, "linkat"); return -EPERM; } /** * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory * should be allowed, or not, on files that already * exist. * @idmap: idmap of the mount the inode was found from * @nd: nameidata pathwalk data * @inode: the inode of the file to open * * Block an O_CREAT open of a FIFO (or a regular file) when: * - sysctl_protected_fifos (or sysctl_protected_regular) is enabled * - the file already exists * - we are in a sticky directory * - we don't own the file * - the owner of the directory doesn't own the file * - the directory is world writable * If the sysctl_protected_fifos (or sysctl_protected_regular) is set to 2 * the directory doesn't have to be world writable: being group writable will * be enough. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. * * Returns 0 if the open is allowed, -ve on error. */ static int may_create_in_sticky(struct mnt_idmap *idmap, struct nameidata *nd, struct inode *const inode) { umode_t dir_mode = nd->dir_mode; vfsuid_t dir_vfsuid = nd->dir_vfsuid, i_vfsuid; if (likely(!(dir_mode & S_ISVTX))) return 0; if (S_ISREG(inode->i_mode) && !sysctl_protected_regular) return 0; if (S_ISFIFO(inode->i_mode) && !sysctl_protected_fifos) return 0; i_vfsuid = i_uid_into_vfsuid(idmap, inode); if (vfsuid_eq(i_vfsuid, dir_vfsuid)) return 0; if (vfsuid_eq_kuid(i_vfsuid, current_fsuid())) return 0; if (likely(dir_mode & 0002)) { audit_log_path_denied(AUDIT_ANOM_CREAT, "sticky_create"); return -EACCES; } if (dir_mode & 0020) { if (sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) { audit_log_path_denied(AUDIT_ANOM_CREAT, "sticky_create_fifo"); return -EACCES; } if (sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode)) { audit_log_path_denied(AUDIT_ANOM_CREAT, "sticky_create_regular"); return -EACCES; } } return 0; } /* * follow_up - Find the mountpoint of path's vfsmount * * Given a path, find the mountpoint of its source file system. * Replace @path with the path of the mountpoint in the parent mount. * Up is towards /. * * Return 1 if we went up a level and 0 if we were already at the * root. */ int follow_up(struct path *path) { struct mount *mnt = real_mount(path->mnt); struct mount *parent; struct dentry *mountpoint; read_seqlock_excl(&mount_lock); parent = mnt->mnt_parent; if (parent == mnt) { read_sequnlock_excl(&mount_lock); return 0; } mntget(&parent->mnt); mountpoint = dget(mnt->mnt_mountpoint); read_sequnlock_excl(&mount_lock); dput(path->dentry); path->dentry = mountpoint; mntput(path->mnt); path->mnt = &parent->mnt; return 1; } EXPORT_SYMBOL(follow_up); static bool choose_mountpoint_rcu(struct mount *m, const struct path *root, struct path *path, unsigned *seqp) { while (mnt_has_parent(m)) { struct dentry *mountpoint = m->mnt_mountpoint; m = m->mnt_parent; if (unlikely(root->dentry == mountpoint && root->mnt == &m->mnt)) break; if (mountpoint != m->mnt.mnt_root) { path->mnt = &m->mnt; path->dentry = mountpoint; *seqp = read_seqcount_begin(&mountpoint->d_seq); return true; } } return false; } static bool choose_mountpoint(struct mount *m, const struct path *root, struct path *path) { bool found; rcu_read_lock(); while (1) { unsigned seq, mseq = read_seqbegin(&mount_lock); found = choose_mountpoint_rcu(m, root, path, &seq); if (unlikely(!found)) { if (!read_seqretry(&mount_lock, mseq)) break; } else { if (likely(__legitimize_path(path, seq, mseq))) break; rcu_read_unlock(); path_put(path); rcu_read_lock(); } } rcu_read_unlock(); return found; } /* * Perform an automount * - return -EISDIR to tell follow_managed() to stop and return the path we * were called with. */ static int follow_automount(struct path *path, int *count, unsigned lookup_flags) { struct dentry *dentry = path->dentry; /* We don't want to mount if someone's just doing a stat - * unless they're stat'ing a directory and appended a '/' to * the name. * * We do, however, want to mount if someone wants to open or * create a file of any type under the mountpoint, wants to * traverse through the mountpoint or wants to open the * mounted directory. Also, autofs may mark negative dentries * as being automount points. These will need the attentions * of the daemon to instantiate them before they can be used. */ if (!(lookup_flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && dentry->d_inode) return -EISDIR; if (count && (*count)++ >= MAXSYMLINKS) return -ELOOP; return finish_automount(dentry->d_op->d_automount(path), path); } /* * mount traversal - out-of-line part. One note on ->d_flags accesses - * dentries are pinned but not locked here, so negative dentry can go * positive right under us. Use of smp_load_acquire() provides a barrier * sufficient for ->d_inode and ->d_flags consistency. */ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped, int *count, unsigned lookup_flags) { struct vfsmount *mnt = path->mnt; bool need_mntput = false; int ret = 0; while (flags & DCACHE_MANAGED_DENTRY) { /* Allow the filesystem to manage the transit without i_rwsem * being held. */ if (flags & DCACHE_MANAGE_TRANSIT) { ret = path->dentry->d_op->d_manage(path, false); flags = smp_load_acquire(&path->dentry->d_flags); if (ret < 0) break; } if (flags & DCACHE_MOUNTED) { // something's mounted on it.. struct vfsmount *mounted = lookup_mnt(path); if (mounted) { // ... in our namespace dput(path->dentry); if (need_mntput) mntput(path->mnt); path->mnt = mounted; path->dentry = dget(mounted->mnt_root); // here we know it's positive flags = path->dentry->d_flags; need_mntput = true; continue; } } if (!(flags & DCACHE_NEED_AUTOMOUNT)) break; // uncovered automount point ret = follow_automount(path, count, lookup_flags); flags = smp_load_acquire(&path->dentry->d_flags); if (ret < 0) break; } if (ret == -EISDIR) ret = 0; // possible if you race with several mount --move if (need_mntput && path->mnt == mnt) mntput(path->mnt); if (!ret && unlikely(d_flags_negative(flags))) ret = -ENOENT; *jumped = need_mntput; return ret; } static inline int traverse_mounts(struct path *path, bool *jumped, int *count, unsigned lookup_flags) { unsigned flags = smp_load_acquire(&path->dentry->d_flags); /* fastpath */ if (likely(!(flags & DCACHE_MANAGED_DENTRY))) { *jumped = false; if (unlikely(d_flags_negative(flags))) return -ENOENT; return 0; } return __traverse_mounts(path, flags, jumped, count, lookup_flags); } int follow_down_one(struct path *path) { struct vfsmount *mounted; mounted = lookup_mnt(path); if (mounted) { dput(path->dentry); mntput(path->mnt); path->mnt = mounted; path->dentry = dget(mounted->mnt_root); return 1; } return 0; } EXPORT_SYMBOL(follow_down_one); /* * Follow down to the covering mount currently visible to userspace. At each * point, the filesystem owning that dentry may be queried as to whether the * caller is permitted to proceed or not. */ int follow_down(struct path *path, unsigned int flags) { struct vfsmount *mnt = path->mnt; bool jumped; int ret = traverse_mounts(path, &jumped, NULL, flags); if (path->mnt != mnt) mntput(mnt); return ret; } EXPORT_SYMBOL(follow_down); /* * Try to skip to top of mountpoint pile in rcuwalk mode. Fail if * we meet a managed dentry that would need blocking. */ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path) { struct dentry *dentry = path->dentry; unsigned int flags = dentry->d_flags; if (likely(!(flags & DCACHE_MANAGED_DENTRY))) return true; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) return false; for (;;) { /* * Don't forget we might have a non-mountpoint managed dentry * that wants to block transit. */ if (unlikely(flags & DCACHE_MANAGE_TRANSIT)) { int res = dentry->d_op->d_manage(path, true); if (res) return res == -EISDIR; flags = dentry->d_flags; } if (flags & DCACHE_MOUNTED) { struct mount *mounted = __lookup_mnt(path->mnt, dentry); if (mounted) { path->mnt = &mounted->mnt; dentry = path->dentry = mounted->mnt.mnt_root; nd->state |= ND_JUMPED; nd->next_seq = read_seqcount_begin(&dentry->d_seq); flags = dentry->d_flags; // makes sure that non-RCU pathwalk could reach // this state. if (read_seqretry(&mount_lock, nd->m_seq)) return false; continue; } if (read_seqretry(&mount_lock, nd->m_seq)) return false; } return !(flags & DCACHE_NEED_AUTOMOUNT); } } static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, struct path *path) { bool jumped; int ret; path->mnt = nd->path.mnt; path->dentry = dentry; if (nd->flags & LOOKUP_RCU) { unsigned int seq = nd->next_seq; if (likely(__follow_mount_rcu(nd, path))) return 0; // *path and nd->next_seq might've been clobbered path->mnt = nd->path.mnt; path->dentry = dentry; nd->next_seq = seq; if (!try_to_unlazy_next(nd, dentry)) return -ECHILD; } ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags); if (jumped) { if (unlikely(nd->flags & LOOKUP_NO_XDEV)) ret = -EXDEV; else nd->state |= ND_JUMPED; } if (unlikely(ret)) { dput(path->dentry); if (path->mnt != nd->path.mnt) mntput(path->mnt); } return ret; } /* * This looks up the name in dcache and possibly revalidates the found dentry. * NULL is returned if the dentry does not exist in the cache. */ static struct dentry *lookup_dcache(const struct qstr *name, struct dentry *dir, unsigned int flags) { struct dentry *dentry = d_lookup(dir, name); if (dentry) { int error = d_revalidate(dir->d_inode, name, dentry, flags); if (unlikely(error <= 0)) { if (!error) d_invalidate(dentry); dput(dentry); return ERR_PTR(error); } } return dentry; } /* * Parent directory has inode locked exclusive. This is one * and only case when ->lookup() gets called on non in-lookup * dentries - as the matter of fact, this only gets called * when directory is guaranteed to have no in-lookup children * at all. * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed. * Will return -EEXIST if name is found and LOOKUP_EXCL was passed. */ struct dentry *lookup_one_qstr_excl(const struct qstr *name, struct dentry *base, unsigned int flags) { struct dentry *dentry; struct dentry *old; struct inode *dir; dentry = lookup_dcache(name, base, flags); if (dentry) goto found; /* Don't create child dentry for a dead directory. */ dir = base->d_inode; if (unlikely(IS_DEADDIR(dir))) return ERR_PTR(-ENOENT); dentry = d_alloc(base, name); if (unlikely(!dentry)) return ERR_PTR(-ENOMEM); old = dir->i_op->lookup(dir, dentry, flags); if (unlikely(old)) { dput(dentry); dentry = old; } found: if (IS_ERR(dentry)) return dentry; if (d_is_negative(dentry) && !(flags & LOOKUP_CREATE)) { dput(dentry); return ERR_PTR(-ENOENT); } if (d_is_positive(dentry) && (flags & LOOKUP_EXCL)) { dput(dentry); return ERR_PTR(-EEXIST); } return dentry; } EXPORT_SYMBOL(lookup_one_qstr_excl); /** * lookup_fast - do fast lockless (but racy) lookup of a dentry * @nd: current nameidata * * Do a fast, but racy lookup in the dcache for the given dentry, and * revalidate it. Returns a valid dentry pointer or NULL if one wasn't * found. On error, an ERR_PTR will be returned. * * If this function returns a valid dentry and the walk is no longer * lazy, the dentry will carry a reference that must later be put. If * RCU mode is still in force, then this is not the case and the dentry * must be legitimized before use. If this returns NULL, then the walk * will no longer be in RCU mode. */ static struct dentry *lookup_fast(struct nameidata *nd) { struct dentry *dentry, *parent = nd->path.dentry; int status = 1; /* * Rename seqlock is not required here because in the off chance * of a false negative due to a concurrent rename, the caller is * going to fall back to non-racy lookup. */ if (nd->flags & LOOKUP_RCU) { dentry = __d_lookup_rcu(parent, &nd->last, &nd->next_seq); if (unlikely(!dentry)) { if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); return NULL; } /* * This sequence count validates that the parent had no * changes while we did the lookup of the dentry above. */ if (read_seqcount_retry(&parent->d_seq, nd->seq)) return ERR_PTR(-ECHILD); status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags); if (likely(status > 0)) return dentry; if (!try_to_unlazy_next(nd, dentry)) return ERR_PTR(-ECHILD); if (status == -ECHILD) /* we'd been told to redo it in non-rcu mode */ status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags); } else { dentry = __d_lookup(parent, &nd->last); if (unlikely(!dentry)) return NULL; status = d_revalidate(nd->inode, &nd->last, dentry, nd->flags); } if (unlikely(status <= 0)) { if (!status) d_invalidate(dentry); dput(dentry); return ERR_PTR(status); } return dentry; } /* Fast lookup failed, do it the slow way */ static struct dentry *__lookup_slow(const struct qstr *name, struct dentry *dir, unsigned int flags) { struct dentry *dentry, *old; struct inode *inode = dir->d_inode; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); /* Don't go there if it's already dead */ if (unlikely(IS_DEADDIR(inode))) return ERR_PTR(-ENOENT); again: dentry = d_alloc_parallel(dir, name, &wq); if (IS_ERR(dentry)) return dentry; if (unlikely(!d_in_lookup(dentry))) { int error = d_revalidate(inode, name, dentry, flags); if (unlikely(error <= 0)) { if (!error) { d_invalidate(dentry); dput(dentry); goto again; } dput(dentry); dentry = ERR_PTR(error); } } else { old = inode->i_op->lookup(inode, dentry, flags); d_lookup_done(dentry); if (unlikely(old)) { dput(dentry); dentry = old; } } return dentry; } static struct dentry *lookup_slow(const struct qstr *name, struct dentry *dir, unsigned int flags) { struct inode *inode = dir->d_inode; struct dentry *res; inode_lock_shared(inode); res = __lookup_slow(name, dir, flags); inode_unlock_shared(inode); return res; } static inline int may_lookup(struct mnt_idmap *idmap, struct nameidata *restrict nd) { int err, mask; mask = nd->flags & LOOKUP_RCU ? MAY_NOT_BLOCK : 0; err = inode_permission(idmap, nd->inode, mask | MAY_EXEC); if (likely(!err)) return 0; // If we failed, and we weren't in LOOKUP_RCU, it's final if (!(nd->flags & LOOKUP_RCU)) return err; // Drop out of RCU mode to make sure it wasn't transient if (!try_to_unlazy(nd)) return -ECHILD; // redo it all non-lazy if (err != -ECHILD) // hard error return err; return inode_permission(idmap, nd->inode, MAY_EXEC); } static int reserve_stack(struct nameidata *nd, struct path *link) { if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) return -ELOOP; if (likely(nd->depth != EMBEDDED_LEVELS)) return 0; if (likely(nd->stack != nd->internal)) return 0; if (likely(nd_alloc_stack(nd))) return 0; if (nd->flags & LOOKUP_RCU) { // we need to grab link before we do unlazy. And we can't skip // unlazy even if we fail to grab the link - cleanup needs it bool grabbed_link = legitimize_path(nd, link, nd->next_seq); if (!try_to_unlazy(nd) || !grabbed_link) return -ECHILD; if (nd_alloc_stack(nd)) return 0; } return -ENOMEM; } enum {WALK_TRAILING = 1, WALK_MORE = 2, WALK_NOFOLLOW = 4}; static const char *pick_link(struct nameidata *nd, struct path *link, struct inode *inode, int flags) { struct saved *last; const char *res; int error = reserve_stack(nd, link); if (unlikely(error)) { if (!(nd->flags & LOOKUP_RCU)) path_put(link); return ERR_PTR(error); } last = nd->stack + nd->depth++; last->link = *link; clear_delayed_call(&last->done); last->seq = nd->next_seq; if (flags & WALK_TRAILING) { error = may_follow_link(nd, inode); if (unlikely(error)) return ERR_PTR(error); } if (unlikely(nd->flags & LOOKUP_NO_SYMLINKS) || unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW)) return ERR_PTR(-ELOOP); if (unlikely(atime_needs_update(&last->link, inode))) { if (nd->flags & LOOKUP_RCU) { if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); } touch_atime(&last->link); cond_resched(); } error = security_inode_follow_link(link->dentry, inode, nd->flags & LOOKUP_RCU); if (unlikely(error)) return ERR_PTR(error); res = READ_ONCE(inode->i_link); if (!res) { const char * (*get)(struct dentry *, struct inode *, struct delayed_call *); get = inode->i_op->get_link; if (nd->flags & LOOKUP_RCU) { res = get(NULL, inode, &last->done); if (res == ERR_PTR(-ECHILD) && try_to_unlazy(nd)) res = get(link->dentry, inode, &last->done); } else { res = get(link->dentry, inode, &last->done); } if (!res) goto all_done; if (IS_ERR(res)) return res; } if (*res == '/') { error = nd_jump_root(nd); if (unlikely(error)) return ERR_PTR(error); while (unlikely(*++res == '/')) ; } if (*res) return res; all_done: // pure jump put_link(nd); return NULL; } /* * Do we need to follow links? We _really_ want to be able * to do this check without having to look at inode->i_op, * so we keep a cache of "no, this doesn't need follow_link" * for the common case. * * NOTE: dentry must be what nd->next_seq had been sampled from. */ static const char *step_into(struct nameidata *nd, int flags, struct dentry *dentry) { struct path path; struct inode *inode; int err = handle_mounts(nd, dentry, &path); if (err < 0) return ERR_PTR(err); inode = path.dentry->d_inode; if (likely(!d_is_symlink(path.dentry)) || ((flags & WALK_TRAILING) && !(nd->flags & LOOKUP_FOLLOW)) || (flags & WALK_NOFOLLOW)) { /* not a symlink or should not follow */ if (nd->flags & LOOKUP_RCU) { if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq)) return ERR_PTR(-ECHILD); if (unlikely(!inode)) return ERR_PTR(-ENOENT); } else { dput(nd->path.dentry); if (nd->path.mnt != path.mnt) mntput(nd->path.mnt); } nd->path = path; nd->inode = inode; nd->seq = nd->next_seq; return NULL; } if (nd->flags & LOOKUP_RCU) { /* make sure that d_is_symlink above matches inode */ if (read_seqcount_retry(&path.dentry->d_seq, nd->next_seq)) return ERR_PTR(-ECHILD); } else { if (path.mnt == nd->path.mnt) mntget(path.mnt); } return pick_link(nd, &path, inode, flags); } static struct dentry *follow_dotdot_rcu(struct nameidata *nd) { struct dentry *parent, *old; if (path_equal(&nd->path, &nd->root)) goto in_root; if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) { struct path path; unsigned seq; if (!choose_mountpoint_rcu(real_mount(nd->path.mnt), &nd->root, &path, &seq)) goto in_root; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) return ERR_PTR(-ECHILD); nd->path = path; nd->inode = path.dentry->d_inode; nd->seq = seq; // makes sure that non-RCU pathwalk could reach this state if (read_seqretry(&mount_lock, nd->m_seq)) return ERR_PTR(-ECHILD); /* we know that mountpoint was pinned */ } old = nd->path.dentry; parent = old->d_parent; nd->next_seq = read_seqcount_begin(&parent->d_seq); // makes sure that non-RCU pathwalk could reach this state if (read_seqcount_retry(&old->d_seq, nd->seq)) return ERR_PTR(-ECHILD); if (unlikely(!path_connected(nd->path.mnt, parent))) return ERR_PTR(-ECHILD); return parent; in_root: if (read_seqretry(&mount_lock, nd->m_seq)) return ERR_PTR(-ECHILD); if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-ECHILD); nd->next_seq = nd->seq; return nd->path.dentry; } static struct dentry *follow_dotdot(struct nameidata *nd) { struct dentry *parent; if (path_equal(&nd->path, &nd->root)) goto in_root; if (unlikely(nd->path.dentry == nd->path.mnt->mnt_root)) { struct path path; if (!choose_mountpoint(real_mount(nd->path.mnt), &nd->root, &path)) goto in_root; path_put(&nd->path); nd->path = path; nd->inode = path.dentry->d_inode; if (unlikely(nd->flags & LOOKUP_NO_XDEV)) return ERR_PTR(-EXDEV); } /* rare case of legitimate dget_parent()... */ parent = dget_parent(nd->path.dentry); if (unlikely(!path_connected(nd->path.mnt, parent))) { dput(parent); return ERR_PTR(-ENOENT); } return parent; in_root: if (unlikely(nd->flags & LOOKUP_BENEATH)) return ERR_PTR(-EXDEV); return dget(nd->path.dentry); } static const char *handle_dots(struct nameidata *nd, int type) { if (type == LAST_DOTDOT) { const char *error = NULL; struct dentry *parent; if (!nd->root.mnt) { error = ERR_PTR(set_root(nd)); if (error) return error; } if (nd->flags & LOOKUP_RCU) parent = follow_dotdot_rcu(nd); else parent = follow_dotdot(nd); if (IS_ERR(parent)) return ERR_CAST(parent); error = step_into(nd, WALK_NOFOLLOW, parent); if (unlikely(error)) return error; if (unlikely(nd->flags & LOOKUP_IS_SCOPED)) { /* * If there was a racing rename or mount along our * path, then we can't be sure that ".." hasn't jumped * above nd->root (and so userspace should retry or use * some fallback). */ smp_rmb(); if (__read_seqcount_retry(&mount_lock.seqcount, nd->m_seq)) return ERR_PTR(-EAGAIN); if (__read_seqcount_retry(&rename_lock.seqcount, nd->r_seq)) return ERR_PTR(-EAGAIN); } } return NULL; } static const char *walk_component(struct nameidata *nd, int flags) { struct dentry *dentry; /* * "." and ".." are special - ".." especially so because it has * to be able to know about the current root directory and * parent relationships. */ if (unlikely(nd->last_type != LAST_NORM)) { if (!(flags & WALK_MORE) && nd->depth) put_link(nd); return handle_dots(nd, nd->last_type); } dentry = lookup_fast(nd); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (unlikely(!dentry)) { dentry = lookup_slow(&nd->last, nd->path.dentry, nd->flags); if (IS_ERR(dentry)) return ERR_CAST(dentry); } if (!(flags & WALK_MORE) && nd->depth) put_link(nd); return step_into(nd, flags, dentry); } /* * We can do the critical dentry name comparison and hashing * operations one word at a time, but we are limited to: * * - Architectures with fast unaligned word accesses. We could * do a "get_unaligned()" if this helps and is sufficiently * fast. * * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we * do not trap on the (extremely unlikely) case of a page * crossing operation. * * - Furthermore, we need an efficient 64-bit compile for the * 64-bit case in order to generate the "number of bytes in * the final mask". Again, that could be replaced with a * efficient population count instruction or similar. */ #ifdef CONFIG_DCACHE_WORD_ACCESS #include <asm/word-at-a-time.h> #ifdef HASH_MIX /* Architecture provides HASH_MIX and fold_hash() in <asm/hash.h> */ #elif defined(CONFIG_64BIT) /* * Register pressure in the mixing function is an issue, particularly * on 32-bit x86, but almost any function requires one state value and * one temporary. Instead, use a function designed for two state values * and no temporaries. * * This function cannot create a collision in only two iterations, so * we have two iterations to achieve avalanche. In those two iterations, * we have six layers of mixing, which is enough to spread one bit's * influence out to 2^6 = 64 state bits. * * Rotate constants are scored by considering either 64 one-bit input * deltas or 64*63/2 = 2016 two-bit input deltas, and finding the * probability of that delta causing a change to each of the 128 output * bits, using a sample of random initial states. * * The Shannon entropy of the computed probabilities is then summed * to produce a score. Ideally, any input change has a 50% chance of * toggling any given output bit. * * Mixing scores (in bits) for (12,45): * Input delta: 1-bit 2-bit * 1 round: 713.3 42542.6 * 2 rounds: 2753.7 140389.8 * 3 rounds: 5954.1 233458.2 * 4 rounds: 7862.6 256672.2 * Perfect: 8192 258048 * (64*128) (64*63/2 * 128) */ #define HASH_MIX(x, y, a) \ ( x ^= (a), \ y ^= x, x = rol64(x,12),\ x += y, y = rol64(y,45),\ y *= 9 ) /* * Fold two longs into one 32-bit hash value. This must be fast, but * latency isn't quite as critical, as there is a fair bit of additional * work done before the hash value is used. */ static inline unsigned int fold_hash(unsigned long x, unsigned long y) { y ^= x * GOLDEN_RATIO_64; y *= GOLDEN_RATIO_64; return y >> 32; } #else /* 32-bit case */ /* * Mixing scores (in bits) for (7,20): * Input delta: 1-bit 2-bit * 1 round: 330.3 9201.6 * 2 rounds: 1246.4 25475.4 * 3 rounds: 1907.1 31295.1 * 4 rounds: 2042.3 31718.6 * Perfect: 2048 31744 * (32*64) (32*31/2 * 64) */ #define HASH_MIX(x, y, a) \ ( x ^= (a), \ y ^= x, x = rol32(x, 7),\ x += y, y = rol32(y,20),\ y *= 9 ) static inline unsigned int fold_hash(unsigned long x, unsigned long y) { /* Use arch-optimized multiply if one exists */ return __hash_32(y ^ __hash_32(x)); } #endif /* * Return the hash of a string of known length. This is carfully * designed to match hash_name(), which is the more critical function. * In particular, we must end by hashing a final word containing 0..7 * payload bytes, to match the way that hash_name() iterates until it * finds the delimiter after the name. */ unsigned int full_name_hash(const void *salt, const char *name, unsigned int len) { unsigned long a, x = 0, y = (unsigned long)salt; for (;;) { if (!len) goto done; a = load_unaligned_zeropad(name); if (len < sizeof(unsigned long)) break; HASH_MIX(x, y, a); name += sizeof(unsigned long); len -= sizeof(unsigned long); } x ^= a & bytemask_from_count(len); done: return fold_hash(x, y); } EXPORT_SYMBOL(full_name_hash); /* Return the "hash_len" (hash and length) of a null-terminated string */ u64 hashlen_string(const void *salt, const char *name) { unsigned long a = 0, x = 0, y = (unsigned long)salt; unsigned long adata, mask, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; len = 0; goto inside; do { HASH_MIX(x, y, a); len += sizeof(unsigned long); inside: a = load_unaligned_zeropad(name+len); } while (!has_zero(a, &adata, &constants)); adata = prep_zero_mask(a, adata, &constants); mask = create_zero_mask(adata); x ^= a & zero_bytemask(mask); return hashlen_create(fold_hash(x, y), len + find_zero(mask)); } EXPORT_SYMBOL(hashlen_string); /* * Calculate the length and hash of the path component, and * return the length as the result. */ static inline const char *hash_name(struct nameidata *nd, const char *name, unsigned long *lastword) { unsigned long a, b, x, y = (unsigned long)nd->path.dentry; unsigned long adata, bdata, mask, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; /* * The first iteration is special, because it can result in * '.' and '..' and has no mixing other than the final fold. */ a = load_unaligned_zeropad(name); b = a ^ REPEAT_BYTE('/'); if (has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)) { adata = prep_zero_mask(a, adata, &constants); bdata = prep_zero_mask(b, bdata, &constants); mask = create_zero_mask(adata | bdata); a &= zero_bytemask(mask); *lastword = a; len = find_zero(mask); nd->last.hash = fold_hash(a, y); nd->last.len = len; return name + len; } len = 0; x = 0; do { HASH_MIX(x, y, a); len += sizeof(unsigned long); a = load_unaligned_zeropad(name+len); b = a ^ REPEAT_BYTE('/'); } while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants))); adata = prep_zero_mask(a, adata, &constants); bdata = prep_zero_mask(b, bdata, &constants); mask = create_zero_mask(adata | bdata); a &= zero_bytemask(mask); x ^= a; len += find_zero(mask); *lastword = 0; // Multi-word components cannot be DOT or DOTDOT nd->last.hash = fold_hash(x, y); nd->last.len = len; return name + len; } /* * Note that the 'last' word is always zero-masked, but * was loaded as a possibly big-endian word. */ #ifdef __BIG_ENDIAN #define LAST_WORD_IS_DOT (0x2eul << (BITS_PER_LONG-8)) #define LAST_WORD_IS_DOTDOT (0x2e2eul << (BITS_PER_LONG-16)) #endif #else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */ /* Return the hash of a string of known length */ unsigned int full_name_hash(const void *salt, const char *name, unsigned int len) { unsigned long hash = init_name_hash(salt); while (len--) hash = partial_name_hash((unsigned char)*name++, hash); return end_name_hash(hash); } EXPORT_SYMBOL(full_name_hash); /* Return the "hash_len" (hash and length) of a null-terminated string */ u64 hashlen_string(const void *salt, const char *name) { unsigned long hash = init_name_hash(salt); unsigned long len = 0, c; c = (unsigned char)*name; while (c) { len++; hash = partial_name_hash(c, hash); c = (unsigned char)name[len]; } return hashlen_create(end_name_hash(hash), len); } EXPORT_SYMBOL(hashlen_string); /* * We know there's a real path component here of at least * one character. */ static inline const char *hash_name(struct nameidata *nd, const char *name, unsigned long *lastword) { unsigned long hash = init_name_hash(nd->path.dentry); unsigned long len = 0, c, last = 0; c = (unsigned char)*name; do { last = (last << 8) + c; len++; hash = partial_name_hash(c, hash); c = (unsigned char)name[len]; } while (c && c != '/'); // This is reliable for DOT or DOTDOT, since the component // cannot contain NUL characters - top bits being zero means // we cannot have had any other pathnames. *lastword = last; nd->last.hash = end_name_hash(hash); nd->last.len = len; return name + len; } #endif #ifndef LAST_WORD_IS_DOT #define LAST_WORD_IS_DOT 0x2e #define LAST_WORD_IS_DOTDOT 0x2e2e #endif /* * Name resolution. * This is the basic name resolution function, turning a pathname into * the final dentry. We expect 'base' to be positive and a directory. * * Returns 0 and nd will have valid dentry and mnt on success. * Returns error and drops reference to input namei data on failure. */ static int link_path_walk(const char *name, struct nameidata *nd) { int depth = 0; // depth <= nd->depth int err; nd->last_type = LAST_ROOT; nd->flags |= LOOKUP_PARENT; if (IS_ERR(name)) return PTR_ERR(name); if (*name == '/') { do { name++; } while (unlikely(*name == '/')); } if (unlikely(!*name)) { nd->dir_mode = 0; // short-circuit the 'hardening' idiocy return 0; } /* At this point we know we have a real path component. */ for(;;) { struct mnt_idmap *idmap; const char *link; unsigned long lastword; idmap = mnt_idmap(nd->path.mnt); err = may_lookup(idmap, nd); if (unlikely(err)) return err; nd->last.name = name; name = hash_name(nd, name, &lastword); switch(lastword) { case LAST_WORD_IS_DOTDOT: nd->last_type = LAST_DOTDOT; nd->state |= ND_JUMPED; break; case LAST_WORD_IS_DOT: nd->last_type = LAST_DOT; break; default: nd->last_type = LAST_NORM; nd->state &= ~ND_JUMPED; struct dentry *parent = nd->path.dentry; if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { err = parent->d_op->d_hash(parent, &nd->last); if (err < 0) return err; } } if (!*name) goto OK; /* * If it wasn't NUL, we know it was '/'. Skip that * slash, and continue until no more slashes. */ do { name++; } while (unlikely(*name == '/')); if (unlikely(!*name)) { OK: /* pathname or trailing symlink, done */ if (!depth) { nd->dir_vfsuid = i_uid_into_vfsuid(idmap, nd->inode); nd->dir_mode = nd->inode->i_mode; nd->flags &= ~LOOKUP_PARENT; return 0; } /* last component of nested symlink */ name = nd->stack[--depth].name; link = walk_component(nd, 0); } else { /* not the last component */ link = walk_component(nd, WALK_MORE); } if (unlikely(link)) { if (IS_ERR(link)) return PTR_ERR(link); /* a symlink to follow */ nd->stack[depth++].name = name; name = link; continue; } if (unlikely(!d_can_lookup(nd->path.dentry))) { if (nd->flags & LOOKUP_RCU) { if (!try_to_unlazy(nd)) return -ECHILD; } return -ENOTDIR; } } } /* must be paired with terminate_walk() */ static const char *path_init(struct nameidata *nd, unsigned flags) { int error; const char *s = nd->pathname; /* LOOKUP_CACHED requires RCU, ask caller to retry */ if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED) return ERR_PTR(-EAGAIN); if (!*s) flags &= ~LOOKUP_RCU; if (flags & LOOKUP_RCU) rcu_read_lock(); else nd->seq = nd->next_seq = 0; nd->flags = flags; nd->state |= ND_JUMPED; nd->m_seq = __read_seqcount_begin(&mount_lock.seqcount); nd->r_seq = __read_seqcount_begin(&rename_lock.seqcount); smp_rmb(); if (nd->state & ND_ROOT_PRESET) { struct dentry *root = nd->root.dentry; struct inode *inode = root->d_inode; if (*s && unlikely(!d_can_lookup(root))) return ERR_PTR(-ENOTDIR); nd->path = nd->root; nd->inode = inode; if (flags & LOOKUP_RCU) { nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); nd->root_seq = nd->seq; } else { path_get(&nd->path); } return s; } nd->root.mnt = NULL; /* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */ if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) { error = nd_jump_root(nd); if (unlikely(error)) return ERR_PTR(error); return s; } /* Relative pathname -- get the starting-point it is relative to. */ if (nd->dfd == AT_FDCWD) { if (flags & LOOKUP_RCU) { struct fs_struct *fs = current->fs; unsigned seq; do { seq = read_seqbegin(&fs->seq); nd->path = fs->pwd; nd->inode = nd->path.dentry->d_inode; nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); } while (read_seqretry(&fs->seq, seq)); } else { get_fs_pwd(current->fs, &nd->path); nd->inode = nd->path.dentry->d_inode; } } else { /* Caller must check execute permissions on the starting path component */ CLASS(fd_raw, f)(nd->dfd); struct dentry *dentry; if (fd_empty(f)) return ERR_PTR(-EBADF); if (flags & LOOKUP_LINKAT_EMPTY) { if (fd_file(f)->f_cred != current_cred() && !ns_capable(fd_file(f)->f_cred->user_ns, CAP_DAC_READ_SEARCH)) return ERR_PTR(-ENOENT); } dentry = fd_file(f)->f_path.dentry; if (*s && unlikely(!d_can_lookup(dentry))) return ERR_PTR(-ENOTDIR); nd->path = fd_file(f)->f_path; if (flags & LOOKUP_RCU) { nd->inode = nd->path.dentry->d_inode; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); } else { path_get(&nd->path); nd->inode = nd->path.dentry->d_inode; } } /* For scoped-lookups we need to set the root to the dirfd as well. */ if (flags & LOOKUP_IS_SCOPED) { nd->root = nd->path; if (flags & LOOKUP_RCU) { nd->root_seq = nd->seq; } else { path_get(&nd->root); nd->state |= ND_ROOT_GRABBED; } } return s; } static inline const char *lookup_last(struct nameidata *nd) { if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; return walk_component(nd, WALK_TRAILING); } static int handle_lookup_down(struct nameidata *nd) { if (!(nd->flags & LOOKUP_RCU)) dget(nd->path.dentry); nd->next_seq = nd->seq; return PTR_ERR(step_into(nd, WALK_NOFOLLOW, nd->path.dentry)); } /* Returns 0 and nd will be valid on success; Returns error, otherwise. */ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path) { const char *s = path_init(nd, flags); int err; if (unlikely(flags & LOOKUP_DOWN) && !IS_ERR(s)) { err = handle_lookup_down(nd); if (unlikely(err < 0)) s = ERR_PTR(err); } while (!(err = link_path_walk(s, nd)) && (s = lookup_last(nd)) != NULL) ; if (!err && unlikely(nd->flags & LOOKUP_MOUNTPOINT)) { err = handle_lookup_down(nd); nd->state &= ~ND_JUMPED; // no d_weak_revalidate(), please... } if (!err) err = complete_walk(nd); if (!err && nd->flags & LOOKUP_DIRECTORY) if (!d_can_lookup(nd->path.dentry)) err = -ENOTDIR; if (!err) { *path = nd->path; nd->path.mnt = NULL; nd->path.dentry = NULL; } terminate_walk(nd); return err; } int filename_lookup(int dfd, struct filename *name, unsigned flags, struct path *path, struct path *root) { int retval; struct nameidata nd; if (IS_ERR(name)) return PTR_ERR(name); set_nameidata(&nd, dfd, name, root); retval = path_lookupat(&nd, flags | LOOKUP_RCU, path); if (unlikely(retval == -ECHILD)) retval = path_lookupat(&nd, flags, path); if (unlikely(retval == -ESTALE)) retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path); if (likely(!retval)) audit_inode(name, path->dentry, flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0); restore_nameidata(); return retval; } /* Returns 0 and nd will be valid on success; Returns error, otherwise. */ static int path_parentat(struct nameidata *nd, unsigned flags, struct path *parent) { const char *s = path_init(nd, flags); int err = link_path_walk(s, nd); if (!err) err = complete_walk(nd); if (!err) { *parent = nd->path; nd->path.mnt = NULL; nd->path.dentry = NULL; } terminate_walk(nd); return err; } /* Note: this does not consume "name" */ static int __filename_parentat(int dfd, struct filename *name, unsigned int flags, struct path *parent, struct qstr *last, int *type, const struct path *root) { int retval; struct nameidata nd; if (IS_ERR(name)) return PTR_ERR(name); set_nameidata(&nd, dfd, name, root); retval = path_parentat(&nd, flags | LOOKUP_RCU, parent); if (unlikely(retval == -ECHILD)) retval = path_parentat(&nd, flags, parent); if (unlikely(retval == -ESTALE)) retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent); if (likely(!retval)) { *last = nd.last; *type = nd.last_type; audit_inode(name, parent->dentry, AUDIT_INODE_PARENT); } restore_nameidata(); return retval; } static int filename_parentat(int dfd, struct filename *name, unsigned int flags, struct path *parent, struct qstr *last, int *type) { return __filename_parentat(dfd, name, flags, parent, last, type, NULL); } /* does lookup, returns the object with parent locked */ static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path *path) { struct path parent_path __free(path_put) = {}; struct dentry *d; struct qstr last; int type, error; error = filename_parentat(dfd, name, 0, &parent_path, &last, &type); if (error) return ERR_PTR(error); if (unlikely(type != LAST_NORM)) return ERR_PTR(-EINVAL); inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); d = lookup_one_qstr_excl(&last, parent_path.dentry, 0); if (IS_ERR(d)) { inode_unlock(parent_path.dentry->d_inode); return d; } path->dentry = no_free_ptr(parent_path.dentry); path->mnt = no_free_ptr(parent_path.mnt); return d; } struct dentry *kern_path_locked_negative(const char *name, struct path *path) { struct path parent_path __free(path_put) = {}; struct filename *filename __free(putname) = getname_kernel(name); struct dentry *d; struct qstr last; int type, error; error = filename_parentat(AT_FDCWD, filename, 0, &parent_path, &last, &type); if (error) return ERR_PTR(error); if (unlikely(type != LAST_NORM)) return ERR_PTR(-EINVAL); inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); d = lookup_one_qstr_excl(&last, parent_path.dentry, LOOKUP_CREATE); if (IS_ERR(d)) { inode_unlock(parent_path.dentry->d_inode); return d; } path->dentry = no_free_ptr(parent_path.dentry); path->mnt = no_free_ptr(parent_path.mnt); return d; } struct dentry *kern_path_locked(const char *name, struct path *path) { struct filename *filename = getname_kernel(name); struct dentry *res = __kern_path_locked(AT_FDCWD, filename, path); putname(filename); return res; } struct dentry *user_path_locked_at(int dfd, const char __user *name, struct path *path) { struct filename *filename = getname(name); struct dentry *res = __kern_path_locked(dfd, filename, path); putname(filename); return res; } EXPORT_SYMBOL(user_path_locked_at); int kern_path(const char *name, unsigned int flags, struct path *path) { struct filename *filename = getname_kernel(name); int ret = filename_lookup(AT_FDCWD, filename, flags, path, NULL); putname(filename); return ret; } EXPORT_SYMBOL(kern_path); /** * vfs_path_parent_lookup - lookup a parent path relative to a dentry-vfsmount pair * @filename: filename structure * @flags: lookup flags * @parent: pointer to struct path to fill * @last: last component * @type: type of the last component * @root: pointer to struct path of the base directory */ int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, struct path *parent, struct qstr *last, int *type, const struct path *root) { return __filename_parentat(AT_FDCWD, filename, flags, parent, last, type, root); } EXPORT_SYMBOL(vfs_path_parent_lookup); /** * vfs_path_lookup - lookup a file path relative to a dentry-vfsmount pair * @dentry: pointer to dentry of the base directory * @mnt: pointer to vfs mount of the base directory * @name: pointer to file name * @flags: lookup flags * @path: pointer to struct path to fill */ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, const char *name, unsigned int flags, struct path *path) { struct filename *filename; struct path root = {.mnt = mnt, .dentry = dentry}; int ret; filename = getname_kernel(name); /* the first argument of filename_lookup() is ignored with root */ ret = filename_lookup(AT_FDCWD, filename, flags, path, &root); putname(filename); return ret; } EXPORT_SYMBOL(vfs_path_lookup); static int lookup_noperm_common(struct qstr *qname, struct dentry *base) { const char *name = qname->name; u32 len = qname->len; qname->hash = full_name_hash(base, name, len); if (!len) return -EACCES; if (is_dot_dotdot(name, len)) return -EACCES; while (len--) { unsigned int c = *(const unsigned char *)name++; if (c == '/' || c == '\0') return -EACCES; } /* * See if the low-level filesystem might want * to use its own hash.. */ if (base->d_flags & DCACHE_OP_HASH) { int err = base->d_op->d_hash(base, qname); if (err < 0) return err; } return 0; } static int lookup_one_common(struct mnt_idmap *idmap, struct qstr *qname, struct dentry *base) { int err; err = lookup_noperm_common(qname, base); if (err < 0) return err; return inode_permission(idmap, base->d_inode, MAY_EXEC); } /** * try_lookup_noperm - filesystem helper to lookup single pathname component * @name: qstr storing pathname component to lookup * @base: base directory to lookup from * * Look up a dentry by name in the dcache, returning NULL if it does not * currently exist. The function does not try to create a dentry and if one * is found it doesn't try to revalidate it. * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. It does no permission checking. * * No locks need be held - only a counted reference to @base is needed. * */ struct dentry *try_lookup_noperm(struct qstr *name, struct dentry *base) { int err; err = lookup_noperm_common(name, base); if (err) return ERR_PTR(err); return d_lookup(base, name); } EXPORT_SYMBOL(try_lookup_noperm); /** * lookup_noperm - filesystem helper to lookup single pathname component * @name: qstr storing pathname component to lookup * @base: base directory to lookup from * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. It does no permission checking. * * The caller must hold base->i_rwsem. */ struct dentry *lookup_noperm(struct qstr *name, struct dentry *base) { struct dentry *dentry; int err; WARN_ON_ONCE(!inode_is_locked(base->d_inode)); err = lookup_noperm_common(name, base); if (err) return ERR_PTR(err); dentry = lookup_dcache(name, base, 0); return dentry ? dentry : __lookup_slow(name, base, 0); } EXPORT_SYMBOL(lookup_noperm); /** * lookup_one - lookup single pathname component * @idmap: idmap of the mount the lookup is performed from * @name: qstr holding pathname component to lookup * @base: base directory to lookup from * * This can be used for in-kernel filesystem clients such as file servers. * * The caller must hold base->i_rwsem. */ struct dentry *lookup_one(struct mnt_idmap *idmap, struct qstr *name, struct dentry *base) { struct dentry *dentry; int err; WARN_ON_ONCE(!inode_is_locked(base->d_inode)); err = lookup_one_common(idmap, name, base); if (err) return ERR_PTR(err); dentry = lookup_dcache(name, base, 0); return dentry ? dentry : __lookup_slow(name, base, 0); } EXPORT_SYMBOL(lookup_one); /** * lookup_one_unlocked - lookup single pathname component * @idmap: idmap of the mount the lookup is performed from * @name: qstr olding pathname component to lookup * @base: base directory to lookup from * * This can be used for in-kernel filesystem clients such as file servers. * * Unlike lookup_one, it should be called without the parent * i_rwsem held, and will take the i_rwsem itself if necessary. */ struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, struct qstr *name, struct dentry *base) { int err; struct dentry *ret; err = lookup_one_common(idmap, name, base); if (err) return ERR_PTR(err); ret = lookup_dcache(name, base, 0); if (!ret) ret = lookup_slow(name, base, 0); return ret; } EXPORT_SYMBOL(lookup_one_unlocked); /** * lookup_one_positive_unlocked - lookup single pathname component * @idmap: idmap of the mount the lookup is performed from * @name: qstr holding pathname component to lookup * @base: base directory to lookup from * * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns * known positive or ERR_PTR(). This is what most of the users want. * * Note that pinned negative with unlocked parent _can_ become positive at any * time, so callers of lookup_one_unlocked() need to be very careful; pinned * positives have >d_inode stable, so this one avoids such problems. * * This can be used for in-kernel filesystem clients such as file servers. * * The helper should be called without i_rwsem held. */ struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, struct qstr *name, struct dentry *base) { struct dentry *ret = lookup_one_unlocked(idmap, name, base); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { dput(ret); ret = ERR_PTR(-ENOENT); } return ret; } EXPORT_SYMBOL(lookup_one_positive_unlocked); /** * lookup_noperm_unlocked - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. It does no permission checking. * * Unlike lookup_noperm(), it should be called without the parent * i_rwsem held, and will take the i_rwsem itself if necessary. * * Unlike try_lookup_noperm() it *does* revalidate the dentry if it already * existed. */ struct dentry *lookup_noperm_unlocked(struct qstr *name, struct dentry *base) { struct dentry *ret; int err; err = lookup_noperm_common(name, base); if (err) return ERR_PTR(err); ret = lookup_dcache(name, base, 0); if (!ret) ret = lookup_slow(name, base, 0); return ret; } EXPORT_SYMBOL(lookup_noperm_unlocked); /* * Like lookup_noperm_unlocked(), except that it yields ERR_PTR(-ENOENT) * on negatives. Returns known positive or ERR_PTR(); that's what * most of the users want. Note that pinned negative with unlocked parent * _can_ become positive at any time, so callers of lookup_noperm_unlocked() * need to be very careful; pinned positives have ->d_inode stable, so * this one avoids such problems. */ struct dentry *lookup_noperm_positive_unlocked(struct qstr *name, struct dentry *base) { struct dentry *ret; ret = lookup_noperm_unlocked(name, base); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { dput(ret); ret = ERR_PTR(-ENOENT); } return ret; } EXPORT_SYMBOL(lookup_noperm_positive_unlocked); #ifdef CONFIG_UNIX98_PTYS int path_pts(struct path *path) { /* Find something mounted on "pts" in the same directory as * the input path. */ struct dentry *parent = dget_parent(path->dentry); struct dentry *child; struct qstr this = QSTR_INIT("pts", 3); if (unlikely(!path_connected(path->mnt, parent))) { dput(parent); return -ENOENT; } dput(path->dentry); path->dentry = parent; child = d_hash_and_lookup(parent, &this); if (IS_ERR_OR_NULL(child)) return -ENOENT; path->dentry = child; dput(parent); follow_down(path, 0); return 0; } #endif int user_path_at(int dfd, const char __user *name, unsigned flags, struct path *path) { struct filename *filename = getname_flags(name, flags); int ret = filename_lookup(dfd, filename, flags, path, NULL); putname(filename); return ret; } EXPORT_SYMBOL(user_path_at); int __check_sticky(struct mnt_idmap *idmap, struct inode *dir, struct inode *inode) { kuid_t fsuid = current_fsuid(); if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, inode), fsuid)) return 0; if (vfsuid_eq_kuid(i_uid_into_vfsuid(idmap, dir), fsuid)) return 0; return !capable_wrt_inode_uidgid(idmap, inode, CAP_FOWNER); } EXPORT_SYMBOL(__check_sticky); /* * Check whether we can remove a link victim from directory dir, check * whether the type of victim is right. * 1. We can't do it if dir is read-only (done in permission()) * 2. We should have write and exec permissions on dir * 3. We can't remove anything from append-only dir * 4. We can't do anything with immutable dir (done in permission()) * 5. If the sticky bit on dir is set we should either * a. be owner of dir, or * b. be owner of victim, or * c. have CAP_FOWNER capability * 6. If the victim is append-only or immutable we can't do antyhing with * links pointing to it. * 7. If the victim has an unknown uid or gid we can't change the inode. * 8. If we were asked to remove a directory and victim isn't one - ENOTDIR. * 9. If we were asked to remove a non-directory and victim isn't one - EISDIR. * 10. We can't remove a root or mountpoint. * 11. We don't allow removal of NFS sillyrenamed files; it's handled by * nfs_async_unlink(). */ static int may_delete(struct mnt_idmap *idmap, struct inode *dir, struct dentry *victim, bool isdir) { struct inode *inode = d_backing_inode(victim); int error; if (d_is_negative(victim)) return -ENOENT; BUG_ON(!inode); BUG_ON(victim->d_parent->d_inode != dir); /* Inode writeback is not safe when the uid or gid are invalid. */ if (!vfsuid_valid(i_uid_into_vfsuid(idmap, inode)) || !vfsgid_valid(i_gid_into_vfsgid(idmap, inode))) return -EOVERFLOW; audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (IS_APPEND(dir)) return -EPERM; if (check_sticky(idmap, dir, inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; if (isdir) { if (!d_is_dir(victim)) return -ENOTDIR; if (IS_ROOT(victim)) return -EBUSY; } else if (d_is_dir(victim)) return -EISDIR; if (IS_DEADDIR(dir)) return -ENOENT; if (victim->d_flags & DCACHE_NFSFS_RENAMED) return -EBUSY; return 0; } /* Check whether we can create an object with dentry child in directory * dir. * 1. We can't do it if child already exists (open has special treatment for * this case, but since we are inlined it's OK) * 2. We can't do it if dir is read-only (done in permission()) * 3. We can't do it if the fs can't represent the fsuid or fsgid. * 4. We should have write and exec permissions on dir * 5. We can't do it if dir is immutable (done in permission()) */ static inline int may_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *child) { audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE); if (child->d_inode) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; if (!fsuidgid_has_mapping(dir->i_sb, idmap)) return -EOVERFLOW; return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); } // p1 != p2, both are on the same filesystem, ->s_vfs_rename_mutex is held static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2) { struct dentry *p = p1, *q = p2, *r; while ((r = p->d_parent) != p2 && r != p) p = r; if (r == p2) { // p is a child of p2 and an ancestor of p1 or p1 itself inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); inode_lock_nested(p1->d_inode, I_MUTEX_PARENT2); return p; } // p is the root of connected component that contains p1 // p2 does not occur on the path from p to p1 while ((r = q->d_parent) != p1 && r != p && r != q) q = r; if (r == p1) { // q is a child of p1 and an ancestor of p2 or p2 itself inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); return q; } else if (likely(r == p)) { // both p2 and p1 are descendents of p inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); return NULL; } else { // no common ancestor at the time we'd been called mutex_unlock(&p1->d_sb->s_vfs_rename_mutex); return ERR_PTR(-EXDEV); } } /* * p1 and p2 should be directories on the same fs. */ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) { if (p1 == p2) { inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); return NULL; } mutex_lock(&p1->d_sb->s_vfs_rename_mutex); return lock_two_directories(p1, p2); } EXPORT_SYMBOL(lock_rename); /* * c1 and p2 should be on the same fs. */ struct dentry *lock_rename_child(struct dentry *c1, struct dentry *p2) { if (READ_ONCE(c1->d_parent) == p2) { /* * hopefully won't need to touch ->s_vfs_rename_mutex at all. */ inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); /* * now that p2 is locked, nobody can move in or out of it, * so the test below is safe. */ if (likely(c1->d_parent == p2)) return NULL; /* * c1 got moved out of p2 while we'd been taking locks; * unlock and fall back to slow case. */ inode_unlock(p2->d_inode); } mutex_lock(&c1->d_sb->s_vfs_rename_mutex); /* * nobody can move out of any directories on this fs. */ if (likely(c1->d_parent != p2)) return lock_two_directories(c1->d_parent, p2); /* * c1 got moved into p2 while we were taking locks; * we need p2 locked and ->s_vfs_rename_mutex unlocked, * for consistency with lock_rename(). */ inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); mutex_unlock(&c1->d_sb->s_vfs_rename_mutex); return NULL; } EXPORT_SYMBOL(lock_rename_child); void unlock_rename(struct dentry *p1, struct dentry *p2) { inode_unlock(p1->d_inode); if (p1 != p2) { inode_unlock(p2->d_inode); mutex_unlock(&p1->d_sb->s_vfs_rename_mutex); } } EXPORT_SYMBOL(unlock_rename); /** * vfs_prepare_mode - prepare the mode to be used for a new inode * @idmap: idmap of the mount the inode was found from * @dir: parent directory of the new inode * @mode: mode of the new inode * @mask_perms: allowed permission by the vfs * @type: type of file to be created * * This helper consolidates and enforces vfs restrictions on the @mode of a new * object to be created. * * Umask stripping depends on whether the filesystem supports POSIX ACLs (see * the kernel documentation for mode_strip_umask()). Moving umask stripping * after setgid stripping allows the same ordering for both non-POSIX ACL and * POSIX ACL supporting filesystems. * * Note that it's currently valid for @type to be 0 if a directory is created. * Filesystems raise that flag individually and we need to check whether each * filesystem can deal with receiving S_IFDIR from the vfs before we enforce a * non-zero type. * * Returns: mode to be passed to the filesystem */ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap, const struct inode *dir, umode_t mode, umode_t mask_perms, umode_t type) { mode = mode_strip_sgid(idmap, dir, mode); mode = mode_strip_umask(dir, mode); /* * Apply the vfs mandated allowed permission mask and set the type of * file to be created before we call into the filesystem. */ mode &= (mask_perms & ~S_IFMT); mode |= (type & S_IFMT); return mode; } /** * vfs_create - create new file * @idmap: idmap of the mount the inode was found from * @dir: inode of the parent directory * @dentry: dentry of the child file * @mode: mode of the child file * @want_excl: whether the file must not yet exist * * Create a new file. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool want_excl) { int error; error = may_create(idmap, dir, dentry); if (error) return error; if (!dir->i_op->create) return -EACCES; /* shouldn't it be ENOSYS? */ mode = vfs_prepare_mode(idmap, dir, mode, S_IALLUGO, S_IFREG); error = security_inode_create(dir, dentry, mode); if (error) return error; error = dir->i_op->create(idmap, dir, dentry, mode, want_excl); if (!error) fsnotify_create(dir, dentry); return error; } EXPORT_SYMBOL(vfs_create); int vfs_mkobj(struct dentry *dentry, umode_t mode, int (*f)(struct dentry *, umode_t, void *), void *arg) { struct inode *dir = dentry->d_parent->d_inode; int error = may_create(&nop_mnt_idmap, dir, dentry); if (error) return error; mode &= S_IALLUGO; mode |= S_IFREG; error = security_inode_create(dir, dentry, mode); if (error) return error; error = f(dentry, mode, arg); if (!error) fsnotify_create(dir, dentry); return error; } EXPORT_SYMBOL(vfs_mkobj); bool may_open_dev(const struct path *path) { return !(path->mnt->mnt_flags & MNT_NODEV) && !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV); } static int may_open(struct mnt_idmap *idmap, const struct path *path, int acc_mode, int flag) { struct dentry *dentry = path->dentry; struct inode *inode = dentry->d_inode; int error; if (!inode) return -ENOENT; switch (inode->i_mode & S_IFMT) { case S_IFLNK: return -ELOOP; case S_IFDIR: if (acc_mode & MAY_WRITE) return -EISDIR; if (acc_mode & MAY_EXEC) return -EACCES; break; case S_IFBLK: case S_IFCHR: if (!may_open_dev(path)) return -EACCES; fallthrough; case S_IFIFO: case S_IFSOCK: if (acc_mode & MAY_EXEC) return -EACCES; flag &= ~O_TRUNC; break; case S_IFREG: if ((acc_mode & MAY_EXEC) && path_noexec(path)) return -EACCES; break; default: VFS_BUG_ON_INODE(!IS_ANON_FILE(inode), inode); } error = inode_permission(idmap, inode, MAY_OPEN | acc_mode); if (error) return error; /* * An append-only file must be opened in append mode for writing. */ if (IS_APPEND(inode)) { if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND)) return -EPERM; if (flag & O_TRUNC) return -EPERM; } /* O_NOATIME can only be set by the owner or superuser */ if (flag & O_NOATIME && !inode_owner_or_capable(idmap, inode)) return -EPERM; return 0; } static int handle_truncate(struct mnt_idmap *idmap, struct file *filp) { const struct path *path = &filp->f_path; struct inode *inode = path->dentry->d_inode; int error = get_write_access(inode); if (error) return error; error = security_file_truncate(filp); if (!error) { error = do_truncate(idmap, path->dentry, 0, ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, filp); } put_write_access(inode); return error; } static inline int open_to_namei_flags(int flag) { if ((flag & O_ACCMODE) == 3) flag--; return flag; } static int may_o_create(struct mnt_idmap *idmap, const struct path *dir, struct dentry *dentry, umode_t mode) { int error = security_path_mknod(dir, dentry, mode, 0); if (error) return error; if (!fsuidgid_has_mapping(dir->dentry->d_sb, idmap)) return -EOVERFLOW; error = inode_permission(idmap, dir->dentry->d_inode, MAY_WRITE | MAY_EXEC); if (error) return error; return security_inode_create(dir->dentry->d_inode, dentry, mode); } /* * Attempt to atomically look up, create and open a file from a negative * dentry. * * Returns 0 if successful. The file will have been created and attached to * @file by the filesystem calling finish_open(). * * If the file was looked up only or didn't need creating, FMODE_OPENED won't * be set. The caller will need to perform the open themselves. @path will * have been updated to point to the new dentry. This may be negative. * * Returns an error code otherwise. */ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry, struct file *file, int open_flag, umode_t mode) { struct dentry *const DENTRY_NOT_SET = (void *) -1UL; struct inode *dir = nd->path.dentry->d_inode; int error; if (nd->flags & LOOKUP_DIRECTORY) open_flag |= O_DIRECTORY; file->f_path.dentry = DENTRY_NOT_SET; file->f_path.mnt = nd->path.mnt; error = dir->i_op->atomic_open(dir, dentry, file, open_to_namei_flags(open_flag), mode); d_lookup_done(dentry); if (!error) { if (file->f_mode & FMODE_OPENED) { if (unlikely(dentry != file->f_path.dentry)) { dput(dentry); dentry = dget(file->f_path.dentry); } } else if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) { error = -EIO; } else { if (file->f_path.dentry) { dput(dentry); dentry = file->f_path.dentry; } if (unlikely(d_is_negative(dentry))) error = -ENOENT; } } if (error) { dput(dentry); dentry = ERR_PTR(error); } return dentry; } /* * Look up and maybe create and open the last component. * * Must be called with parent locked (exclusive in O_CREAT case). * * Returns 0 on success, that is, if * the file was successfully atomically created (if necessary) and opened, or * the file was not completely opened at this time, though lookups and * creations were performed. * These case are distinguished by presence of FMODE_OPENED on file->f_mode. * In the latter case dentry returned in @path might be negative if O_CREAT * hadn't been specified. * * An error code is returned on failure. */ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, const struct open_flags *op, bool got_write) { struct mnt_idmap *idmap; struct dentry *dir = nd->path.dentry; struct inode *dir_inode = dir->d_inode; int open_flag = op->open_flag; struct dentry *dentry; int error, create_error = 0; umode_t mode = op->mode; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); if (unlikely(IS_DEADDIR(dir_inode))) return ERR_PTR(-ENOENT); file->f_mode &= ~FMODE_CREATED; dentry = d_lookup(dir, &nd->last); for (;;) { if (!dentry) { dentry = d_alloc_parallel(dir, &nd->last, &wq); if (IS_ERR(dentry)) return dentry; } if (d_in_lookup(dentry)) break; error = d_revalidate(dir_inode, &nd->last, dentry, nd->flags); if (likely(error > 0)) break; if (error) goto out_dput; d_invalidate(dentry); dput(dentry); dentry = NULL; } if (dentry->d_inode) { /* Cached positive dentry: will open in f_op->open */ return dentry; } if (open_flag & O_CREAT) audit_inode(nd->name, dir, AUDIT_INODE_PARENT); /* * Checking write permission is tricky, bacuse we don't know if we are * going to actually need it: O_CREAT opens should work as long as the * file exists. But checking existence breaks atomicity. The trick is * to check access and if not granted clear O_CREAT from the flags. * * Another problem is returing the "right" error value (e.g. for an * O_EXCL open we want to return EEXIST not EROFS). */ if (unlikely(!got_write)) open_flag &= ~O_TRUNC; idmap = mnt_idmap(nd->path.mnt); if (open_flag & O_CREAT) { if (open_flag & O_EXCL) open_flag &= ~O_TRUNC; mode = vfs_prepare_mode(idmap, dir->d_inode, mode, mode, mode); if (likely(got_write)) create_error = may_o_create(idmap, &nd->path, dentry, mode); else create_error = -EROFS; } if (create_error) open_flag &= ~O_CREAT; if (dir_inode->i_op->atomic_open) { dentry = atomic_open(nd, dentry, file, open_flag, mode); if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT)) dentry = ERR_PTR(create_error); return dentry; } if (d_in_lookup(dentry)) { struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry, nd->flags); d_lookup_done(dentry); if (unlikely(res)) { if (IS_ERR(res)) { error = PTR_ERR(res); goto out_dput; } dput(dentry); dentry = res; } } /* Negative dentry, just create the file */ if (!dentry->d_inode && (open_flag & O_CREAT)) { file->f_mode |= FMODE_CREATED; audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE); if (!dir_inode->i_op->create) { error = -EACCES; goto out_dput; } error = dir_inode->i_op->create(idmap, dir_inode, dentry, mode, open_flag & O_EXCL); if (error) goto out_dput; } if (unlikely(create_error) && !dentry->d_inode) { error = create_error; goto out_dput; } return dentry; out_dput: dput(dentry); return ERR_PTR(error); } static inline bool trailing_slashes(struct nameidata *nd) { return (bool)nd->last.name[nd->last.len]; } static struct dentry *lookup_fast_for_open(struct nameidata *nd, int open_flag) { struct dentry *dentry; if (open_flag & O_CREAT) { if (trailing_slashes(nd)) return ERR_PTR(-EISDIR); /* Don't bother on an O_EXCL create */ if (open_flag & O_EXCL) return NULL; } if (trailing_slashes(nd)) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; dentry = lookup_fast(nd); if (IS_ERR_OR_NULL(dentry)) return dentry; if (open_flag & O_CREAT) { /* Discard negative dentries. Need inode_lock to do the create */ if (!dentry->d_inode) { if (!(nd->flags & LOOKUP_RCU)) dput(dentry); dentry = NULL; } } return dentry; } static const char *open_last_lookups(struct nameidata *nd, struct file *file, const struct open_flags *op) { struct dentry *dir = nd->path.dentry; int open_flag = op->open_flag; bool got_write = false; struct dentry *dentry; const char *res; nd->flags |= op->intent; if (nd->last_type != LAST_NORM) { if (nd->depth) put_link(nd); return handle_dots(nd, nd->last_type); } /* We _can_ be in RCU mode here */ dentry = lookup_fast_for_open(nd, open_flag); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (likely(dentry)) goto finish_lookup; if (!(open_flag & O_CREAT)) { if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU)) return ERR_PTR(-ECHILD); } else { if (nd->flags & LOOKUP_RCU) { if (!try_to_unlazy(nd)) return ERR_PTR(-ECHILD); } } if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { got_write = !mnt_want_write(nd->path.mnt); /* * do _not_ fail yet - we might not need that or fail with * a different error; let lookup_open() decide; we'll be * dropping this one anyway. */ } if (open_flag & O_CREAT) inode_lock(dir->d_inode); else inode_lock_shared(dir->d_inode); dentry = lookup_open(nd, file, op, got_write); if (!IS_ERR(dentry)) { if (file->f_mode & FMODE_CREATED) fsnotify_create(dir->d_inode, dentry); if (file->f_mode & FMODE_OPENED) fsnotify_open(file); } if (open_flag & O_CREAT) inode_unlock(dir->d_inode); else inode_unlock_shared(dir->d_inode); if (got_write) mnt_drop_write(nd->path.mnt); if (IS_ERR(dentry)) return ERR_CAST(dentry); if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) { dput(nd->path.dentry); nd->path.dentry = dentry; return NULL; } finish_lookup: if (nd->depth) put_link(nd); res = step_into(nd, WALK_TRAILING, dentry); if (unlikely(res)) nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); return res; } /* * Handle the last step of open() */ static int do_open(struct nameidata *nd, struct file *file, const struct open_flags *op) { struct mnt_idmap *idmap; int open_flag = op->open_flag; bool do_truncate; int acc_mode; int error; if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) { error = complete_walk(nd); if (error) return error; } if (!(file->f_mode & FMODE_CREATED)) audit_inode(nd->name, nd->path.dentry, 0); idmap = mnt_idmap(nd->path.mnt); if (open_flag & O_CREAT) { if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED)) return -EEXIST; if (d_is_dir(nd->path.dentry)) return -EISDIR; error = may_create_in_sticky(idmap, nd, d_backing_inode(nd->path.dentry)); if (unlikely(error)) return error; } if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry)) return -ENOTDIR; do_truncate = false; acc_mode = op->acc_mode; if (file->f_mode & FMODE_CREATED) { /* Don't check for write permission, don't truncate */ open_flag &= ~O_TRUNC; acc_mode = 0; } else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) { error = mnt_want_write(nd->path.mnt); if (error) return error; do_truncate = true; } error = may_open(idmap, &nd->path, acc_mode, open_flag); if (!error && !(file->f_mode & FMODE_OPENED)) error = vfs_open(&nd->path, file); if (!error) error = security_file_post_open(file, op->acc_mode); if (!error && do_truncate) error = handle_truncate(idmap, file); if (unlikely(error > 0)) { WARN_ON(1); error = -EINVAL; } if (do_truncate) mnt_drop_write(nd->path.mnt); return error; } /** * vfs_tmpfile - create tmpfile * @idmap: idmap of the mount the inode was found from * @parentpath: pointer to the path of the base directory * @file: file descriptor of the new tmpfile * @mode: mode of the new tmpfile * * Create a temporary file. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_tmpfile(struct mnt_idmap *idmap, const struct path *parentpath, struct file *file, umode_t mode) { struct dentry *child; struct inode *dir = d_inode(parentpath->dentry); struct inode *inode; int error; int open_flag = file->f_flags; /* we want directory to be writable */ error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (!dir->i_op->tmpfile) return -EOPNOTSUPP; child = d_alloc(parentpath->dentry, &slash_name); if (unlikely(!child)) return -ENOMEM; file->f_path.mnt = parentpath->mnt; file->f_path.dentry = child; mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); error = dir->i_op->tmpfile(idmap, dir, file, mode); dput(child); if (file->f_mode & FMODE_OPENED) fsnotify_open(file); if (error) return error; /* Don't check for other permissions, the inode was just created */ error = may_open(idmap, &file->f_path, 0, file->f_flags); if (error) return error; inode = file_inode(file); if (!(open_flag & O_EXCL)) { spin_lock(&inode->i_lock); inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); } security_inode_post_create_tmpfile(idmap, inode); return 0; } /** * kernel_tmpfile_open - open a tmpfile for kernel internal use * @idmap: idmap of the mount the inode was found from * @parentpath: path of the base directory * @mode: mode of the new tmpfile * @open_flag: flags * @cred: credentials for open * * Create and open a temporary file. The file is not accounted in nr_files, * hence this is only for kernel internal use, and must not be installed into * file tables or such. */ struct file *kernel_tmpfile_open(struct mnt_idmap *idmap, const struct path *parentpath, umode_t mode, int open_flag, const struct cred *cred) { struct file *file; int error; file = alloc_empty_file_noaccount(open_flag, cred); if (IS_ERR(file)) return file; error = vfs_tmpfile(idmap, parentpath, file, mode); if (error) { fput(file); file = ERR_PTR(error); } return file; } EXPORT_SYMBOL(kernel_tmpfile_open); static int do_tmpfile(struct nameidata *nd, unsigned flags, const struct open_flags *op, struct file *file) { struct path path; int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path); if (unlikely(error)) return error; error = mnt_want_write(path.mnt); if (unlikely(error)) goto out; error = vfs_tmpfile(mnt_idmap(path.mnt), &path, file, op->mode); if (error) goto out2; audit_inode(nd->name, file->f_path.dentry, 0); out2: mnt_drop_write(path.mnt); out: path_put(&path); return error; } static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file) { struct path path; int error = path_lookupat(nd, flags, &path); if (!error) { audit_inode(nd->name, path.dentry, 0); error = vfs_open(&path, file); path_put(&path); } return error; } static struct file *path_openat(struct nameidata *nd, const struct open_flags *op, unsigned flags) { struct file *file; int error; file = alloc_empty_file(op->open_flag, current_cred()); if (IS_ERR(file)) return file; if (unlikely(file->f_flags & __O_TMPFILE)) { error = do_tmpfile(nd, flags, op, file); } else if (unlikely(file->f_flags & O_PATH)) { error = do_o_path(nd, flags, file); } else { const char *s = path_init(nd, flags); while (!(error = link_path_walk(s, nd)) && (s = open_last_lookups(nd, file, op)) != NULL) ; if (!error) error = do_open(nd, file, op); terminate_walk(nd); } if (likely(!error)) { if (likely(file->f_mode & FMODE_OPENED)) return file; WARN_ON(1); error = -EINVAL; } fput_close(file); if (error == -EOPENSTALE) { if (flags & LOOKUP_RCU) error = -ECHILD; else error = -ESTALE; } return ERR_PTR(error); } struct file *do_filp_open(int dfd, struct filename *pathname, const struct open_flags *op) { struct nameidata nd; int flags = op->lookup_flags; struct file *filp; set_nameidata(&nd, dfd, pathname, NULL); filp = path_openat(&nd, op, flags | LOOKUP_RCU); if (unlikely(filp == ERR_PTR(-ECHILD))) filp = path_openat(&nd, op, flags); if (unlikely(filp == ERR_PTR(-ESTALE))) filp = path_openat(&nd, op, flags | LOOKUP_REVAL); restore_nameidata(); return filp; } struct file *do_file_open_root(const struct path *root, const char *name, const struct open_flags *op) { struct nameidata nd; struct file *file; struct filename *filename; int flags = op->lookup_flags; if (d_is_symlink(root->dentry) && op->intent & LOOKUP_OPEN) return ERR_PTR(-ELOOP); filename = getname_kernel(name); if (IS_ERR(filename)) return ERR_CAST(filename); set_nameidata(&nd, -1, filename, root); file = path_openat(&nd, op, flags | LOOKUP_RCU); if (unlikely(file == ERR_PTR(-ECHILD))) file = path_openat(&nd, op, flags); if (unlikely(file == ERR_PTR(-ESTALE))) file = path_openat(&nd, op, flags | LOOKUP_REVAL); restore_nameidata(); putname(filename); return file; } static struct dentry *filename_create(int dfd, struct filename *name, struct path *path, unsigned int lookup_flags) { struct dentry *dentry = ERR_PTR(-EEXIST); struct qstr last; bool want_dir = lookup_flags & LOOKUP_DIRECTORY; unsigned int reval_flag = lookup_flags & LOOKUP_REVAL; unsigned int create_flags = LOOKUP_CREATE | LOOKUP_EXCL; int type; int err2; int error; error = filename_parentat(dfd, name, reval_flag, path, &last, &type); if (error) return ERR_PTR(error); /* * Yucky last component or no last component at all? * (foo/., foo/.., /////) */ if (unlikely(type != LAST_NORM)) goto out; /* don't fail immediately if it's r/o, at least try to report other errors */ err2 = mnt_want_write(path->mnt); /* * Do the final lookup. Suppress 'create' if there is a trailing * '/', and a directory wasn't requested. */ if (last.name[last.len] && !want_dir) create_flags &= ~LOOKUP_CREATE; inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); dentry = lookup_one_qstr_excl(&last, path->dentry, reval_flag | create_flags); if (IS_ERR(dentry)) goto unlock; if (unlikely(err2)) { error = err2; goto fail; } return dentry; fail: dput(dentry); dentry = ERR_PTR(error); unlock: inode_unlock(path->dentry->d_inode); if (!err2) mnt_drop_write(path->mnt); out: path_put(path); return dentry; } struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, unsigned int lookup_flags) { struct filename *filename = getname_kernel(pathname); struct dentry *res = filename_create(dfd, filename, path, lookup_flags); putname(filename); return res; } EXPORT_SYMBOL(kern_path_create); void done_path_create(struct path *path, struct dentry *dentry) { if (!IS_ERR(dentry)) dput(dentry); inode_unlock(path->dentry->d_inode); mnt_drop_write(path->mnt); path_put(path); } EXPORT_SYMBOL(done_path_create); inline struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, unsigned int lookup_flags) { struct filename *filename = getname(pathname); struct dentry *res = filename_create(dfd, filename, path, lookup_flags); putname(filename); return res; } EXPORT_SYMBOL(user_path_create); /** * vfs_mknod - create device node or file * @idmap: idmap of the mount the inode was found from * @dir: inode of the parent directory * @dentry: dentry of the child device node * @mode: mode of the child device node * @dev: device number of device to create * * Create a device node or file. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV; int error = may_create(idmap, dir, dentry); if (error) return error; if ((S_ISCHR(mode) || S_ISBLK(mode)) && !is_whiteout && !capable(CAP_MKNOD)) return -EPERM; if (!dir->i_op->mknod) return -EPERM; mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); error = devcgroup_inode_mknod(mode, dev); if (error) return error; error = security_inode_mknod(dir, dentry, mode, dev); if (error) return error; error = dir->i_op->mknod(idmap, dir, dentry, mode, dev); if (!error) fsnotify_create(dir, dentry); return error; } EXPORT_SYMBOL(vfs_mknod); static int may_mknod(umode_t mode) { switch (mode & S_IFMT) { case S_IFREG: case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK: case 0: /* zero mode translates to S_IFREG */ return 0; case S_IFDIR: return -EPERM; default: return -EINVAL; } } static int do_mknodat(int dfd, struct filename *name, umode_t mode, unsigned int dev) { struct mnt_idmap *idmap; struct dentry *dentry; struct path path; int error; unsigned int lookup_flags = 0; error = may_mknod(mode); if (error) goto out1; retry: dentry = filename_create(dfd, name, &path, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out1; error = security_path_mknod(&path, dentry, mode_strip_umask(path.dentry->d_inode, mode), dev); if (error) goto out2; idmap = mnt_idmap(path.mnt); switch (mode & S_IFMT) { case 0: case S_IFREG: error = vfs_create(idmap, path.dentry->d_inode, dentry, mode, true); if (!error) security_path_post_mknod(idmap, dentry); break; case S_IFCHR: case S_IFBLK: error = vfs_mknod(idmap, path.dentry->d_inode, dentry, mode, new_decode_dev(dev)); break; case S_IFIFO: case S_IFSOCK: error = vfs_mknod(idmap, path.dentry->d_inode, dentry, mode, 0); break; } out2: done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out1: putname(name); return error; } SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, unsigned int, dev) { return do_mknodat(dfd, getname(filename), mode, dev); } SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev) { return do_mknodat(AT_FDCWD, getname(filename), mode, dev); } /** * vfs_mkdir - create directory returning correct dentry if possible * @idmap: idmap of the mount the inode was found from * @dir: inode of the parent directory * @dentry: dentry of the child directory * @mode: mode of the child directory * * Create a directory. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. * * In the event that the filesystem does not use the *@dentry but leaves it * negative or unhashes it and possibly splices a different one returning it, * the original dentry is dput() and the alternate is returned. * * In case of an error the dentry is dput() and an ERR_PTR() is returned. */ struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { int error; unsigned max_links = dir->i_sb->s_max_links; struct dentry *de; error = may_create(idmap, dir, dentry); if (error) goto err; error = -EPERM; if (!dir->i_op->mkdir) goto err; mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO | S_ISVTX, 0); error = security_inode_mkdir(dir, dentry, mode); if (error) goto err; error = -EMLINK; if (max_links && dir->i_nlink >= max_links) goto err; de = dir->i_op->mkdir(idmap, dir, dentry, mode); error = PTR_ERR(de); if (IS_ERR(de)) goto err; if (de) { dput(dentry); dentry = de; } fsnotify_mkdir(dir, dentry); return dentry; err: dput(dentry); return ERR_PTR(error); } EXPORT_SYMBOL(vfs_mkdir); int do_mkdirat(int dfd, struct filename *name, umode_t mode) { struct dentry *dentry; struct path path; int error; unsigned int lookup_flags = LOOKUP_DIRECTORY; retry: dentry = filename_create(dfd, name, &path, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_putname; error = security_path_mkdir(&path, dentry, mode_strip_umask(path.dentry->d_inode, mode)); if (!error) { dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, mode); if (IS_ERR(dentry)) error = PTR_ERR(dentry); } done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out_putname: putname(name); return error; } SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) { return do_mkdirat(dfd, getname(pathname), mode); } SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) { return do_mkdirat(AT_FDCWD, getname(pathname), mode); } /** * vfs_rmdir - remove directory * @idmap: idmap of the mount the inode was found from * @dir: inode of the parent directory * @dentry: dentry of the child directory * * Remove a directory. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry) { int error = may_delete(idmap, dir, dentry, 1); if (error) return error; if (!dir->i_op->rmdir) return -EPERM; dget(dentry); inode_lock(dentry->d_inode); error = -EBUSY; if (is_local_mountpoint(dentry) || (dentry->d_inode->i_flags & S_KERNEL_FILE)) goto out; error = security_inode_rmdir(dir, dentry); if (error) goto out; error = dir->i_op->rmdir(dir, dentry); if (error) goto out; shrink_dcache_parent(dentry); dentry->d_inode->i_flags |= S_DEAD; dont_mount(dentry); detach_mounts(dentry); out: inode_unlock(dentry->d_inode); dput(dentry); if (!error) d_delete_notify(dir, dentry); return error; } EXPORT_SYMBOL(vfs_rmdir); int do_rmdir(int dfd, struct filename *name) { int error; struct dentry *dentry; struct path path; struct qstr last; int type; unsigned int lookup_flags = 0; retry: error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type); if (error) goto exit1; switch (type) { case LAST_DOTDOT: error = -ENOTEMPTY; goto exit2; case LAST_DOT: error = -EINVAL; goto exit2; case LAST_ROOT: error = -EBUSY; goto exit2; } error = mnt_want_write(path.mnt); if (error) goto exit2; inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto exit3; error = security_path_rmdir(&path, dentry); if (error) goto exit4; error = vfs_rmdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry); exit4: dput(dentry); exit3: inode_unlock(path.dentry->d_inode); mnt_drop_write(path.mnt); exit2: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } exit1: putname(name); return error; } SYSCALL_DEFINE1(rmdir, const char __user *, pathname) { return do_rmdir(AT_FDCWD, getname(pathname)); } /** * vfs_unlink - unlink a filesystem object * @idmap: idmap of the mount the inode was found from * @dir: parent directory * @dentry: victim * @delegated_inode: returns victim inode, if the inode is delegated. * * The caller must hold dir->i_rwsem exclusively. * * If vfs_unlink discovers a delegation, it will return -EWOULDBLOCK and * return a reference to the inode in delegated_inode. The caller * should then break the delegation on that inode and retry. Because * breaking a delegation may take a long time, the caller should drop * dir->i_rwsem before doing so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, struct inode **delegated_inode) { struct inode *target = dentry->d_inode; int error = may_delete(idmap, dir, dentry, 0); if (error) return error; if (!dir->i_op->unlink) return -EPERM; inode_lock(target); if (IS_SWAPFILE(target)) error = -EPERM; else if (is_local_mountpoint(dentry)) error = -EBUSY; else { error = security_inode_unlink(dir, dentry); if (!error) { error = try_break_deleg(target, delegated_inode); if (error) goto out; error = dir->i_op->unlink(dir, dentry); if (!error) { dont_mount(dentry); detach_mounts(dentry); } } } out: inode_unlock(target); /* We don't d_delete() NFS sillyrenamed files--they still exist. */ if (!error && dentry->d_flags & DCACHE_NFSFS_RENAMED) { fsnotify_unlink(dir, dentry); } else if (!error) { fsnotify_link_count(target); d_delete_notify(dir, dentry); } return error; } EXPORT_SYMBOL(vfs_unlink); /* * Make sure that the actual truncation of the file will occur outside its * directory's i_rwsem. Truncate can take a long time if there is a lot of * writeout happening, and we don't want to prevent access to the directory * while waiting on the I/O. */ int do_unlinkat(int dfd, struct filename *name) { int error; struct dentry *dentry; struct path path; struct qstr last; int type; struct inode *inode = NULL; struct inode *delegated_inode = NULL; unsigned int lookup_flags = 0; retry: error = filename_parentat(dfd, name, lookup_flags, &path, &last, &type); if (error) goto exit1; error = -EISDIR; if (type != LAST_NORM) goto exit2; error = mnt_want_write(path.mnt); if (error) goto exit2; retry_deleg: inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); dentry = lookup_one_qstr_excl(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ if (last.name[last.len]) goto slashes; inode = dentry->d_inode; ihold(inode); error = security_path_unlink(&path, dentry); if (error) goto exit3; error = vfs_unlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, &delegated_inode); exit3: dput(dentry); } inode_unlock(path.dentry->d_inode); if (inode) iput(inode); /* truncate the inode here */ inode = NULL; if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } mnt_drop_write(path.mnt); exit2: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; inode = NULL; goto retry; } exit1: putname(name); return error; slashes: if (d_is_dir(dentry)) error = -EISDIR; else error = -ENOTDIR; goto exit3; } SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag) { if ((flag & ~AT_REMOVEDIR) != 0) return -EINVAL; if (flag & AT_REMOVEDIR) return do_rmdir(dfd, getname(pathname)); return do_unlinkat(dfd, getname(pathname)); } SYSCALL_DEFINE1(unlink, const char __user *, pathname) { return do_unlinkat(AT_FDCWD, getname(pathname)); } /** * vfs_symlink - create symlink * @idmap: idmap of the mount the inode was found from * @dir: inode of the parent directory * @dentry: dentry of the child symlink file * @oldname: name of the file to link to * * Create a symlink. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *oldname) { int error; error = may_create(idmap, dir, dentry); if (error) return error; if (!dir->i_op->symlink) return -EPERM; error = security_inode_symlink(dir, dentry, oldname); if (error) return error; error = dir->i_op->symlink(idmap, dir, dentry, oldname); if (!error) fsnotify_create(dir, dentry); return error; } EXPORT_SYMBOL(vfs_symlink); int do_symlinkat(struct filename *from, int newdfd, struct filename *to) { int error; struct dentry *dentry; struct path path; unsigned int lookup_flags = 0; if (IS_ERR(from)) { error = PTR_ERR(from); goto out_putnames; } retry: dentry = filename_create(newdfd, to, &path, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_putnames; error = security_path_symlink(&path, dentry, from->name); if (!error) error = vfs_symlink(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, from->name); done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } out_putnames: putname(to); putname(from); return error; } SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, int, newdfd, const char __user *, newname) { return do_symlinkat(getname(oldname), newdfd, getname(newname)); } SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname) { return do_symlinkat(getname(oldname), AT_FDCWD, getname(newname)); } /** * vfs_link - create a new link * @old_dentry: object to be linked * @idmap: idmap of the mount * @dir: new parent * @new_dentry: where to create the new link * @delegated_inode: returns inode needing a delegation break * * The caller must hold dir->i_rwsem exclusively. * * If vfs_link discovers a delegation on the to-be-linked file in need * of breaking, it will return -EWOULDBLOCK and return a reference to the * inode in delegated_inode. The caller should then break the delegation * and retry. Because breaking a delegation may take a long time, the * caller should drop the i_rwsem before doing so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. * * If the inode has been found through an idmapped mount the idmap of * the vfsmount must be passed through @idmap. This function will then take * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. */ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode) { struct inode *inode = old_dentry->d_inode; unsigned max_links = dir->i_sb->s_max_links; int error; if (!inode) return -ENOENT; error = may_create(idmap, dir, new_dentry); if (error) return error; if (dir->i_sb != inode->i_sb) return -EXDEV; /* * A link to an append-only or immutable file cannot be created. */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return -EPERM; /* * Updating the link count will likely cause i_uid and i_gid to * be writen back improperly if their true value is unknown to * the vfs. */ if (HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; if (!dir->i_op->link) return -EPERM; if (S_ISDIR(inode->i_mode)) return -EPERM; error = security_inode_link(old_dentry, dir, new_dentry); if (error) return error; inode_lock(inode); /* Make sure we don't allow creating hardlink to an unlinked file */ if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) error = -ENOENT; else if (max_links && inode->i_nlink >= max_links) error = -EMLINK; else { error = try_break_deleg(inode, delegated_inode); if (!error) error = dir->i_op->link(old_dentry, dir, new_dentry); } if (!error && (inode->i_state & I_LINKABLE)) { spin_lock(&inode->i_lock); inode->i_state &= ~I_LINKABLE; spin_unlock(&inode->i_lock); } inode_unlock(inode); if (!error) fsnotify_link(dir, inode, new_dentry); return error; } EXPORT_SYMBOL(vfs_link); /* * Hardlinks are often used in delicate situations. We avoid * security-related surprises by not following symlinks on the * newname. --KAB * * We don't follow them on the oldname either to be compatible * with linux 2.0, and to avoid hard-linking to directories * and other special files. --ADM */ int do_linkat(int olddfd, struct filename *old, int newdfd, struct filename *new, int flags) { struct mnt_idmap *idmap; struct dentry *new_dentry; struct path old_path, new_path; struct inode *delegated_inode = NULL; int how = 0; int error; if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) { error = -EINVAL; goto out_putnames; } /* * To use null names we require CAP_DAC_READ_SEARCH or * that the open-time creds of the dfd matches current. * This ensures that not everyone will be able to create * a hardlink using the passed file descriptor. */ if (flags & AT_EMPTY_PATH) how |= LOOKUP_LINKAT_EMPTY; if (flags & AT_SYMLINK_FOLLOW) how |= LOOKUP_FOLLOW; retry: error = filename_lookup(olddfd, old, how, &old_path, NULL); if (error) goto out_putnames; new_dentry = filename_create(newdfd, new, &new_path, (how & LOOKUP_REVAL)); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto out_putpath; error = -EXDEV; if (old_path.mnt != new_path.mnt) goto out_dput; idmap = mnt_idmap(new_path.mnt); error = may_linkat(idmap, &old_path); if (unlikely(error)) goto out_dput; error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) goto out_dput; error = vfs_link(old_path.dentry, idmap, new_path.dentry->d_inode, new_dentry, &delegated_inode); out_dput: done_path_create(&new_path, new_dentry); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) { path_put(&old_path); goto retry; } } if (retry_estale(error, how)) { path_put(&old_path); how |= LOOKUP_REVAL; goto retry; } out_putpath: path_put(&old_path); out_putnames: putname(old); putname(new); return error; } SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname, int, flags) { return do_linkat(olddfd, getname_uflags(oldname, flags), newdfd, getname(newname), flags); } SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname) { return do_linkat(AT_FDCWD, getname(oldname), AT_FDCWD, getname(newname), 0); } /** * vfs_rename - rename a filesystem object * @rd: pointer to &struct renamedata info * * The caller must hold multiple mutexes--see lock_rename()). * * If vfs_rename discovers a delegation in need of breaking at either * the source or destination, it will return -EWOULDBLOCK and return a * reference to the inode in delegated_inode. The caller should then * break the delegation and retry. Because breaking a delegation may * take a long time, the caller should drop all locks before doing * so. * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. * * The worst of all namespace operations - renaming directory. "Perverted" * doesn't even start to describe it. Somebody in UCB had a heck of a trip... * Problems: * * a) we can get into loop creation. * b) race potential - two innocent renames can create a loop together. * That's where 4.4BSD screws up. Current fix: serialization on * sb->s_vfs_rename_mutex. We might be more accurate, but that's another * story. * c) we may have to lock up to _four_ objects - parents and victim (if it exists), * and source (if it's a non-directory or a subdirectory that moves to * different parent). * And that - after we got ->i_rwsem on parents (until then we don't know * whether the target exists). Solution: try to be smart with locking * order for inodes. We rely on the fact that tree topology may change * only under ->s_vfs_rename_mutex _and_ that parent of the object we * move will be locked. Thus we can rank directories by the tree * (ancestors first) and rank all non-directories after them. * That works since everybody except rename does "lock parent, lookup, * lock child" and rename is under ->s_vfs_rename_mutex. * HOWEVER, it relies on the assumption that any object with ->lookup() * has no more than 1 dentry. If "hybrid" objects will ever appear, * we'd better make sure that there's no link(2) for them. * d) conversion from fhandle to dentry may come in the wrong moment - when * we are removing the target. Solution: we will have to grab ->i_rwsem * in the fhandle_to_dentry code. [FIXME - current nfsfh.c relies on * ->i_rwsem on parents, which works but leads to some truly excessive * locking]. */ int vfs_rename(struct renamedata *rd) { int error; struct inode *old_dir = d_inode(rd->old_parent); struct inode *new_dir = d_inode(rd->new_parent); struct dentry *old_dentry = rd->old_dentry; struct dentry *new_dentry = rd->new_dentry; struct inode **delegated_inode = rd->delegated_inode; unsigned int flags = rd->flags; bool is_dir = d_is_dir(old_dentry); struct inode *source = old_dentry->d_inode; struct inode *target = new_dentry->d_inode; bool new_is_dir = false; unsigned max_links = new_dir->i_sb->s_max_links; struct name_snapshot old_name; bool lock_old_subdir, lock_new_subdir; if (source == target) return 0; error = may_delete(rd->old_mnt_idmap, old_dir, old_dentry, is_dir); if (error) return error; if (!target) { error = may_create(rd->new_mnt_idmap, new_dir, new_dentry); } else { new_is_dir = d_is_dir(new_dentry); if (!(flags & RENAME_EXCHANGE)) error = may_delete(rd->new_mnt_idmap, new_dir, new_dentry, is_dir); else error = may_delete(rd->new_mnt_idmap, new_dir, new_dentry, new_is_dir); } if (error) return error; if (!old_dir->i_op->rename) return -EPERM; /* * If we are going to change the parent - check write permissions, * we'll need to flip '..'. */ if (new_dir != old_dir) { if (is_dir) { error = inode_permission(rd->old_mnt_idmap, source, MAY_WRITE); if (error) return error; } if ((flags & RENAME_EXCHANGE) && new_is_dir) { error = inode_permission(rd->new_mnt_idmap, target, MAY_WRITE); if (error) return error; } } error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry, flags); if (error) return error; take_dentry_name_snapshot(&old_name, old_dentry); dget(new_dentry); /* * Lock children. * The source subdirectory needs to be locked on cross-directory * rename or cross-directory exchange since its parent changes. * The target subdirectory needs to be locked on cross-directory * exchange due to parent change and on any rename due to becoming * a victim. * Non-directories need locking in all cases (for NFS reasons); * they get locked after any subdirectories (in inode address order). * * NOTE: WE ONLY LOCK UNRELATED DIRECTORIES IN CROSS-DIRECTORY CASE. * NEVER, EVER DO THAT WITHOUT ->s_vfs_rename_mutex. */ lock_old_subdir = new_dir != old_dir; lock_new_subdir = new_dir != old_dir || !(flags & RENAME_EXCHANGE); if (is_dir) { if (lock_old_subdir) inode_lock_nested(source, I_MUTEX_CHILD); if (target && (!new_is_dir || lock_new_subdir)) inode_lock(target); } else if (new_is_dir) { if (lock_new_subdir) inode_lock_nested(target, I_MUTEX_CHILD); inode_lock(source); } else { lock_two_nondirectories(source, target); } error = -EPERM; if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target))) goto out; error = -EBUSY; if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry)) goto out; if (max_links && new_dir != old_dir) { error = -EMLINK; if (is_dir && !new_is_dir && new_dir->i_nlink >= max_links) goto out; if ((flags & RENAME_EXCHANGE) && !is_dir && new_is_dir && old_dir->i_nlink >= max_links) goto out; } if (!is_dir) { error = try_break_deleg(source, delegated_inode); if (error) goto out; } if (target && !new_is_dir) { error = try_break_deleg(target, delegated_inode); if (error) goto out; } error = old_dir->i_op->rename(rd->new_mnt_idmap, old_dir, old_dentry, new_dir, new_dentry, flags); if (error) goto out; if (!(flags & RENAME_EXCHANGE) && target) { if (is_dir) { shrink_dcache_parent(new_dentry); target->i_flags |= S_DEAD; } dont_mount(new_dentry); detach_mounts(new_dentry); } if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) { if (!(flags & RENAME_EXCHANGE)) d_move(old_dentry, new_dentry); else d_exchange(old_dentry, new_dentry); } out: if (!is_dir || lock_old_subdir) inode_unlock(source); if (target && (!new_is_dir || lock_new_subdir)) inode_unlock(target); dput(new_dentry); if (!error) { fsnotify_move(old_dir, new_dir, &old_name.name, is_dir, !(flags & RENAME_EXCHANGE) ? target : NULL, old_dentry); if (flags & RENAME_EXCHANGE) { fsnotify_move(new_dir, old_dir, &old_dentry->d_name, new_is_dir, NULL, new_dentry); } } release_dentry_name_snapshot(&old_name); return error; } EXPORT_SYMBOL(vfs_rename); int do_renameat2(int olddfd, struct filename *from, int newdfd, struct filename *to, unsigned int flags) { struct renamedata rd; struct dentry *old_dentry, *new_dentry; struct dentry *trap; struct path old_path, new_path; struct qstr old_last, new_last; int old_type, new_type; struct inode *delegated_inode = NULL; unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE; bool should_retry = false; int error = -EINVAL; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) goto put_names; if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) && (flags & RENAME_EXCHANGE)) goto put_names; if (flags & RENAME_EXCHANGE) target_flags = 0; if (flags & RENAME_NOREPLACE) target_flags |= LOOKUP_EXCL; retry: error = filename_parentat(olddfd, from, lookup_flags, &old_path, &old_last, &old_type); if (error) goto put_names; error = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last, &new_type); if (error) goto exit1; error = -EXDEV; if (old_path.mnt != new_path.mnt) goto exit2; error = -EBUSY; if (old_type != LAST_NORM) goto exit2; if (flags & RENAME_NOREPLACE) error = -EEXIST; if (new_type != LAST_NORM) goto exit2; error = mnt_want_write(old_path.mnt); if (error) goto exit2; retry_deleg: trap = lock_rename(new_path.dentry, old_path.dentry); if (IS_ERR(trap)) { error = PTR_ERR(trap); goto exit_lock_rename; } old_dentry = lookup_one_qstr_excl(&old_last, old_path.dentry, lookup_flags); error = PTR_ERR(old_dentry); if (IS_ERR(old_dentry)) goto exit3; new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry, lookup_flags | target_flags); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto exit4; if (flags & RENAME_EXCHANGE) { if (!d_is_dir(new_dentry)) { error = -ENOTDIR; if (new_last.name[new_last.len]) goto exit5; } } /* unless the source is a directory trailing slashes give -ENOTDIR */ if (!d_is_dir(old_dentry)) { error = -ENOTDIR; if (old_last.name[old_last.len]) goto exit5; if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len]) goto exit5; } /* source should not be ancestor of target */ error = -EINVAL; if (old_dentry == trap) goto exit5; /* target should not be an ancestor of source */ if (!(flags & RENAME_EXCHANGE)) error = -ENOTEMPTY; if (new_dentry == trap) goto exit5; error = security_path_rename(&old_path, old_dentry, &new_path, new_dentry, flags); if (error) goto exit5; rd.old_parent = old_path.dentry; rd.old_dentry = old_dentry; rd.old_mnt_idmap = mnt_idmap(old_path.mnt); rd.new_parent = new_path.dentry; rd.new_dentry = new_dentry; rd.new_mnt_idmap = mnt_idmap(new_path.mnt); rd.delegated_inode = &delegated_inode; rd.flags = flags; error = vfs_rename(&rd); exit5: dput(new_dentry); exit4: dput(old_dentry); exit3: unlock_rename(new_path.dentry, old_path.dentry); exit_lock_rename: if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } mnt_drop_write(old_path.mnt); exit2: if (retry_estale(error, lookup_flags)) should_retry = true; path_put(&new_path); exit1: path_put(&old_path); if (should_retry) { should_retry = false; lookup_flags |= LOOKUP_REVAL; goto retry; } put_names: putname(from); putname(to); return error; } SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname, unsigned int, flags) { return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname), flags); } SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname) { return do_renameat2(olddfd, getname(oldname), newdfd, getname(newname), 0); } SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) { return do_renameat2(AT_FDCWD, getname(oldname), AT_FDCWD, getname(newname), 0); } int readlink_copy(char __user *buffer, int buflen, const char *link, int linklen) { int copylen; copylen = linklen; if (unlikely(copylen > (unsigned) buflen)) copylen = buflen; if (copy_to_user(buffer, link, copylen)) copylen = -EFAULT; return copylen; } /** * vfs_readlink - copy symlink body into userspace buffer * @dentry: dentry on which to get symbolic link * @buffer: user memory pointer * @buflen: size of buffer * * Does not touch atime. That's up to the caller if necessary * * Does not call security hook. */ int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) { struct inode *inode = d_inode(dentry); DEFINE_DELAYED_CALL(done); const char *link; int res; if (inode->i_opflags & IOP_CACHED_LINK) return readlink_copy(buffer, buflen, inode->i_link, inode->i_linklen); if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) { if (unlikely(inode->i_op->readlink)) return inode->i_op->readlink(dentry, buffer, buflen); if (!d_is_symlink(dentry)) return -EINVAL; spin_lock(&inode->i_lock); inode->i_opflags |= IOP_DEFAULT_READLINK; spin_unlock(&inode->i_lock); } link = READ_ONCE(inode->i_link); if (!link) { link = inode->i_op->get_link(dentry, inode, &done); if (IS_ERR(link)) return PTR_ERR(link); } res = readlink_copy(buffer, buflen, link, strlen(link)); do_delayed_call(&done); return res; } EXPORT_SYMBOL(vfs_readlink); /** * vfs_get_link - get symlink body * @dentry: dentry on which to get symbolic link * @done: caller needs to free returned data with this * * Calls security hook and i_op->get_link() on the supplied inode. * * It does not touch atime. That's up to the caller if necessary. * * Does not work on "special" symlinks like /proc/$$/fd/N */ const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done) { const char *res = ERR_PTR(-EINVAL); struct inode *inode = d_inode(dentry); if (d_is_symlink(dentry)) { res = ERR_PTR(security_inode_readlink(dentry)); if (!res) res = inode->i_op->get_link(dentry, inode, done); } return res; } EXPORT_SYMBOL(vfs_get_link); /* get the link contents into pagecache */ static char *__page_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *callback) { struct folio *folio; struct address_space *mapping = inode->i_mapping; if (!dentry) { folio = filemap_get_folio(mapping, 0); if (IS_ERR(folio)) return ERR_PTR(-ECHILD); if (!folio_test_uptodate(folio)) { folio_put(folio); return ERR_PTR(-ECHILD); } } else { folio = read_mapping_folio(mapping, 0, NULL); if (IS_ERR(folio)) return ERR_CAST(folio); } set_delayed_call(callback, page_put_link, folio); BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM); return folio_address(folio); } const char *page_get_link_raw(struct dentry *dentry, struct inode *inode, struct delayed_call *callback) { return __page_get_link(dentry, inode, callback); } EXPORT_SYMBOL_GPL(page_get_link_raw); /** * page_get_link() - An implementation of the get_link inode_operation. * @dentry: The directory entry which is the symlink. * @inode: The inode for the symlink. * @callback: Used to drop the reference to the symlink. * * Filesystems which store their symlinks in the page cache should use * this to implement the get_link() member of their inode_operations. * * Return: A pointer to the NUL-terminated symlink. */ const char *page_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *callback) { char *kaddr = __page_get_link(dentry, inode, callback); if (!IS_ERR(kaddr)) nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1); return kaddr; } EXPORT_SYMBOL(page_get_link); /** * page_put_link() - Drop the reference to the symlink. * @arg: The folio which contains the symlink. * * This is used internally by page_get_link(). It is exported for use * by filesystems which need to implement a variant of page_get_link() * themselves. Despite the apparent symmetry, filesystems which use * page_get_link() do not need to call page_put_link(). * * The argument, while it has a void pointer type, must be a pointer to * the folio which was retrieved from the page cache. The delayed_call * infrastructure is used to drop the reference count once the caller * is done with the symlink. */ void page_put_link(void *arg) { folio_put(arg); } EXPORT_SYMBOL(page_put_link); int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) { const char *link; int res; DEFINE_DELAYED_CALL(done); link = page_get_link(dentry, d_inode(dentry), &done); res = PTR_ERR(link); if (!IS_ERR(link)) res = readlink_copy(buffer, buflen, link, strlen(link)); do_delayed_call(&done); return res; } EXPORT_SYMBOL(page_readlink); int page_symlink(struct inode *inode, const char *symname, int len) { struct address_space *mapping = inode->i_mapping; const struct address_space_operations *aops = mapping->a_ops; bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS); struct folio *folio; void *fsdata = NULL; int err; unsigned int flags; retry: if (nofs) flags = memalloc_nofs_save(); err = aops->write_begin(NULL, mapping, 0, len-1, &folio, &fsdata); if (nofs) memalloc_nofs_restore(flags); if (err) goto fail; memcpy(folio_address(folio), symname, len - 1); err = aops->write_end(NULL, mapping, 0, len - 1, len - 1, folio, fsdata); if (err < 0) goto fail; if (err < len-1) goto retry; mark_inode_dirty(inode); return 0; fail: return err; } EXPORT_SYMBOL(page_symlink); const struct inode_operations page_symlink_inode_operations = { .get_link = page_get_link, }; EXPORT_SYMBOL(page_symlink_inode_operations);
72841 14299 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_PREEMPT_H #define __LINUX_PREEMPT_H /* * include/linux/preempt.h - macros for accessing and manipulating * preempt_count (used for kernel preemption, interrupt count, etc.) */ #include <linux/linkage.h> #include <linux/cleanup.h> #include <linux/types.h> /* * We put the hardirq and softirq counter into the preemption * counter. The bitmask has the following meaning: * * - bits 0-7 are the preemption count (max preemption depth: 256) * - bits 8-15 are the softirq count (max # of softirqs: 256) * * The hardirq count could in theory be the same as the number of * interrupts in the system, but we run all interrupt handlers with * interrupts disabled, so we cannot have nesting interrupts. Though * there are a few palaeontologic drivers which reenable interrupts in * the handler, so we need more than one bit here. * * PREEMPT_MASK: 0x000000ff * SOFTIRQ_MASK: 0x0000ff00 * HARDIRQ_MASK: 0x000f0000 * NMI_MASK: 0x00f00000 * PREEMPT_NEED_RESCHED: 0x80000000 */ #define PREEMPT_BITS 8 #define SOFTIRQ_BITS 8 #define HARDIRQ_BITS 4 #define NMI_BITS 4 #define PREEMPT_SHIFT 0 #define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS) #define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS) #define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS) #define __IRQ_MASK(x) ((1UL << (x))-1) #define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT) #define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT) #define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT) #define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT) #define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT) #define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT) #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) #define NMI_OFFSET (1UL << NMI_SHIFT) #define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET) #define PREEMPT_DISABLED (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) /* * Disable preemption until the scheduler is running -- use an unconditional * value so that it also works on !PREEMPT_COUNT kernels. * * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count(). */ #define INIT_PREEMPT_COUNT PREEMPT_OFFSET /* * Initial preempt_count value; reflects the preempt_count schedule invariant * which states that during context switches: * * preempt_count() == 2*PREEMPT_DISABLE_OFFSET * * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels. * Note: See finish_task_switch(). */ #define FORK_PREEMPT_COUNT (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) /* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */ #include <asm/preempt.h> /** * interrupt_context_level - return interrupt context level * * Returns the current interrupt context level. * 0 - normal context * 1 - softirq context * 2 - hardirq context * 3 - NMI context */ static __always_inline unsigned char interrupt_context_level(void) { unsigned long pc = preempt_count(); unsigned char level = 0; level += !!(pc & (NMI_MASK)); level += !!(pc & (NMI_MASK | HARDIRQ_MASK)); level += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); return level; } /* * These macro definitions avoid redundant invocations of preempt_count() * because such invocations would result in redundant loads given that * preempt_count() is commonly implemented with READ_ONCE(). */ #define nmi_count() (preempt_count() & NMI_MASK) #define hardirq_count() (preempt_count() & HARDIRQ_MASK) #ifdef CONFIG_PREEMPT_RT # define softirq_count() (current->softirq_disable_cnt & SOFTIRQ_MASK) # define irq_count() ((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | softirq_count()) #else # define softirq_count() (preempt_count() & SOFTIRQ_MASK) # define irq_count() (preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK)) #endif /* * Macros to retrieve the current execution context: * * in_nmi() - We're in NMI context * in_hardirq() - We're in hard IRQ context * in_serving_softirq() - We're in softirq context * in_task() - We're in task context */ #define in_nmi() (nmi_count()) #define in_hardirq() (hardirq_count()) #define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) #ifdef CONFIG_PREEMPT_RT # define in_task() (!((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | in_serving_softirq())) #else # define in_task() (!(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) #endif /* * The following macros are deprecated and should not be used in new code: * in_irq() - Obsolete version of in_hardirq() * in_softirq() - We have BH disabled, or are processing softirqs * in_interrupt() - We're in NMI,IRQ,SoftIRQ context or have BH disabled */ #define in_irq() (hardirq_count()) #define in_softirq() (softirq_count()) #define in_interrupt() (irq_count()) /* * The preempt_count offset after preempt_disable(); */ #if defined(CONFIG_PREEMPT_COUNT) # define PREEMPT_DISABLE_OFFSET PREEMPT_OFFSET #else # define PREEMPT_DISABLE_OFFSET 0 #endif /* * The preempt_count offset after spin_lock() */ #if !defined(CONFIG_PREEMPT_RT) #define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET #else /* Locks on RT do not disable preemption */ #define PREEMPT_LOCK_OFFSET 0 #endif /* * The preempt_count offset needed for things like: * * spin_lock_bh() * * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and * softirqs, such that unlock sequences of: * * spin_unlock(); * local_bh_enable(); * * Work as expected. */ #define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_LOCK_OFFSET) /* * Are we running in atomic context? WARNING: this macro cannot * always detect atomic context; in particular, it cannot know about * held spinlocks in non-preemptible kernels. Thus it should not be * used in the general case to determine whether sleeping is possible. * Do not use in_atomic() in driver code. */ #define in_atomic() (preempt_count() != 0) /* * Check whether we were atomic before we did preempt_disable(): * (used by the scheduler) */ #define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET) #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE) extern void preempt_count_add(int val); extern void preempt_count_sub(int val); #define preempt_count_dec_and_test() \ ({ preempt_count_sub(1); should_resched(0); }) #else #define preempt_count_add(val) __preempt_count_add(val) #define preempt_count_sub(val) __preempt_count_sub(val) #define preempt_count_dec_and_test() __preempt_count_dec_and_test() #endif #define __preempt_count_inc() __preempt_count_add(1) #define __preempt_count_dec() __preempt_count_sub(1) #define preempt_count_inc() preempt_count_add(1) #define preempt_count_dec() preempt_count_sub(1) #ifdef CONFIG_PREEMPT_COUNT #define preempt_disable() \ do { \ preempt_count_inc(); \ barrier(); \ } while (0) #define sched_preempt_enable_no_resched() \ do { \ barrier(); \ preempt_count_dec(); \ } while (0) #define preempt_enable_no_resched() sched_preempt_enable_no_resched() #define preemptible() (preempt_count() == 0 && !irqs_disabled()) #ifdef CONFIG_PREEMPTION #define preempt_enable() \ do { \ barrier(); \ if (unlikely(preempt_count_dec_and_test())) \ __preempt_schedule(); \ } while (0) #define preempt_enable_notrace() \ do { \ barrier(); \ if (unlikely(__preempt_count_dec_and_test())) \ __preempt_schedule_notrace(); \ } while (0) #define preempt_check_resched() \ do { \ if (should_resched(0)) \ __preempt_schedule(); \ } while (0) #else /* !CONFIG_PREEMPTION */ #define preempt_enable() \ do { \ barrier(); \ preempt_count_dec(); \ } while (0) #define preempt_enable_notrace() \ do { \ barrier(); \ __preempt_count_dec(); \ } while (0) #define preempt_check_resched() do { } while (0) #endif /* CONFIG_PREEMPTION */ #define preempt_disable_notrace() \ do { \ __preempt_count_inc(); \ barrier(); \ } while (0) #define preempt_enable_no_resched_notrace() \ do { \ barrier(); \ __preempt_count_dec(); \ } while (0) #else /* !CONFIG_PREEMPT_COUNT */ /* * Even if we don't have any preemption, we need preempt disable/enable * to be barriers, so that we don't have things like get_user/put_user * that can cause faults and scheduling migrate into our preempt-protected * region. */ #define preempt_disable() barrier() #define sched_preempt_enable_no_resched() barrier() #define preempt_enable_no_resched() barrier() #define preempt_enable() barrier() #define preempt_check_resched() do { } while (0) #define preempt_disable_notrace() barrier() #define preempt_enable_no_resched_notrace() barrier() #define preempt_enable_notrace() barrier() #define preemptible() 0 #endif /* CONFIG_PREEMPT_COUNT */ #ifdef MODULE /* * Modules have no business playing preemption tricks. */ #undef sched_preempt_enable_no_resched #undef preempt_enable_no_resched #undef preempt_enable_no_resched_notrace #undef preempt_check_resched #endif #define preempt_set_need_resched() \ do { \ set_preempt_need_resched(); \ } while (0) #define preempt_fold_need_resched() \ do { \ if (tif_need_resched()) \ set_preempt_need_resched(); \ } while (0) #ifdef CONFIG_PREEMPT_NOTIFIERS struct preempt_notifier; struct task_struct; /** * preempt_ops - notifiers called when a task is preempted and rescheduled * @sched_in: we're about to be rescheduled: * notifier: struct preempt_notifier for the task being scheduled * cpu: cpu we're scheduled on * @sched_out: we've just been preempted * notifier: struct preempt_notifier for the task being preempted * next: the task that's kicking us out * * Please note that sched_in and out are called under different * contexts. sched_out is called with rq lock held and irq disabled * while sched_in is called without rq lock and irq enabled. This * difference is intentional and depended upon by its users. */ struct preempt_ops { void (*sched_in)(struct preempt_notifier *notifier, int cpu); void (*sched_out)(struct preempt_notifier *notifier, struct task_struct *next); }; /** * preempt_notifier - key for installing preemption notifiers * @link: internal use * @ops: defines the notifier functions to be called * * Usually used in conjunction with container_of(). */ struct preempt_notifier { struct hlist_node link; struct preempt_ops *ops; }; void preempt_notifier_inc(void); void preempt_notifier_dec(void); void preempt_notifier_register(struct preempt_notifier *notifier); void preempt_notifier_unregister(struct preempt_notifier *notifier); static inline void preempt_notifier_init(struct preempt_notifier *notifier, struct preempt_ops *ops) { /* INIT_HLIST_NODE() open coded, to avoid dependency on list.h */ notifier->link.next = NULL; notifier->link.pprev = NULL; notifier->ops = ops; } #endif /* * Migrate-Disable and why it is undesired. * * When a preempted task becomes elegible to run under the ideal model (IOW it * becomes one of the M highest priority tasks), it might still have to wait * for the preemptee's migrate_disable() section to complete. Thereby suffering * a reduction in bandwidth in the exact duration of the migrate_disable() * section. * * Per this argument, the change from preempt_disable() to migrate_disable() * gets us: * * - a higher priority tasks gains reduced wake-up latency; with preempt_disable() * it would have had to wait for the lower priority task. * * - a lower priority tasks; which under preempt_disable() could've instantly * migrated away when another CPU becomes available, is now constrained * by the ability to push the higher priority task away, which might itself be * in a migrate_disable() section, reducing it's available bandwidth. * * IOW it trades latency / moves the interference term, but it stays in the * system, and as long as it remains unbounded, the system is not fully * deterministic. * * * The reason we have it anyway. * * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a * number of primitives into becoming preemptible, they would also allow * migration. This turns out to break a bunch of per-cpu usage. To this end, * all these primitives employ migirate_disable() to restore this implicit * assumption. * * This is a 'temporary' work-around at best. The correct solution is getting * rid of the above assumptions and reworking the code to employ explicit * per-cpu locking or short preempt-disable regions. * * The end goal must be to get rid of migrate_disable(), alternatively we need * a schedulability theory that does not depend on abritrary migration. * * * Notes on the implementation. * * The implementation is particularly tricky since existing code patterns * dictate neither migrate_disable() nor migrate_enable() is allowed to block. * This means that it cannot use cpus_read_lock() to serialize against hotplug, * nor can it easily migrate itself into a pending affinity mask change on * migrate_enable(). * * * Note: even non-work-conserving schedulers like semi-partitioned depends on * migration, so migrate_disable() is not only a problem for * work-conserving schedulers. * */ extern void migrate_disable(void); extern void migrate_enable(void); /** * preempt_disable_nested - Disable preemption inside a normally preempt disabled section * * Use for code which requires preemption protection inside a critical * section which has preemption disabled implicitly on non-PREEMPT_RT * enabled kernels, by e.g.: * - holding a spinlock/rwlock * - soft interrupt context * - regular interrupt handlers * * On PREEMPT_RT enabled kernels spinlock/rwlock held sections, soft * interrupt context and regular interrupt handlers are preemptible and * only prevent migration. preempt_disable_nested() ensures that preemption * is disabled for cases which require CPU local serialization even on * PREEMPT_RT. For non-PREEMPT_RT kernels this is a NOP. * * The use cases are code sequences which are not serialized by a * particular lock instance, e.g.: * - seqcount write side critical sections where the seqcount is not * associated to a particular lock and therefore the automatic * protection mechanism does not work. This prevents a live lock * against a preempting high priority reader. * - RMW per CPU variable updates like vmstat. */ /* Macro to avoid header recursion hell vs. lockdep */ #define preempt_disable_nested() \ do { \ if (IS_ENABLED(CONFIG_PREEMPT_RT)) \ preempt_disable(); \ else \ lockdep_assert_preemption_disabled(); \ } while (0) /** * preempt_enable_nested - Undo the effect of preempt_disable_nested() */ static __always_inline void preempt_enable_nested(void) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_enable(); } DEFINE_LOCK_GUARD_0(preempt, preempt_disable(), preempt_enable()) DEFINE_LOCK_GUARD_0(preempt_notrace, preempt_disable_notrace(), preempt_enable_notrace()) DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable()) #ifdef CONFIG_PREEMPT_DYNAMIC extern bool preempt_model_none(void); extern bool preempt_model_voluntary(void); extern bool preempt_model_full(void); extern bool preempt_model_lazy(void); #else static inline bool preempt_model_none(void) { return IS_ENABLED(CONFIG_PREEMPT_NONE); } static inline bool preempt_model_voluntary(void) { return IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY); } static inline bool preempt_model_full(void) { return IS_ENABLED(CONFIG_PREEMPT); } static inline bool preempt_model_lazy(void) { return IS_ENABLED(CONFIG_PREEMPT_LAZY); } #endif static inline bool preempt_model_rt(void) { return IS_ENABLED(CONFIG_PREEMPT_RT); } extern const char *preempt_model_str(void); /* * Does the preemption model allow non-cooperative preemption? * * For !CONFIG_PREEMPT_DYNAMIC kernels this is an exact match with * CONFIG_PREEMPTION; for CONFIG_PREEMPT_DYNAMIC this doesn't work as the * kernel is *built* with CONFIG_PREEMPTION=y but may run with e.g. the * PREEMPT_NONE model. */ static inline bool preempt_model_preemptible(void) { return preempt_model_full() || preempt_model_lazy() || preempt_model_rt(); } #endif /* __LINUX_PREEMPT_H */
48 48 22 22 15 9 15 9 1 5 32 32 3 2 2 3 2 20 32 13 14 14 14 14 3 3 3 3 3 1 1 1 4 2 4 4 4 4 4 4 4 2 4 4 4 4 4 4 4 4 4 1 1 2 4 4 2 1 2 2 3 11 11 9 14 4 14 9 2 3 1 1 3 5 3 2 5 9 5 1 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 // SPDX-License-Identifier: GPL-2.0-or-later /* * PPP async serial channel driver for Linux. * * Copyright 1999 Paul Mackerras. * * This driver provides the encapsulation and framing for sending * and receiving PPP frames over async serial lines. It relies on * the generic PPP layer to give it frames to send and to process * received frames. It implements the PPP line discipline. * * Part of the code in this driver was inspired by the old async-only * PPP driver, written by Michael Callahan and Al Longyear, and * subsequently hacked by Paul Mackerras. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/tty.h> #include <linux/netdevice.h> #include <linux/poll.h> #include <linux/crc-ccitt.h> #include <linux/ppp_defs.h> #include <linux/ppp-ioctl.h> #include <linux/ppp_channel.h> #include <linux/spinlock.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/jiffies.h> #include <linux/slab.h> #include <linux/unaligned.h> #include <linux/uaccess.h> #include <asm/string.h> #define PPP_VERSION "2.4.2" #define OBUFSIZE 4096 /* Structure for storing local state. */ struct asyncppp { struct tty_struct *tty; unsigned int flags; unsigned int state; unsigned int rbits; int mru; spinlock_t xmit_lock; spinlock_t recv_lock; unsigned long xmit_flags; u32 xaccm[8]; u32 raccm; unsigned int bytes_sent; unsigned int bytes_rcvd; struct sk_buff *tpkt; int tpkt_pos; u16 tfcs; unsigned char *optr; unsigned char *olim; unsigned long last_xmit; struct sk_buff *rpkt; int lcp_fcs; struct sk_buff_head rqueue; struct tasklet_struct tsk; refcount_t refcnt; struct completion dead; struct ppp_channel chan; /* interface to generic ppp layer */ unsigned char obuf[OBUFSIZE]; }; /* Bit numbers in xmit_flags */ #define XMIT_WAKEUP 0 #define XMIT_FULL 1 #define XMIT_BUSY 2 /* State bits */ #define SC_TOSS 1 #define SC_ESCAPE 2 #define SC_PREV_ERROR 4 /* Bits in rbits */ #define SC_RCV_BITS (SC_RCV_B7_1|SC_RCV_B7_0|SC_RCV_ODDP|SC_RCV_EVNP) static int flag_time = HZ; module_param(flag_time, int, 0); MODULE_PARM_DESC(flag_time, "ppp_async: interval between flagged packets (in clock ticks)"); MODULE_DESCRIPTION("PPP async serial channel module"); MODULE_LICENSE("GPL"); MODULE_ALIAS_LDISC(N_PPP); /* * Prototypes. */ static int ppp_async_encode(struct asyncppp *ap); static int ppp_async_send(struct ppp_channel *chan, struct sk_buff *skb); static int ppp_async_push(struct asyncppp *ap); static void ppp_async_flush_output(struct asyncppp *ap); static void ppp_async_input(struct asyncppp *ap, const unsigned char *buf, const u8 *flags, int count); static int ppp_async_ioctl(struct ppp_channel *chan, unsigned int cmd, unsigned long arg); static void ppp_async_process(struct tasklet_struct *t); static void async_lcp_peek(struct asyncppp *ap, unsigned char *data, int len, int inbound); static const struct ppp_channel_ops async_ops = { .start_xmit = ppp_async_send, .ioctl = ppp_async_ioctl, }; /* * Routines implementing the PPP line discipline. */ /* * We have a potential race on dereferencing tty->disc_data, * because the tty layer provides no locking at all - thus one * cpu could be running ppp_asynctty_receive while another * calls ppp_asynctty_close, which zeroes tty->disc_data and * frees the memory that ppp_asynctty_receive is using. The best * way to fix this is to use a rwlock in the tty struct, but for now * we use a single global rwlock for all ttys in ppp line discipline. * * FIXME: this is no longer true. The _close path for the ldisc is * now guaranteed to be sane. */ static DEFINE_RWLOCK(disc_data_lock); static struct asyncppp *ap_get(struct tty_struct *tty) { struct asyncppp *ap; read_lock(&disc_data_lock); ap = tty->disc_data; if (ap != NULL) refcount_inc(&ap->refcnt); read_unlock(&disc_data_lock); return ap; } static void ap_put(struct asyncppp *ap) { if (refcount_dec_and_test(&ap->refcnt)) complete(&ap->dead); } /* * Called when a tty is put into PPP line discipline. Called in process * context. */ static int ppp_asynctty_open(struct tty_struct *tty) { struct asyncppp *ap; int err; int speed; if (tty->ops->write == NULL) return -EOPNOTSUPP; err = -ENOMEM; ap = kzalloc(sizeof(*ap), GFP_KERNEL); if (!ap) goto out; /* initialize the asyncppp structure */ ap->tty = tty; ap->mru = PPP_MRU; spin_lock_init(&ap->xmit_lock); spin_lock_init(&ap->recv_lock); ap->xaccm[0] = ~0U; ap->xaccm[3] = 0x60000000U; ap->raccm = ~0U; ap->optr = ap->obuf; ap->olim = ap->obuf; ap->lcp_fcs = -1; skb_queue_head_init(&ap->rqueue); tasklet_setup(&ap->tsk, ppp_async_process); refcount_set(&ap->refcnt, 1); init_completion(&ap->dead); ap->chan.private = ap; ap->chan.ops = &async_ops; ap->chan.mtu = PPP_MRU; speed = tty_get_baud_rate(tty); ap->chan.speed = speed; err = ppp_register_channel(&ap->chan); if (err) goto out_free; tty->disc_data = ap; tty->receive_room = 65536; return 0; out_free: kfree(ap); out: return err; } /* * Called when the tty is put into another line discipline * or it hangs up. We have to wait for any cpu currently * executing in any of the other ppp_asynctty_* routines to * finish before we can call ppp_unregister_channel and free * the asyncppp struct. This routine must be called from * process context, not interrupt or softirq context. */ static void ppp_asynctty_close(struct tty_struct *tty) { struct asyncppp *ap; write_lock_irq(&disc_data_lock); ap = tty->disc_data; tty->disc_data = NULL; write_unlock_irq(&disc_data_lock); if (!ap) return; /* * We have now ensured that nobody can start using ap from now * on, but we have to wait for all existing users to finish. * Note that ppp_unregister_channel ensures that no calls to * our channel ops (i.e. ppp_async_send/ioctl) are in progress * by the time it returns. */ if (!refcount_dec_and_test(&ap->refcnt)) wait_for_completion(&ap->dead); tasklet_kill(&ap->tsk); ppp_unregister_channel(&ap->chan); kfree_skb(ap->rpkt); skb_queue_purge(&ap->rqueue); kfree_skb(ap->tpkt); kfree(ap); } /* * Called on tty hangup in process context. * * Wait for I/O to driver to complete and unregister PPP channel. * This is already done by the close routine, so just call that. */ static void ppp_asynctty_hangup(struct tty_struct *tty) { ppp_asynctty_close(tty); } /* * Read does nothing - no data is ever available this way. * Pppd reads and writes packets via /dev/ppp instead. */ static ssize_t ppp_asynctty_read(struct tty_struct *tty, struct file *file, u8 *buf, size_t count, void **cookie, unsigned long offset) { return -EAGAIN; } /* * Write on the tty does nothing, the packets all come in * from the ppp generic stuff. */ static ssize_t ppp_asynctty_write(struct tty_struct *tty, struct file *file, const u8 *buf, size_t count) { return -EAGAIN; } /* * Called in process context only. May be re-entered by multiple * ioctl calling threads. */ static int ppp_asynctty_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { struct asyncppp *ap = ap_get(tty); int err, val; int __user *p = (int __user *)arg; if (!ap) return -ENXIO; err = -EFAULT; switch (cmd) { case PPPIOCGCHAN: err = -EFAULT; if (put_user(ppp_channel_index(&ap->chan), p)) break; err = 0; break; case PPPIOCGUNIT: err = -EFAULT; if (put_user(ppp_unit_number(&ap->chan), p)) break; err = 0; break; case TCFLSH: /* flush our buffers and the serial port's buffer */ if (arg == TCIOFLUSH || arg == TCOFLUSH) ppp_async_flush_output(ap); err = n_tty_ioctl_helper(tty, cmd, arg); break; case FIONREAD: val = 0; if (put_user(val, p)) break; err = 0; break; default: /* Try the various mode ioctls */ err = tty_mode_ioctl(tty, cmd, arg); } ap_put(ap); return err; } /* May sleep, don't call from interrupt level or with interrupts disabled */ static void ppp_asynctty_receive(struct tty_struct *tty, const u8 *buf, const u8 *cflags, size_t count) { struct asyncppp *ap = ap_get(tty); unsigned long flags; if (!ap) return; spin_lock_irqsave(&ap->recv_lock, flags); ppp_async_input(ap, buf, cflags, count); spin_unlock_irqrestore(&ap->recv_lock, flags); if (!skb_queue_empty(&ap->rqueue)) tasklet_schedule(&ap->tsk); ap_put(ap); tty_unthrottle(tty); } static void ppp_asynctty_wakeup(struct tty_struct *tty) { struct asyncppp *ap = ap_get(tty); clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); if (!ap) return; set_bit(XMIT_WAKEUP, &ap->xmit_flags); tasklet_schedule(&ap->tsk); ap_put(ap); } static struct tty_ldisc_ops ppp_ldisc = { .owner = THIS_MODULE, .num = N_PPP, .name = "ppp", .open = ppp_asynctty_open, .close = ppp_asynctty_close, .hangup = ppp_asynctty_hangup, .read = ppp_asynctty_read, .write = ppp_asynctty_write, .ioctl = ppp_asynctty_ioctl, .receive_buf = ppp_asynctty_receive, .write_wakeup = ppp_asynctty_wakeup, }; static int __init ppp_async_init(void) { int err; err = tty_register_ldisc(&ppp_ldisc); if (err != 0) printk(KERN_ERR "PPP_async: error %d registering line disc.\n", err); return err; } /* * The following routines provide the PPP channel interface. */ static int ppp_async_ioctl(struct ppp_channel *chan, unsigned int cmd, unsigned long arg) { struct asyncppp *ap = chan->private; void __user *argp = (void __user *)arg; int __user *p = argp; int err, val; u32 accm[8]; err = -EFAULT; switch (cmd) { case PPPIOCGFLAGS: val = ap->flags | ap->rbits; if (put_user(val, p)) break; err = 0; break; case PPPIOCSFLAGS: if (get_user(val, p)) break; ap->flags = val & ~SC_RCV_BITS; spin_lock_irq(&ap->recv_lock); ap->rbits = val & SC_RCV_BITS; spin_unlock_irq(&ap->recv_lock); err = 0; break; case PPPIOCGASYNCMAP: if (put_user(ap->xaccm[0], (u32 __user *)argp)) break; err = 0; break; case PPPIOCSASYNCMAP: if (get_user(ap->xaccm[0], (u32 __user *)argp)) break; err = 0; break; case PPPIOCGRASYNCMAP: if (put_user(ap->raccm, (u32 __user *)argp)) break; err = 0; break; case PPPIOCSRASYNCMAP: if (get_user(ap->raccm, (u32 __user *)argp)) break; err = 0; break; case PPPIOCGXASYNCMAP: if (copy_to_user(argp, ap->xaccm, sizeof(ap->xaccm))) break; err = 0; break; case PPPIOCSXASYNCMAP: if (copy_from_user(accm, argp, sizeof(accm))) break; accm[2] &= ~0x40000000U; /* can't escape 0x5e */ accm[3] |= 0x60000000U; /* must escape 0x7d, 0x7e */ memcpy(ap->xaccm, accm, sizeof(ap->xaccm)); err = 0; break; case PPPIOCGMRU: if (put_user(ap->mru, p)) break; err = 0; break; case PPPIOCSMRU: if (get_user(val, p)) break; if (val > U16_MAX) { err = -EINVAL; break; } if (val < PPP_MRU) val = PPP_MRU; ap->mru = val; err = 0; break; default: err = -ENOTTY; } return err; } /* * This is called at softirq level to deliver received packets * to the ppp_generic code, and to tell the ppp_generic code * if we can accept more output now. */ static void ppp_async_process(struct tasklet_struct *t) { struct asyncppp *ap = from_tasklet(ap, t, tsk); struct sk_buff *skb; /* process received packets */ while ((skb = skb_dequeue(&ap->rqueue)) != NULL) { if (skb->cb[0]) ppp_input_error(&ap->chan, 0); ppp_input(&ap->chan, skb); } /* try to push more stuff out */ if (test_bit(XMIT_WAKEUP, &ap->xmit_flags) && ppp_async_push(ap)) ppp_output_wakeup(&ap->chan); } /* * Procedures for encapsulation and framing. */ /* * Procedure to encode the data for async serial transmission. * Does octet stuffing (escaping), puts the address/control bytes * on if A/C compression is disabled, and does protocol compression. * Assumes ap->tpkt != 0 on entry. * Returns 1 if we finished the current frame, 0 otherwise. */ #define PUT_BYTE(ap, buf, c, islcp) do { \ if ((islcp && c < 0x20) || (ap->xaccm[c >> 5] & (1 << (c & 0x1f)))) {\ *buf++ = PPP_ESCAPE; \ *buf++ = c ^ PPP_TRANS; \ } else \ *buf++ = c; \ } while (0) static int ppp_async_encode(struct asyncppp *ap) { int fcs, i, count, c, proto; unsigned char *buf, *buflim; unsigned char *data; int islcp; buf = ap->obuf; ap->olim = buf; ap->optr = buf; i = ap->tpkt_pos; data = ap->tpkt->data; count = ap->tpkt->len; fcs = ap->tfcs; proto = get_unaligned_be16(data); /* * LCP packets with code values between 1 (configure-request) * and 7 (code-reject) must be sent as though no options * had been negotiated. */ islcp = proto == PPP_LCP && count >= 3 && 1 <= data[2] && data[2] <= 7; if (i == 0) { if (islcp) async_lcp_peek(ap, data, count, 0); /* * Start of a new packet - insert the leading FLAG * character if necessary. */ if (islcp || flag_time == 0 || time_after_eq(jiffies, ap->last_xmit + flag_time)) *buf++ = PPP_FLAG; ap->last_xmit = jiffies; fcs = PPP_INITFCS; /* * Put in the address/control bytes if necessary */ if ((ap->flags & SC_COMP_AC) == 0 || islcp) { PUT_BYTE(ap, buf, 0xff, islcp); fcs = PPP_FCS(fcs, 0xff); PUT_BYTE(ap, buf, 0x03, islcp); fcs = PPP_FCS(fcs, 0x03); } } /* * Once we put in the last byte, we need to put in the FCS * and closing flag, so make sure there is at least 7 bytes * of free space in the output buffer. */ buflim = ap->obuf + OBUFSIZE - 6; while (i < count && buf < buflim) { c = data[i++]; if (i == 1 && c == 0 && (ap->flags & SC_COMP_PROT)) continue; /* compress protocol field */ fcs = PPP_FCS(fcs, c); PUT_BYTE(ap, buf, c, islcp); } if (i < count) { /* * Remember where we are up to in this packet. */ ap->olim = buf; ap->tpkt_pos = i; ap->tfcs = fcs; return 0; } /* * We have finished the packet. Add the FCS and flag. */ fcs = ~fcs; c = fcs & 0xff; PUT_BYTE(ap, buf, c, islcp); c = (fcs >> 8) & 0xff; PUT_BYTE(ap, buf, c, islcp); *buf++ = PPP_FLAG; ap->olim = buf; consume_skb(ap->tpkt); ap->tpkt = NULL; return 1; } /* * Transmit-side routines. */ /* * Send a packet to the peer over an async tty line. * Returns 1 iff the packet was accepted. * If the packet was not accepted, we will call ppp_output_wakeup * at some later time. */ static int ppp_async_send(struct ppp_channel *chan, struct sk_buff *skb) { struct asyncppp *ap = chan->private; ppp_async_push(ap); if (test_and_set_bit(XMIT_FULL, &ap->xmit_flags)) return 0; /* already full */ ap->tpkt = skb; ap->tpkt_pos = 0; ppp_async_push(ap); return 1; } /* * Push as much data as possible out to the tty. */ static int ppp_async_push(struct asyncppp *ap) { int avail, sent, done = 0; struct tty_struct *tty = ap->tty; int tty_stuffed = 0; /* * We can get called recursively here if the tty write * function calls our wakeup function. This can happen * for example on a pty with both the master and slave * set to PPP line discipline. * We use the XMIT_BUSY bit to detect this and get out, * leaving the XMIT_WAKEUP bit set to tell the other * instance that it may now be able to write more now. */ if (test_and_set_bit(XMIT_BUSY, &ap->xmit_flags)) return 0; spin_lock_bh(&ap->xmit_lock); for (;;) { if (test_and_clear_bit(XMIT_WAKEUP, &ap->xmit_flags)) tty_stuffed = 0; if (!tty_stuffed && ap->optr < ap->olim) { avail = ap->olim - ap->optr; set_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); sent = tty->ops->write(tty, ap->optr, avail); if (sent < 0) goto flush; /* error, e.g. loss of CD */ ap->optr += sent; if (sent < avail) tty_stuffed = 1; continue; } if (ap->optr >= ap->olim && ap->tpkt) { if (ppp_async_encode(ap)) { /* finished processing ap->tpkt */ clear_bit(XMIT_FULL, &ap->xmit_flags); done = 1; } continue; } /* * We haven't made any progress this time around. * Clear XMIT_BUSY to let other callers in, but * after doing so we have to check if anyone set * XMIT_WAKEUP since we last checked it. If they * did, we should try again to set XMIT_BUSY and go * around again in case XMIT_BUSY was still set when * the other caller tried. */ clear_bit(XMIT_BUSY, &ap->xmit_flags); /* any more work to do? if not, exit the loop */ if (!(test_bit(XMIT_WAKEUP, &ap->xmit_flags) || (!tty_stuffed && ap->tpkt))) break; /* more work to do, see if we can do it now */ if (test_and_set_bit(XMIT_BUSY, &ap->xmit_flags)) break; } spin_unlock_bh(&ap->xmit_lock); return done; flush: clear_bit(XMIT_BUSY, &ap->xmit_flags); if (ap->tpkt) { kfree_skb(ap->tpkt); ap->tpkt = NULL; clear_bit(XMIT_FULL, &ap->xmit_flags); done = 1; } ap->optr = ap->olim; spin_unlock_bh(&ap->xmit_lock); return done; } /* * Flush output from our internal buffers. * Called for the TCFLSH ioctl. Can be entered in parallel * but this is covered by the xmit_lock. */ static void ppp_async_flush_output(struct asyncppp *ap) { int done = 0; spin_lock_bh(&ap->xmit_lock); ap->optr = ap->olim; if (ap->tpkt != NULL) { kfree_skb(ap->tpkt); ap->tpkt = NULL; clear_bit(XMIT_FULL, &ap->xmit_flags); done = 1; } spin_unlock_bh(&ap->xmit_lock); if (done) ppp_output_wakeup(&ap->chan); } /* * Receive-side routines. */ /* see how many ordinary chars there are at the start of buf */ static inline int scan_ordinary(struct asyncppp *ap, const unsigned char *buf, int count) { int i, c; for (i = 0; i < count; ++i) { c = buf[i]; if (c == PPP_ESCAPE || c == PPP_FLAG || (c < 0x20 && (ap->raccm & (1 << c)) != 0)) break; } return i; } /* called when a flag is seen - do end-of-packet processing */ static void process_input_packet(struct asyncppp *ap) { struct sk_buff *skb; unsigned char *p; unsigned int len, fcs; skb = ap->rpkt; if (ap->state & (SC_TOSS | SC_ESCAPE)) goto err; if (skb == NULL) return; /* 0-length packet */ /* check the FCS */ p = skb->data; len = skb->len; if (len < 3) goto err; /* too short */ fcs = PPP_INITFCS; for (; len > 0; --len) fcs = PPP_FCS(fcs, *p++); if (fcs != PPP_GOODFCS) goto err; /* bad FCS */ skb_trim(skb, skb->len - 2); /* check for address/control and protocol compression */ p = skb->data; if (p[0] == PPP_ALLSTATIONS) { /* chop off address/control */ if (p[1] != PPP_UI || skb->len < 3) goto err; p = skb_pull(skb, 2); } /* If protocol field is not compressed, it can be LCP packet */ if (!(p[0] & 0x01)) { unsigned int proto; if (skb->len < 2) goto err; proto = (p[0] << 8) + p[1]; if (proto == PPP_LCP) async_lcp_peek(ap, p, skb->len, 1); } /* queue the frame to be processed */ skb->cb[0] = ap->state; skb_queue_tail(&ap->rqueue, skb); ap->rpkt = NULL; ap->state = 0; return; err: /* frame had an error, remember that, reset SC_TOSS & SC_ESCAPE */ ap->state = SC_PREV_ERROR; if (skb) { /* make skb appear as freshly allocated */ skb_trim(skb, 0); skb_reserve(skb, - skb_headroom(skb)); } } /* Called when the tty driver has data for us. Runs parallel with the other ldisc functions but will not be re-entered */ static void ppp_async_input(struct asyncppp *ap, const u8 *buf, const u8 *flags, int count) { struct sk_buff *skb; int c, i, j, n, s, f; unsigned char *sp; /* update bits used for 8-bit cleanness detection */ if (~ap->rbits & SC_RCV_BITS) { s = 0; for (i = 0; i < count; ++i) { c = buf[i]; if (flags && flags[i] != 0) continue; s |= (c & 0x80)? SC_RCV_B7_1: SC_RCV_B7_0; c = ((c >> 4) ^ c) & 0xf; s |= (0x6996 & (1 << c))? SC_RCV_ODDP: SC_RCV_EVNP; } ap->rbits |= s; } while (count > 0) { /* scan through and see how many chars we can do in bulk */ if ((ap->state & SC_ESCAPE) && buf[0] == PPP_ESCAPE) n = 1; else n = scan_ordinary(ap, buf, count); f = 0; if (flags && (ap->state & SC_TOSS) == 0) { /* check the flags to see if any char had an error */ for (j = 0; j < n; ++j) if ((f = flags[j]) != 0) break; } if (f != 0) { /* start tossing */ ap->state |= SC_TOSS; } else if (n > 0 && (ap->state & SC_TOSS) == 0) { /* stuff the chars in the skb */ skb = ap->rpkt; if (!skb) { skb = dev_alloc_skb(ap->mru + PPP_HDRLEN + 2); if (!skb) goto nomem; ap->rpkt = skb; } if (skb->len == 0) { /* Try to get the payload 4-byte aligned. * This should match the * PPP_ALLSTATIONS/PPP_UI/compressed tests in * process_input_packet, but we do not have * enough chars here to test buf[1] and buf[2]. */ if (buf[0] != PPP_ALLSTATIONS) skb_reserve(skb, 2 + (buf[0] & 1)); } if (n > skb_tailroom(skb)) { /* packet overflowed MRU */ ap->state |= SC_TOSS; } else { sp = skb_put_data(skb, buf, n); if (ap->state & SC_ESCAPE) { sp[0] ^= PPP_TRANS; ap->state &= ~SC_ESCAPE; } } } if (n >= count) break; c = buf[n]; if (flags != NULL && flags[n] != 0) { ap->state |= SC_TOSS; } else if (c == PPP_FLAG) { process_input_packet(ap); } else if (c == PPP_ESCAPE) { ap->state |= SC_ESCAPE; } else if (I_IXON(ap->tty)) { if (c == START_CHAR(ap->tty)) start_tty(ap->tty); else if (c == STOP_CHAR(ap->tty)) stop_tty(ap->tty); } /* otherwise it's a char in the recv ACCM */ ++n; buf += n; if (flags) flags += n; count -= n; } return; nomem: printk(KERN_ERR "PPPasync: no memory (input pkt)\n"); ap->state |= SC_TOSS; } /* * We look at LCP frames going past so that we can notice * and react to the LCP configure-ack from the peer. * In the situation where the peer has been sent a configure-ack * already, LCP is up once it has sent its configure-ack * so the immediately following packet can be sent with the * configured LCP options. This allows us to process the following * packet correctly without pppd needing to respond quickly. * * We only respond to the received configure-ack if we have just * sent a configure-request, and the configure-ack contains the * same data (this is checked using a 16-bit crc of the data). */ #define CONFREQ 1 /* LCP code field values */ #define CONFACK 2 #define LCP_MRU 1 /* LCP option numbers */ #define LCP_ASYNCMAP 2 static void async_lcp_peek(struct asyncppp *ap, unsigned char *data, int len, int inbound) { int dlen, fcs, i, code; u32 val; data += 2; /* skip protocol bytes */ len -= 2; if (len < 4) /* 4 = code, ID, length */ return; code = data[0]; if (code != CONFACK && code != CONFREQ) return; dlen = get_unaligned_be16(data + 2); if (len < dlen) return; /* packet got truncated or length is bogus */ if (code == (inbound? CONFACK: CONFREQ)) { /* * sent confreq or received confack: * calculate the crc of the data from the ID field on. */ fcs = PPP_INITFCS; for (i = 1; i < dlen; ++i) fcs = PPP_FCS(fcs, data[i]); if (!inbound) { /* outbound confreq - remember the crc for later */ ap->lcp_fcs = fcs; return; } /* received confack, check the crc */ fcs ^= ap->lcp_fcs; ap->lcp_fcs = -1; if (fcs != 0) return; } else if (inbound) return; /* not interested in received confreq */ /* process the options in the confack */ data += 4; dlen -= 4; /* data[0] is code, data[1] is length */ while (dlen >= 2 && dlen >= data[1] && data[1] >= 2) { switch (data[0]) { case LCP_MRU: val = get_unaligned_be16(data + 2); if (inbound) ap->mru = val; else ap->chan.mtu = val; break; case LCP_ASYNCMAP: val = get_unaligned_be32(data + 2); if (inbound) ap->raccm = val; else ap->xaccm[0] = val; break; } dlen -= data[1]; data += data[1]; } } static void __exit ppp_async_cleanup(void) { tty_unregister_ldisc(&ppp_ldisc); } module_init(ppp_async_init); module_exit(ppp_async_cleanup);
102 13 90 11 4 115 191 60 1 93 130 21 8 90 4 4 74 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/binfmt_elf.c * * These are the functions used to load ELF format executables as used * on SVr4 machines. Information on the format may be found in the book * "UNIX SYSTEM V RELEASE 4 Programmers Guide: Ansi C and Programming Support * Tools". * * Copyright 1993, 1994: Eric Youngdale (ericy@cais.com). */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/fs.h> #include <linux/log2.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/errno.h> #include <linux/signal.h> #include <linux/binfmts.h> #include <linux/string.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/personality.h> #include <linux/elfcore.h> #include <linux/init.h> #include <linux/highuid.h> #include <linux/compiler.h> #include <linux/highmem.h> #include <linux/hugetlb.h> #include <linux/pagemap.h> #include <linux/vmalloc.h> #include <linux/security.h> #include <linux/random.h> #include <linux/elf.h> #include <linux/elf-randomize.h> #include <linux/utsname.h> #include <linux/coredump.h> #include <linux/sched.h> #include <linux/sched/coredump.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> #include <linux/sizes.h> #include <linux/types.h> #include <linux/cred.h> #include <linux/dax.h> #include <linux/uaccess.h> #include <linux/rseq.h> #include <asm/param.h> #include <asm/page.h> #ifndef ELF_COMPAT #define ELF_COMPAT 0 #endif #ifndef user_long_t #define user_long_t long #endif #ifndef user_siginfo_t #define user_siginfo_t siginfo_t #endif /* That's for binfmt_elf_fdpic to deal with */ #ifndef elf_check_fdpic #define elf_check_fdpic(ex) false #endif static int load_elf_binary(struct linux_binprm *bprm); /* * If we don't support core dumping, then supply a NULL so we * don't even try. */ #ifdef CONFIG_ELF_CORE static int elf_core_dump(struct coredump_params *cprm); #else #define elf_core_dump NULL #endif #if ELF_EXEC_PAGESIZE > PAGE_SIZE #define ELF_MIN_ALIGN ELF_EXEC_PAGESIZE #else #define ELF_MIN_ALIGN PAGE_SIZE #endif #ifndef ELF_CORE_EFLAGS #define ELF_CORE_EFLAGS 0 #endif #define ELF_PAGESTART(_v) ((_v) & ~(int)(ELF_MIN_ALIGN-1)) #define ELF_PAGEOFFSET(_v) ((_v) & (ELF_MIN_ALIGN-1)) #define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1)) static struct linux_binfmt elf_format = { .module = THIS_MODULE, .load_binary = load_elf_binary, #ifdef CONFIG_COREDUMP .core_dump = elf_core_dump, .min_coredump = ELF_EXEC_PAGESIZE, #endif }; #define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE)) /* * We need to explicitly zero any trailing portion of the page that follows * p_filesz when it ends before the page ends (e.g. bss), otherwise this * memory will contain the junk from the file that should not be present. */ static int padzero(unsigned long address) { unsigned long nbyte; nbyte = ELF_PAGEOFFSET(address); if (nbyte) { nbyte = ELF_MIN_ALIGN - nbyte; if (clear_user((void __user *)address, nbyte)) return -EFAULT; } return 0; } /* Let's use some macros to make this stack manipulation a little clearer */ #ifdef CONFIG_STACK_GROWSUP #define STACK_ADD(sp, items) ((elf_addr_t __user *)(sp) + (items)) #define STACK_ROUND(sp, items) \ ((15 + (unsigned long) ((sp) + (items))) &~ 15UL) #define STACK_ALLOC(sp, len) ({ \ elf_addr_t __user *old_sp = (elf_addr_t __user *)sp; sp += len; \ old_sp; }) #else #define STACK_ADD(sp, items) ((elf_addr_t __user *)(sp) - (items)) #define STACK_ROUND(sp, items) \ (((unsigned long) (sp - items)) &~ 15UL) #define STACK_ALLOC(sp, len) (sp -= len) #endif #ifndef ELF_BASE_PLATFORM /* * AT_BASE_PLATFORM indicates the "real" hardware/microarchitecture. * If the arch defines ELF_BASE_PLATFORM (in asm/elf.h), the value * will be copied to the user stack in the same manner as AT_PLATFORM. */ #define ELF_BASE_PLATFORM NULL #endif static int create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, unsigned long interp_load_addr, unsigned long e_entry, unsigned long phdr_addr) { struct mm_struct *mm = current->mm; unsigned long p = bprm->p; int argc = bprm->argc; int envc = bprm->envc; elf_addr_t __user *sp; elf_addr_t __user *u_platform; elf_addr_t __user *u_base_platform; elf_addr_t __user *u_rand_bytes; const char *k_platform = ELF_PLATFORM; const char *k_base_platform = ELF_BASE_PLATFORM; unsigned char k_rand_bytes[16]; int items; elf_addr_t *elf_info; elf_addr_t flags = 0; int ei_index; const struct cred *cred = current_cred(); struct vm_area_struct *vma; /* * In some cases (e.g. Hyper-Threading), we want to avoid L1 * evictions by the processes running on the same package. One * thing we can do is to shuffle the initial stack for them. */ p = arch_align_stack(p); /* * If this architecture has a platform capability string, copy it * to userspace. In some cases (Sparc), this info is impossible * for userspace to get any other way, in others (i386) it is * merely difficult. */ u_platform = NULL; if (k_platform) { size_t len = strlen(k_platform) + 1; u_platform = (elf_addr_t __user *)STACK_ALLOC(p, len); if (copy_to_user(u_platform, k_platform, len)) return -EFAULT; } /* * If this architecture has a "base" platform capability * string, copy it to userspace. */ u_base_platform = NULL; if (k_base_platform) { size_t len = strlen(k_base_platform) + 1; u_base_platform = (elf_addr_t __user *)STACK_ALLOC(p, len); if (copy_to_user(u_base_platform, k_base_platform, len)) return -EFAULT; } /* * Generate 16 random bytes for userspace PRNG seeding. */ get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes)); u_rand_bytes = (elf_addr_t __user *) STACK_ALLOC(p, sizeof(k_rand_bytes)); if (copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes))) return -EFAULT; /* Create the ELF interpreter info */ elf_info = (elf_addr_t *)mm->saved_auxv; /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */ #define NEW_AUX_ENT(id, val) \ do { \ *elf_info++ = id; \ *elf_info++ = val; \ } while (0) #ifdef ARCH_DLINFO /* * ARCH_DLINFO must come first so PPC can do its special alignment of * AUXV. * update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT() in * ARCH_DLINFO changes */ ARCH_DLINFO; #endif NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP); NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE); NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC); NEW_AUX_ENT(AT_PHDR, phdr_addr); NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr)); NEW_AUX_ENT(AT_PHNUM, exec->e_phnum); NEW_AUX_ENT(AT_BASE, interp_load_addr); if (bprm->interp_flags & BINPRM_FLAGS_PRESERVE_ARGV0) flags |= AT_FLAGS_PRESERVE_ARGV0; NEW_AUX_ENT(AT_FLAGS, flags); NEW_AUX_ENT(AT_ENTRY, e_entry); NEW_AUX_ENT(AT_UID, from_kuid_munged(cred->user_ns, cred->uid)); NEW_AUX_ENT(AT_EUID, from_kuid_munged(cred->user_ns, cred->euid)); NEW_AUX_ENT(AT_GID, from_kgid_munged(cred->user_ns, cred->gid)); NEW_AUX_ENT(AT_EGID, from_kgid_munged(cred->user_ns, cred->egid)); NEW_AUX_ENT(AT_SECURE, bprm->secureexec); NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes); #ifdef ELF_HWCAP2 NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2); #endif #ifdef ELF_HWCAP3 NEW_AUX_ENT(AT_HWCAP3, ELF_HWCAP3); #endif #ifdef ELF_HWCAP4 NEW_AUX_ENT(AT_HWCAP4, ELF_HWCAP4); #endif NEW_AUX_ENT(AT_EXECFN, bprm->exec); if (k_platform) { NEW_AUX_ENT(AT_PLATFORM, (elf_addr_t)(unsigned long)u_platform); } if (k_base_platform) { NEW_AUX_ENT(AT_BASE_PLATFORM, (elf_addr_t)(unsigned long)u_base_platform); } if (bprm->have_execfd) { NEW_AUX_ENT(AT_EXECFD, bprm->execfd); } #ifdef CONFIG_RSEQ NEW_AUX_ENT(AT_RSEQ_FEATURE_SIZE, offsetof(struct rseq, end)); NEW_AUX_ENT(AT_RSEQ_ALIGN, __alignof__(struct rseq)); #endif #undef NEW_AUX_ENT /* AT_NULL is zero; clear the rest too */ memset(elf_info, 0, (char *)mm->saved_auxv + sizeof(mm->saved_auxv) - (char *)elf_info); /* And advance past the AT_NULL entry. */ elf_info += 2; ei_index = elf_info - (elf_addr_t *)mm->saved_auxv; sp = STACK_ADD(p, ei_index); items = (argc + 1) + (envc + 1) + 1; bprm->p = STACK_ROUND(sp, items); /* Point sp at the lowest address on the stack */ #ifdef CONFIG_STACK_GROWSUP sp = (elf_addr_t __user *)bprm->p - items - ei_index; bprm->exec = (unsigned long)sp; /* XXX: PARISC HACK */ #else sp = (elf_addr_t __user *)bprm->p; #endif /* * Grow the stack manually; some architectures have a limit on how * far ahead a user-space access may be in order to grow the stack. */ if (mmap_write_lock_killable(mm)) return -EINTR; vma = find_extend_vma_locked(mm, bprm->p); mmap_write_unlock(mm); if (!vma) return -EFAULT; /* Now, let's put argc (and argv, envp if appropriate) on the stack */ if (put_user(argc, sp++)) return -EFAULT; /* Populate list of argv pointers back to argv strings. */ p = mm->arg_end = mm->arg_start; while (argc-- > 0) { size_t len; if (put_user((elf_addr_t)p, sp++)) return -EFAULT; len = strnlen_user((void __user *)p, MAX_ARG_STRLEN); if (!len || len > MAX_ARG_STRLEN) return -EINVAL; p += len; } if (put_user(0, sp++)) return -EFAULT; mm->arg_end = p; /* Populate list of envp pointers back to envp strings. */ mm->env_end = mm->env_start = p; while (envc-- > 0) { size_t len; if (put_user((elf_addr_t)p, sp++)) return -EFAULT; len = strnlen_user((void __user *)p, MAX_ARG_STRLEN); if (!len || len > MAX_ARG_STRLEN) return -EINVAL; p += len; } if (put_user(0, sp++)) return -EFAULT; mm->env_end = p; /* Put the elf_info on the stack in the right place. */ if (copy_to_user(sp, mm->saved_auxv, ei_index * sizeof(elf_addr_t))) return -EFAULT; return 0; } /* * Map "eppnt->p_filesz" bytes from "filep" offset "eppnt->p_offset" * into memory at "addr". (Note that p_filesz is rounded up to the * next page, so any extra bytes from the file must be wiped.) */ static unsigned long elf_map(struct file *filep, unsigned long addr, const struct elf_phdr *eppnt, int prot, int type, unsigned long total_size) { unsigned long map_addr; unsigned long size = eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr); unsigned long off = eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr); addr = ELF_PAGESTART(addr); size = ELF_PAGEALIGN(size); /* mmap() will return -EINVAL if given a zero size, but a * segment with zero filesize is perfectly valid */ if (!size) return addr; /* * total_size is the size of the ELF (interpreter) image. * The _first_ mmap needs to know the full size, otherwise * randomization might put this image into an overlapping * position with the ELF binary image. (since size < total_size) * So we first map the 'big' image - and unmap the remainder at * the end. (which unmap is needed for ELF images with holes.) */ if (total_size) { total_size = ELF_PAGEALIGN(total_size); map_addr = vm_mmap(filep, addr, total_size, prot, type, off); if (!BAD_ADDR(map_addr)) vm_munmap(map_addr+size, total_size-size); } else map_addr = vm_mmap(filep, addr, size, prot, type, off); if ((type & MAP_FIXED_NOREPLACE) && PTR_ERR((void *)map_addr) == -EEXIST) pr_info("%d (%s): Uhuuh, elf segment at %px requested but the memory is mapped already\n", task_pid_nr(current), current->comm, (void *)addr); return(map_addr); } /* * Map "eppnt->p_filesz" bytes from "filep" offset "eppnt->p_offset" * into memory at "addr". Memory from "p_filesz" through "p_memsz" * rounded up to the next page is zeroed. */ static unsigned long elf_load(struct file *filep, unsigned long addr, const struct elf_phdr *eppnt, int prot, int type, unsigned long total_size) { unsigned long zero_start, zero_end; unsigned long map_addr; if (eppnt->p_filesz) { map_addr = elf_map(filep, addr, eppnt, prot, type, total_size); if (BAD_ADDR(map_addr)) return map_addr; if (eppnt->p_memsz > eppnt->p_filesz) { zero_start = map_addr + ELF_PAGEOFFSET(eppnt->p_vaddr) + eppnt->p_filesz; zero_end = map_addr + ELF_PAGEOFFSET(eppnt->p_vaddr) + eppnt->p_memsz; /* * Zero the end of the last mapped page but ignore * any errors if the segment isn't writable. */ if (padzero(zero_start) && (prot & PROT_WRITE)) return -EFAULT; } } else { map_addr = zero_start = ELF_PAGESTART(addr); zero_end = zero_start + ELF_PAGEOFFSET(eppnt->p_vaddr) + eppnt->p_memsz; } if (eppnt->p_memsz > eppnt->p_filesz) { /* * Map the last of the segment. * If the header is requesting these pages to be * executable, honour that (ppc32 needs this). */ int error; zero_start = ELF_PAGEALIGN(zero_start); zero_end = ELF_PAGEALIGN(zero_end); error = vm_brk_flags(zero_start, zero_end - zero_start, prot & PROT_EXEC ? VM_EXEC : 0); if (error) map_addr = error; } return map_addr; } static unsigned long total_mapping_size(const struct elf_phdr *phdr, int nr) { elf_addr_t min_addr = -1; elf_addr_t max_addr = 0; bool pt_load = false; int i; for (i = 0; i < nr; i++) { if (phdr[i].p_type == PT_LOAD) { min_addr = min(min_addr, ELF_PAGESTART(phdr[i].p_vaddr)); max_addr = max(max_addr, phdr[i].p_vaddr + phdr[i].p_memsz); pt_load = true; } } return pt_load ? (max_addr - min_addr) : 0; } static int elf_read(struct file *file, void *buf, size_t len, loff_t pos) { ssize_t rv; rv = kernel_read(file, buf, len, &pos); if (unlikely(rv != len)) { return (rv < 0) ? rv : -EIO; } return 0; } static unsigned long maximum_alignment(struct elf_phdr *cmds, int nr) { unsigned long alignment = 0; int i; for (i = 0; i < nr; i++) { if (cmds[i].p_type == PT_LOAD) { unsigned long p_align = cmds[i].p_align; /* skip non-power of two alignments as invalid */ if (!is_power_of_2(p_align)) continue; alignment = max(alignment, p_align); } } /* ensure we align to at least one page */ return ELF_PAGEALIGN(alignment); } /** * load_elf_phdrs() - load ELF program headers * @elf_ex: ELF header of the binary whose program headers should be loaded * @elf_file: the opened ELF binary file * * Loads ELF program headers from the binary file elf_file, which has the ELF * header pointed to by elf_ex, into a newly allocated array. The caller is * responsible for freeing the allocated data. Returns NULL upon failure. */ static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex, struct file *elf_file) { struct elf_phdr *elf_phdata = NULL; int retval = -1; unsigned int size; /* * If the size of this structure has changed, then punt, since * we will be doing the wrong thing. */ if (elf_ex->e_phentsize != sizeof(struct elf_phdr)) goto out; /* Sanity check the number of program headers... */ /* ...and their total size. */ size = sizeof(struct elf_phdr) * elf_ex->e_phnum; if (size == 0 || size > 65536) goto out; elf_phdata = kmalloc(size, GFP_KERNEL); if (!elf_phdata) goto out; /* Read in the program headers */ retval = elf_read(elf_file, elf_phdata, size, elf_ex->e_phoff); out: if (retval) { kfree(elf_phdata); elf_phdata = NULL; } return elf_phdata; } #ifndef CONFIG_ARCH_BINFMT_ELF_STATE /** * struct arch_elf_state - arch-specific ELF loading state * * This structure is used to preserve architecture specific data during * the loading of an ELF file, throughout the checking of architecture * specific ELF headers & through to the point where the ELF load is * known to be proceeding (ie. SET_PERSONALITY). * * This implementation is a dummy for architectures which require no * specific state. */ struct arch_elf_state { }; #define INIT_ARCH_ELF_STATE {} /** * arch_elf_pt_proc() - check a PT_LOPROC..PT_HIPROC ELF program header * @ehdr: The main ELF header * @phdr: The program header to check * @elf: The open ELF file * @is_interp: True if the phdr is from the interpreter of the ELF being * loaded, else false. * @state: Architecture-specific state preserved throughout the process * of loading the ELF. * * Inspects the program header phdr to validate its correctness and/or * suitability for the system. Called once per ELF program header in the * range PT_LOPROC to PT_HIPROC, for both the ELF being loaded and its * interpreter. * * Return: Zero to proceed with the ELF load, non-zero to fail the ELF load * with that return code. */ static inline int arch_elf_pt_proc(struct elfhdr *ehdr, struct elf_phdr *phdr, struct file *elf, bool is_interp, struct arch_elf_state *state) { /* Dummy implementation, always proceed */ return 0; } /** * arch_check_elf() - check an ELF executable * @ehdr: The main ELF header * @has_interp: True if the ELF has an interpreter, else false. * @interp_ehdr: The interpreter's ELF header * @state: Architecture-specific state preserved throughout the process * of loading the ELF. * * Provides a final opportunity for architecture code to reject the loading * of the ELF & cause an exec syscall to return an error. This is called after * all program headers to be checked by arch_elf_pt_proc have been. * * Return: Zero to proceed with the ELF load, non-zero to fail the ELF load * with that return code. */ static inline int arch_check_elf(struct elfhdr *ehdr, bool has_interp, struct elfhdr *interp_ehdr, struct arch_elf_state *state) { /* Dummy implementation, always proceed */ return 0; } #endif /* !CONFIG_ARCH_BINFMT_ELF_STATE */ static inline int make_prot(u32 p_flags, struct arch_elf_state *arch_state, bool has_interp, bool is_interp) { int prot = 0; if (p_flags & PF_R) prot |= PROT_READ; if (p_flags & PF_W) prot |= PROT_WRITE; if (p_flags & PF_X) prot |= PROT_EXEC; return arch_elf_adjust_prot(prot, arch_state, has_interp, is_interp); } /* This is much more generalized than the library routine read function, so we keep this separate. Technically the library read function is only provided so that we can read a.out libraries that have an ELF header */ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, struct file *interpreter, unsigned long no_base, struct elf_phdr *interp_elf_phdata, struct arch_elf_state *arch_state) { struct elf_phdr *eppnt; unsigned long load_addr = 0; int load_addr_set = 0; unsigned long error = ~0UL; unsigned long total_size; int i; /* First of all, some simple consistency checks */ if (interp_elf_ex->e_type != ET_EXEC && interp_elf_ex->e_type != ET_DYN) goto out; if (!elf_check_arch(interp_elf_ex) || elf_check_fdpic(interp_elf_ex)) goto out; if (!can_mmap_file(interpreter)) goto out; total_size = total_mapping_size(interp_elf_phdata, interp_elf_ex->e_phnum); if (!total_size) { error = -EINVAL; goto out; } eppnt = interp_elf_phdata; for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) { if (eppnt->p_type == PT_LOAD) { int elf_type = MAP_PRIVATE; int elf_prot = make_prot(eppnt->p_flags, arch_state, true, true); unsigned long vaddr = 0; unsigned long k, map_addr; vaddr = eppnt->p_vaddr; if (interp_elf_ex->e_type == ET_EXEC || load_addr_set) elf_type |= MAP_FIXED; else if (no_base && interp_elf_ex->e_type == ET_DYN) load_addr = -vaddr; map_addr = elf_load(interpreter, load_addr + vaddr, eppnt, elf_prot, elf_type, total_size); total_size = 0; error = map_addr; if (BAD_ADDR(map_addr)) goto out; if (!load_addr_set && interp_elf_ex->e_type == ET_DYN) { load_addr = map_addr - ELF_PAGESTART(vaddr); load_addr_set = 1; } /* * Check to see if the section's size will overflow the * allowed task size. Note that p_filesz must always be * <= p_memsize so it's only necessary to check p_memsz. */ k = load_addr + eppnt->p_vaddr; if (BAD_ADDR(k) || eppnt->p_filesz > eppnt->p_memsz || eppnt->p_memsz > TASK_SIZE || TASK_SIZE - eppnt->p_memsz < k) { error = -ENOMEM; goto out; } } } error = load_addr; out: return error; } /* * These are the functions used to load ELF style executables and shared * libraries. There is no binary dependent code anywhere else. */ static int parse_elf_property(const char *data, size_t *off, size_t datasz, struct arch_elf_state *arch, bool have_prev_type, u32 *prev_type) { size_t o, step; const struct gnu_property *pr; int ret; if (*off == datasz) return -ENOENT; if (WARN_ON_ONCE(*off > datasz || *off % ELF_GNU_PROPERTY_ALIGN)) return -EIO; o = *off; datasz -= *off; if (datasz < sizeof(*pr)) return -ENOEXEC; pr = (const struct gnu_property *)(data + o); o += sizeof(*pr); datasz -= sizeof(*pr); if (pr->pr_datasz > datasz) return -ENOEXEC; WARN_ON_ONCE(o % ELF_GNU_PROPERTY_ALIGN); step = round_up(pr->pr_datasz, ELF_GNU_PROPERTY_ALIGN); if (step > datasz) return -ENOEXEC; /* Properties are supposed to be unique and sorted on pr_type: */ if (have_prev_type && pr->pr_type <= *prev_type) return -ENOEXEC; *prev_type = pr->pr_type; ret = arch_parse_elf_property(pr->pr_type, data + o, pr->pr_datasz, ELF_COMPAT, arch); if (ret) return ret; *off = o + step; return 0; } #define NOTE_DATA_SZ SZ_1K #define NOTE_NAME_SZ (sizeof(NN_GNU_PROPERTY_TYPE_0)) static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr, struct arch_elf_state *arch) { union { struct elf_note nhdr; char data[NOTE_DATA_SZ]; } note; loff_t pos; ssize_t n; size_t off, datasz; int ret; bool have_prev_type; u32 prev_type; if (!IS_ENABLED(CONFIG_ARCH_USE_GNU_PROPERTY) || !phdr) return 0; /* load_elf_binary() shouldn't call us unless this is true... */ if (WARN_ON_ONCE(phdr->p_type != PT_GNU_PROPERTY)) return -ENOEXEC; /* If the properties are crazy large, that's too bad (for now): */ if (phdr->p_filesz > sizeof(note)) return -ENOEXEC; pos = phdr->p_offset; n = kernel_read(f, &note, phdr->p_filesz, &pos); BUILD_BUG_ON(sizeof(note) < sizeof(note.nhdr) + NOTE_NAME_SZ); if (n < 0 || n < sizeof(note.nhdr) + NOTE_NAME_SZ) return -EIO; if (note.nhdr.n_type != NT_GNU_PROPERTY_TYPE_0 || note.nhdr.n_namesz != NOTE_NAME_SZ || strncmp(note.data + sizeof(note.nhdr), NN_GNU_PROPERTY_TYPE_0, n - sizeof(note.nhdr))) return -ENOEXEC; off = round_up(sizeof(note.nhdr) + NOTE_NAME_SZ, ELF_GNU_PROPERTY_ALIGN); if (off > n) return -ENOEXEC; if (note.nhdr.n_descsz > n - off) return -ENOEXEC; datasz = off + note.nhdr.n_descsz; have_prev_type = false; do { ret = parse_elf_property(note.data, &off, datasz, arch, have_prev_type, &prev_type); have_prev_type = true; } while (!ret); return ret == -ENOENT ? 0 : ret; } static int load_elf_binary(struct linux_binprm *bprm) { struct file *interpreter = NULL; /* to shut gcc up */ unsigned long load_bias = 0, phdr_addr = 0; int first_pt_load = 1; unsigned long error; struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; struct elf_phdr *elf_property_phdata = NULL; unsigned long elf_brk; bool brk_moved = false; int retval, i; unsigned long elf_entry; unsigned long e_entry; unsigned long interp_load_addr = 0; unsigned long start_code, end_code, start_data, end_data; unsigned long reloc_func_desc __maybe_unused = 0; int executable_stack = EXSTACK_DEFAULT; struct elfhdr *elf_ex = (struct elfhdr *)bprm->buf; struct elfhdr *interp_elf_ex = NULL; struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE; struct mm_struct *mm; struct pt_regs *regs; retval = -ENOEXEC; /* First of all, some simple consistency checks */ if (memcmp(elf_ex->e_ident, ELFMAG, SELFMAG) != 0) goto out; if (elf_ex->e_type != ET_EXEC && elf_ex->e_type != ET_DYN) goto out; if (!elf_check_arch(elf_ex)) goto out; if (elf_check_fdpic(elf_ex)) goto out; if (!can_mmap_file(bprm->file)) goto out; elf_phdata = load_elf_phdrs(elf_ex, bprm->file); if (!elf_phdata) goto out; elf_ppnt = elf_phdata; for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++) { char *elf_interpreter; if (elf_ppnt->p_type == PT_GNU_PROPERTY) { elf_property_phdata = elf_ppnt; continue; } if (elf_ppnt->p_type != PT_INTERP) continue; /* * This is the program interpreter used for shared libraries - * for now assume that this is an a.out format binary. */ retval = -ENOEXEC; if (elf_ppnt->p_filesz > PATH_MAX || elf_ppnt->p_filesz < 2) goto out_free_ph; retval = -ENOMEM; elf_interpreter = kmalloc(elf_ppnt->p_filesz, GFP_KERNEL); if (!elf_interpreter) goto out_free_ph; retval = elf_read(bprm->file, elf_interpreter, elf_ppnt->p_filesz, elf_ppnt->p_offset); if (retval < 0) goto out_free_interp; /* make sure path is NULL terminated */ retval = -ENOEXEC; if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0') goto out_free_interp; interpreter = open_exec(elf_interpreter); kfree(elf_interpreter); retval = PTR_ERR(interpreter); if (IS_ERR(interpreter)) goto out_free_ph; /* * If the binary is not readable then enforce mm->dumpable = 0 * regardless of the interpreter's permissions. */ would_dump(bprm, interpreter); interp_elf_ex = kmalloc(sizeof(*interp_elf_ex), GFP_KERNEL); if (!interp_elf_ex) { retval = -ENOMEM; goto out_free_file; } /* Get the exec headers */ retval = elf_read(interpreter, interp_elf_ex, sizeof(*interp_elf_ex), 0); if (retval < 0) goto out_free_dentry; break; out_free_interp: kfree(elf_interpreter); goto out_free_ph; } elf_ppnt = elf_phdata; for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++) switch (elf_ppnt->p_type) { case PT_GNU_STACK: if (elf_ppnt->p_flags & PF_X) executable_stack = EXSTACK_ENABLE_X; else executable_stack = EXSTACK_DISABLE_X; break; case PT_LOPROC ... PT_HIPROC: retval = arch_elf_pt_proc(elf_ex, elf_ppnt, bprm->file, false, &arch_state); if (retval) goto out_free_dentry; break; } /* Some simple consistency checks for the interpreter */ if (interpreter) { retval = -ELIBBAD; /* Not an ELF interpreter */ if (memcmp(interp_elf_ex->e_ident, ELFMAG, SELFMAG) != 0) goto out_free_dentry; /* Verify the interpreter has a valid arch */ if (!elf_check_arch(interp_elf_ex) || elf_check_fdpic(interp_elf_ex)) goto out_free_dentry; /* Load the interpreter program headers */ interp_elf_phdata = load_elf_phdrs(interp_elf_ex, interpreter); if (!interp_elf_phdata) goto out_free_dentry; /* Pass PT_LOPROC..PT_HIPROC headers to arch code */ elf_property_phdata = NULL; elf_ppnt = interp_elf_phdata; for (i = 0; i < interp_elf_ex->e_phnum; i++, elf_ppnt++) switch (elf_ppnt->p_type) { case PT_GNU_PROPERTY: elf_property_phdata = elf_ppnt; break; case PT_LOPROC ... PT_HIPROC: retval = arch_elf_pt_proc(interp_elf_ex, elf_ppnt, interpreter, true, &arch_state); if (retval) goto out_free_dentry; break; } } retval = parse_elf_properties(interpreter ?: bprm->file, elf_property_phdata, &arch_state); if (retval) goto out_free_dentry; /* * Allow arch code to reject the ELF at this point, whilst it's * still possible to return an error to the code that invoked * the exec syscall. */ retval = arch_check_elf(elf_ex, !!interpreter, interp_elf_ex, &arch_state); if (retval) goto out_free_dentry; /* Flush all traces of the currently running executable */ retval = begin_new_exec(bprm); if (retval) goto out_free_dentry; /* Do this immediately, since STACK_TOP as used in setup_arg_pages may depend on the personality. */ SET_PERSONALITY2(*elf_ex, &arch_state); if (elf_read_implies_exec(*elf_ex, executable_stack)) current->personality |= READ_IMPLIES_EXEC; const int snapshot_randomize_va_space = READ_ONCE(randomize_va_space); if (!(current->personality & ADDR_NO_RANDOMIZE) && snapshot_randomize_va_space) current->flags |= PF_RANDOMIZE; setup_new_exec(bprm); /* Do this so that we can load the interpreter, if need be. We will change some of these later */ retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP), executable_stack); if (retval < 0) goto out_free_dentry; elf_brk = 0; start_code = ~0UL; end_code = 0; start_data = 0; end_data = 0; /* Now we do a little grungy work by mmapping the ELF image into the correct location in memory. */ for(i = 0, elf_ppnt = elf_phdata; i < elf_ex->e_phnum; i++, elf_ppnt++) { int elf_prot, elf_flags; unsigned long k, vaddr; unsigned long total_size = 0; unsigned long alignment; if (elf_ppnt->p_type != PT_LOAD) continue; elf_prot = make_prot(elf_ppnt->p_flags, &arch_state, !!interpreter, false); elf_flags = MAP_PRIVATE; vaddr = elf_ppnt->p_vaddr; /* * The first time through the loop, first_pt_load is true: * layout will be calculated. Once set, use MAP_FIXED since * we know we've already safely mapped the entire region with * MAP_FIXED_NOREPLACE in the once-per-binary logic following. */ if (!first_pt_load) { elf_flags |= MAP_FIXED; } else if (elf_ex->e_type == ET_EXEC) { /* * This logic is run once for the first LOAD Program * Header for ET_EXEC binaries. No special handling * is needed. */ elf_flags |= MAP_FIXED_NOREPLACE; } else if (elf_ex->e_type == ET_DYN) { /* * This logic is run once for the first LOAD Program * Header for ET_DYN binaries to calculate the * randomization (load_bias) for all the LOAD * Program Headers. */ /* * Calculate the entire size of the ELF mapping * (total_size), used for the initial mapping, * due to load_addr_set which is set to true later * once the initial mapping is performed. * * Note that this is only sensible when the LOAD * segments are contiguous (or overlapping). If * used for LOADs that are far apart, this would * cause the holes between LOADs to be mapped, * running the risk of having the mapping fail, * as it would be larger than the ELF file itself. * * As a result, only ET_DYN does this, since * some ET_EXEC (e.g. ia64) may have large virtual * memory holes between LOADs. * */ total_size = total_mapping_size(elf_phdata, elf_ex->e_phnum); if (!total_size) { retval = -EINVAL; goto out_free_dentry; } /* Calculate any requested alignment. */ alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); /** * DOC: PIE handling * * There are effectively two types of ET_DYN ELF * binaries: programs (i.e. PIE: ET_DYN with * PT_INTERP) and loaders (i.e. static PIE: ET_DYN * without PT_INTERP, usually the ELF interpreter * itself). Loaders must be loaded away from programs * since the program may otherwise collide with the * loader (especially for ET_EXEC which does not have * a randomized position). * * For example, to handle invocations of * "./ld.so someprog" to test out a new version of * the loader, the subsequent program that the * loader loads must avoid the loader itself, so * they cannot share the same load range. Sufficient * room for the brk must be allocated with the * loader as well, since brk must be available with * the loader. * * Therefore, programs are loaded offset from * ELF_ET_DYN_BASE and loaders are loaded into the * independently randomized mmap region (0 load_bias * without MAP_FIXED nor MAP_FIXED_NOREPLACE). * * See below for "brk" handling details, which is * also affected by program vs loader and ASLR. */ if (interpreter) { /* On ET_DYN with PT_INTERP, we do the ASLR. */ load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); /* Adjust alignment as requested. */ if (alignment) load_bias &= ~(alignment - 1); elf_flags |= MAP_FIXED_NOREPLACE; } else { /* * For ET_DYN without PT_INTERP, we rely on * the architectures's (potentially ASLR) mmap * base address (via a load_bias of 0). * * When a large alignment is requested, we * must do the allocation at address "0" right * now to discover where things will load so * that we can adjust the resulting alignment. * In this case (load_bias != 0), we can use * MAP_FIXED_NOREPLACE to make sure the mapping * doesn't collide with anything. */ if (alignment > ELF_MIN_ALIGN) { load_bias = elf_load(bprm->file, 0, elf_ppnt, elf_prot, elf_flags, total_size); if (BAD_ADDR(load_bias)) { retval = IS_ERR_VALUE(load_bias) ? PTR_ERR((void*)load_bias) : -EINVAL; goto out_free_dentry; } vm_munmap(load_bias, total_size); /* Adjust alignment as requested. */ if (alignment) load_bias &= ~(alignment - 1); elf_flags |= MAP_FIXED_NOREPLACE; } else load_bias = 0; } /* * Since load_bias is used for all subsequent loading * calculations, we must lower it by the first vaddr * so that the remaining calculations based on the * ELF vaddrs will be correctly offset. The result * is then page aligned. */ load_bias = ELF_PAGESTART(load_bias - vaddr); } error = elf_load(bprm->file, load_bias + vaddr, elf_ppnt, elf_prot, elf_flags, total_size); if (BAD_ADDR(error)) { retval = IS_ERR_VALUE(error) ? PTR_ERR((void*)error) : -EINVAL; goto out_free_dentry; } if (first_pt_load) { first_pt_load = 0; if (elf_ex->e_type == ET_DYN) { load_bias += error - ELF_PAGESTART(load_bias + vaddr); reloc_func_desc = load_bias; } } /* * Figure out which segment in the file contains the Program * Header table, and map to the associated memory address. */ if (elf_ppnt->p_offset <= elf_ex->e_phoff && elf_ex->e_phoff < elf_ppnt->p_offset + elf_ppnt->p_filesz) { phdr_addr = elf_ex->e_phoff - elf_ppnt->p_offset + elf_ppnt->p_vaddr; } k = elf_ppnt->p_vaddr; if ((elf_ppnt->p_flags & PF_X) && k < start_code) start_code = k; if (start_data < k) start_data = k; /* * Check to see if the section's size will overflow the * allowed task size. Note that p_filesz must always be * <= p_memsz so it is only necessary to check p_memsz. */ if (BAD_ADDR(k) || elf_ppnt->p_filesz > elf_ppnt->p_memsz || elf_ppnt->p_memsz > TASK_SIZE || TASK_SIZE - elf_ppnt->p_memsz < k) { /* set_brk can never work. Avoid overflows. */ retval = -EINVAL; goto out_free_dentry; } k = elf_ppnt->p_vaddr + elf_ppnt->p_filesz; if ((elf_ppnt->p_flags & PF_X) && end_code < k) end_code = k; if (end_data < k) end_data = k; k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz; if (k > elf_brk) elf_brk = k; } e_entry = elf_ex->e_entry + load_bias; phdr_addr += load_bias; elf_brk += load_bias; start_code += load_bias; end_code += load_bias; start_data += load_bias; end_data += load_bias; if (interpreter) { elf_entry = load_elf_interp(interp_elf_ex, interpreter, load_bias, interp_elf_phdata, &arch_state); if (!IS_ERR_VALUE(elf_entry)) { /* * load_elf_interp() returns relocation * adjustment */ interp_load_addr = elf_entry; elf_entry += interp_elf_ex->e_entry; } if (BAD_ADDR(elf_entry)) { retval = IS_ERR_VALUE(elf_entry) ? (int)elf_entry : -EINVAL; goto out_free_dentry; } reloc_func_desc = interp_load_addr; exe_file_allow_write_access(interpreter); fput(interpreter); kfree(interp_elf_ex); kfree(interp_elf_phdata); } else { elf_entry = e_entry; if (BAD_ADDR(elf_entry)) { retval = -EINVAL; goto out_free_dentry; } } kfree(elf_phdata); set_binfmt(&elf_format); #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES retval = ARCH_SETUP_ADDITIONAL_PAGES(bprm, elf_ex, !!interpreter); if (retval < 0) goto out; #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ retval = create_elf_tables(bprm, elf_ex, interp_load_addr, e_entry, phdr_addr); if (retval < 0) goto out; mm = current->mm; mm->end_code = end_code; mm->start_code = start_code; mm->start_data = start_data; mm->end_data = end_data; mm->start_stack = bprm->p; /** * DOC: "brk" handling * * For architectures with ELF randomization, when executing a * loader directly (i.e. static PIE: ET_DYN without PT_INTERP), * move the brk area out of the mmap region and into the unused * ELF_ET_DYN_BASE region. Since "brk" grows up it may collide * early with the stack growing down or other regions being put * into the mmap region by the kernel (e.g. vdso). * * In the CONFIG_COMPAT_BRK case, though, everything is turned * off because we're not allowed to move the brk at all. */ if (!IS_ENABLED(CONFIG_COMPAT_BRK) && IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && elf_ex->e_type == ET_DYN && !interpreter) { elf_brk = ELF_ET_DYN_BASE; /* This counts as moving the brk, so let brk(2) know. */ brk_moved = true; } mm->start_brk = mm->brk = ELF_PAGEALIGN(elf_brk); if ((current->flags & PF_RANDOMIZE) && snapshot_randomize_va_space > 1) { /* * If we didn't move the brk to ELF_ET_DYN_BASE (above), * leave a gap between .bss and brk. */ if (!brk_moved) mm->brk = mm->start_brk = mm->brk + PAGE_SIZE; mm->brk = mm->start_brk = arch_randomize_brk(mm); brk_moved = true; } #ifdef compat_brk_randomized if (brk_moved) current->brk_randomized = 1; #endif if (current->personality & MMAP_PAGE_ZERO) { /* Why this, you ask??? Well SVr4 maps page 0 as read-only, and some applications "depend" upon this behavior. Since we do not have the power to recompile these, we emulate the SVr4 behavior. Sigh. */ error = vm_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC, MAP_FIXED | MAP_PRIVATE, 0); retval = do_mseal(0, PAGE_SIZE, 0); if (retval) pr_warn_ratelimited("pid=%d, couldn't seal address 0, ret=%d.\n", task_pid_nr(current), retval); } regs = current_pt_regs(); #ifdef ELF_PLAT_INIT /* * The ABI may specify that certain registers be set up in special * ways (on i386 %edx is the address of a DT_FINI function, for * example. In addition, it may also specify (eg, PowerPC64 ELF) * that the e_entry field is the address of the function descriptor * for the startup routine, rather than the address of the startup * routine itself. This macro performs whatever initialization to * the regs structure is required as well as any relocations to the * function descriptor entries when executing dynamically links apps. */ ELF_PLAT_INIT(regs, reloc_func_desc); #endif finalize_exec(bprm); START_THREAD(elf_ex, regs, elf_entry, bprm->p); retval = 0; out: return retval; /* error cleanup */ out_free_dentry: kfree(interp_elf_ex); kfree(interp_elf_phdata); out_free_file: exe_file_allow_write_access(interpreter); if (interpreter) fput(interpreter); out_free_ph: kfree(elf_phdata); goto out; } #ifdef CONFIG_ELF_CORE /* * ELF core dumper * * Modelled on fs/exec.c:aout_core_dump() * Jeremy Fitzhardinge <jeremy@sw.oz.au> */ /* An ELF note in memory */ struct memelfnote { const char *name; int type; unsigned int datasz; void *data; }; static int notesize(struct memelfnote *en) { int sz; sz = sizeof(struct elf_note); sz += roundup(strlen(en->name) + 1, 4); sz += roundup(en->datasz, 4); return sz; } static int writenote(struct memelfnote *men, struct coredump_params *cprm) { struct elf_note en; en.n_namesz = strlen(men->name) + 1; en.n_descsz = men->datasz; en.n_type = men->type; return dump_emit(cprm, &en, sizeof(en)) && dump_emit(cprm, men->name, en.n_namesz) && dump_align(cprm, 4) && dump_emit(cprm, men->data, men->datasz) && dump_align(cprm, 4); } static void fill_elf_header(struct elfhdr *elf, int segs, u16 machine, u32 flags) { memset(elf, 0, sizeof(*elf)); memcpy(elf->e_ident, ELFMAG, SELFMAG); elf->e_ident[EI_CLASS] = ELF_CLASS; elf->e_ident[EI_DATA] = ELF_DATA; elf->e_ident[EI_VERSION] = EV_CURRENT; elf->e_ident[EI_OSABI] = ELF_OSABI; elf->e_type = ET_CORE; elf->e_machine = machine; elf->e_version = EV_CURRENT; elf->e_phoff = sizeof(struct elfhdr); elf->e_flags = flags; elf->e_ehsize = sizeof(struct elfhdr); elf->e_phentsize = sizeof(struct elf_phdr); elf->e_phnum = segs; } static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset) { phdr->p_type = PT_NOTE; phdr->p_offset = offset; phdr->p_vaddr = 0; phdr->p_paddr = 0; phdr->p_filesz = sz; phdr->p_memsz = 0; phdr->p_flags = 0; phdr->p_align = 4; } static void __fill_note(struct memelfnote *note, const char *name, int type, unsigned int sz, void *data) { note->name = name; note->type = type; note->datasz = sz; note->data = data; } #define fill_note(note, type, sz, data) \ __fill_note(note, NN_ ## type, NT_ ## type, sz, data) /* * fill up all the fields in prstatus from the given task struct, except * registers which need to be filled up separately. */ static void fill_prstatus(struct elf_prstatus_common *prstatus, struct task_struct *p, long signr) { prstatus->pr_info.si_signo = prstatus->pr_cursig = signr; prstatus->pr_sigpend = p->pending.signal.sig[0]; prstatus->pr_sighold = p->blocked.sig[0]; rcu_read_lock(); prstatus->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent)); rcu_read_unlock(); prstatus->pr_pid = task_pid_vnr(p); prstatus->pr_pgrp = task_pgrp_vnr(p); prstatus->pr_sid = task_session_vnr(p); if (thread_group_leader(p)) { struct task_cputime cputime; /* * This is the record for the group leader. It shows the * group-wide total, not its individual thread total. */ thread_group_cputime(p, &cputime); prstatus->pr_utime = ns_to_kernel_old_timeval(cputime.utime); prstatus->pr_stime = ns_to_kernel_old_timeval(cputime.stime); } else { u64 utime, stime; task_cputime(p, &utime, &stime); prstatus->pr_utime = ns_to_kernel_old_timeval(utime); prstatus->pr_stime = ns_to_kernel_old_timeval(stime); } prstatus->pr_cutime = ns_to_kernel_old_timeval(p->signal->cutime); prstatus->pr_cstime = ns_to_kernel_old_timeval(p->signal->cstime); } static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, struct mm_struct *mm) { const struct cred *cred; unsigned int i, len; unsigned int state; /* first copy the parameters from user space */ memset(psinfo, 0, sizeof(struct elf_prpsinfo)); len = mm->arg_end - mm->arg_start; if (len >= ELF_PRARGSZ) len = ELF_PRARGSZ-1; if (copy_from_user(&psinfo->pr_psargs, (const char __user *)mm->arg_start, len)) return -EFAULT; for(i = 0; i < len; i++) if (psinfo->pr_psargs[i] == 0) psinfo->pr_psargs[i] = ' '; psinfo->pr_psargs[len] = 0; rcu_read_lock(); psinfo->pr_ppid = task_pid_vnr(rcu_dereference(p->real_parent)); rcu_read_unlock(); psinfo->pr_pid = task_pid_vnr(p); psinfo->pr_pgrp = task_pgrp_vnr(p); psinfo->pr_sid = task_session_vnr(p); state = READ_ONCE(p->__state); i = state ? ffz(~state) + 1 : 0; psinfo->pr_state = i; psinfo->pr_sname = (i > 5) ? '.' : "RSDTZW"[i]; psinfo->pr_zomb = psinfo->pr_sname == 'Z'; psinfo->pr_nice = task_nice(p); psinfo->pr_flag = p->flags; rcu_read_lock(); cred = __task_cred(p); SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid)); SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid)); rcu_read_unlock(); get_task_comm(psinfo->pr_fname, p); return 0; } static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm) { elf_addr_t *auxv = (elf_addr_t *) mm->saved_auxv; int i = 0; do i += 2; while (auxv[i - 2] != AT_NULL); fill_note(note, AUXV, i * sizeof(elf_addr_t), auxv); } static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata, const kernel_siginfo_t *siginfo) { copy_siginfo_to_external(csigdata, siginfo); fill_note(note, SIGINFO, sizeof(*csigdata), csigdata); } /* * Format of NT_FILE note: * * long count -- how many files are mapped * long page_size -- units for file_ofs * array of [COUNT] elements of * long start * long end * long file_ofs * followed by COUNT filenames in ASCII: "FILE1" NUL "FILE2" NUL... */ static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm) { unsigned count, size, names_ofs, remaining, n; user_long_t *data; user_long_t *start_end_ofs; char *name_base, *name_curpos; int i; /* *Estimated* file count and total data size needed */ count = cprm->vma_count; if (count > UINT_MAX / 64) return -EINVAL; size = count * 64; names_ofs = (2 + 3 * count) * sizeof(data[0]); alloc: /* paranoia check */ if (size >= core_file_note_size_limit) { pr_warn_once("coredump Note size too large: %u (does kernel.core_file_note_size_limit sysctl need adjustment?\n", size); return -EINVAL; } size = round_up(size, PAGE_SIZE); /* * "size" can be 0 here legitimately. * Let it ENOMEM and omit NT_FILE section which will be empty anyway. */ data = kvmalloc(size, GFP_KERNEL); if (ZERO_OR_NULL_PTR(data)) return -ENOMEM; start_end_ofs = data + 2; name_base = name_curpos = ((char *)data) + names_ofs; remaining = size - names_ofs; count = 0; for (i = 0; i < cprm->vma_count; i++) { struct core_vma_metadata *m = &cprm->vma_meta[i]; struct file *file; const char *filename; file = m->file; if (!file) continue; filename = file_path(file, name_curpos, remaining); if (IS_ERR(filename)) { if (PTR_ERR(filename) == -ENAMETOOLONG) { kvfree(data); size = size * 5 / 4; goto alloc; } continue; } /* file_path() fills at the end, move name down */ /* n = strlen(filename) + 1: */ n = (name_curpos + remaining) - filename; remaining = filename - name_curpos; memmove(name_curpos, filename, n); name_curpos += n; *start_end_ofs++ = m->start; *start_end_ofs++ = m->end; *start_end_ofs++ = m->pgoff; count++; } /* Now we know exact count of files, can store it */ data[0] = count; data[1] = PAGE_SIZE; /* * Count usually is less than mm->map_count, * we need to move filenames down. */ n = cprm->vma_count - count; if (n != 0) { unsigned shift_bytes = n * 3 * sizeof(data[0]); memmove(name_base - shift_bytes, name_base, name_curpos - name_base); name_curpos -= shift_bytes; } size = name_curpos - (char *)data; fill_note(note, FILE, size, data); return 0; } #include <linux/regset.h> struct elf_thread_core_info { struct elf_thread_core_info *next; struct task_struct *task; struct elf_prstatus prstatus; struct memelfnote notes[]; }; struct elf_note_info { struct elf_thread_core_info *thread; struct memelfnote psinfo; struct memelfnote signote; struct memelfnote auxv; struct memelfnote files; user_siginfo_t csigdata; size_t size; int thread_notes; }; #ifdef CORE_DUMP_USE_REGSET /* * When a regset has a writeback hook, we call it on each thread before * dumping user memory. On register window machines, this makes sure the * user memory backing the register data is up to date before we read it. */ static void do_thread_regset_writeback(struct task_struct *task, const struct user_regset *regset) { if (regset->writeback) regset->writeback(task, regset, 1); } #ifndef PRSTATUS_SIZE #define PRSTATUS_SIZE sizeof(struct elf_prstatus) #endif #ifndef SET_PR_FPVALID #define SET_PR_FPVALID(S) ((S)->pr_fpvalid = 1) #endif static int fill_thread_core_info(struct elf_thread_core_info *t, const struct user_regset_view *view, long signr, struct elf_note_info *info) { unsigned int note_iter, view_iter; /* * NT_PRSTATUS is the one special case, because the regse