42 48 11 1135 1050 471 1050 1094 1119 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef BLK_MQ_H #define BLK_MQ_H #include <linux/blkdev.h> #include <linux/sbitmap.h> #include <linux/lockdep.h> #include <linux/scatterlist.h> #include <linux/prefetch.h> #include <linux/srcu.h> struct blk_mq_tags; struct blk_flush_queue; #define BLKDEV_MIN_RQ 4 #define BLKDEV_DEFAULT_RQ 128 enum rq_end_io_ret { RQ_END_IO_NONE, RQ_END_IO_FREE, }; typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t); /* * request flags */ typedef __u32 __bitwise req_flags_t; /* drive already may have started this one */ #define RQF_STARTED ((__force req_flags_t)(1 << 1)) /* request for flush sequence */ #define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << 4)) /* merge of different types, fail separately */ #define RQF_MIXED_MERGE ((__force req_flags_t)(1 << 5)) /* don't call prep for this one */ #define RQF_DONTPREP ((__force req_flags_t)(1 << 7)) /* use hctx->sched_tags */ #define RQF_SCHED_TAGS ((__force req_flags_t)(1 << 8)) /* use an I/O scheduler for this request */ #define RQF_USE_SCHED ((__force req_flags_t)(1 << 9)) /* vaguely specified driver internal error. Ignored by the block layer */ #define RQF_FAILED ((__force req_flags_t)(1 << 10)) /* don't warn about errors */ #define RQF_QUIET ((__force req_flags_t)(1 << 11)) /* account into disk and partition IO statistics */ #define RQF_IO_STAT ((__force req_flags_t)(1 << 13)) /* runtime pm request */ #define RQF_PM ((__force req_flags_t)(1 << 15)) /* on IO scheduler merge hash */ #define RQF_HASHED ((__force req_flags_t)(1 << 16)) /* track IO completion time */ #define RQF_STATS ((__force req_flags_t)(1 << 17)) /* Look at ->special_vec for the actual data payload instead of the bio chain. */ #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) /* The per-zone write lock is held for this request */ #define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) /* ->timeout has been called, don't expire again */ #define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21)) #define RQF_RESV ((__force req_flags_t)(1 << 23)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ (RQF_STARTED | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD) enum mq_rq_state { MQ_RQ_IDLE = 0, MQ_RQ_IN_FLIGHT = 1, MQ_RQ_COMPLETE = 2, }; /* * Try to put the fields that are referenced together in the same cacheline. * * If you modify this structure, make sure to update blk_rq_init() and * especially blk_mq_rq_ctx_init() to take care of the added fields. */ struct request { struct request_queue *q; struct blk_mq_ctx *mq_ctx; struct blk_mq_hw_ctx *mq_hctx; blk_opf_t cmd_flags; /* op and common flags */ req_flags_t rq_flags; int tag; int internal_tag; unsigned int timeout; /* the following two fields are internal, NEVER access directly */ unsigned int __data_len; /* total data len */ sector_t __sector; /* sector cursor */ struct bio *bio; struct bio *biotail; union { struct list_head queuelist; struct request *rq_next; }; struct block_device *part; #ifdef CONFIG_BLK_RQ_ALLOC_TIME /* Time that the first bio started allocating this request. */ u64 alloc_time_ns; #endif /* Time that this request was allocated for this IO. */ u64 start_time_ns; /* Time that I/O was submitted to the device. */ u64 io_start_time_ns; #ifdef CONFIG_BLK_WBT unsigned short wbt_flags; #endif /* * rq sectors used for blk stats. It has the same value * with blk_rq_sectors(rq), except that it never be zeroed * by completion. */ unsigned short stats_sectors; /* * Number of scatter-gather DMA addr+len pairs after * physical address coalescing is performed. */ unsigned short nr_phys_segments; #ifdef CONFIG_BLK_DEV_INTEGRITY unsigned short nr_integrity_segments; #endif #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct bio_crypt_ctx *crypt_ctx; struct blk_crypto_keyslot *crypt_keyslot; #endif unsigned short ioprio; enum mq_rq_state state; atomic_t ref; unsigned long deadline; /* * The hash is used inside the scheduler, and killed once the * request reaches the dispatch list. The ipi_list is only used * to queue the request for softirq completion, which is long * after the request has been unhashed (and even removed from * the dispatch list). */ union { struct hlist_node hash; /* merge hash */ struct llist_node ipi_list; }; /* * The rb_node is only used inside the io scheduler, requests * are pruned when moved to the dispatch queue. special_vec must * only be used if RQF_SPECIAL_PAYLOAD is set, and those cannot be * insert into an IO scheduler. */ union { struct rb_node rb_node; /* sort/lookup */ struct bio_vec special_vec; }; /* * Three pointers are available for the IO schedulers, if they need * more they have to dynamically allocate it. */ struct { struct io_cq *icq; void *priv[2]; } elv; struct { unsigned int seq; rq_end_io_fn *saved_end_io; } flush; u64 fifo_time; /* * completion callback. */ rq_end_io_fn *end_io; void *end_io_data; }; static inline enum req_op req_op(const struct request *req) { return req->cmd_flags & REQ_OP_MASK; } static inline bool blk_rq_is_passthrough(struct request *rq) { return blk_op_is_passthrough(rq->cmd_flags); } static inline unsigned short req_get_ioprio(struct request *req) { return req->ioprio; } #define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ) #define rq_dma_dir(rq) \ (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE) #define rq_list_add(listptr, rq) do { \ (rq)->rq_next = *(listptr); \ *(listptr) = rq; \ } while (0) #define rq_list_add_tail(lastpptr, rq) do { \ (rq)->rq_next = NULL; \ **(lastpptr) = rq; \ *(lastpptr) = &rq->rq_next; \ } while (0) #define rq_list_pop(listptr) \ ({ \ struct request *__req = NULL; \ if ((listptr) && *(listptr)) { \ __req = *(listptr); \ *(listptr) = __req->rq_next; \ } \ __req; \ }) #define rq_list_peek(listptr) \ ({ \ struct request *__req = NULL; \ if ((listptr) && *(listptr)) \ __req = *(listptr); \ __req; \ }) #define rq_list_for_each(listptr, pos) \ for (pos = rq_list_peek((listptr)); pos; pos = rq_list_next(pos)) #define rq_list_for_each_safe(listptr, pos, nxt) \ for (pos = rq_list_peek((listptr)), nxt = rq_list_next(pos); \ pos; pos = nxt, nxt = pos ? rq_list_next(pos) : NULL) #define rq_list_next(rq) (rq)->rq_next #define rq_list_empty(list) ((list) == (struct request *) NULL) /** * rq_list_move() - move a struct request from one list to another * @src: The source list @rq is currently in * @dst: The destination list that @rq will be appended to * @rq: The request to move * @prev: The request preceding @rq in @src (NULL if @rq is the head) */ static inline void rq_list_move(struct request **src, struct request **dst, struct request *rq, struct request *prev) { if (prev) prev->rq_next = rq->rq_next; else *src = rq->rq_next; rq_list_add(dst, rq); } /** * enum blk_eh_timer_return - How the timeout handler should proceed * @BLK_EH_DONE: The block driver completed the command or will complete it at * a later time. * @BLK_EH_RESET_TIMER: Reset the request timer and continue waiting for the * request to complete. */ enum blk_eh_timer_return { BLK_EH_DONE, BLK_EH_RESET_TIMER, }; #define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */ #define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */ /** * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware * block device */ struct blk_mq_hw_ctx { struct { /** @lock: Protects the dispatch list. */ spinlock_t lock; /** * @dispatch: Used for requests that are ready to be * dispatched to the hardware but for some reason (e.g. lack of * resources) could not be sent to the hardware. As soon as the * driver can send new requests, requests at this list will * be sent first for a fairer dispatch. */ struct list_head dispatch; /** * @state: BLK_MQ_S_* flags. Defines the state of the hw * queue (active, scheduled to restart, stopped). */ unsigned long state; } ____cacheline_aligned_in_smp; /** * @run_work: Used for scheduling a hardware queue run at a later time. */ struct delayed_work run_work; /** @cpumask: Map of available CPUs where this hctx can run. */ cpumask_var_t cpumask; /** * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU * selection from @cpumask. */ int next_cpu; /** * @next_cpu_batch: Counter of how many works left in the batch before * changing to the next CPU. */ int next_cpu_batch; /** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */ unsigned long flags; /** * @sched_data: Pointer owned by the IO scheduler attached to a request * queue. It's up to the IO scheduler how to use this pointer. */ void *sched_data; /** * @queue: Pointer to the request queue that owns this hardware context. */ struct request_queue *queue; /** @fq: Queue of requests that need to perform a flush operation. */ struct blk_flush_queue *fq; /** * @driver_data: Pointer to data owned by the block driver that created * this hctx */ void *driver_data; /** * @ctx_map: Bitmap for each software queue. If bit is on, there is a * pending request in that software queue. */ struct sbitmap ctx_map; /** * @dispatch_from: Software queue to be used when no scheduler was * selected. */ struct blk_mq_ctx *dispatch_from; /** * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to * decide if the hw_queue is busy using Exponential Weighted Moving * Average algorithm. */ unsigned int dispatch_busy; /** @type: HCTX_TYPE_* flags. Type of hardware queue. */ unsigned short type; /** @nr_ctx: Number of software queues. */ unsigned short nr_ctx; /** @ctxs: Array of software queues. */ struct blk_mq_ctx **ctxs; /** @dispatch_wait_lock: Lock for dispatch_wait queue. */ spinlock_t dispatch_wait_lock; /** * @dispatch_wait: Waitqueue to put requests when there is no tag * available at the moment, to wait for another try in the future. */ wait_queue_entry_t dispatch_wait; /** * @wait_index: Index of next available dispatch_wait queue to insert * requests. */ atomic_t wait_index; /** * @tags: Tags owned by the block driver. A tag at this set is only * assigned when a request is dispatched from a hardware queue. */ struct blk_mq_tags *tags; /** * @sched_tags: Tags owned by I/O scheduler. If there is an I/O * scheduler associated with a request queue, a tag is assigned when * that request is allocated. Else, this member is not used. */ struct blk_mq_tags *sched_tags; /** @numa_node: NUMA node the storage adapter has been connected to. */ unsigned int numa_node; /** @queue_num: Index of this hardware queue. */ unsigned int queue_num; /** * @nr_active: Number of active requests. Only used when a tag set is * shared across request queues. */ atomic_t nr_active; /** @cpuhp_online: List to store request if CPU is going to die */ struct hlist_node cpuhp_online; /** @cpuhp_dead: List to store request if some CPU die. */ struct hlist_node cpuhp_dead; /** @kobj: Kernel object for sysfs. */ struct kobject kobj; #ifdef CONFIG_BLK_DEBUG_FS /** * @debugfs_dir: debugfs directory for this hardware queue. Named * as cpu<cpu_number>. */ struct dentry *debugfs_dir; /** @sched_debugfs_dir: debugfs directory for the scheduler. */ struct dentry *sched_debugfs_dir; #endif /** * @hctx_list: if this hctx is not in use, this is an entry in * q->unused_hctx_list. */ struct list_head hctx_list; }; /** * struct blk_mq_queue_map - Map software queues to hardware queues * @mq_map: CPU ID to hardware queue index map. This is an array * with nr_cpu_ids elements. Each element has a value in the range * [@queue_offset, @queue_offset + @nr_queues). * @nr_queues: Number of hardware queues to map CPU IDs onto. * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe * driver to map each hardware queue type (enum hctx_type) onto a distinct * set of hardware queues. */ struct blk_mq_queue_map { unsigned int *mq_map; unsigned int nr_queues; unsigned int queue_offset; }; /** * enum hctx_type - Type of hardware queue * @HCTX_TYPE_DEFAULT: All I/O not otherwise accounted for. * @HCTX_TYPE_READ: Just for READ I/O. * @HCTX_TYPE_POLL: Polled I/O of any kind. * @HCTX_MAX_TYPES: Number of types of hctx. */ enum hctx_type { HCTX_TYPE_DEFAULT, HCTX_TYPE_READ, HCTX_TYPE_POLL, HCTX_MAX_TYPES, }; /** * struct blk_mq_tag_set - tag set that can be shared between request queues * @ops: Pointers to functions that implement block driver behavior. * @map: One or more ctx -> hctx mappings. One map exists for each * hardware queue type (enum hctx_type) that the driver wishes * to support. There are no restrictions on maps being of the * same size, and it's perfectly legal to share maps between * types. * @nr_maps: Number of elements in the @map array. A number in the range * [1, HCTX_MAX_TYPES]. * @nr_hw_queues: Number of hardware queues supported by the block driver that * owns this data structure. * @queue_depth: Number of tags per hardware queue, reserved tags included. * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag * allocations. * @cmd_size: Number of additional bytes to allocate per request. The block * driver owns these additional bytes. * @numa_node: NUMA node the storage adapter has been connected to. * @timeout: Request processing timeout in jiffies. * @flags: Zero or more BLK_MQ_F_* flags. * @driver_data: Pointer to data owned by the block driver that created this * tag set. * @tags: Tag sets. One tag set per hardware queue. Has @nr_hw_queues * elements. * @shared_tags: * Shared set of tags. Has @nr_hw_queues elements. If set, * shared by all @tags. * @tag_list_lock: Serializes tag_list accesses. * @tag_list: List of the request queues that use this tag set. See also * request_queue.tag_set_list. * @srcu: Use as lock when type of the request queue is blocking * (BLK_MQ_F_BLOCKING). */ struct blk_mq_tag_set { const struct blk_mq_ops *ops; struct blk_mq_queue_map map[HCTX_MAX_TYPES]; unsigned int nr_maps; unsigned int nr_hw_queues; unsigned int queue_depth; unsigned int reserved_tags; unsigned int cmd_size; int numa_node; unsigned int timeout; unsigned int flags; void *driver_data; struct blk_mq_tags **tags; struct blk_mq_tags *shared_tags; struct mutex tag_list_lock; struct list_head tag_list; struct srcu_struct *srcu; }; /** * struct blk_mq_queue_data - Data about a request inserted in a queue * * @rq: Request pointer. * @last: If it is the last request in the queue. */ struct blk_mq_queue_data { struct request *rq; bool last; }; typedef bool (busy_tag_iter_fn)(struct request *, void *); /** * struct blk_mq_ops - Callback functions that implements block driver * behaviour. */ struct blk_mq_ops { /** * @queue_rq: Queue a new request from block IO. */ blk_status_t (*queue_rq)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *); /** * @commit_rqs: If a driver uses bd->last to judge when to submit * requests to hardware, it must define this function. In case of errors * that make us stop issuing further requests, this hook serves the * purpose of kicking the hardware (which the last request otherwise * would have done). */ void (*commit_rqs)(struct blk_mq_hw_ctx *); /** * @queue_rqs: Queue a list of new requests. Driver is guaranteed * that each request belongs to the same queue. If the driver doesn't * empty the @rqlist completely, then the rest will be queued * individually by the block layer upon return. */ void (*queue_rqs)(struct request **rqlist); /** * @get_budget: Reserve budget before queue request, once .queue_rq is * run, it is driver's responsibility to release the * reserved budget. Also we have to handle failure case * of .get_budget for avoiding I/O deadlock. */ int (*get_budget)(struct request_queue *); /** * @put_budget: Release the reserved budget. */ void (*put_budget)(struct request_queue *, int); /** * @set_rq_budget_token: store rq's budget token */ void (*set_rq_budget_token)(struct request *, int); /** * @get_rq_budget_token: retrieve rq's budget token */ int (*get_rq_budget_token)(struct request *); /** * @timeout: Called on request timeout. */ enum blk_eh_timer_return (*timeout)(struct request *); /** * @poll: Called to poll for completion of a specific tag. */ int (*poll)(struct blk_mq_hw_ctx *, struct io_comp_batch *); /** * @complete: Mark the request as complete. */ void (*complete)(struct request *); /** * @init_hctx: Called when the block layer side of a hardware queue has * been set up, allowing the driver to allocate/init matching * structures. */ int (*init_hctx)(struct blk_mq_hw_ctx *, void *, unsigned int); /** * @exit_hctx: Ditto for exit/teardown. */ void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); /** * @init_request: Called for every command allocated by the block layer * to allow the driver to set up driver specific data. * * Tag greater than or equal to queue_depth is for setting up * flush request. */ int (*init_request)(struct blk_mq_tag_set *set, struct request *, unsigned int, unsigned int); /** * @exit_request: Ditto for exit/teardown. */ void (*exit_request)(struct blk_mq_tag_set *set, struct request *, unsigned int); /** * @cleanup_rq: Called before freeing one request which isn't completed * yet, and usually for freeing the driver private data. */ void (*cleanup_rq)(struct request *); /** * @busy: If set, returns whether or not this queue currently is busy. */ bool (*busy)(struct request_queue *); /** * @map_queues: This allows drivers specify their own queue mapping by * overriding the setup-time function that builds the mq_map. */ void (*map_queues)(struct blk_mq_tag_set *set); #ifdef CONFIG_BLK_DEBUG_FS /** * @show_rq: Used by the debugfs implementation to show driver-specific * information about a request. */ void (*show_rq)(struct seq_file *m, struct request *rq); #endif }; enum { BLK_MQ_F_SHOULD_MERGE = 1 << 0, BLK_MQ_F_TAG_QUEUE_SHARED = 1 << 1, /* * Set when this device requires underlying blk-mq device for * completing IO: */ BLK_MQ_F_STACKING = 1 << 2, BLK_MQ_F_TAG_HCTX_SHARED = 1 << 3, BLK_MQ_F_BLOCKING = 1 << 5, /* Do not allow an I/O scheduler to be configured. */ BLK_MQ_F_NO_SCHED = 1 << 6, /* * Select 'none' during queue registration in case of a single hwq * or shared hwqs instead of 'mq-deadline'. */ BLK_MQ_F_NO_SCHED_BY_DEFAULT = 1 << 7, BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, BLK_MQ_F_ALLOC_POLICY_BITS = 1, BLK_MQ_S_STOPPED = 0, BLK_MQ_S_TAG_ACTIVE = 1, BLK_MQ_S_SCHED_RESTART = 2, /* hw queue is inactive after all its CPUs become offline */ BLK_MQ_S_INACTIVE = 3, BLK_MQ_MAX_DEPTH = 10240, BLK_MQ_CPU_WORK_BATCH = 8, }; #define BLK_MQ_FLAG_TO_ALLOC_POLICY(flags) \ ((flags >> BLK_MQ_F_ALLOC_POLICY_START_BIT) & \ ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) #define BLK_ALLOC_POLICY_TO_MQ_FLAG(policy) \ ((policy & ((1 << BLK_MQ_F_ALLOC_POLICY_BITS) - 1)) \ << BLK_MQ_F_ALLOC_POLICY_START_BIT) #define BLK_MQ_NO_HCTX_IDX (-1U) struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, struct lock_class_key *lkclass); #define blk_mq_alloc_disk(set, queuedata) \ ({ \ static struct lock_class_key __key; \ \ __blk_mq_alloc_disk(set, queuedata, &__key); \ }) struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, struct lock_class_key *lkclass); struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q); void blk_mq_destroy_queue(struct request_queue *); int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set); int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, const struct blk_mq_ops *ops, unsigned int queue_depth, unsigned int set_flags); void blk_mq_free_tag_set(struct blk_mq_tag_set *set); void blk_mq_free_request(struct request *rq); int blk_rq_poll(struct request *rq, struct io_comp_batch *iob, unsigned int poll_flags); bool blk_mq_queue_inflight(struct request_queue *q); enum { /* return when out of requests */ BLK_MQ_REQ_NOWAIT = (__force blk_mq_req_flags_t)(1 << 0), /* allocate from reserved pool */ BLK_MQ_REQ_RESERVED = (__force blk_mq_req_flags_t)(1 << 1), /* set RQF_PM */ BLK_MQ_REQ_PM = (__force blk_mq_req_flags_t)(1 << 2), }; struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, blk_mq_req_flags_t flags); struct request *blk_mq_alloc_request_hctx(struct request_queue *q, blk_opf_t opf, blk_mq_req_flags_t flags, unsigned int hctx_idx); /* * Tag address space map. */ struct blk_mq_tags { unsigned int nr_tags; unsigned int nr_reserved_tags; unsigned int active_queues; struct sbitmap_queue bitmap_tags; struct sbitmap_queue breserved_tags; struct request **rqs; struct request **static_rqs; struct list_head page_list; /* * used to clear request reference in rqs[] before freeing one * request pool */ spinlock_t lock; }; static inline struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) { if (tag < tags->nr_tags) { prefetch(tags->rqs[tag]); return tags->rqs[tag]; } return NULL; } enum { BLK_MQ_UNIQUE_TAG_BITS = 16, BLK_MQ_UNIQUE_TAG_MASK = (1 << BLK_MQ_UNIQUE_TAG_BITS) - 1, }; u32 blk_mq_unique_tag(struct request *rq); static inline u16 blk_mq_unique_tag_to_hwq(u32 unique_tag) { return unique_tag >> BLK_MQ_UNIQUE_TAG_BITS; } static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag) { return unique_tag & BLK_MQ_UNIQUE_TAG_MASK; } /** * blk_mq_rq_state() - read the current MQ_RQ_* state of a request * @rq: target request. */ static inline enum mq_rq_state blk_mq_rq_state(struct request *rq) { return READ_ONCE(rq->state); } static inline int blk_mq_request_started(struct request *rq) { return blk_mq_rq_state(rq) != MQ_RQ_IDLE; } static inline int blk_mq_request_completed(struct request *rq) { return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE; } /* * * Set the state to complete when completing a request from inside ->queue_rq. * This is used by drivers that want to ensure special complete actions that * need access to the request are called on failure, e.g. by nvme for * multipathing. */ static inline void blk_mq_set_request_complete(struct request *rq) { WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); } /* * Complete the request directly instead of deferring it to softirq or * completing it another CPU. Useful in preemptible instead of an interrupt. */ static inline void blk_mq_complete_request_direct(struct request *rq, void (*complete)(struct request *rq)) { WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); complete(rq); } void blk_mq_start_request(struct request *rq); void blk_mq_end_request(struct request *rq, blk_status_t error); void __blk_mq_end_request(struct request *rq, blk_status_t error); void blk_mq_end_request_batch(struct io_comp_batch *ib); /* * Only need start/end time stamping if we have iostat or * blk stats enabled, or using an IO scheduler. */ static inline bool blk_mq_need_time_stamp(struct request *rq) { /* * passthrough io doesn't use iostat accounting, cgroup stats * and io scheduler functionalities. */ if (blk_rq_is_passthrough(rq)) return false; return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS | RQF_USE_SCHED)); } static inline bool blk_mq_is_reserved_rq(struct request *rq) { return rq->rq_flags & RQF_RESV; } /* * Batched completions only work when there is no I/O error and no special * ->end_io handler. */ static inline bool blk_mq_add_to_batch(struct request *req, struct io_comp_batch *iob, int ioerror, void (*complete)(struct io_comp_batch *)) { /* * blk_mq_end_request_batch() can't end request allocated from * sched tags */ if (!iob || (req->rq_flags & RQF_SCHED_TAGS) || ioerror || (req->end_io && !blk_rq_is_passthrough(req))) return false; if (!iob->complete) iob->complete = complete; else if (iob->complete != complete) return false; iob->need_ts |= blk_mq_need_time_stamp(req); rq_list_add(&iob->req_list, req); return true; } void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list); void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); void blk_mq_complete_request(struct request *rq); bool blk_mq_complete_request_remote(struct request *rq); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx); void blk_mq_stop_hw_queues(struct request_queue *q); void blk_mq_start_hw_queues(struct request_queue *q); void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_quiesce_queue(struct request_queue *q); void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set); void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set); void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set); void blk_mq_unquiesce_queue(struct request_queue *q); void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_run_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs); void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv); void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset); void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_unfreeze_queue(struct request_queue *q); void blk_freeze_queue_start(struct request_queue *q); void blk_mq_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); void blk_mq_map_queues(struct blk_mq_queue_map *qmap); void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues); void blk_mq_quiesce_queue_nowait(struct request_queue *q); unsigned int blk_mq_rq_cpu(struct request *rq); bool __blk_should_fake_timeout(struct request_queue *q); static inline bool blk_should_fake_timeout(struct request_queue *q) { if (IS_ENABLED(CONFIG_FAIL_IO_TIMEOUT) && test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags)) return __blk_should_fake_timeout(q); return false; } /** * blk_mq_rq_from_pdu - cast a PDU to a request * @pdu: the PDU (Protocol Data Unit) to be casted * * Return: request * * Driver command data is immediately after the request. So subtract request * size to get back to the original request. */ static inline struct request *blk_mq_rq_from_pdu(void *pdu) { return pdu - sizeof(struct request); } /** * blk_mq_rq_to_pdu - cast a request to a PDU * @rq: the request to be casted * * Return: pointer to the PDU * * Driver command data is immediately after the request. So add request to get * the PDU. */ static inline void *blk_mq_rq_to_pdu(struct request *rq) { return rq + 1; } #define queue_for_each_hw_ctx(q, hctx, i) \ xa_for_each(&(q)->hctx_table, (i), (hctx)) #define hctx_for_each_ctx(hctx, ctx, i) \ for ((i) = 0; (i) < (hctx)->nr_ctx && \ ({ ctx = (hctx)->ctxs[(i)]; 1; }); (i)++) static inline void blk_mq_cleanup_rq(struct request *rq) { if (rq->q->mq_ops->cleanup_rq) rq->q->mq_ops->cleanup_rq(rq); } static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, unsigned int nr_segs) { rq->nr_phys_segments = nr_segs; rq->__data_len = bio->bi_iter.bi_size; rq->bio = rq->biotail = bio; rq->ioprio = bio_prio(bio); } void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, struct lock_class_key *key); static inline bool rq_is_sync(struct request *rq) { return op_is_sync(rq->cmd_flags); } void blk_rq_init(struct request_queue *q, struct request *rq); int blk_rq_prep_clone(struct request *rq, struct request *rq_src, struct bio_set *bs, gfp_t gfp_mask, int (*bio_ctr)(struct bio *, struct bio *, void *), void *data); void blk_rq_unprep_clone(struct request *rq); blk_status_t blk_insert_cloned_request(struct request *rq); struct rq_map_data { struct page **pages; unsigned long offset; unsigned short page_order; unsigned short nr_entries; bool null_mapped; bool from_user; }; int blk_rq_map_user(struct request_queue *, struct request *, struct rq_map_data *, void __user *, unsigned long, gfp_t); int blk_rq_map_user_io(struct request *, struct rq_map_data *, void __user *, unsigned long, gfp_t, bool, int, bool, int); int blk_rq_map_user_iov(struct request_queue *, struct request *, struct rq_map_data *, const struct iov_iter *, gfp_t); int blk_rq_unmap_user(struct bio *); int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t); int blk_rq_append_bio(struct request *rq, struct bio *bio); void blk_execute_rq_nowait(struct request *rq, bool at_head); blk_status_t blk_execute_rq(struct request *rq, bool at_head); bool blk_rq_is_poll(struct request *rq); struct req_iterator { struct bvec_iter iter; struct bio *bio; }; #define __rq_for_each_bio(_bio, rq) \ if ((rq->bio)) \ for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next) #define rq_for_each_segment(bvl, _rq, _iter) \ __rq_for_each_bio(_iter.bio, _rq) \ bio_for_each_segment(bvl, _iter.bio, _iter.iter) #define rq_for_each_bvec(bvl, _rq, _iter) \ __rq_for_each_bio(_iter.bio, _rq) \ bio_for_each_bvec(bvl, _iter.bio, _iter.iter) #define rq_iter_last(bvec, _iter) \ (_iter.bio->bi_next == NULL && \ bio_iter_last(bvec, _iter.iter)) /* * blk_rq_pos() : the current sector * blk_rq_bytes() : bytes left in the entire request * blk_rq_cur_bytes() : bytes left in the current segment * blk_rq_sectors() : sectors left in the entire request * blk_rq_cur_sectors() : sectors left in the current segment * blk_rq_stats_sectors() : sectors of the entire request used for stats */ static inline sector_t blk_rq_pos(const struct request *rq) { return rq->__sector; } static inline unsigned int blk_rq_bytes(const struct request *rq) { return rq->__data_len; } static inline int blk_rq_cur_bytes(const struct request *rq) { if (!rq->bio) return 0; if (!bio_has_data(rq->bio)) /* dataless requests such as discard */ return rq->bio->bi_iter.bi_size; return bio_iovec(rq->bio).bv_len; } static inline unsigned int blk_rq_sectors(const struct request *rq) { return blk_rq_bytes(rq) >> SECTOR_SHIFT; } static inline unsigned int blk_rq_cur_sectors(const struct request *rq) { return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT; } static inline unsigned int blk_rq_stats_sectors(const struct request *rq) { return rq->stats_sectors; } /* * Some commands like WRITE SAME have a payload or data transfer size which * is different from the size of the request. Any driver that supports such * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to * calculate the data transfer size. */ static inline unsigned int blk_rq_payload_bytes(struct request *rq) { if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) return rq->special_vec.bv_len; return blk_rq_bytes(rq); } /* * Return the first full biovec in the request. The caller needs to check that * there are any bvecs before calling this helper. */ static inline struct bio_vec req_bvec(struct request *rq) { if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) return rq->special_vec; return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter); } static inline unsigned int blk_rq_count_bios(struct request *rq) { unsigned int nr_bios = 0; struct bio *bio; __rq_for_each_bio(bio, rq) nr_bios++; return nr_bios; } void blk_steal_bios(struct bio_list *list, struct request *rq); /* * Request completion related functions. * * blk_update_request() completes given number of bytes and updates * the request without completing it. */ bool blk_update_request(struct request *rq, blk_status_t error, unsigned int nr_bytes); void blk_abort_request(struct request *); /* * Number of physical segments as sent to the device. * * Normally this is the number of discontiguous data segments sent by the * submitter. But for data-less command like discard we might have no * actual data segments submitted, but the driver might have to add it's * own special payload. In that case we still return 1 here so that this * special payload will be mapped. */ static inline unsigned short blk_rq_nr_phys_segments(struct request *rq) { if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) return 1; return rq->nr_phys_segments; } /* * Number of discard segments (or ranges) the driver needs to fill in. * Each discard bio merged into a request is counted as one segment. */ static inline unsigned short blk_rq_nr_discard_segments(struct request *rq) { return max_t(unsigned short, rq->nr_phys_segments, 1); } int __blk_rq_map_sg(struct request_queue *q, struct request *rq, struct scatterlist *sglist, struct scatterlist **last_sg); static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq, struct scatterlist *sglist) { struct scatterlist *last_sg = NULL; return __blk_rq_map_sg(q, rq, sglist, &last_sg); } void blk_dump_rq_flags(struct request *, char *); #ifdef CONFIG_BLK_DEV_ZONED static inline unsigned int blk_rq_zone_no(struct request *rq) { return disk_zone_no(rq->q->disk, blk_rq_pos(rq)); } static inline unsigned int blk_rq_zone_is_seq(struct request *rq) { return disk_zone_is_seq(rq->q->disk, blk_rq_pos(rq)); } /** * blk_rq_is_seq_zoned_write() - Check if @rq requires write serialization. * @rq: Request to examine. * * Note: REQ_OP_ZONE_APPEND requests do not require serialization. */ static inline bool blk_rq_is_seq_zoned_write(struct request *rq) { return op_needs_zoned_write_locking(req_op(rq)) && blk_rq_zone_is_seq(rq); } bool blk_req_needs_zone_write_lock(struct request *rq); bool blk_req_zone_write_trylock(struct request *rq); void __blk_req_zone_write_lock(struct request *rq); void __blk_req_zone_write_unlock(struct request *rq); static inline void blk_req_zone_write_lock(struct request *rq) { if (blk_req_needs_zone_write_lock(rq)) __blk_req_zone_write_lock(rq); } static inline void blk_req_zone_write_unlock(struct request *rq) { if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED) __blk_req_zone_write_unlock(rq); } static inline bool blk_req_zone_is_write_locked(struct request *rq) { return rq->q->disk->seq_zones_wlock && test_bit(blk_rq_zone_no(rq), rq->q->disk->seq_zones_wlock); } static inline bool blk_req_can_dispatch_to_zone(struct request *rq) { if (!blk_req_needs_zone_write_lock(rq)) return true; return !blk_req_zone_is_write_locked(rq); } #else /* CONFIG_BLK_DEV_ZONED */ static inline bool blk_rq_is_seq_zoned_write(struct request *rq) { return false; } static inline bool blk_req_needs_zone_write_lock(struct request *rq) { return false; } static inline void blk_req_zone_write_lock(struct request *rq) { } static inline void blk_req_zone_write_unlock(struct request *rq) { } static inline bool blk_req_zone_is_write_locked(struct request *rq) { return false; } static inline bool blk_req_can_dispatch_to_zone(struct request *rq) { return true; } #endif /* CONFIG_BLK_DEV_ZONED */ #endif /* BLK_MQ_H */
12 4 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _NET_GSO_H #define _NET_GSO_H #include <linux/skbuff.h> /* Keeps track of mac header offset relative to skb->head. * It is useful for TSO of Tunneling protocol. e.g. GRE. * For non-tunnel skb it points to skb_mac_header() and for * tunnel skb it points to outer mac header. * Keeps track of level of encapsulation of network headers. */ struct skb_gso_cb { union { int mac_offset; int data_offset; }; int encap_level; __wsum csum; __u16 csum_start; }; #define SKB_GSO_CB_OFFSET 32 #define SKB_GSO_CB(skb) ((struct skb_gso_cb *)((skb)->cb + SKB_GSO_CB_OFFSET)) static inline int skb_tnl_header_len(const struct sk_buff *inner_skb) { return (skb_mac_header(inner_skb) - inner_skb->head) - SKB_GSO_CB(inner_skb)->mac_offset; } static inline int gso_pskb_expand_head(struct sk_buff *skb, int extra) { int new_headroom, headroom; int ret; headroom = skb_headroom(skb); ret = pskb_expand_head(skb, extra, 0, GFP_ATOMIC); if (ret) return ret; new_headroom = skb_headroom(skb); SKB_GSO_CB(skb)->mac_offset += (new_headroom - headroom); return 0; } static inline void gso_reset_checksum(struct sk_buff *skb, __wsum res) { /* Do not update partial checksums if remote checksum is enabled. */ if (skb->remcsum_offload) return; SKB_GSO_CB(skb)->csum = res; SKB_GSO_CB(skb)->csum_start = skb_checksum_start(skb) - skb->head; } /* Compute the checksum for a gso segment. First compute the checksum value * from the start of transport header to SKB_GSO_CB(skb)->csum_start, and * then add in skb->csum (checksum from csum_start to end of packet). * skb->csum and csum_start are then updated to reflect the checksum of the * resultant packet starting from the transport header-- the resultant checksum * is in the res argument (i.e. normally zero or ~ of checksum of a pseudo * header. */ static inline __sum16 gso_make_checksum(struct sk_buff *skb, __wsum res) { unsigned char *csum_start = skb_transport_header(skb); int plen = (skb->head + SKB_GSO_CB(skb)->csum_start) - csum_start; __wsum partial = SKB_GSO_CB(skb)->csum; SKB_GSO_CB(skb)->csum = res; SKB_GSO_CB(skb)->csum_start = csum_start - skb->head; return csum_fold(csum_partial(csum_start, plen, partial)); } struct sk_buff *__skb_gso_segment(struct sk_buff *skb, netdev_features_t features, bool tx_path); static inline struct sk_buff *skb_gso_segment(struct sk_buff *skb, netdev_features_t features) { return __skb_gso_segment(skb, features, true); } struct sk_buff *skb_eth_gso_segment(struct sk_buff *skb, netdev_features_t features, __be16 type); struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb, netdev_features_t features); bool skb_gso_validate_network_len(const struct sk_buff *skb, unsigned int mtu); bool skb_gso_validate_mac_len(const struct sk_buff *skb, unsigned int len); static inline void skb_gso_error_unwind(struct sk_buff *skb, __be16 protocol, int pulled_hlen, u16 mac_offset, int mac_len) { skb->protocol = protocol; skb->encapsulation = 1; skb_push(skb, pulled_hlen); skb_reset_transport_header(skb); skb->mac_header = mac_offset; skb->network_header = skb->mac_header + mac_len; skb->mac_len = mac_len; } #endif /* _NET_GSO_H */
2 2 2 1 1 4 1 1 2 2 1 1 3 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 // SPDX-License-Identifier: GPL-2.0-or-later /* * taskstats.c - Export per-task statistics to userland * * Copyright (C) Shailabh Nagar, IBM Corp. 2006 * (C) Balbir Singh, IBM Corp. 2006 */ #include <linux/kernel.h> #include <linux/taskstats_kern.h> #include <linux/tsacct_kern.h> #include <linux/acct.h> #include <linux/delayacct.h> #include <linux/cpumask.h> #include <linux/percpu.h> #include <linux/slab.h> #include <linux/cgroupstats.h> #include <linux/cgroup.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/pid_namespace.h> #include <net/genetlink.h> #include <linux/atomic.h> #include <linux/sched/cputime.h> /* * Maximum length of a cpumask that can be specified in * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute */ #define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) static DEFINE_PER_CPU(__u32, taskstats_seqnum); static int family_registered; struct kmem_cache *taskstats_cache; static struct genl_family family; static const struct nla_policy taskstats_cmd_get_policy[] = { [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; static const struct nla_policy cgroupstats_cmd_get_policy[] = { [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, }; struct listener { struct list_head list; pid_t pid; char valid; }; struct listener_list { struct rw_semaphore sem; struct list_head list; }; static DEFINE_PER_CPU(struct listener_list, listener_array); enum actions { REGISTER, DEREGISTER, CPU_DONT_CARE }; static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, size_t size) { struct sk_buff *skb; void *reply; /* * If new attributes are added, please revisit this allocation */ skb = genlmsg_new(size, GFP_KERNEL); if (!skb) return -ENOMEM; if (!info) { int seq = this_cpu_inc_return(taskstats_seqnum) - 1; reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); } else reply = genlmsg_put_reply(skb, info, &family, 0, cmd); if (reply == NULL) { nlmsg_free(skb); return -EINVAL; } *skbp = skb; return 0; } /* * Send taskstats data in @skb to listener with nl_pid @pid */ static int send_reply(struct sk_buff *skb, struct genl_info *info) { struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); void *reply = genlmsg_data(genlhdr); genlmsg_end(skb, reply); return genlmsg_reply(skb, info); } /* * Send taskstats data in @skb to listeners registered for @cpu's exit data */ static void send_cpu_listeners(struct sk_buff *skb, struct listener_list *listeners) { struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); struct listener *s, *tmp; struct sk_buff *skb_next, *skb_cur = skb; void *reply = genlmsg_data(genlhdr); int delcount = 0; genlmsg_end(skb, reply); down_read(&listeners->sem); list_for_each_entry(s, &listeners->list, list) { int rc; skb_next = NULL; if (!list_is_last(&s->list, &listeners->list)) { skb_next = skb_clone(skb_cur, GFP_KERNEL); if (!skb_next) break; } rc = genlmsg_unicast(&init_net, skb_cur, s->pid); if (rc == -ECONNREFUSED) { s->valid = 0; delcount++; } skb_cur = skb_next; } up_read(&listeners->sem); if (skb_cur) nlmsg_free(skb_cur); if (!delcount) return; /* Delete invalidated entries */ down_write(&listeners->sem); list_for_each_entry_safe(s, tmp, &listeners->list, list) { if (!s->valid) { list_del(&s->list); kfree(s); } } up_write(&listeners->sem); } static void exe_add_tsk(struct taskstats *stats, struct task_struct *tsk) { /* No idea if I'm allowed to access that here, now. */ struct file *exe_file = get_task_exe_file(tsk); if (exe_file) { /* Following cp_new_stat64() in stat.c . */ stats->ac_exe_dev = huge_encode_dev(exe_file->f_inode->i_sb->s_dev); stats->ac_exe_inode = exe_file->f_inode->i_ino; fput(exe_file); } else { stats->ac_exe_dev = 0; stats->ac_exe_inode = 0; } } static void fill_stats(struct user_namespace *user_ns, struct pid_namespace *pid_ns, struct task_struct *tsk, struct taskstats *stats) { memset(stats, 0, sizeof(*stats)); /* * Each accounting subsystem adds calls to its functions to * fill in relevant parts of struct taskstsats as follows * * per-task-foo(stats, tsk); */ delayacct_add_tsk(stats, tsk); /* fill in basic acct fields */ stats->version = TASKSTATS_VERSION; stats->nvcsw = tsk->nvcsw; stats->nivcsw = tsk->nivcsw; bacct_add_tsk(user_ns, pid_ns, stats, tsk); /* fill in extended acct fields */ xacct_add_tsk(stats, tsk); /* add executable info */ exe_add_tsk(stats, tsk); } static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) { struct task_struct *tsk; tsk = find_get_task_by_vpid(pid); if (!tsk) return -ESRCH; fill_stats(current_user_ns(), task_active_pid_ns(current), tsk, stats); put_task_struct(tsk); return 0; } static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) { struct task_struct *tsk, *first; unsigned long flags; int rc = -ESRCH; u64 delta, utime, stime; u64 start_time; /* * Add additional stats from live tasks except zombie thread group * leaders who are already counted with the dead tasks */ rcu_read_lock(); first = find_task_by_vpid(tgid); if (!first || !lock_task_sighand(first, &flags)) goto out; if (first->signal->stats) memcpy(stats, first->signal->stats, sizeof(*stats)); else memset(stats, 0, sizeof(*stats)); start_time = ktime_get_ns(); for_each_thread(first, tsk) { if (tsk->exit_state) continue; /* * Accounting subsystem can call its functions here to * fill in relevant parts of struct taskstsats as follows * * per-task-foo(stats, tsk); */ delayacct_add_tsk(stats, tsk); /* calculate task elapsed time in nsec */ delta = start_time - tsk->start_time; /* Convert to micro seconds */ do_div(delta, NSEC_PER_USEC); stats->ac_etime += delta; task_cputime(tsk, &utime, &stime); stats->ac_utime += div_u64(utime, NSEC_PER_USEC); stats->ac_stime += div_u64(stime, NSEC_PER_USEC); stats->nvcsw += tsk->nvcsw; stats->nivcsw += tsk->nivcsw; } unlock_task_sighand(first, &flags); rc = 0; out: rcu_read_unlock(); stats->version = TASKSTATS_VERSION; /* * Accounting subsystems can also add calls here to modify * fields of taskstats. */ return rc; } static void fill_tgid_exit(struct task_struct *tsk) { unsigned long flags; spin_lock_irqsave(&tsk->sighand->siglock, flags); if (!tsk->signal->stats) goto ret; /* * Each accounting subsystem calls its functions here to * accumalate its per-task stats for tsk, into the per-tgid structure * * per-task-foo(tsk->signal->stats, tsk); */ delayacct_add_tsk(tsk->signal->stats, tsk); ret: spin_unlock_irqrestore(&tsk->sighand->siglock, flags); return; } static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) { struct listener_list *listeners; struct listener *s, *tmp, *s2; unsigned int cpu; int ret = 0; if (!cpumask_subset(mask, cpu_possible_mask)) return -EINVAL; if (current_user_ns() != &init_user_ns) return -EINVAL; if (task_active_pid_ns(current) != &init_pid_ns) return -EINVAL; if (isadd == REGISTER) { for_each_cpu(cpu, mask) { s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, cpu_to_node(cpu)); if (!s) { ret = -ENOMEM; goto cleanup; } s->pid = pid; s->valid = 1; listeners = &per_cpu(listener_array, cpu); down_write(&listeners->sem); list_for_each_entry(s2, &listeners->list, list) { if (s2->pid == pid && s2->valid) goto exists; } list_add(&s->list, &listeners->list); s = NULL; exists: up_write(&listeners->sem); kfree(s); /* nop if NULL */ } return 0; } /* Deregister or cleanup */ cleanup: for_each_cpu(cpu, mask) { listeners = &per_cpu(listener_array, cpu); down_write(&listeners->sem); list_for_each_entry_safe(s, tmp, &listeners->list, list) { if (s->pid == pid) { list_del(&s->list); kfree(s); break; } } up_write(&listeners->sem); } return ret; } static int parse(struct nlattr *na, struct cpumask *mask) { char *data; int len; int ret; if (na == NULL) return 1; len = nla_len(na); if (len > TASKSTATS_CPUMASK_MAXLEN) return -E2BIG; if (len < 1) return -EINVAL; data = kmalloc(len, GFP_KERNEL); if (!data) return -ENOMEM; nla_strscpy(data, na, len); ret = cpulist_parse(data, mask); kfree(data); return ret; } static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) { struct nlattr *na, *ret; int aggr; aggr = (type == TASKSTATS_TYPE_PID) ? TASKSTATS_TYPE_AGGR_PID : TASKSTATS_TYPE_AGGR_TGID; na = nla_nest_start_noflag(skb, aggr); if (!na) goto err; if (nla_put(skb, type, sizeof(pid), &pid) < 0) { nla_nest_cancel(skb, na); goto err; } ret = nla_reserve_64bit(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats), TASKSTATS_TYPE_NULL); if (!ret) { nla_nest_cancel(skb, na); goto err; } nla_nest_end(skb, na); return nla_data(ret); err: return NULL; } static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) { int rc = 0; struct sk_buff *rep_skb; struct cgroupstats *stats; struct nlattr *na; size_t size; u32 fd; struct fd f; na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; if (!na) return -EINVAL; fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); f = fdget(fd); if (!f.file) return 0; size = nla_total_size(sizeof(struct cgroupstats)); rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb, size); if (rc < 0) goto err; na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, sizeof(struct cgroupstats)); if (na == NULL) { nlmsg_free(rep_skb); rc = -EMSGSIZE; goto err; } stats = nla_data(na); memset(stats, 0, sizeof(*stats)); rc = cgroupstats_build(stats, f.file->f_path.dentry); if (rc < 0) { nlmsg_free(rep_skb); goto err; } rc = send_reply(rep_skb, info); err: fdput(f); return rc; } static int cmd_attr_register_cpumask(struct genl_info *info) { cpumask_var_t mask; int rc; if (!alloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); if (rc < 0) goto out; rc = add_del_listener(info->snd_portid, mask, REGISTER); out: free_cpumask_var(mask); return rc; } static int cmd_attr_deregister_cpumask(struct genl_info *info) { cpumask_var_t mask; int rc; if (!alloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); if (rc < 0) goto out; rc = add_del_listener(info->snd_portid, mask, DEREGISTER); out: free_cpumask_var(mask); return rc; } static size_t taskstats_packet_size(void) { size_t size; size = nla_total_size(sizeof(u32)) + nla_total_size_64bit(sizeof(struct taskstats)) + nla_total_size(0); return size; } static int cmd_attr_pid(struct genl_info *info) { struct taskstats *stats; struct sk_buff *rep_skb; size_t size; u32 pid; int rc; size = taskstats_packet_size(); rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); if (rc < 0) return rc; rc = -EINVAL; pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); if (!stats) goto err; rc = fill_stats_for_pid(pid, stats); if (rc < 0) goto err; return send_reply(rep_skb, info); err: nlmsg_free(rep_skb); return rc; } static int cmd_attr_tgid(struct genl_info *info) { struct taskstats *stats; struct sk_buff *rep_skb; size_t size; u32 tgid; int rc; size = taskstats_packet_size(); rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); if (rc < 0) return rc; rc = -EINVAL; tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); if (!stats) goto err; rc = fill_stats_for_tgid(tgid, stats); if (rc < 0) goto err; return send_reply(rep_skb, info); err: nlmsg_free(rep_skb); return rc; } static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) { if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK]) return cmd_attr_register_cpumask(info); else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK]) return cmd_attr_deregister_cpumask(info); else if (info->attrs[TASKSTATS_CMD_ATTR_PID]) return cmd_attr_pid(info); else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) return cmd_attr_tgid(info); else return -EINVAL; } static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; struct taskstats *stats_new, *stats; /* Pairs with smp_store_release() below. */ stats = smp_load_acquire(&sig->stats); if (stats || thread_group_empty(tsk)) return stats; /* No problem if kmem_cache_zalloc() fails */ stats_new = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); spin_lock_irq(&tsk->sighand->siglock); stats = sig->stats; if (!stats) { /* * Pairs with smp_store_release() above and order the * kmem_cache_zalloc(). */ smp_store_release(&sig->stats, stats_new); stats = stats_new; stats_new = NULL; } spin_unlock_irq(&tsk->sighand->siglock); if (stats_new) kmem_cache_free(taskstats_cache, stats_new); return stats; } /* Send pid data out on exit */ void taskstats_exit(struct task_struct *tsk, int group_dead) { int rc; struct listener_list *listeners; struct taskstats *stats; struct sk_buff *rep_skb; size_t size; int is_thread_group; if (!family_registered) return; /* * Size includes space for nested attributes */ size = taskstats_packet_size(); is_thread_group = !!taskstats_tgid_alloc(tsk); if (is_thread_group) { /* PID + STATS + TGID + STATS */ size = 2 * size; /* fill the tsk->signal->stats structure */ fill_tgid_exit(tsk); } listeners = raw_cpu_ptr(&listener_array); if (list_empty(&listeners->list)) return; rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size); if (rc < 0) return; stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, task_pid_nr_ns(tsk, &init_pid_ns)); if (!stats) goto err; fill_stats(&init_user_ns, &init_pid_ns, tsk, stats); if (group_dead) stats->ac_flag |= AGROUP; /* * Doesn't matter if tsk is the leader or the last group member leaving */ if (!is_thread_group || !group_dead) goto send; stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, task_tgid_nr_ns(tsk, &init_pid_ns)); if (!stats) goto err; memcpy(stats, tsk->signal->stats, sizeof(*stats)); send: send_cpu_listeners(rep_skb, listeners); return; err: nlmsg_free(rep_skb); } static const struct genl_ops taskstats_ops[] = { { .cmd = TASKSTATS_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = taskstats_user_cmd, .policy = taskstats_cmd_get_policy, .maxattr = ARRAY_SIZE(taskstats_cmd_get_policy) - 1, .flags = GENL_ADMIN_PERM, }, { .cmd = CGROUPSTATS_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = cgroupstats_user_cmd, .policy = cgroupstats_cmd_get_policy, .maxattr = ARRAY_SIZE(cgroupstats_cmd_get_policy) - 1, }, }; static struct genl_family family __ro_after_init = { .name = TASKSTATS_GENL_NAME, .version = TASKSTATS_GENL_VERSION, .module = THIS_MODULE, .ops = taskstats_ops, .n_ops = ARRAY_SIZE(taskstats_ops), .resv_start_op = CGROUPSTATS_CMD_GET + 1, .netnsok = true, }; /* Needed early in initialization */ void __init taskstats_init_early(void) { unsigned int i; taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC); for_each_possible_cpu(i) { INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); init_rwsem(&(per_cpu(listener_array, i).sem)); } } static int __init taskstats_init(void) { int rc; rc = genl_register_family(&family); if (rc) return rc; family_registered = 1; pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); return 0; } /* * late initcall ensures initialization of statistics collection * mechanisms precedes initialization of the taskstats interface */ late_initcall(taskstats_init);
34 2 24 30 626 947 20 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BITOPS_H #define _LINUX_BITOPS_H #include <asm/types.h> #include <linux/bits.h> #include <linux/typecheck.h> #include <uapi/linux/kernel.h> /* Set bits in the first 'n' bytes when loaded from memory */ #ifdef __LITTLE_ENDIAN # define aligned_byte_mask(n) ((1UL << 8*(n))-1) #else # define aligned_byte_mask(n) (~0xffUL << (BITS_PER_LONG - 8 - 8*(n))) #endif #define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) #define BITS_TO_LONGS(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(long)) #define BITS_TO_U64(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(u64)) #define BITS_TO_U32(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(u32)) #define BITS_TO_BYTES(nr) __KERNEL_DIV_ROUND_UP(nr, BITS_PER_TYPE(char)) extern unsigned int __sw_hweight8(unsigned int w); extern unsigned int __sw_hweight16(unsigned int w); extern unsigned int __sw_hweight32(unsigned int w); extern unsigned long __sw_hweight64(__u64 w); /* * Defined here because those may be needed by architecture-specific static * inlines. */ #include <asm-generic/bitops/generic-non-atomic.h> /* * Many architecture-specific non-atomic bitops contain inline asm code and due * to that the compiler can't optimize them to compile-time expressions or * constants. In contrary, generic_*() helpers are defined in pure C and * compilers optimize them just well. * Therefore, to make `unsigned long foo = 0; __set_bit(BAR, &foo)` effectively * equal to `unsigned long foo = BIT(BAR)`, pick the generic C alternative when * the arguments can be resolved at compile time. That expression itself is a * constant and doesn't bring any functional changes to the rest of cases. * The casts to `uintptr_t` are needed to mitigate `-Waddress` warnings when * passing a bitmap from .bss or .data (-> `!!addr` is always true). */ #define bitop(op, nr, addr) \ ((__builtin_constant_p(nr) && \ __builtin_constant_p((uintptr_t)(addr) != (uintptr_t)NULL) && \ (uintptr_t)(addr) != (uintptr_t)NULL && \ __builtin_constant_p(*(const unsigned long *)(addr))) ? \ const##op(nr, addr) : op(nr, addr)) #define __set_bit(nr, addr) bitop(___set_bit, nr, addr) #define __clear_bit(nr, addr) bitop(___clear_bit, nr, addr) #define __change_bit(nr, addr) bitop(___change_bit, nr, addr) #define __test_and_set_bit(nr, addr) bitop(___test_and_set_bit, nr, addr) #define __test_and_clear_bit(nr, addr) bitop(___test_and_clear_bit, nr, addr) #define __test_and_change_bit(nr, addr) bitop(___test_and_change_bit, nr, addr) #define test_bit(nr, addr) bitop(_test_bit, nr, addr) #define test_bit_acquire(nr, addr) bitop(_test_bit_acquire, nr, addr) /* * Include this here because some architectures need generic_ffs/fls in * scope */ #include <asm/bitops.h> /* Check that the bitops prototypes are sane */ #define __check_bitop_pr(name) \ static_assert(__same_type(arch_##name, generic_##name) && \ __same_type(const_##name, generic_##name) && \ __same_type(_##name, generic_##name)) __check_bitop_pr(__set_bit); __check_bitop_pr(__clear_bit); __check_bitop_pr(__change_bit); __check_bitop_pr(__test_and_set_bit); __check_bitop_pr(__test_and_clear_bit); __check_bitop_pr(__test_and_change_bit); __check_bitop_pr(test_bit); #undef __check_bitop_pr static inline int get_bitmask_order(unsigned int count) { int order; order = fls(count); return order; /* We could be slightly more clever with -1 here... */ } static __always_inline unsigned long hweight_long(unsigned long w) { return sizeof(w) == 4 ? hweight32(w) : hweight64((__u64)w); } /** * rol64 - rotate a 64-bit value left * @word: value to rotate * @shift: bits to roll */ static inline __u64 rol64(__u64 word, unsigned int shift) { return (word << (shift & 63)) | (word >> ((-shift) & 63)); } /** * ror64 - rotate a 64-bit value right * @word: value to rotate * @shift: bits to roll */ static inline __u64 ror64(__u64 word, unsigned int shift) { return (word >> (shift & 63)) | (word << ((-shift) & 63)); } /** * rol32 - rotate a 32-bit value left * @word: value to rotate * @shift: bits to roll */ static inline __u32 rol32(__u32 word, unsigned int shift) { return (word << (shift & 31)) | (word >> ((-shift) & 31)); } /** * ror32 - rotate a 32-bit value right * @word: value to rotate * @shift: bits to roll */ static inline __u32 ror32(__u32 word, unsigned int shift) { return (word >> (shift & 31)) | (word << ((-shift) & 31)); } /** * rol16 - rotate a 16-bit value left * @word: value to rotate * @shift: bits to roll */ static inline __u16 rol16(__u16 word, unsigned int shift) { return (word << (shift & 15)) | (word >> ((-shift) & 15)); } /** * ror16 - rotate a 16-bit value right * @word: value to rotate * @shift: bits to roll */ static inline __u16 ror16(__u16 word, unsigned int shift) { return (word >> (shift & 15)) | (word << ((-shift) & 15)); } /** * rol8 - rotate an 8-bit value left * @word: value to rotate * @shift: bits to roll */ static inline __u8 rol8(__u8 word, unsigned int shift) { return (word << (shift & 7)) | (word >> ((-shift) & 7)); } /** * ror8 - rotate an 8-bit value right * @word: value to rotate * @shift: bits to roll */ static inline __u8 ror8(__u8 word, unsigned int shift) { return (word >> (shift & 7)) | (word << ((-shift) & 7)); } /** * sign_extend32 - sign extend a 32-bit value using specified bit as sign-bit * @value: value to sign extend * @index: 0 based bit index (0<=index<32) to sign bit * * This is safe to use for 16- and 8-bit types as well. */ static __always_inline __s32 sign_extend32(__u32 value, int index) { __u8 shift = 31 - index; return (__s32)(value << shift) >> shift; } /** * sign_extend64 - sign extend a 64-bit value using specified bit as sign-bit * @value: value to sign extend * @index: 0 based bit index (0<=index<64) to sign bit */ static __always_inline __s64 sign_extend64(__u64 value, int index) { __u8 shift = 63 - index; return (__s64)(value << shift) >> shift; } static inline unsigned fls_long(unsigned long l) { if (sizeof(l) == 4) return fls(l); return fls64(l); } static inline int get_count_order(unsigned int count) { if (count == 0) return -1; return fls(--count); } /** * get_count_order_long - get order after rounding @l up to power of 2 * @l: parameter * * it is same as get_count_order() but with long type parameter */ static inline int get_count_order_long(unsigned long l) { if (l == 0UL) return -1; return (int)fls_long(--l); } /** * __ffs64 - find first set bit in a 64 bit word * @word: The 64 bit word * * On 64 bit arches this is a synonym for __ffs * The result is not defined if no bits are set, so check that @word * is non-zero before calling this. */ static inline unsigned long __ffs64(u64 word) { #if BITS_PER_LONG == 32 if (((u32)word) == 0UL) return __ffs((u32)(word >> 32)) + 32; #elif BITS_PER_LONG != 64 #error BITS_PER_LONG not 32 or 64 #endif return __ffs((unsigned long)word); } /** * fns - find N'th set bit in a word * @word: The word to search * @n: Bit to find */ static inline unsigned long fns(unsigned long word, unsigned int n) { unsigned int bit; while (word) { bit = __ffs(word); if (n-- == 0) return bit; __clear_bit(bit, &word); } return BITS_PER_LONG; } /** * assign_bit - Assign value to a bit in memory * @nr: the bit to set * @addr: the address to start counting from * @value: the value to assign */ static __always_inline void assign_bit(long nr, volatile unsigned long *addr, bool value) { if (value) set_bit(nr, addr); else clear_bit(nr, addr); } static __always_inline void __assign_bit(long nr, volatile unsigned long *addr, bool value) { if (value) __set_bit(nr, addr); else __clear_bit(nr, addr); } /** * __ptr_set_bit - Set bit in a pointer's value * @nr: the bit to set * @addr: the address of the pointer variable * * Example: * void *p = foo(); * __ptr_set_bit(bit, &p); */ #define __ptr_set_bit(nr, addr) \ ({ \ typecheck_pointer(*(addr)); \ __set_bit(nr, (unsigned long *)(addr)); \ }) /** * __ptr_clear_bit - Clear bit in a pointer's value * @nr: the bit to clear * @addr: the address of the pointer variable * * Example: * void *p = foo(); * __ptr_clear_bit(bit, &p); */ #define __ptr_clear_bit(nr, addr) \ ({ \ typecheck_pointer(*(addr)); \ __clear_bit(nr, (unsigned long *)(addr)); \ }) /** * __ptr_test_bit - Test bit in a pointer's value * @nr: the bit to test * @addr: the address of the pointer variable * * Example: * void *p = foo(); * if (__ptr_test_bit(bit, &p)) { * ... * } else { * ... * } */ #define __ptr_test_bit(nr, addr) \ ({ \ typecheck_pointer(*(addr)); \ test_bit(nr, (unsigned long *)(addr)); \ }) #ifdef __KERNEL__ #ifndef set_mask_bits #define set_mask_bits(ptr, mask, bits) \ ({ \ const typeof(*(ptr)) mask__ = (mask), bits__ = (bits); \ typeof(*(ptr)) old__, new__; \ \ old__ = READ_ONCE(*(ptr)); \ do { \ new__ = (old__ & ~mask__) | bits__; \ } while (!try_cmpxchg(ptr, &old__, new__)); \ \ old__; \ }) #endif #ifndef bit_clear_unless #define bit_clear_unless(ptr, clear, test) \ ({ \ const typeof(*(ptr)) clear__ = (clear), test__ = (test);\ typeof(*(ptr)) old__, new__; \ \ old__ = READ_ONCE(*(ptr)); \ do { \ if (old__ & test__) \ break; \ new__ = old__ & ~clear__; \ } while (!try_cmpxchg(ptr, &old__, new__)); \ \ !(old__ & test__); \ }) #endif #endif /* __KERNEL__ */ #endif
105 40 65 95 10 102 3 468 264 105 18 23 11 54 3 75 31 48 2 2 2 1 2 24 21 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 // SPDX-License-Identifier: GPL-2.0-only /* * * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov> */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/stat.h> #include <linux/string.h> #include <linux/pagemap.h> #include <linux/mount.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/statfs.h> #include <linux/magic.h> #include <linux/fscache.h> #include <net/9p/9p.h> #include <net/9p/client.h> #include "v9fs.h" #include "v9fs_vfs.h" #include "fid.h" #include "xattr.h" #include "acl.h" static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl; /** * v9fs_set_super - set the superblock * @s: super block * @data: file system specific data * */ static int v9fs_set_super(struct super_block *s, void *data) { s->s_fs_info = data; return set_anon_super(s, data); } /** * v9fs_fill_super - populate superblock with info * @sb: superblock * @v9ses: session information * @flags: flags propagated from v9fs_mount() * */ static int v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, int flags) { int ret; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize_bits = fls(v9ses->maxdata - 1); sb->s_blocksize = 1 << sb->s_blocksize_bits; sb->s_magic = V9FS_MAGIC; if (v9fs_proto_dotl(v9ses)) { sb->s_op = &v9fs_super_ops_dotl; if (!(v9ses->flags & V9FS_NO_XATTR)) sb->s_xattr = v9fs_xattr_handlers; } else { sb->s_op = &v9fs_super_ops; sb->s_time_max = U32_MAX; } sb->s_time_min = 0; ret = super_setup_bdi(sb); if (ret) return ret; if (!v9ses->cache) { sb->s_bdi->ra_pages = 0; sb->s_bdi->io_pages = 0; } else { sb->s_bdi->ra_pages = v9ses->maxdata >> PAGE_SHIFT; sb->s_bdi->io_pages = v9ses->maxdata >> PAGE_SHIFT; } sb->s_flags |= SB_ACTIVE; #ifdef CONFIG_9P_FS_POSIX_ACL if ((v9ses->flags & V9FS_ACL_MASK) == V9FS_POSIX_ACL) sb->s_flags |= SB_POSIXACL; #endif return 0; } /** * v9fs_mount - mount a superblock * @fs_type: file system type * @flags: mount flags * @dev_name: device name that was mounted * @data: mount options * */ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { struct super_block *sb = NULL; struct inode *inode = NULL; struct dentry *root = NULL; struct v9fs_session_info *v9ses = NULL; umode_t mode = 0777 | S_ISVTX; struct p9_fid *fid; int retval = 0; p9_debug(P9_DEBUG_VFS, "\n"); v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL); if (!v9ses) return ERR_PTR(-ENOMEM); fid = v9fs_session_init(v9ses, dev_name, data); if (IS_ERR(fid)) { retval = PTR_ERR(fid); goto free_session; } sb = sget(fs_type, NULL, v9fs_set_super, flags, v9ses); if (IS_ERR(sb)) { retval = PTR_ERR(sb); goto clunk_fid; } retval = v9fs_fill_super(sb, v9ses, flags); if (retval) goto release_sb; if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) sb->s_d_op = &v9fs_cached_dentry_operations; else sb->s_d_op = &v9fs_dentry_operations; inode = v9fs_get_inode(sb, S_IFDIR | mode, 0); if (IS_ERR(inode)) { retval = PTR_ERR(inode); goto release_sb; } root = d_make_root(inode); if (!root) { retval = -ENOMEM; goto release_sb; } sb->s_root = root; if (v9fs_proto_dotl(v9ses)) { struct p9_stat_dotl *st = NULL; st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); if (IS_ERR(st)) { retval = PTR_ERR(st); goto release_sb; } d_inode(root)->i_ino = v9fs_qid2ino(&st->qid); v9fs_stat2inode_dotl(st, d_inode(root), 0); kfree(st); } else { struct p9_wstat *st = NULL; st = p9_client_stat(fid); if (IS_ERR(st)) { retval = PTR_ERR(st); goto release_sb; } d_inode(root)->i_ino = v9fs_qid2ino(&st->qid); v9fs_stat2inode(st, d_inode(root), sb, 0); p9stat_free(st); kfree(st); } retval = v9fs_get_acl(inode, fid); if (retval) goto release_sb; v9fs_fid_add(root, &fid); p9_debug(P9_DEBUG_VFS, " simple set mount, return 0\n"); return dget(sb->s_root); clunk_fid: p9_fid_put(fid); v9fs_session_close(v9ses); free_session: kfree(v9ses); return ERR_PTR(retval); release_sb: /* * we will do the session_close and root dentry release * in the below call. But we need to clunk fid, because we haven't * attached the fid to dentry so it won't get clunked * automatically. */ p9_fid_put(fid); deactivate_locked_super(sb); return ERR_PTR(retval); } /** * v9fs_kill_super - Kill Superblock * @s: superblock * */ static void v9fs_kill_super(struct super_block *s) { struct v9fs_session_info *v9ses = s->s_fs_info; p9_debug(P9_DEBUG_VFS, " %p\n", s); kill_anon_super(s); v9fs_session_cancel(v9ses); v9fs_session_close(v9ses); kfree(v9ses); s->s_fs_info = NULL; p9_debug(P9_DEBUG_VFS, "exiting kill_super\n"); } static void v9fs_umount_begin(struct super_block *sb) { struct v9fs_session_info *v9ses; v9ses = sb->s_fs_info; v9fs_session_begin_cancel(v9ses); } static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct v9fs_session_info *v9ses; struct p9_fid *fid; struct p9_rstatfs rs; int res; fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) { res = PTR_ERR(fid); goto done; } v9ses = v9fs_dentry2v9ses(dentry); if (v9fs_proto_dotl(v9ses)) { res = p9_client_statfs(fid, &rs); if (res == 0) { buf->f_type = rs.type; buf->f_bsize = rs.bsize; buf->f_blocks = rs.blocks; buf->f_bfree = rs.bfree; buf->f_bavail = rs.bavail; buf->f_files = rs.files; buf->f_ffree = rs.ffree; buf->f_fsid = u64_to_fsid(rs.fsid); buf->f_namelen = rs.namelen; } if (res != -ENOSYS) goto done; } res = simple_statfs(dentry, buf); done: p9_fid_put(fid); return res; } static int v9fs_drop_inode(struct inode *inode) { struct v9fs_session_info *v9ses; v9ses = v9fs_inode2v9ses(inode); if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) return generic_drop_inode(inode); /* * in case of non cached mode always drop the * inode because we want the inode attribute * to always match that on the server. */ return 1; } static int v9fs_write_inode(struct inode *inode, struct writeback_control *wbc) { /* * send an fsync request to server irrespective of * wbc->sync_mode. */ p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); return netfs_unpin_writeback(inode, wbc); } static int v9fs_write_inode_dotl(struct inode *inode, struct writeback_control *wbc) { p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode); return netfs_unpin_writeback(inode, wbc); } static const struct super_operations v9fs_super_ops = { .alloc_inode = v9fs_alloc_inode, .free_inode = v9fs_free_inode, .statfs = simple_statfs, .evict_inode = v9fs_evict_inode, .show_options = v9fs_show_options, .umount_begin = v9fs_umount_begin, .write_inode = v9fs_write_inode, }; static const struct super_operations v9fs_super_ops_dotl = { .alloc_inode = v9fs_alloc_inode, .free_inode = v9fs_free_inode, .statfs = v9fs_statfs, .drop_inode = v9fs_drop_inode, .evict_inode = v9fs_evict_inode, .show_options = v9fs_show_options, .umount_begin = v9fs_umount_begin, .write_inode = v9fs_write_inode_dotl, }; struct file_system_type v9fs_fs_type = { .name = "9p", .mount = v9fs_mount, .kill_sb = v9fs_kill_super, .owner = THIS_MODULE, .fs_flags = FS_RENAME_DOES_D_MOVE, }; MODULE_ALIAS_FS("9p");
5 1 2 1 1 1 1 1 86 87 2 1 5 2 3 1 5 3 2 2 1 1 17 18 18 25 2 1 1 3 5 11 3 18 18 3 4 3 2 100 13 2 2 1 1 1 22 1 12 9 2 3 13 13 2 5 7 4 1 8 8 2 4 1 3 4 16 96 1 94 96 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 // SPDX-License-Identifier: GPL-2.0 /* * XFRM virtual interface * * Copyright (C) 2018 secunet Security Networks AG * * Author: * Steffen Klassert <steffen.klassert@secunet.com> */ #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/sockios.h> #include <linux/icmp.h> #include <linux/if.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_link.h> #include <linux/if_arp.h> #include <linux/icmpv6.h> #include <linux/init.h> #include <linux/route.h> #include <linux/rtnetlink.h> #include <linux/netfilter_ipv6.h> #include <linux/slab.h> #include <linux/hash.h> #include <linux/uaccess.h> #include <linux/atomic.h> #include <net/gso.h> #include <net/icmp.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <net/ip_tunnels.h> #include <net/addrconf.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/dst_metadata.h> #include <net/netns/generic.h> #include <linux/etherdevice.h> static int xfrmi_dev_init(struct net_device *dev); static void xfrmi_dev_setup(struct net_device *dev); static struct rtnl_link_ops xfrmi_link_ops __read_mostly; static unsigned int xfrmi_net_id __read_mostly; static const struct net_device_ops xfrmi_netdev_ops; #define XFRMI_HASH_BITS 8 #define XFRMI_HASH_SIZE BIT(XFRMI_HASH_BITS) struct xfrmi_net { /* lists for storing interfaces in use */ struct xfrm_if __rcu *xfrmi[XFRMI_HASH_SIZE]; struct xfrm_if __rcu *collect_md_xfrmi; }; static const struct nla_policy xfrm_lwt_policy[LWT_XFRM_MAX + 1] = { [LWT_XFRM_IF_ID] = NLA_POLICY_MIN(NLA_U32, 1), [LWT_XFRM_LINK] = NLA_POLICY_MIN(NLA_U32, 1), }; static void xfrmi_destroy_state(struct lwtunnel_state *lwt) { } static int xfrmi_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { struct nlattr *tb[LWT_XFRM_MAX + 1]; struct lwtunnel_state *new_state; struct xfrm_md_info *info; int ret; ret = nla_parse_nested(tb, LWT_XFRM_MAX, nla, xfrm_lwt_policy, extack); if (ret < 0) return ret; if (!tb[LWT_XFRM_IF_ID]) { NL_SET_ERR_MSG(extack, "if_id must be set"); return -EINVAL; } new_state = lwtunnel_state_alloc(sizeof(*info)); if (!new_state) { NL_SET_ERR_MSG(extack, "failed to create encap info"); return -ENOMEM; } new_state->type = LWTUNNEL_ENCAP_XFRM; info = lwt_xfrm_info(new_state); info->if_id = nla_get_u32(tb[LWT_XFRM_IF_ID]); if (tb[LWT_XFRM_LINK]) info->link = nla_get_u32(tb[LWT_XFRM_LINK]); *ts = new_state; return 0; } static int xfrmi_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) { struct xfrm_md_info *info = lwt_xfrm_info(lwt); if (nla_put_u32(skb, LWT_XFRM_IF_ID, info->if_id) || (info->link && nla_put_u32(skb, LWT_XFRM_LINK, info->link))) return -EMSGSIZE; return 0; } static int xfrmi_encap_nlsize(struct lwtunnel_state *lwtstate) { return nla_total_size(sizeof(u32)) + /* LWT_XFRM_IF_ID */ nla_total_size(sizeof(u32)); /* LWT_XFRM_LINK */ } static int xfrmi_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) { struct xfrm_md_info *a_info = lwt_xfrm_info(a); struct xfrm_md_info *b_info = lwt_xfrm_info(b); return memcmp(a_info, b_info, sizeof(*a_info)); } static const struct lwtunnel_encap_ops xfrmi_encap_ops = { .build_state = xfrmi_build_state, .destroy_state = xfrmi_destroy_state, .fill_encap = xfrmi_fill_encap_info, .get_encap_size = xfrmi_encap_nlsize, .cmp_encap = xfrmi_encap_cmp, .owner = THIS_MODULE, }; #define for_each_xfrmi_rcu(start, xi) \ for (xi = rcu_dereference(start); xi; xi = rcu_dereference(xi->next)) static u32 xfrmi_hash(u32 if_id) { return hash_32(if_id, XFRMI_HASH_BITS); } static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); struct xfrm_if *xi; for_each_xfrmi_rcu(xfrmn->xfrmi[xfrmi_hash(x->if_id)], xi) { if (x->if_id == xi->p.if_id && (xi->dev->flags & IFF_UP)) return xi; } xi = rcu_dereference(xfrmn->collect_md_xfrmi); if (xi && (xi->dev->flags & IFF_UP)) return xi; return NULL; } static bool xfrmi_decode_session(struct sk_buff *skb, unsigned short family, struct xfrm_if_decode_session_result *res) { struct net_device *dev; struct xfrm_if *xi; int ifindex = 0; if (!secpath_exists(skb) || !skb->dev) return false; switch (family) { case AF_INET6: ifindex = inet6_sdif(skb); break; case AF_INET: ifindex = inet_sdif(skb); break; } if (ifindex) { struct net *net = xs_net(xfrm_input_state(skb)); dev = dev_get_by_index_rcu(net, ifindex); } else { dev = skb->dev; } if (!dev || !(dev->flags & IFF_UP)) return false; if (dev->netdev_ops != &xfrmi_netdev_ops) return false; xi = netdev_priv(dev); res->net = xi->net; if (xi->p.collect_md) res->if_id = xfrm_input_state(skb)->if_id; else res->if_id = xi->p.if_id; return true; } static void xfrmi_link(struct xfrmi_net *xfrmn, struct xfrm_if *xi) { struct xfrm_if __rcu **xip = &xfrmn->xfrmi[xfrmi_hash(xi->p.if_id)]; rcu_assign_pointer(xi->next , rtnl_dereference(*xip)); rcu_assign_pointer(*xip, xi); } static void xfrmi_unlink(struct xfrmi_net *xfrmn, struct xfrm_if *xi) { struct xfrm_if __rcu **xip; struct xfrm_if *iter; for (xip = &xfrmn->xfrmi[xfrmi_hash(xi->p.if_id)]; (iter = rtnl_dereference(*xip)) != NULL; xip = &iter->next) { if (xi == iter) { rcu_assign_pointer(*xip, xi->next); break; } } } static void xfrmi_dev_free(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); gro_cells_destroy(&xi->gro_cells); free_percpu(dev->tstats); } static int xfrmi_create(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct net *net = dev_net(dev); struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); int err; dev->rtnl_link_ops = &xfrmi_link_ops; err = register_netdevice(dev); if (err < 0) goto out; if (xi->p.collect_md) rcu_assign_pointer(xfrmn->collect_md_xfrmi, xi); else xfrmi_link(xfrmn, xi); return 0; out: return err; } static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p) { struct xfrm_if __rcu **xip; struct xfrm_if *xi; struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); for (xip = &xfrmn->xfrmi[xfrmi_hash(p->if_id)]; (xi = rtnl_dereference(*xip)) != NULL; xip = &xi->next) if (xi->p.if_id == p->if_id) return xi; return NULL; } static void xfrmi_dev_uninit(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id); if (xi->p.collect_md) RCU_INIT_POINTER(xfrmn->collect_md_xfrmi, NULL); else xfrmi_unlink(xfrmn, xi); } static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) { skb_clear_tstamp(skb); skb->pkt_type = PACKET_HOST; skb->skb_iif = 0; skb->ignore_df = 0; skb_dst_drop(skb); nf_reset_ct(skb); nf_reset_trace(skb); if (!xnet) return; ipvs_reset(skb); secpath_reset(skb); skb_orphan(skb); skb->mark = 0; } static int xfrmi_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type, unsigned short family) { struct sec_path *sp; sp = skb_sec_path(skb); if (sp && (sp->len || sp->olen) && !xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family)) goto discard; XFRM_SPI_SKB_CB(skb)->family = family; if (family == AF_INET) { XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; } else { XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; } return xfrm_input(skb, nexthdr, spi, encap_type); discard: kfree_skb(skb); return 0; } static int xfrmi4_rcv(struct sk_buff *skb) { return xfrmi_input(skb, ip_hdr(skb)->protocol, 0, 0, AF_INET); } static int xfrmi6_rcv(struct sk_buff *skb) { return xfrmi_input(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], 0, 0, AF_INET6); } static int xfrmi4_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET); } static int xfrmi6_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { return xfrmi_input(skb, nexthdr, spi, encap_type, AF_INET6); } static int xfrmi_rcv_cb(struct sk_buff *skb, int err) { const struct xfrm_mode *inner_mode; struct net_device *dev; struct xfrm_state *x; struct xfrm_if *xi; bool xnet; int link; if (err && !secpath_exists(skb)) return 0; x = xfrm_input_state(skb); xi = xfrmi_lookup(xs_net(x), x); if (!xi) return 1; link = skb->dev->ifindex; dev = xi->dev; skb->dev = dev; if (err) { DEV_STATS_INC(dev, rx_errors); DEV_STATS_INC(dev, rx_dropped); return 0; } xnet = !net_eq(xi->net, dev_net(skb->dev)); if (xnet) { inner_mode = &x->inner_mode; if (x->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); if (inner_mode == NULL) { XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINSTATEMODEERROR); return -EINVAL; } } if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, inner_mode->family)) return -EPERM; } xfrmi_scrub_packet(skb, xnet); if (xi->p.collect_md) { struct metadata_dst *md_dst; md_dst = metadata_dst_alloc(0, METADATA_XFRM, GFP_ATOMIC); if (!md_dst) return -ENOMEM; md_dst->u.xfrm_info.if_id = x->if_id; md_dst->u.xfrm_info.link = link; skb_dst_set(skb, (struct dst_entry *)md_dst); } dev_sw_netstats_rx_add(dev, skb->len); return 0; } static int xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) { struct xfrm_if *xi = netdev_priv(dev); struct dst_entry *dst = skb_dst(skb); unsigned int length = skb->len; struct net_device *tdev; struct xfrm_state *x; int err = -1; u32 if_id; int mtu; if (xi->p.collect_md) { struct xfrm_md_info *md_info = skb_xfrm_md_info(skb); if (unlikely(!md_info)) return -EINVAL; if_id = md_info->if_id; fl->flowi_oif = md_info->link; if (md_info->dst_orig) { struct dst_entry *tmp_dst = dst; dst = md_info->dst_orig; skb_dst_set(skb, dst); md_info->dst_orig = NULL; dst_release(tmp_dst); } } else { if_id = xi->p.if_id; } dst_hold(dst); dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, if_id); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; goto tx_err_link_failure; } x = dst->xfrm; if (!x) goto tx_err_link_failure; if (x->if_id != if_id) goto tx_err_link_failure; tdev = dst->dev; if (tdev == dev) { DEV_STATS_INC(dev, collisions); net_warn_ratelimited("%s: Local routing loop detected!\n", dev->name); goto tx_err_dst_release; } mtu = dst_mtu(dst); if ((!skb_is_gso(skb) && skb->len > mtu) || (skb_is_gso(skb) && !skb_gso_validate_network_len(skb, mtu))) { skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IPV6)) { if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; if (skb->len > 1280) icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); else goto xmit; } else { if (!(ip_hdr(skb)->frag_off & htons(IP_DF))) goto xmit; icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); } dst_release(dst); return -EMSGSIZE; } xmit: xfrmi_scrub_packet(skb, !net_eq(xi->net, dev_net(dev))); skb_dst_set(skb, dst); skb->dev = tdev; err = dst_output(xi->net, skb->sk, skb); if (net_xmit_eval(err) == 0) { dev_sw_netstats_tx_add(dev, 1, length); } else { DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_aborted_errors); } return 0; tx_err_link_failure: DEV_STATS_INC(dev, tx_carrier_errors); dst_link_failure(skb); tx_err_dst_release: dst_release(dst); return err; } static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct dst_entry *dst = skb_dst(skb); struct flowi fl; int ret; memset(&fl, 0, sizeof(fl)); switch (skb->protocol) { case htons(ETH_P_IPV6): memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET6); if (!dst) { fl.u.ip6.flowi6_oif = dev->ifindex; fl.u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; dst = ip6_route_output(dev_net(dev), NULL, &fl.u.ip6); if (dst->error) { dst_release(dst); DEV_STATS_INC(dev, tx_carrier_errors); goto tx_err; } skb_dst_set(skb, dst); } break; case htons(ETH_P_IP): memset(IPCB(skb), 0, sizeof(*IPCB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET); if (!dst) { struct rtable *rt; fl.u.ip4.flowi4_oif = dev->ifindex; fl.u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; rt = __ip_route_output_key(dev_net(dev), &fl.u.ip4); if (IS_ERR(rt)) { DEV_STATS_INC(dev, tx_carrier_errors); goto tx_err; } skb_dst_set(skb, &rt->dst); } break; default: goto tx_err; } fl.flowi_oif = xi->p.link; ret = xfrmi_xmit2(skb, dev, &fl); if (ret < 0) goto tx_err; return NETDEV_TX_OK; tx_err: DEV_STATS_INC(dev, tx_errors); DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } static int xfrmi4_err(struct sk_buff *skb, u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; struct net *net = dev_net(skb->dev); int protocol = iph->protocol; struct ip_comp_hdr *ipch; struct ip_esp_hdr *esph; struct ip_auth_hdr *ah ; struct xfrm_state *x; struct xfrm_if *xi; __be32 spi; switch (protocol) { case IPPROTO_ESP: esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); spi = esph->spi; break; case IPPROTO_AH: ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); spi = ah->spi; break; case IPPROTO_COMP: ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); spi = htonl(ntohs(ipch->cpi)); break; default: return 0; } switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return 0; break; case ICMP_REDIRECT: break; default: return 0; } x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, spi, protocol, AF_INET); if (!x) return 0; xi = xfrmi_lookup(net, x); if (!xi) { xfrm_state_put(x); return -1; } if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, protocol); else ipv4_redirect(skb, net, 0, protocol); xfrm_state_put(x); return 0; } static int xfrmi6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data; struct net *net = dev_net(skb->dev); int protocol = iph->nexthdr; struct ip_comp_hdr *ipch; struct ip_esp_hdr *esph; struct ip_auth_hdr *ah; struct xfrm_state *x; struct xfrm_if *xi; __be32 spi; switch (protocol) { case IPPROTO_ESP: esph = (struct ip_esp_hdr *)(skb->data + offset); spi = esph->spi; break; case IPPROTO_AH: ah = (struct ip_auth_hdr *)(skb->data + offset); spi = ah->spi; break; case IPPROTO_COMP: ipch = (struct ip_comp_hdr *)(skb->data + offset); spi = htonl(ntohs(ipch->cpi)); break; default: return 0; } if (type != ICMPV6_PKT_TOOBIG && type != NDISC_REDIRECT) return 0; x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, spi, protocol, AF_INET6); if (!x) return 0; xi = xfrmi_lookup(net, x); if (!xi) { xfrm_state_put(x); return -1; } if (type == NDISC_REDIRECT) ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); else ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL)); xfrm_state_put(x); return 0; } static int xfrmi_change(struct xfrm_if *xi, const struct xfrm_if_parms *p) { if (xi->p.link != p->link) return -EINVAL; xi->p.if_id = p->if_id; return 0; } static int xfrmi_update(struct xfrm_if *xi, struct xfrm_if_parms *p) { struct net *net = xi->net; struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); int err; xfrmi_unlink(xfrmn, xi); synchronize_net(); err = xfrmi_change(xi, p); xfrmi_link(xfrmn, xi); netdev_state_change(xi->dev); return err; } static int xfrmi_get_iflink(const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); return xi->p.link; } static const struct net_device_ops xfrmi_netdev_ops = { .ndo_init = xfrmi_dev_init, .ndo_uninit = xfrmi_dev_uninit, .ndo_start_xmit = xfrmi_xmit, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = xfrmi_get_iflink, }; static void xfrmi_dev_setup(struct net_device *dev) { dev->netdev_ops = &xfrmi_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->type = ARPHRD_NONE; dev->mtu = ETH_DATA_LEN; dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = IP_MAX_MTU; dev->flags = IFF_NOARP; dev->needs_free_netdev = true; dev->priv_destructor = xfrmi_dev_free; netif_keep_dst(dev); eth_broadcast_addr(dev->broadcast); } #define XFRMI_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_GSO_SOFTWARE | \ NETIF_F_HW_CSUM) static int xfrmi_dev_init(struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct net_device *phydev = __dev_get_by_index(xi->net, xi->p.link); int err; dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; err = gro_cells_init(&xi->gro_cells, dev); if (err) { free_percpu(dev->tstats); return err; } dev->features |= NETIF_F_LLTX; dev->features |= XFRMI_FEATURES; dev->hw_features |= XFRMI_FEATURES; if (phydev) { dev->needed_headroom = phydev->needed_headroom; dev->needed_tailroom = phydev->needed_tailroom; if (is_zero_ether_addr(dev->dev_addr)) eth_hw_addr_inherit(dev, phydev); if (is_zero_ether_addr(dev->broadcast)) memcpy(dev->broadcast, phydev->broadcast, dev->addr_len); } else { eth_hw_addr_random(dev); eth_broadcast_addr(dev->broadcast); } return 0; } static int xfrmi_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { return 0; } static void xfrmi_netlink_parms(struct nlattr *data[], struct xfrm_if_parms *parms) { memset(parms, 0, sizeof(*parms)); if (!data) return; if (data[IFLA_XFRM_LINK]) parms->link = nla_get_u32(data[IFLA_XFRM_LINK]); if (data[IFLA_XFRM_IF_ID]) parms->if_id = nla_get_u32(data[IFLA_XFRM_IF_ID]); if (data[IFLA_XFRM_COLLECT_METADATA]) parms->collect_md = true; } static int xfrmi_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct net *net = dev_net(dev); struct xfrm_if_parms p = {}; struct xfrm_if *xi; int err; xfrmi_netlink_parms(data, &p); if (p.collect_md) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); if (p.link || p.if_id) { NL_SET_ERR_MSG(extack, "link and if_id must be zero"); return -EINVAL; } if (rtnl_dereference(xfrmn->collect_md_xfrmi)) return -EEXIST; } else { if (!p.if_id) { NL_SET_ERR_MSG(extack, "if_id must be non zero"); return -EINVAL; } xi = xfrmi_locate(net, &p); if (xi) return -EEXIST; } xi = netdev_priv(dev); xi->p = p; xi->net = net; xi->dev = dev; err = xfrmi_create(dev); return err; } static void xfrmi_dellink(struct net_device *dev, struct list_head *head) { unregister_netdevice_queue(dev, head); } static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct xfrm_if *xi = netdev_priv(dev); struct net *net = xi->net; struct xfrm_if_parms p = {}; xfrmi_netlink_parms(data, &p); if (!p.if_id) { NL_SET_ERR_MSG(extack, "if_id must be non zero"); return -EINVAL; } if (p.collect_md) { NL_SET_ERR_MSG(extack, "collect_md can't be changed"); return -EINVAL; } xi = xfrmi_locate(net, &p); if (!xi) { xi = netdev_priv(dev); } else { if (xi->dev != dev) return -EEXIST; if (xi->p.collect_md) { NL_SET_ERR_MSG(extack, "device can't be changed to collect_md"); return -EINVAL; } } return xfrmi_update(xi, &p); } static size_t xfrmi_get_size(const struct net_device *dev) { return /* IFLA_XFRM_LINK */ nla_total_size(4) + /* IFLA_XFRM_IF_ID */ nla_total_size(4) + /* IFLA_XFRM_COLLECT_METADATA */ nla_total_size(0) + 0; } static int xfrmi_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); struct xfrm_if_parms *parm = &xi->p; if (nla_put_u32(skb, IFLA_XFRM_LINK, parm->link) || nla_put_u32(skb, IFLA_XFRM_IF_ID, parm->if_id) || (xi->p.collect_md && nla_put_flag(skb, IFLA_XFRM_COLLECT_METADATA))) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static struct net *xfrmi_get_link_net(const struct net_device *dev) { struct xfrm_if *xi = netdev_priv(dev); return xi->net; } static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = { [IFLA_XFRM_UNSPEC] = { .strict_start_type = IFLA_XFRM_COLLECT_METADATA }, [IFLA_XFRM_LINK] = { .type = NLA_U32 }, [IFLA_XFRM_IF_ID] = { .type = NLA_U32 }, [IFLA_XFRM_COLLECT_METADATA] = { .type = NLA_FLAG }, }; static struct rtnl_link_ops xfrmi_link_ops __read_mostly = { .kind = "xfrm", .maxtype = IFLA_XFRM_MAX, .policy = xfrmi_policy, .priv_size = sizeof(struct xfrm_if), .setup = xfrmi_dev_setup, .validate = xfrmi_validate, .newlink = xfrmi_newlink, .dellink = xfrmi_dellink, .changelink = xfrmi_changelink, .get_size = xfrmi_get_size, .fill_info = xfrmi_fill_info, .get_link_net = xfrmi_get_link_net, }; static void __net_exit xfrmi_exit_batch_net(struct list_head *net_exit_list) { struct net *net; LIST_HEAD(list); rtnl_lock(); list_for_each_entry(net, net_exit_list, exit_list) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); struct xfrm_if __rcu **xip; struct xfrm_if *xi; int i; for (i = 0; i < XFRMI_HASH_SIZE; i++) { for (xip = &xfrmn->xfrmi[i]; (xi = rtnl_dereference(*xip)) != NULL; xip = &xi->next) unregister_netdevice_queue(xi->dev, &list); } xi = rtnl_dereference(xfrmn->collect_md_xfrmi); if (xi) unregister_netdevice_queue(xi->dev, &list); } unregister_netdevice_many(&list); rtnl_unlock(); } static struct pernet_operations xfrmi_net_ops = { .exit_batch = xfrmi_exit_batch_net, .id = &xfrmi_net_id, .size = sizeof(struct xfrmi_net), }; static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = { .handler = xfrmi6_rcv, .input_handler = xfrmi6_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, }; static struct xfrm6_protocol xfrmi_ah6_protocol __read_mostly = { .handler = xfrm6_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, }; static struct xfrm6_protocol xfrmi_ipcomp6_protocol __read_mostly = { .handler = xfrm6_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 10, }; #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) static int xfrmi6_rcv_tunnel(struct sk_buff *skb) { const xfrm_address_t *saddr; __be32 spi; saddr = (const xfrm_address_t *)&ipv6_hdr(skb)->saddr; spi = xfrm6_tunnel_spi_lookup(dev_net(skb->dev), saddr); return xfrm6_rcv_spi(skb, IPPROTO_IPV6, spi, NULL); } static struct xfrm6_tunnel xfrmi_ipv6_handler __read_mostly = { .handler = xfrmi6_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 2, }; static struct xfrm6_tunnel xfrmi_ip6ip_handler __read_mostly = { .handler = xfrmi6_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi6_err, .priority = 2, }; #endif static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = { .handler = xfrmi4_rcv, .input_handler = xfrmi4_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, }; static struct xfrm4_protocol xfrmi_ah4_protocol __read_mostly = { .handler = xfrm4_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, }; static struct xfrm4_protocol xfrmi_ipcomp4_protocol __read_mostly = { .handler = xfrm4_rcv, .input_handler = xfrm_input, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 10, }; #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) static int xfrmi4_rcv_tunnel(struct sk_buff *skb) { return xfrm4_rcv_spi(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr); } static struct xfrm_tunnel xfrmi_ipip_handler __read_mostly = { .handler = xfrmi4_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 3, }; static struct xfrm_tunnel xfrmi_ipip6_handler __read_mostly = { .handler = xfrmi4_rcv_tunnel, .cb_handler = xfrmi_rcv_cb, .err_handler = xfrmi4_err, .priority = 2, }; #endif static int __init xfrmi4_init(void) { int err; err = xfrm4_protocol_register(&xfrmi_esp4_protocol, IPPROTO_ESP); if (err < 0) goto xfrm_proto_esp_failed; err = xfrm4_protocol_register(&xfrmi_ah4_protocol, IPPROTO_AH); if (err < 0) goto xfrm_proto_ah_failed; err = xfrm4_protocol_register(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) err = xfrm4_tunnel_register(&xfrmi_ipip_handler, AF_INET); if (err < 0) goto xfrm_tunnel_ipip_failed; err = xfrm4_tunnel_register(&xfrmi_ipip6_handler, AF_INET6); if (err < 0) goto xfrm_tunnel_ipip6_failed; #endif return 0; #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) xfrm_tunnel_ipip6_failed: xfrm4_tunnel_deregister(&xfrmi_ipip_handler, AF_INET); xfrm_tunnel_ipip_failed: xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); #endif xfrm_proto_comp_failed: xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); xfrm_proto_ah_failed: xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); xfrm_proto_esp_failed: return err; } static void xfrmi4_fini(void) { #if IS_REACHABLE(CONFIG_INET_XFRM_TUNNEL) xfrm4_tunnel_deregister(&xfrmi_ipip6_handler, AF_INET6); xfrm4_tunnel_deregister(&xfrmi_ipip_handler, AF_INET); #endif xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP); xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH); xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP); } static int __init xfrmi6_init(void) { int err; err = xfrm6_protocol_register(&xfrmi_esp6_protocol, IPPROTO_ESP); if (err < 0) goto xfrm_proto_esp_failed; err = xfrm6_protocol_register(&xfrmi_ah6_protocol, IPPROTO_AH); if (err < 0) goto xfrm_proto_ah_failed; err = xfrm6_protocol_register(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) err = xfrm6_tunnel_register(&xfrmi_ipv6_handler, AF_INET6); if (err < 0) goto xfrm_tunnel_ipv6_failed; err = xfrm6_tunnel_register(&xfrmi_ip6ip_handler, AF_INET); if (err < 0) goto xfrm_tunnel_ip6ip_failed; #endif return 0; #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) xfrm_tunnel_ip6ip_failed: xfrm6_tunnel_deregister(&xfrmi_ipv6_handler, AF_INET6); xfrm_tunnel_ipv6_failed: xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); #endif xfrm_proto_comp_failed: xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); xfrm_proto_ah_failed: xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); xfrm_proto_esp_failed: return err; } static void xfrmi6_fini(void) { #if IS_REACHABLE(CONFIG_INET6_XFRM_TUNNEL) xfrm6_tunnel_deregister(&xfrmi_ip6ip_handler, AF_INET); xfrm6_tunnel_deregister(&xfrmi_ipv6_handler, AF_INET6); #endif xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP); xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH); xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP); } static const struct xfrm_if_cb xfrm_if_cb = { .decode_session = xfrmi_decode_session, }; static int __init xfrmi_init(void) { const char *msg; int err; pr_info("IPsec XFRM device driver\n"); msg = "tunnel device"; err = register_pernet_device(&xfrmi_net_ops); if (err < 0) goto pernet_dev_failed; msg = "xfrm4 protocols"; err = xfrmi4_init(); if (err < 0) goto xfrmi4_failed; msg = "xfrm6 protocols"; err = xfrmi6_init(); if (err < 0) goto xfrmi6_failed; msg = "netlink interface"; err = rtnl_link_register(&xfrmi_link_ops); if (err < 0) goto rtnl_link_failed; err = register_xfrm_interface_bpf(); if (err < 0) goto kfunc_failed; lwtunnel_encap_add_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM); xfrm_if_register_cb(&xfrm_if_cb); return err; kfunc_failed: rtnl_link_unregister(&xfrmi_link_ops); rtnl_link_failed: xfrmi6_fini(); xfrmi6_failed: xfrmi4_fini(); xfrmi4_failed: unregister_pernet_device(&xfrmi_net_ops); pernet_dev_failed: pr_err("xfrmi init: failed to register %s\n", msg); return err; } static void __exit xfrmi_fini(void) { xfrm_if_unregister_cb(); lwtunnel_encap_del_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM); rtnl_link_unregister(&xfrmi_link_ops); xfrmi4_fini(); xfrmi6_fini(); unregister_pernet_device(&xfrmi_net_ops); } module_init(xfrmi_init); module_exit(xfrmi_fini); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("xfrm"); MODULE_ALIAS_NETDEV("xfrm0"); MODULE_AUTHOR("Steffen Klassert"); MODULE_DESCRIPTION("XFRM virtual interface");
10 11 85 86 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 // SPDX-License-Identifier: GPL-2.0-only /* * KVM dirty ring implementation * * Copyright 2019 Red Hat, Inc. */ #include <linux/kvm_host.h> #include <linux/kvm.h> #include <linux/vmalloc.h> #include <linux/kvm_dirty_ring.h> #include <trace/events/kvm.h> #include "kvm_mm.h" int __weak kvm_cpu_dirty_log_size(void) { return 0; } u32 kvm_dirty_ring_get_rsvd_entries(void) { return KVM_DIRTY_RING_RSVD_ENTRIES + kvm_cpu_dirty_log_size(); } bool kvm_use_dirty_bitmap(struct kvm *kvm) { lockdep_assert_held(&kvm->slots_lock); return !kvm->dirty_ring_size || kvm->dirty_ring_with_bitmap; } #ifndef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm) { return false; } #endif static u32 kvm_dirty_ring_used(struct kvm_dirty_ring *ring) { return READ_ONCE(ring->dirty_index) - READ_ONCE(ring->reset_index); } static bool kvm_dirty_ring_soft_full(struct kvm_dirty_ring *ring) { return kvm_dirty_ring_used(ring) >= ring->soft_limit; } static bool kvm_dirty_ring_full(struct kvm_dirty_ring *ring) { return kvm_dirty_ring_used(ring) >= ring->size; } static void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask) { struct kvm_memory_slot *memslot; int as_id, id; as_id = slot >> 16; id = (u16)slot; if (as_id >= kvm_arch_nr_memslot_as_ids(kvm) || id >= KVM_USER_MEM_SLOTS) return; memslot = id_to_memslot(__kvm_memslots(kvm, as_id), id); if (!memslot || (offset + __fls(mask)) >= memslot->npages) return; KVM_MMU_LOCK(kvm); kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, memslot, offset, mask); KVM_MMU_UNLOCK(kvm); } int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size) { ring->dirty_gfns = vzalloc(size); if (!ring->dirty_gfns) return -ENOMEM; ring->size = size / sizeof(struct kvm_dirty_gfn); ring->soft_limit = ring->size - kvm_dirty_ring_get_rsvd_entries(); ring->dirty_index = 0; ring->reset_index = 0; ring->index = index; return 0; } static inline void kvm_dirty_gfn_set_invalid(struct kvm_dirty_gfn *gfn) { smp_store_release(&gfn->flags, 0); } static inline void kvm_dirty_gfn_set_dirtied(struct kvm_dirty_gfn *gfn) { gfn->flags = KVM_DIRTY_GFN_F_DIRTY; } static inline bool kvm_dirty_gfn_harvested(struct kvm_dirty_gfn *gfn) { return smp_load_acquire(&gfn->flags) & KVM_DIRTY_GFN_F_RESET; } int kvm_dirty_ring_reset(struct kvm *kvm, struct kvm_dirty_ring *ring) { u32 cur_slot, next_slot; u64 cur_offset, next_offset; unsigned long mask; int count = 0; struct kvm_dirty_gfn *entry; bool first_round = true; /* This is only needed to make compilers happy */ cur_slot = cur_offset = mask = 0; while (true) { entry = &ring->dirty_gfns[ring->reset_index & (ring->size - 1)]; if (!kvm_dirty_gfn_harvested(entry)) break; next_slot = READ_ONCE(entry->slot); next_offset = READ_ONCE(entry->offset); /* Update the flags to reflect that this GFN is reset */ kvm_dirty_gfn_set_invalid(entry); ring->reset_index++; count++; /* * Try to coalesce the reset operations when the guest is * scanning pages in the same slot. */ if (!first_round && next_slot == cur_slot) { s64 delta = next_offset - cur_offset; if (delta >= 0 && delta < BITS_PER_LONG) { mask |= 1ull << delta; continue; } /* Backwards visit, careful about overflows! */ if (delta > -BITS_PER_LONG && delta < 0 && (mask << -delta >> -delta) == mask) { cur_offset = next_offset; mask = (mask << -delta) | 1; continue; } } kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask); cur_slot = next_slot; cur_offset = next_offset; mask = 1; first_round = false; } kvm_reset_dirty_gfn(kvm, cur_slot, cur_offset, mask); /* * The request KVM_REQ_DIRTY_RING_SOFT_FULL will be cleared * by the VCPU thread next time when it enters the guest. */ trace_kvm_dirty_ring_reset(ring); return count; } void kvm_dirty_ring_push(struct kvm_vcpu *vcpu, u32 slot, u64 offset) { struct kvm_dirty_ring *ring = &vcpu->dirty_ring; struct kvm_dirty_gfn *entry; /* It should never get full */ WARN_ON_ONCE(kvm_dirty_ring_full(ring)); entry = &ring->dirty_gfns[ring->dirty_index & (ring->size - 1)]; entry->slot = slot; entry->offset = offset; /* * Make sure the data is filled in before we publish this to * the userspace program. There's no paired kernel-side reader. */ smp_wmb(); kvm_dirty_gfn_set_dirtied(entry); ring->dirty_index++; trace_kvm_dirty_ring_push(ring, slot, offset); if (kvm_dirty_ring_soft_full(ring)) kvm_make_request(KVM_REQ_DIRTY_RING_SOFT_FULL, vcpu); } bool kvm_dirty_ring_check_request(struct kvm_vcpu *vcpu) { /* * The VCPU isn't runnable when the dirty ring becomes soft full. * The KVM_REQ_DIRTY_RING_SOFT_FULL event is always set to prevent * the VCPU from running until the dirty pages are harvested and * the dirty ring is reset by userspace. */ if (kvm_check_request(KVM_REQ_DIRTY_RING_SOFT_FULL, vcpu) && kvm_dirty_ring_soft_full(&vcpu->dirty_ring)) { kvm_make_request(KVM_REQ_DIRTY_RING_SOFT_FULL, vcpu); vcpu->run->exit_reason = KVM_EXIT_DIRTY_RING_FULL; trace_kvm_dirty_ring_exit(vcpu); return true; } return false; } struct page *kvm_dirty_ring_get_page(struct kvm_dirty_ring *ring, u32 offset) { return vmalloc_to_page((void *)ring->dirty_gfns + offset * PAGE_SIZE); } void kvm_dirty_ring_free(struct kvm_dirty_ring *ring) { vfree(ring->dirty_gfns); ring->dirty_gfns = NULL; }
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 /* * Copyright (c) 2007-2011 Atheros Communications Inc. * Copyright (c) 2011-2012 Qualcomm Atheros, Inc. * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include <linux/module.h> #include <linux/usb.h> #include "debug.h" #include "core.h" /* constants */ #define TX_URB_COUNT 32 #define RX_URB_COUNT 32 #define ATH6KL_USB_RX_BUFFER_SIZE 4096 /* tx/rx pipes for usb */ enum ATH6KL_USB_PIPE_ID { ATH6KL_USB_PIPE_TX_CTRL = 0, ATH6KL_USB_PIPE_TX_DATA_LP, ATH6KL_USB_PIPE_TX_DATA_MP, ATH6KL_USB_PIPE_TX_DATA_HP, ATH6KL_USB_PIPE_RX_CTRL, ATH6KL_USB_PIPE_RX_DATA, ATH6KL_USB_PIPE_RX_DATA2, ATH6KL_USB_PIPE_RX_INT, ATH6KL_USB_PIPE_MAX }; #define ATH6KL_USB_PIPE_INVALID ATH6KL_USB_PIPE_MAX struct ath6kl_usb_pipe { struct list_head urb_list_head; struct usb_anchor urb_submitted; u32 urb_alloc; u32 urb_cnt; u32 urb_cnt_thresh; unsigned int usb_pipe_handle; u32 flags; u8 ep_address; u8 logical_pipe_num; struct ath6kl_usb *ar_usb; u16 max_packet_size; struct work_struct io_complete_work; struct sk_buff_head io_comp_queue; struct usb_endpoint_descriptor *ep_desc; }; #define ATH6KL_USB_PIPE_FLAG_TX (1 << 0) /* usb device object */ struct ath6kl_usb { /* protects pipe->urb_list_head and pipe->urb_cnt */ spinlock_t cs_lock; struct usb_device *udev; struct usb_interface *interface; struct ath6kl_usb_pipe pipes[ATH6KL_USB_PIPE_MAX]; u8 *diag_cmd_buffer; u8 *diag_resp_buffer; struct ath6kl *ar; struct workqueue_struct *wq; }; /* usb urb object */ struct ath6kl_urb_context { struct list_head link; struct ath6kl_usb_pipe *pipe; struct sk_buff *skb; struct ath6kl *ar; }; /* USB endpoint definitions */ #define ATH6KL_USB_EP_ADDR_APP_CTRL_IN 0x81 #define ATH6KL_USB_EP_ADDR_APP_DATA_IN 0x82 #define ATH6KL_USB_EP_ADDR_APP_DATA2_IN 0x83 #define ATH6KL_USB_EP_ADDR_APP_INT_IN 0x84 #define ATH6KL_USB_EP_ADDR_APP_CTRL_OUT 0x01 #define ATH6KL_USB_EP_ADDR_APP_DATA_LP_OUT 0x02 #define ATH6KL_USB_EP_ADDR_APP_DATA_MP_OUT 0x03 #define ATH6KL_USB_EP_ADDR_APP_DATA_HP_OUT 0x04 /* diagnostic command defnitions */ #define ATH6KL_USB_CONTROL_REQ_SEND_BMI_CMD 1 #define ATH6KL_USB_CONTROL_REQ_RECV_BMI_RESP 2 #define ATH6KL_USB_CONTROL_REQ_DIAG_CMD 3 #define ATH6KL_USB_CONTROL_REQ_DIAG_RESP 4 #define ATH6KL_USB_CTRL_DIAG_CC_READ 0 #define ATH6KL_USB_CTRL_DIAG_CC_WRITE 1 struct ath6kl_usb_ctrl_diag_cmd_write { __le32 cmd; __le32 address; __le32 value; __le32 _pad[1]; } __packed; struct ath6kl_usb_ctrl_diag_cmd_read { __le32 cmd; __le32 address; } __packed; struct ath6kl_usb_ctrl_diag_resp_read { __le32 value; } __packed; /* function declarations */ static void ath6kl_usb_recv_complete(struct urb *urb); #define ATH6KL_USB_IS_BULK_EP(attr) (((attr) & 3) == 0x02) #define ATH6KL_USB_IS_INT_EP(attr) (((attr) & 3) == 0x03) #define ATH6KL_USB_IS_ISOC_EP(attr) (((attr) & 3) == 0x01) #define ATH6KL_USB_IS_DIR_IN(addr) ((addr) & 0x80) /* pipe/urb operations */ static struct ath6kl_urb_context * ath6kl_usb_alloc_urb_from_pipe(struct ath6kl_usb_pipe *pipe) { struct ath6kl_urb_context *urb_context = NULL; unsigned long flags; /* bail if this pipe is not initialized */ if (!pipe->ar_usb) return NULL; spin_lock_irqsave(&pipe->ar_usb->cs_lock, flags); if (!list_empty(&pipe->urb_list_head)) { urb_context = list_first_entry(&pipe->urb_list_head, struct ath6kl_urb_context, link); list_del(&urb_context->link); pipe->urb_cnt--; } spin_unlock_irqrestore(&pipe->ar_usb->cs_lock, flags); return urb_context; } static void ath6kl_usb_free_urb_to_pipe(struct ath6kl_usb_pipe *pipe, struct ath6kl_urb_context *urb_context) { unsigned long flags; /* bail if this pipe is not initialized */ if (!pipe->ar_usb) return; spin_lock_irqsave(&pipe->ar_usb->cs_lock, flags); pipe->urb_cnt++; list_add(&urb_context->link, &pipe->urb_list_head); spin_unlock_irqrestore(&pipe->ar_usb->cs_lock, flags); } static void ath6kl_usb_cleanup_recv_urb(struct ath6kl_urb_context *urb_context) { dev_kfree_skb(urb_context->skb); urb_context->skb = NULL; ath6kl_usb_free_urb_to_pipe(urb_context->pipe, urb_context); } static inline struct ath6kl_usb *ath6kl_usb_priv(struct ath6kl *ar) { return ar->hif_priv; } /* pipe resource allocation/cleanup */ static int ath6kl_usb_alloc_pipe_resources(struct ath6kl_usb_pipe *pipe, int urb_cnt) { struct ath6kl_urb_context *urb_context; int status = 0, i; INIT_LIST_HEAD(&pipe->urb_list_head); init_usb_anchor(&pipe->urb_submitted); for (i = 0; i < urb_cnt; i++) { urb_context = kzalloc(sizeof(struct ath6kl_urb_context), GFP_KERNEL); if (urb_context == NULL) { status = -ENOMEM; goto fail_alloc_pipe_resources; } urb_context->pipe = pipe; /* * we are only allocate the urb contexts here, the actual URB * is allocated from the kernel as needed to do a transaction */ pipe->urb_alloc++; ath6kl_usb_free_urb_to_pipe(pipe, urb_context); } ath6kl_dbg(ATH6KL_DBG_USB, "ath6kl usb: alloc resources lpipe:%d hpipe:0x%X urbs:%d\n", pipe->logical_pipe_num, pipe->usb_pipe_handle, pipe->urb_alloc); fail_alloc_pipe_resources: return status; } static void ath6kl_usb_free_pipe_resources(struct ath6kl_usb_pipe *pipe) { struct ath6kl_urb_context *urb_context; if (pipe->ar_usb == NULL) { /* nothing allocated for this pipe */ return; } ath6kl_dbg(ATH6KL_DBG_USB, "ath6kl usb: free resources lpipe:%d" "hpipe:0x%X urbs:%d avail:%d\n", pipe->logical_pipe_num, pipe->usb_pipe_handle, pipe->urb_alloc, pipe->urb_cnt); if (pipe->urb_alloc != pipe->urb_cnt) { ath6kl_dbg(ATH6KL_DBG_USB, "ath6kl usb: urb leak! lpipe:%d" "hpipe:0x%X urbs:%d avail:%d\n", pipe->logical_pipe_num, pipe->usb_pipe_handle, pipe->urb_alloc, pipe->urb_cnt); } while (true) { urb_context = ath6kl_usb_alloc_urb_from_pipe(pipe); if (urb_context == NULL) break; kfree(urb_context); } } static void ath6kl_usb_cleanup_pipe_resources(struct ath6kl_usb *ar_usb) { int i; for (i = 0; i < ATH6KL_USB_PIPE_MAX; i++) ath6kl_usb_free_pipe_resources(&ar_usb->pipes[i]); } static u8 ath6kl_usb_get_logical_pipe_num(struct ath6kl_usb *ar_usb, u8 ep_address, int *urb_count) { u8 pipe_num = ATH6KL_USB_PIPE_INVALID; switch (ep_address) { case ATH6KL_USB_EP_ADDR_APP_CTRL_IN: pipe_num = ATH6KL_USB_PIPE_RX_CTRL; *urb_count = RX_URB_COUNT; break; case ATH6KL_USB_EP_ADDR_APP_DATA_IN: pipe_num = ATH6KL_USB_PIPE_RX_DATA; *urb_count = RX_URB_COUNT; break; case ATH6KL_USB_EP_ADDR_APP_INT_IN: pipe_num = ATH6KL_USB_PIPE_RX_INT; *urb_count = RX_URB_COUNT; break; case ATH6KL_USB_EP_ADDR_APP_DATA2_IN: pipe_num = ATH6KL_USB_PIPE_RX_DATA2; *urb_count = RX_URB_COUNT; break; case ATH6KL_USB_EP_ADDR_APP_CTRL_OUT: pipe_num = ATH6KL_USB_PIPE_TX_CTRL; *urb_count = TX_URB_COUNT; break; case ATH6KL_USB_EP_ADDR_APP_DATA_LP_OUT: pipe_num = ATH6KL_USB_PIPE_TX_DATA_LP; *urb_count = TX_URB_COUNT; break; case ATH6KL_USB_EP_ADDR_APP_DATA_MP_OUT: pipe_num = ATH6KL_USB_PIPE_TX_DATA_MP; *urb_count = TX_URB_COUNT; break; case ATH6KL_USB_EP_ADDR_APP_DATA_HP_OUT: pipe_num = ATH6KL_USB_PIPE_TX_DATA_HP; *urb_count = TX_URB_COUNT; break; default: /* note: there may be endpoints not currently used */ break; } return pipe_num; } static int ath6kl_usb_setup_pipe_resources(struct ath6kl_usb *ar_usb) { struct usb_interface *interface = ar_usb->interface; struct usb_host_interface *iface_desc = interface->cur_altsetting; struct usb_endpoint_descriptor *endpoint; struct ath6kl_usb_pipe *pipe; int i, urbcount, status = 0; u8 pipe_num; ath6kl_dbg(ATH6KL_DBG_USB, "setting up USB Pipes using interface\n"); /* walk descriptors and setup pipes */ for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) { endpoint = &iface_desc->endpoint[i].desc; if (ATH6KL_USB_IS_BULK_EP(endpoint->bmAttributes)) { ath6kl_dbg(ATH6KL_DBG_USB, "%s Bulk Ep:0x%2.2X maxpktsz:%d\n", ATH6KL_USB_IS_DIR_IN (endpoint->bEndpointAddress) ? "RX" : "TX", endpoint->bEndpointAddress, le16_to_cpu(endpoint->wMaxPacketSize)); } else if (ATH6KL_USB_IS_INT_EP(endpoint->bmAttributes)) { ath6kl_dbg(ATH6KL_DBG_USB, "%s Int Ep:0x%2.2X maxpktsz:%d interval:%d\n", ATH6KL_USB_IS_DIR_IN (endpoint->bEndpointAddress) ? "RX" : "TX", endpoint->bEndpointAddress, le16_to_cpu(endpoint->wMaxPacketSize), endpoint->bInterval); } else if (ATH6KL_USB_IS_ISOC_EP(endpoint->bmAttributes)) { /* TODO for ISO */ ath6kl_dbg(ATH6KL_DBG_USB, "%s ISOC Ep:0x%2.2X maxpktsz:%d interval:%d\n", ATH6KL_USB_IS_DIR_IN (endpoint->bEndpointAddress) ? "RX" : "TX", endpoint->bEndpointAddress, le16_to_cpu(endpoint->wMaxPacketSize), endpoint->bInterval); } /* Ignore broken descriptors. */ if (usb_endpoint_maxp(endpoint) == 0) continue; urbcount = 0; pipe_num = ath6kl_usb_get_logical_pipe_num(ar_usb, endpoint->bEndpointAddress, &urbcount); if (pipe_num == ATH6KL_USB_PIPE_INVALID) continue; pipe = &ar_usb->pipes[pipe_num]; if (pipe->ar_usb != NULL) { /* hmmm..pipe was already setup */ continue; } pipe->ar_usb = ar_usb; pipe->logical_pipe_num = pipe_num; pipe->ep_address = endpoint->bEndpointAddress; pipe->max_packet_size = le16_to_cpu(endpoint->wMaxPacketSize); if (ATH6KL_USB_IS_BULK_EP(endpoint->bmAttributes)) { if (ATH6KL_USB_IS_DIR_IN(pipe->ep_address)) { pipe->usb_pipe_handle = usb_rcvbulkpipe(ar_usb->udev, pipe->ep_address); } else { pipe->usb_pipe_handle = usb_sndbulkpipe(ar_usb->udev, pipe->ep_address); } } else if (ATH6KL_USB_IS_INT_EP(endpoint->bmAttributes)) { if (ATH6KL_USB_IS_DIR_IN(pipe->ep_address)) { pipe->usb_pipe_handle = usb_rcvintpipe(ar_usb->udev, pipe->ep_address); } else { pipe->usb_pipe_handle = usb_sndintpipe(ar_usb->udev, pipe->ep_address); } } else if (ATH6KL_USB_IS_ISOC_EP(endpoint->bmAttributes)) { /* TODO for ISO */ if (ATH6KL_USB_IS_DIR_IN(pipe->ep_address)) { pipe->usb_pipe_handle = usb_rcvisocpipe(ar_usb->udev, pipe->ep_address); } else { pipe->usb_pipe_handle = usb_sndisocpipe(ar_usb->udev, pipe->ep_address); } } pipe->ep_desc = endpoint; if (!ATH6KL_USB_IS_DIR_IN(pipe->ep_address)) pipe->flags |= ATH6KL_USB_PIPE_FLAG_TX; status = ath6kl_usb_alloc_pipe_resources(pipe, urbcount); if (status != 0) break; } return status; } /* pipe operations */ static void ath6kl_usb_post_recv_transfers(struct ath6kl_usb_pipe *recv_pipe, int buffer_length) { struct ath6kl_urb_context *urb_context; struct urb *urb; int usb_status; while (true) { urb_context = ath6kl_usb_alloc_urb_from_pipe(recv_pipe); if (urb_context == NULL) break; urb_context->skb = dev_alloc_skb(buffer_length); if (urb_context->skb == NULL) goto err_cleanup_urb; urb = usb_alloc_urb(0, GFP_ATOMIC); if (urb == NULL) goto err_cleanup_urb; usb_fill_bulk_urb(urb, recv_pipe->ar_usb->udev, recv_pipe->usb_pipe_handle, urb_context->skb->data, buffer_length, ath6kl_usb_recv_complete, urb_context); ath6kl_dbg(ATH6KL_DBG_USB_BULK, "ath6kl usb: bulk recv submit:%d, 0x%X (ep:0x%2.2X), %d bytes buf:0x%p\n", recv_pipe->logical_pipe_num, recv_pipe->usb_pipe_handle, recv_pipe->ep_address, buffer_length, urb_context->skb); usb_anchor_urb(urb, &recv_pipe->urb_submitted); usb_status = usb_submit_urb(urb, GFP_ATOMIC); if (usb_status) { ath6kl_dbg(ATH6KL_DBG_USB_BULK, "ath6kl usb : usb bulk recv failed %d\n", usb_status); usb_unanchor_urb(urb); usb_free_urb(urb); goto err_cleanup_urb; } usb_free_urb(urb); } return; err_cleanup_urb: ath6kl_usb_cleanup_recv_urb(urb_context); return; } static void ath6kl_usb_flush_all(struct ath6kl_usb *ar_usb) { int i; for (i = 0; i < ATH6KL_USB_PIPE_MAX; i++) { if (ar_usb->pipes[i].ar_usb != NULL) usb_kill_anchored_urbs(&ar_usb->pipes[i].urb_submitted); } /* * Flushing any pending I/O may schedule work this call will block * until all scheduled work runs to completion. */ flush_workqueue(ar_usb->wq); } static void ath6kl_usb_start_recv_pipes(struct ath6kl_usb *ar_usb) { /* * note: control pipe is no longer used * ar_usb->pipes[ATH6KL_USB_PIPE_RX_CTRL].urb_cnt_thresh = * ar_usb->pipes[ATH6KL_USB_PIPE_RX_CTRL].urb_alloc/2; * ath6kl_usb_post_recv_transfers(&ar_usb-> * pipes[ATH6KL_USB_PIPE_RX_CTRL], * ATH6KL_USB_RX_BUFFER_SIZE); */ ar_usb->pipes[ATH6KL_USB_PIPE_RX_DATA].urb_cnt_thresh = 1; ath6kl_usb_post_recv_transfers(&ar_usb->pipes[ATH6KL_USB_PIPE_RX_DATA], ATH6KL_USB_RX_BUFFER_SIZE); } /* hif usb rx/tx completion functions */ static void ath6kl_usb_recv_complete(struct urb *urb) { struct ath6kl_urb_context *urb_context = urb->context; struct ath6kl_usb_pipe *pipe = urb_context->pipe; struct sk_buff *skb = NULL; int status = 0; ath6kl_dbg(ATH6KL_DBG_USB_BULK, "%s: recv pipe: %d, stat:%d, len:%d urb:0x%p\n", __func__, pipe->logical_pipe_num, urb->status, urb->actual_length, urb); if (urb->status != 0) { status = -EIO; switch (urb->status) { case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: /* * no need to spew these errors when device * removed or urb killed due to driver shutdown */ status = -ECANCELED; break; default: ath6kl_dbg(ATH6KL_DBG_USB_BULK, "%s recv pipe: %d (ep:0x%2.2X), failed:%d\n", __func__, pipe->logical_pipe_num, pipe->ep_address, urb->status); break; } goto cleanup_recv_urb; } if (urb->actual_length == 0) goto cleanup_recv_urb; skb = urb_context->skb; /* we are going to pass it up */ urb_context->skb = NULL; skb_put(skb, urb->actual_length); /* note: queue implements a lock */ skb_queue_tail(&pipe->io_comp_queue, skb); queue_work(pipe->ar_usb->wq, &pipe->io_complete_work); cleanup_recv_urb: ath6kl_usb_cleanup_recv_urb(urb_context); if (status == 0 && pipe->urb_cnt >= pipe->urb_cnt_thresh) { /* our free urbs are piling up, post more transfers */ ath6kl_usb_post_recv_transfers(pipe, ATH6KL_USB_RX_BUFFER_SIZE); } } static void ath6kl_usb_usb_transmit_complete(struct urb *urb) { struct ath6kl_urb_context *urb_context = urb->context; struct ath6kl_usb_pipe *pipe = urb_context->pipe; struct sk_buff *skb; ath6kl_dbg(ATH6KL_DBG_USB_BULK, "%s: pipe: %d, stat:%d, len:%d\n", __func__, pipe->logical_pipe_num, urb->status, urb->actual_length); if (urb->status != 0) { ath6kl_dbg(ATH6KL_DBG_USB_BULK, "%s: pipe: %d, failed:%d\n", __func__, pipe->logical_pipe_num, urb->status); } skb = urb_context->skb; urb_context->skb = NULL; ath6kl_usb_free_urb_to_pipe(urb_context->pipe, urb_context); /* note: queue implements a lock */ skb_queue_tail(&pipe->io_comp_queue, skb); queue_work(pipe->ar_usb->wq, &pipe->io_complete_work); } static void ath6kl_usb_io_comp_work(struct work_struct *work) { struct ath6kl_usb_pipe *pipe = container_of(work, struct ath6kl_usb_pipe, io_complete_work); struct ath6kl_usb *ar_usb; struct sk_buff *skb; ar_usb = pipe->ar_usb; while ((skb = skb_dequeue(&pipe->io_comp_queue))) { if (pipe->flags & ATH6KL_USB_PIPE_FLAG_TX) { ath6kl_dbg(ATH6KL_DBG_USB_BULK, "ath6kl usb xmit callback buf:0x%p\n", skb); ath6kl_core_tx_complete(ar_usb->ar, skb); } else { ath6kl_dbg(ATH6KL_DBG_USB_BULK, "ath6kl usb recv callback buf:0x%p\n", skb); ath6kl_core_rx_complete(ar_usb->ar, skb, pipe->logical_pipe_num); } } } #define ATH6KL_USB_MAX_DIAG_CMD (sizeof(struct ath6kl_usb_ctrl_diag_cmd_write)) #define ATH6KL_USB_MAX_DIAG_RESP (sizeof(struct ath6kl_usb_ctrl_diag_resp_read)) static void ath6kl_usb_destroy(struct ath6kl_usb *ar_usb) { ath6kl_usb_flush_all(ar_usb); ath6kl_usb_cleanup_pipe_resources(ar_usb); usb_set_intfdata(ar_usb->interface, NULL); kfree(ar_usb->diag_cmd_buffer); kfree(ar_usb->diag_resp_buffer); destroy_workqueue(ar_usb->wq); kfree(ar_usb); } static struct ath6kl_usb *ath6kl_usb_create(struct usb_interface *interface) { struct usb_device *dev = interface_to_usbdev(interface); struct ath6kl_usb *ar_usb; struct ath6kl_usb_pipe *pipe; int status = 0; int i; /* ath6kl_usb_destroy() needs ar_usb != NULL && ar_usb->wq != NULL. */ ar_usb = kzalloc(sizeof(struct ath6kl_usb), GFP_KERNEL); if (ar_usb == NULL) return NULL; ar_usb->wq = alloc_workqueue("ath6kl_wq", 0, 0); if (!ar_usb->wq) { kfree(ar_usb); return NULL; } usb_set_intfdata(interface, ar_usb); spin_lock_init(&(ar_usb->cs_lock)); ar_usb->udev = dev; ar_usb->interface = interface; for (i = 0; i < ATH6KL_USB_PIPE_MAX; i++) { pipe = &ar_usb->pipes[i]; INIT_WORK(&pipe->io_complete_work, ath6kl_usb_io_comp_work); skb_queue_head_init(&pipe->io_comp_queue); } ar_usb->diag_cmd_buffer = kzalloc(ATH6KL_USB_MAX_DIAG_CMD, GFP_KERNEL); if (ar_usb->diag_cmd_buffer == NULL) { status = -ENOMEM; goto fail_ath6kl_usb_create; } ar_usb->diag_resp_buffer = kzalloc(ATH6KL_USB_MAX_DIAG_RESP, GFP_KERNEL); if (ar_usb->diag_resp_buffer == NULL) { status = -ENOMEM; goto fail_ath6kl_usb_create; } status = ath6kl_usb_setup_pipe_resources(ar_usb); fail_ath6kl_usb_create: if (status != 0) { ath6kl_usb_destroy(ar_usb); ar_usb = NULL; } return ar_usb; } static void ath6kl_usb_device_detached(struct usb_interface *interface) { struct ath6kl_usb *ar_usb; ar_usb = usb_get_intfdata(interface); if (ar_usb == NULL) return; ath6kl_stop_txrx(ar_usb->ar); /* Delay to wait for the target to reboot */ mdelay(20); ath6kl_core_cleanup(ar_usb->ar); ath6kl_usb_destroy(ar_usb); } /* exported hif usb APIs for htc pipe */ static void hif_start(struct ath6kl *ar) { struct ath6kl_usb *device = ath6kl_usb_priv(ar); int i; ath6kl_usb_start_recv_pipes(device); /* set the TX resource avail threshold for each TX pipe */ for (i = ATH6KL_USB_PIPE_TX_CTRL; i <= ATH6KL_USB_PIPE_TX_DATA_HP; i++) { device->pipes[i].urb_cnt_thresh = device->pipes[i].urb_alloc / 2; } } static int ath6kl_usb_send(struct ath6kl *ar, u8 PipeID, struct sk_buff *hdr_skb, struct sk_buff *skb) { struct ath6kl_usb *device = ath6kl_usb_priv(ar); struct ath6kl_usb_pipe *pipe = &device->pipes[PipeID]; struct ath6kl_urb_context *urb_context; int usb_status, status = 0; struct urb *urb; u8 *data; u32 len; ath6kl_dbg(ATH6KL_DBG_USB_BULK, "+%s pipe : %d, buf:0x%p\n", __func__, PipeID, skb); urb_context = ath6kl_usb_alloc_urb_from_pipe(pipe); if (urb_context == NULL) { /* * TODO: it is possible to run out of urbs if * 2 endpoints map to the same pipe ID */ ath6kl_dbg(ATH6KL_DBG_USB_BULK, "%s pipe:%d no urbs left. URB Cnt : %d\n", __func__, PipeID, pipe->urb_cnt); status = -ENOMEM; goto fail_hif_send; } urb_context->skb = skb; data = skb->data; len = skb->len; urb = usb_alloc_urb(0, GFP_ATOMIC); if (urb == NULL) { status = -ENOMEM; ath6kl_usb_free_urb_to_pipe(urb_context->pipe, urb_context); goto fail_hif_send; } usb_fill_bulk_urb(urb, device->udev, pipe->usb_pipe_handle, data, len, ath6kl_usb_usb_transmit_complete, urb_context); if ((len % pipe->max_packet_size) == 0) { /* hit a max packet boundary on this pipe */ urb->transfer_flags |= URB_ZERO_PACKET; } ath6kl_dbg(ATH6KL_DBG_USB_BULK, "athusb bulk send submit:%d, 0x%X (ep:0x%2.2X), %d bytes\n", pipe->logical_pipe_num, pipe->usb_pipe_handle, pipe->ep_address, len); usb_anchor_urb(urb, &pipe->urb_submitted); usb_status = usb_submit_urb(urb, GFP_ATOMIC); if (usb_status) { ath6kl_dbg(ATH6KL_DBG_USB_BULK, "ath6kl usb : usb bulk transmit failed %d\n", usb_status); usb_unanchor_urb(urb); ath6kl_usb_free_urb_to_pipe(urb_context->pipe, urb_context); status = -EINVAL; } usb_free_urb(urb); fail_hif_send: return status; } static void hif_stop(struct ath6kl *ar) { struct ath6kl_usb *device = ath6kl_usb_priv(ar); ath6kl_usb_flush_all(device); } static void ath6kl_usb_get_default_pipe(struct ath6kl *ar, u8 *ul_pipe, u8 *dl_pipe) { *ul_pipe = ATH6KL_USB_PIPE_TX_CTRL; *dl_pipe = ATH6KL_USB_PIPE_RX_CTRL; } static int ath6kl_usb_map_service_pipe(struct ath6kl *ar, u16 svc_id, u8 *ul_pipe, u8 *dl_pipe) { int status = 0; switch (svc_id) { case HTC_CTRL_RSVD_SVC: case WMI_CONTROL_SVC: *ul_pipe = ATH6KL_USB_PIPE_TX_CTRL; /* due to large control packets, shift to data pipe */ *dl_pipe = ATH6KL_USB_PIPE_RX_DATA; break; case WMI_DATA_BE_SVC: case WMI_DATA_BK_SVC: *ul_pipe = ATH6KL_USB_PIPE_TX_DATA_LP; /* * Disable rxdata2 directly, it will be enabled * if FW enable rxdata2 */ *dl_pipe = ATH6KL_USB_PIPE_RX_DATA; break; case WMI_DATA_VI_SVC: if (test_bit(ATH6KL_FW_CAPABILITY_MAP_LP_ENDPOINT, ar->fw_capabilities)) *ul_pipe = ATH6KL_USB_PIPE_TX_DATA_LP; else *ul_pipe = ATH6KL_USB_PIPE_TX_DATA_MP; /* * Disable rxdata2 directly, it will be enabled * if FW enable rxdata2 */ *dl_pipe = ATH6KL_USB_PIPE_RX_DATA; break; case WMI_DATA_VO_SVC: if (test_bit(ATH6KL_FW_CAPABILITY_MAP_LP_ENDPOINT, ar->fw_capabilities)) *ul_pipe = ATH6KL_USB_PIPE_TX_DATA_LP; else *ul_pipe = ATH6KL_USB_PIPE_TX_DATA_MP; /* * Disable rxdata2 directly, it will be enabled * if FW enable rxdata2 */ *dl_pipe = ATH6KL_USB_PIPE_RX_DATA; break; default: status = -EPERM; break; } return status; } static u16 ath6kl_usb_get_free_queue_number(struct ath6kl *ar, u8 pipe_id) { struct ath6kl_usb *device = ath6kl_usb_priv(ar); return device->pipes[pipe_id].urb_cnt; } static void hif_detach_htc(struct ath6kl *ar) { struct ath6kl_usb *device = ath6kl_usb_priv(ar); ath6kl_usb_flush_all(device); } static int ath6kl_usb_submit_ctrl_out(struct ath6kl_usb *ar_usb, u8 req, u16 value, u16 index, void *data, u32 size) { u8 *buf = NULL; int ret; if (size > 0) { buf = kmemdup(data, size, GFP_KERNEL); if (buf == NULL) return -ENOMEM; } /* note: if successful returns number of bytes transfered */ ret = usb_control_msg(ar_usb->udev, usb_sndctrlpipe(ar_usb->udev, 0), req, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, value, index, buf, size, 1000); if (ret < 0) { ath6kl_warn("Failed to submit usb control message: %d\n", ret); kfree(buf); return ret; } kfree(buf); return 0; } static int ath6kl_usb_submit_ctrl_in(struct ath6kl_usb *ar_usb, u8 req, u16 value, u16 index, void *data, u32 size) { u8 *buf = NULL; int ret; if (size > 0) { buf = kmalloc(size, GFP_KERNEL); if (buf == NULL) return -ENOMEM; } /* note: if successful returns number of bytes transfered */ ret = usb_control_msg(ar_usb->udev, usb_rcvctrlpipe(ar_usb->udev, 0), req, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, value, index, buf, size, 2000); if (ret < 0) { ath6kl_warn("Failed to read usb control message: %d\n", ret); kfree(buf); return ret; } memcpy((u8 *) data, buf, size); kfree(buf); return 0; } static int ath6kl_usb_ctrl_msg_exchange(struct ath6kl_usb *ar_usb, u8 req_val, u8 *req_buf, u32 req_len, u8 resp_val, u8 *resp_buf, u32 *resp_len) { int ret; /* send command */ ret = ath6kl_usb_submit_ctrl_out(ar_usb, req_val, 0, 0, req_buf, req_len); if (ret != 0) return ret; if (resp_buf == NULL) { /* no expected response */ return ret; } /* get response */ ret = ath6kl_usb_submit_ctrl_in(ar_usb, resp_val, 0, 0, resp_buf, *resp_len); return ret; } static int ath6kl_usb_diag_read32(struct ath6kl *ar, u32 address, u32 *data) { struct ath6kl_usb *ar_usb = ar->hif_priv; struct ath6kl_usb_ctrl_diag_resp_read *resp; struct ath6kl_usb_ctrl_diag_cmd_read *cmd; u32 resp_len; int ret; cmd = (struct ath6kl_usb_ctrl_diag_cmd_read *) ar_usb->diag_cmd_buffer; memset(cmd, 0, sizeof(*cmd)); cmd->cmd = ATH6KL_USB_CTRL_DIAG_CC_READ; cmd->address = cpu_to_le32(address); resp_len = sizeof(*resp); ret = ath6kl_usb_ctrl_msg_exchange(ar_usb, ATH6KL_USB_CONTROL_REQ_DIAG_CMD, (u8 *) cmd, sizeof(struct ath6kl_usb_ctrl_diag_cmd_write), ATH6KL_USB_CONTROL_REQ_DIAG_RESP, ar_usb->diag_resp_buffer, &resp_len); if (ret) { ath6kl_warn("diag read32 failed: %d\n", ret); return ret; } resp = (struct ath6kl_usb_ctrl_diag_resp_read *) ar_usb->diag_resp_buffer; *data = le32_to_cpu(resp->value); return ret; } static int ath6kl_usb_diag_write32(struct ath6kl *ar, u32 address, __le32 data) { struct ath6kl_usb *ar_usb = ar->hif_priv; struct ath6kl_usb_ctrl_diag_cmd_write *cmd; int ret; cmd = (struct ath6kl_usb_ctrl_diag_cmd_write *) ar_usb->diag_cmd_buffer; memset(cmd, 0, sizeof(struct ath6kl_usb_ctrl_diag_cmd_write)); cmd->cmd = cpu_to_le32(ATH6KL_USB_CTRL_DIAG_CC_WRITE); cmd->address = cpu_to_le32(address); cmd->value = data; ret = ath6kl_usb_ctrl_msg_exchange(ar_usb, ATH6KL_USB_CONTROL_REQ_DIAG_CMD, (u8 *) cmd, sizeof(*cmd), 0, NULL, NULL); if (ret) { ath6kl_warn("diag_write32 failed: %d\n", ret); return ret; } return 0; } static int ath6kl_usb_bmi_read(struct ath6kl *ar, u8 *buf, u32 len) { struct ath6kl_usb *ar_usb = ar->hif_priv; int ret; /* get response */ ret = ath6kl_usb_submit_ctrl_in(ar_usb, ATH6KL_USB_CONTROL_REQ_RECV_BMI_RESP, 0, 0, buf, len); if (ret) { ath6kl_err("Unable to read the bmi data from the device: %d\n", ret); return ret; } return 0; } static int ath6kl_usb_bmi_write(struct ath6kl *ar, u8 *buf, u32 len) { struct ath6kl_usb *ar_usb = ar->hif_priv; int ret; /* send command */ ret = ath6kl_usb_submit_ctrl_out(ar_usb, ATH6KL_USB_CONTROL_REQ_SEND_BMI_CMD, 0, 0, buf, len); if (ret) { ath6kl_err("unable to send the bmi data to the device: %d\n", ret); return ret; } return 0; } static int ath6kl_usb_power_on(struct ath6kl *ar) { hif_start(ar); return 0; } static int ath6kl_usb_power_off(struct ath6kl *ar) { hif_detach_htc(ar); return 0; } static void ath6kl_usb_stop(struct ath6kl *ar) { hif_stop(ar); } static void ath6kl_usb_cleanup_scatter(struct ath6kl *ar) { /* * USB doesn't support it. Just return. */ return; } static int ath6kl_usb_suspend(struct ath6kl *ar, struct cfg80211_wowlan *wow) { /* * cfg80211 suspend/WOW currently not supported for USB. */ return 0; } static int ath6kl_usb_resume(struct ath6kl *ar) { /* * cfg80211 resume currently not supported for USB. */ return 0; } static const struct ath6kl_hif_ops ath6kl_usb_ops = { .diag_read32 = ath6kl_usb_diag_read32, .diag_write32 = ath6kl_usb_diag_write32, .bmi_read = ath6kl_usb_bmi_read, .bmi_write = ath6kl_usb_bmi_write, .power_on = ath6kl_usb_power_on, .power_off = ath6kl_usb_power_off, .stop = ath6kl_usb_stop, .pipe_send = ath6kl_usb_send, .pipe_get_default = ath6kl_usb_get_default_pipe, .pipe_map_service = ath6kl_usb_map_service_pipe, .pipe_get_free_queue_number = ath6kl_usb_get_free_queue_number, .cleanup_scatter = ath6kl_usb_cleanup_scatter, .suspend = ath6kl_usb_suspend, .resume = ath6kl_usb_resume, }; /* ath6kl usb driver registered functions */ static int ath6kl_usb_probe(struct usb_interface *interface, const struct usb_device_id *id) { struct usb_device *dev = interface_to_usbdev(interface); struct ath6kl *ar; struct ath6kl_usb *ar_usb = NULL; int vendor_id, product_id; int ret = 0; usb_get_dev(dev); vendor_id = le16_to_cpu(dev->descriptor.idVendor); product_id = le16_to_cpu(dev->descriptor.idProduct); ath6kl_dbg(ATH6KL_DBG_USB, "vendor_id = %04x\n", vendor_id); ath6kl_dbg(ATH6KL_DBG_USB, "product_id = %04x\n", product_id); if (interface->cur_altsetting) ath6kl_dbg(ATH6KL_DBG_USB, "USB Interface %d\n", interface->cur_altsetting->desc.bInterfaceNumber); if (dev->speed == USB_SPEED_HIGH) ath6kl_dbg(ATH6KL_DBG_USB, "USB 2.0 Host\n"); else ath6kl_dbg(ATH6KL_DBG_USB, "USB 1.1 Host\n"); ar_usb = ath6kl_usb_create(interface); if (ar_usb == NULL) { ret = -ENOMEM; goto err_usb_put; } ar = ath6kl_core_create(&ar_usb->udev->dev); if (ar == NULL) { ath6kl_err("Failed to alloc ath6kl core\n"); ret = -ENOMEM; goto err_usb_destroy; } ar->hif_priv = ar_usb; ar->hif_type = ATH6KL_HIF_TYPE_USB; ar->hif_ops = &ath6kl_usb_ops; ar->mbox_info.block_size = 16; ar->bmi.max_data_size = 252; ar_usb->ar = ar; ret = ath6kl_core_init(ar, ATH6KL_HTC_TYPE_PIPE); if (ret) { ath6kl_err("Failed to init ath6kl core: %d\n", ret); goto err_core_free; } return ret; err_core_free: ath6kl_core_destroy(ar); err_usb_destroy: ath6kl_usb_destroy(ar_usb); err_usb_put: usb_put_dev(dev); return ret; } static void ath6kl_usb_remove(struct usb_interface *interface) { usb_put_dev(interface_to_usbdev(interface)); ath6kl_usb_device_detached(interface); } #ifdef CONFIG_PM static int ath6kl_usb_pm_suspend(struct usb_interface *interface, pm_message_t message) { struct ath6kl_usb *device; device = usb_get_intfdata(interface); ath6kl_usb_flush_all(device); return 0; } static int ath6kl_usb_pm_resume(struct usb_interface *interface) { struct ath6kl_usb *device; device = usb_get_intfdata(interface); ath6kl_usb_post_recv_transfers(&device->pipes[ATH6KL_USB_PIPE_RX_DATA], ATH6KL_USB_RX_BUFFER_SIZE); ath6kl_usb_post_recv_transfers(&device->pipes[ATH6KL_USB_PIPE_RX_DATA2], ATH6KL_USB_RX_BUFFER_SIZE); return 0; } #else #define ath6kl_usb_pm_suspend NULL #define ath6kl_usb_pm_resume NULL #endif /* table of devices that work with this driver */ static const struct usb_device_id ath6kl_usb_ids[] = { {USB_DEVICE(0x0cf3, 0x9375)}, {USB_DEVICE(0x0cf3, 0x9374)}, {USB_DEVICE(0x04da, 0x390d)}, { /* Terminating entry */ }, }; MODULE_DEVICE_TABLE(usb, ath6kl_usb_ids); static struct usb_driver ath6kl_usb_driver = { .name = "ath6kl_usb", .probe = ath6kl_usb_probe, .suspend = ath6kl_usb_pm_suspend, .resume = ath6kl_usb_pm_resume, .disconnect = ath6kl_usb_remove, .id_table = ath6kl_usb_ids, .supports_autosuspend = true, .disable_hub_initiated_lpm = 1, }; module_usb_driver(ath6kl_usb_driver); MODULE_AUTHOR("Atheros Communications, Inc."); MODULE_DESCRIPTION("Driver support for Atheros AR600x USB devices"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_FIRMWARE(AR6004_HW_1_0_FIRMWARE_FILE); MODULE_FIRMWARE(AR6004_HW_1_0_BOARD_DATA_FILE); MODULE_FIRMWARE(AR6004_HW_1_0_DEFAULT_BOARD_DATA_FILE); MODULE_FIRMWARE(AR6004_HW_1_1_FIRMWARE_FILE); MODULE_FIRMWARE(AR6004_HW_1_1_BOARD_DATA_FILE); MODULE_FIRMWARE(AR6004_HW_1_1_DEFAULT_BOARD_DATA_FILE); MODULE_FIRMWARE(AR6004_HW_1_2_FIRMWARE_FILE); MODULE_FIRMWARE(AR6004_HW_1_2_BOARD_DATA_FILE); MODULE_FIRMWARE(AR6004_HW_1_2_DEFAULT_BOARD_DATA_FILE); MODULE_FIRMWARE(AR6004_HW_1_3_FW_DIR "/" AR6004_HW_1_3_FIRMWARE_FILE); MODULE_FIRMWARE(AR6004_HW_1_3_BOARD_DATA_FILE); MODULE_FIRMWARE(AR6004_HW_1_3_DEFAULT_BOARD_DATA_FILE);
3 3 5 51 42 2 13 1 2 2 2 4 6 3 18 18 19 9 5 7 3 1 3 14 2 1 1 1 5 2 2 8 1 4 5 2 3 13 13 6 6 3 2 1 68 1 52 1 14 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * Copyright (c) 2016 Pablo Neira Ayuso <pablo@netfilter.org> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_labels.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> struct nft_ct_helper_obj { struct nf_conntrack_helper *helper4; struct nf_conntrack_helper *helper6; u8 l4proto; }; #ifdef CONFIG_NF_CONNTRACK_ZONES static DEFINE_PER_CPU(struct nf_conn *, nft_ct_pcpu_template); static unsigned int nft_ct_pcpu_template_refcnt __read_mostly; static DEFINE_MUTEX(nft_ct_pcpu_mutex); #endif static u64 nft_ct_get_eval_counter(const struct nf_conn_counter *c, enum nft_ct_keys k, enum ip_conntrack_dir d) { if (d < IP_CT_DIR_MAX) return k == NFT_CT_BYTES ? atomic64_read(&c[d].bytes) : atomic64_read(&c[d].packets); return nft_ct_get_eval_counter(c, k, IP_CT_DIR_ORIGINAL) + nft_ct_get_eval_counter(c, k, IP_CT_DIR_REPLY); } static void nft_ct_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_ct *priv = nft_expr_priv(expr); u32 *dest = &regs->data[priv->dreg]; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; const struct nf_conn_help *help; const struct nf_conntrack_tuple *tuple; const struct nf_conntrack_helper *helper; unsigned int state; ct = nf_ct_get(pkt->skb, &ctinfo); switch (priv->key) { case NFT_CT_STATE: if (ct) state = NF_CT_STATE_BIT(ctinfo); else if (ctinfo == IP_CT_UNTRACKED) state = NF_CT_STATE_UNTRACKED_BIT; else state = NF_CT_STATE_INVALID_BIT; *dest = state; return; default: break; } if (ct == NULL) goto err; switch (priv->key) { case NFT_CT_DIRECTION: nft_reg_store8(dest, CTINFO2DIR(ctinfo)); return; case NFT_CT_STATUS: *dest = ct->status; return; #ifdef CONFIG_NF_CONNTRACK_MARK case NFT_CT_MARK: *dest = READ_ONCE(ct->mark); return; #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK case NFT_CT_SECMARK: *dest = ct->secmark; return; #endif case NFT_CT_EXPIRATION: *dest = jiffies_to_msecs(nf_ct_expires(ct)); return; case NFT_CT_HELPER: if (ct->master == NULL) goto err; help = nfct_help(ct->master); if (help == NULL) goto err; helper = rcu_dereference(help->helper); if (helper == NULL) goto err; strscpy_pad((char *)dest, helper->name, NF_CT_HELPER_NAME_LEN); return; #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: { struct nf_conn_labels *labels = nf_ct_labels_find(ct); if (labels) memcpy(dest, labels->bits, NF_CT_LABELS_MAX_SIZE); else memset(dest, 0, NF_CT_LABELS_MAX_SIZE); return; } #endif case NFT_CT_BYTES: case NFT_CT_PKTS: { const struct nf_conn_acct *acct = nf_conn_acct_find(ct); u64 count = 0; if (acct) count = nft_ct_get_eval_counter(acct->counter, priv->key, priv->dir); memcpy(dest, &count, sizeof(count)); return; } case NFT_CT_AVGPKT: { const struct nf_conn_acct *acct = nf_conn_acct_find(ct); u64 avgcnt = 0, bcnt = 0, pcnt = 0; if (acct) { pcnt = nft_ct_get_eval_counter(acct->counter, NFT_CT_PKTS, priv->dir); bcnt = nft_ct_get_eval_counter(acct->counter, NFT_CT_BYTES, priv->dir); if (pcnt != 0) avgcnt = div64_u64(bcnt, pcnt); } memcpy(dest, &avgcnt, sizeof(avgcnt)); return; } case NFT_CT_L3PROTOCOL: nft_reg_store8(dest, nf_ct_l3num(ct)); return; case NFT_CT_PROTOCOL: nft_reg_store8(dest, nf_ct_protonum(ct)); return; #ifdef CONFIG_NF_CONNTRACK_ZONES case NFT_CT_ZONE: { const struct nf_conntrack_zone *zone = nf_ct_zone(ct); u16 zoneid; if (priv->dir < IP_CT_DIR_MAX) zoneid = nf_ct_zone_id(zone, priv->dir); else zoneid = zone->id; nft_reg_store16(dest, zoneid); return; } #endif case NFT_CT_ID: *dest = nf_ct_get_id(ct); return; default: break; } tuple = &ct->tuplehash[priv->dir].tuple; switch (priv->key) { case NFT_CT_SRC: memcpy(dest, tuple->src.u3.all, nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); return; case NFT_CT_DST: memcpy(dest, tuple->dst.u3.all, nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); return; case NFT_CT_PROTO_SRC: nft_reg_store16(dest, (__force u16)tuple->src.u.all); return; case NFT_CT_PROTO_DST: nft_reg_store16(dest, (__force u16)tuple->dst.u.all); return; case NFT_CT_SRC_IP: if (nf_ct_l3num(ct) != NFPROTO_IPV4) goto err; *dest = (__force __u32)tuple->src.u3.ip; return; case NFT_CT_DST_IP: if (nf_ct_l3num(ct) != NFPROTO_IPV4) goto err; *dest = (__force __u32)tuple->dst.u3.ip; return; case NFT_CT_SRC_IP6: if (nf_ct_l3num(ct) != NFPROTO_IPV6) goto err; memcpy(dest, tuple->src.u3.ip6, sizeof(struct in6_addr)); return; case NFT_CT_DST_IP6: if (nf_ct_l3num(ct) != NFPROTO_IPV6) goto err; memcpy(dest, tuple->dst.u3.ip6, sizeof(struct in6_addr)); return; default: break; } return; err: regs->verdict.code = NFT_BREAK; } #ifdef CONFIG_NF_CONNTRACK_ZONES static void nft_ct_set_zone_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nf_conntrack_zone zone = { .dir = NF_CT_DEFAULT_ZONE_DIR }; const struct nft_ct *priv = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; enum ip_conntrack_info ctinfo; u16 value = nft_reg_load16(&regs->data[priv->sreg]); struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); if (ct) /* already tracked */ return; zone.id = value; switch (priv->dir) { case IP_CT_DIR_ORIGINAL: zone.dir = NF_CT_ZONE_DIR_ORIG; break; case IP_CT_DIR_REPLY: zone.dir = NF_CT_ZONE_DIR_REPL; break; default: break; } ct = this_cpu_read(nft_ct_pcpu_template); if (likely(refcount_read(&ct->ct_general.use) == 1)) { refcount_inc(&ct->ct_general.use); nf_ct_zone_add(ct, &zone); } else { /* previous skb got queued to userspace, allocate temporary * one until percpu template can be reused. */ ct = nf_ct_tmpl_alloc(nft_net(pkt), &zone, GFP_ATOMIC); if (!ct) { regs->verdict.code = NF_DROP; return; } __set_bit(IPS_CONFIRMED_BIT, &ct->status); } nf_ct_set(skb, ct, IP_CT_NEW); } #endif static void nft_ct_set_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_ct *priv = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; #if defined(CONFIG_NF_CONNTRACK_MARK) || defined(CONFIG_NF_CONNTRACK_SECMARK) u32 value = regs->data[priv->sreg]; #endif enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); if (ct == NULL || nf_ct_is_template(ct)) return; switch (priv->key) { #ifdef CONFIG_NF_CONNTRACK_MARK case NFT_CT_MARK: if (READ_ONCE(ct->mark) != value) { WRITE_ONCE(ct->mark, value); nf_conntrack_event_cache(IPCT_MARK, ct); } break; #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK case NFT_CT_SECMARK: if (ct->secmark != value) { ct->secmark = value; nf_conntrack_event_cache(IPCT_SECMARK, ct); } break; #endif #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: nf_connlabels_replace(ct, &regs->data[priv->sreg], &regs->data[priv->sreg], NF_CT_LABELS_MAX_SIZE / sizeof(u32)); break; #endif #ifdef CONFIG_NF_CONNTRACK_EVENTS case NFT_CT_EVENTMASK: { struct nf_conntrack_ecache *e = nf_ct_ecache_find(ct); u32 ctmask = regs->data[priv->sreg]; if (e) { if (e->ctmask != ctmask) e->ctmask = ctmask; break; } if (ctmask && !nf_ct_is_confirmed(ct)) nf_ct_ecache_ext_add(ct, ctmask, 0, GFP_ATOMIC); break; } #endif default: break; } } static const struct nla_policy nft_ct_policy[NFTA_CT_MAX + 1] = { [NFTA_CT_DREG] = { .type = NLA_U32 }, [NFTA_CT_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_CT_DIRECTION] = { .type = NLA_U8 }, [NFTA_CT_SREG] = { .type = NLA_U32 }, }; #ifdef CONFIG_NF_CONNTRACK_ZONES static void nft_ct_tmpl_put_pcpu(void) { struct nf_conn *ct; int cpu; for_each_possible_cpu(cpu) { ct = per_cpu(nft_ct_pcpu_template, cpu); if (!ct) break; nf_ct_put(ct); per_cpu(nft_ct_pcpu_template, cpu) = NULL; } } static bool nft_ct_tmpl_alloc_pcpu(void) { struct nf_conntrack_zone zone = { .id = 0 }; struct nf_conn *tmp; int cpu; if (nft_ct_pcpu_template_refcnt) return true; for_each_possible_cpu(cpu) { tmp = nf_ct_tmpl_alloc(&init_net, &zone, GFP_KERNEL); if (!tmp) { nft_ct_tmpl_put_pcpu(); return false; } __set_bit(IPS_CONFIRMED_BIT, &tmp->status); per_cpu(nft_ct_pcpu_template, cpu) = tmp; } return true; } #endif static int nft_ct_get_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_ct *priv = nft_expr_priv(expr); unsigned int len; int err; priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); priv->dir = IP_CT_DIR_MAX; switch (priv->key) { case NFT_CT_DIRECTION: if (tb[NFTA_CT_DIRECTION] != NULL) return -EINVAL; len = sizeof(u8); break; case NFT_CT_STATE: case NFT_CT_STATUS: #ifdef CONFIG_NF_CONNTRACK_MARK case NFT_CT_MARK: #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK case NFT_CT_SECMARK: #endif case NFT_CT_EXPIRATION: if (tb[NFTA_CT_DIRECTION] != NULL) return -EINVAL; len = sizeof(u32); break; #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: if (tb[NFTA_CT_DIRECTION] != NULL) return -EINVAL; len = NF_CT_LABELS_MAX_SIZE; break; #endif case NFT_CT_HELPER: if (tb[NFTA_CT_DIRECTION] != NULL) return -EINVAL; len = NF_CT_HELPER_NAME_LEN; break; case NFT_CT_L3PROTOCOL: case NFT_CT_PROTOCOL: /* For compatibility, do not report error if NFTA_CT_DIRECTION * attribute is specified. */ len = sizeof(u8); break; case NFT_CT_SRC: case NFT_CT_DST: if (tb[NFTA_CT_DIRECTION] == NULL) return -EINVAL; switch (ctx->family) { case NFPROTO_IPV4: len = sizeof_field(struct nf_conntrack_tuple, src.u3.ip); break; case NFPROTO_IPV6: case NFPROTO_INET: len = sizeof_field(struct nf_conntrack_tuple, src.u3.ip6); break; default: return -EAFNOSUPPORT; } break; case NFT_CT_SRC_IP: case NFT_CT_DST_IP: if (tb[NFTA_CT_DIRECTION] == NULL) return -EINVAL; len = sizeof_field(struct nf_conntrack_tuple, src.u3.ip); break; case NFT_CT_SRC_IP6: case NFT_CT_DST_IP6: if (tb[NFTA_CT_DIRECTION] == NULL) return -EINVAL; len = sizeof_field(struct nf_conntrack_tuple, src.u3.ip6); break; case NFT_CT_PROTO_SRC: case NFT_CT_PROTO_DST: if (tb[NFTA_CT_DIRECTION] == NULL) return -EINVAL; len = sizeof_field(struct nf_conntrack_tuple, src.u.all); break; case NFT_CT_BYTES: case NFT_CT_PKTS: case NFT_CT_AVGPKT: len = sizeof(u64); break; #ifdef CONFIG_NF_CONNTRACK_ZONES case NFT_CT_ZONE: len = sizeof(u16); break; #endif case NFT_CT_ID: len = sizeof(u32); break; default: return -EOPNOTSUPP; } if (tb[NFTA_CT_DIRECTION] != NULL) { priv->dir = nla_get_u8(tb[NFTA_CT_DIRECTION]); switch (priv->dir) { case IP_CT_DIR_ORIGINAL: case IP_CT_DIR_REPLY: break; default: return -EINVAL; } } priv->len = len; err = nft_parse_register_store(ctx, tb[NFTA_CT_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, len); if (err < 0) return err; err = nf_ct_netns_get(ctx->net, ctx->family); if (err < 0) return err; if (priv->key == NFT_CT_BYTES || priv->key == NFT_CT_PKTS || priv->key == NFT_CT_AVGPKT) nf_ct_set_acct(ctx->net, true); return 0; } static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv) { switch (priv->key) { #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: nf_connlabels_put(ctx->net); break; #endif #ifdef CONFIG_NF_CONNTRACK_ZONES case NFT_CT_ZONE: mutex_lock(&nft_ct_pcpu_mutex); if (--nft_ct_pcpu_template_refcnt == 0) nft_ct_tmpl_put_pcpu(); mutex_unlock(&nft_ct_pcpu_mutex); break; #endif default: break; } } static int nft_ct_set_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_ct *priv = nft_expr_priv(expr); unsigned int len; int err; priv->dir = IP_CT_DIR_MAX; priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); switch (priv->key) { #ifdef CONFIG_NF_CONNTRACK_MARK case NFT_CT_MARK: if (tb[NFTA_CT_DIRECTION]) return -EINVAL; len = sizeof_field(struct nf_conn, mark); break; #endif #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: if (tb[NFTA_CT_DIRECTION]) return -EINVAL; len = NF_CT_LABELS_MAX_SIZE; err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1); if (err) return err; break; #endif #ifdef CONFIG_NF_CONNTRACK_ZONES case NFT_CT_ZONE: mutex_lock(&nft_ct_pcpu_mutex); if (!nft_ct_tmpl_alloc_pcpu()) { mutex_unlock(&nft_ct_pcpu_mutex); return -ENOMEM; } nft_ct_pcpu_template_refcnt++; mutex_unlock(&nft_ct_pcpu_mutex); len = sizeof(u16); break; #endif #ifdef CONFIG_NF_CONNTRACK_EVENTS case NFT_CT_EVENTMASK: if (tb[NFTA_CT_DIRECTION]) return -EINVAL; len = sizeof(u32); break; #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK case NFT_CT_SECMARK: if (tb[NFTA_CT_DIRECTION]) return -EINVAL; len = sizeof(u32); break; #endif default: return -EOPNOTSUPP; } if (tb[NFTA_CT_DIRECTION]) { priv->dir = nla_get_u8(tb[NFTA_CT_DIRECTION]); switch (priv->dir) { case IP_CT_DIR_ORIGINAL: case IP_CT_DIR_REPLY: break; default: err = -EINVAL; goto err1; } } priv->len = len; err = nft_parse_register_load(tb[NFTA_CT_SREG], &priv->sreg, len); if (err < 0) goto err1; err = nf_ct_netns_get(ctx->net, ctx->family); if (err < 0) goto err1; return 0; err1: __nft_ct_set_destroy(ctx, priv); return err; } static void nft_ct_get_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { nf_ct_netns_put(ctx->net, ctx->family); } static void nft_ct_set_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_ct *priv = nft_expr_priv(expr); __nft_ct_set_destroy(ctx, priv); nf_ct_netns_put(ctx->net, ctx->family); } static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_ct *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_CT_DREG, priv->dreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key))) goto nla_put_failure; switch (priv->key) { case NFT_CT_SRC: case NFT_CT_DST: case NFT_CT_SRC_IP: case NFT_CT_DST_IP: case NFT_CT_SRC_IP6: case NFT_CT_DST_IP6: case NFT_CT_PROTO_SRC: case NFT_CT_PROTO_DST: if (nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) goto nla_put_failure; break; case NFT_CT_BYTES: case NFT_CT_PKTS: case NFT_CT_AVGPKT: case NFT_CT_ZONE: if (priv->dir < IP_CT_DIR_MAX && nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) goto nla_put_failure; break; default: break; } return 0; nla_put_failure: return -1; } static bool nft_ct_get_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { const struct nft_ct *priv = nft_expr_priv(expr); const struct nft_ct *ct; if (!nft_reg_track_cmp(track, expr, priv->dreg)) { nft_reg_track_update(track, expr, priv->dreg, priv->len); return false; } ct = nft_expr_priv(track->regs[priv->dreg].selector); if (priv->key != ct->key) { nft_reg_track_update(track, expr, priv->dreg, priv->len); return false; } if (!track->regs[priv->dreg].bitwise) return true; return nft_expr_reduce_bitwise(track, expr); } static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_ct *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_CT_SREG, priv->sreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key))) goto nla_put_failure; switch (priv->key) { case NFT_CT_ZONE: if (priv->dir < IP_CT_DIR_MAX && nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) goto nla_put_failure; break; default: break; } return 0; nla_put_failure: return -1; } static struct nft_expr_type nft_ct_type; static const struct nft_expr_ops nft_ct_get_ops = { .type = &nft_ct_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), .eval = nft_ct_get_eval, .init = nft_ct_get_init, .destroy = nft_ct_get_destroy, .dump = nft_ct_get_dump, .reduce = nft_ct_get_reduce, }; static bool nft_ct_set_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { int i; for (i = 0; i < NFT_REG32_NUM; i++) { if (!track->regs[i].selector) continue; if (track->regs[i].selector->ops != &nft_ct_get_ops) continue; __nft_reg_track_cancel(track, i); } return false; } #ifdef CONFIG_RETPOLINE static const struct nft_expr_ops nft_ct_get_fast_ops = { .type = &nft_ct_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), .eval = nft_ct_get_fast_eval, .init = nft_ct_get_init, .destroy = nft_ct_get_destroy, .dump = nft_ct_get_dump, .reduce = nft_ct_set_reduce, }; #endif static const struct nft_expr_ops nft_ct_set_ops = { .type = &nft_ct_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), .eval = nft_ct_set_eval, .init = nft_ct_set_init, .destroy = nft_ct_set_destroy, .dump = nft_ct_set_dump, .reduce = nft_ct_set_reduce, }; #ifdef CONFIG_NF_CONNTRACK_ZONES static const struct nft_expr_ops nft_ct_set_zone_ops = { .type = &nft_ct_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), .eval = nft_ct_set_zone_eval, .init = nft_ct_set_init, .destroy = nft_ct_set_destroy, .dump = nft_ct_set_dump, .reduce = nft_ct_set_reduce, }; #endif static const struct nft_expr_ops * nft_ct_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { if (tb[NFTA_CT_KEY] == NULL) return ERR_PTR(-EINVAL); if (tb[NFTA_CT_DREG] && tb[NFTA_CT_SREG]) return ERR_PTR(-EINVAL); if (tb[NFTA_CT_DREG]) { #ifdef CONFIG_RETPOLINE u32 k = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); switch (k) { case NFT_CT_STATE: case NFT_CT_DIRECTION: case NFT_CT_STATUS: case NFT_CT_MARK: case NFT_CT_SECMARK: return &nft_ct_get_fast_ops; } #endif return &nft_ct_get_ops; } if (tb[NFTA_CT_SREG]) { #ifdef CONFIG_NF_CONNTRACK_ZONES if (nla_get_be32(tb[NFTA_CT_KEY]) == htonl(NFT_CT_ZONE)) return &nft_ct_set_zone_ops; #endif return &nft_ct_set_ops; } return ERR_PTR(-EINVAL); } static struct nft_expr_type nft_ct_type __read_mostly = { .name = "ct", .select_ops = nft_ct_select_ops, .policy = nft_ct_policy, .maxattr = NFTA_CT_MAX, .owner = THIS_MODULE, }; static void nft_notrack_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct sk_buff *skb = pkt->skb; enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(pkt->skb, &ctinfo); /* Previously seen (loopback or untracked)? Ignore. */ if (ct || ctinfo == IP_CT_UNTRACKED) return; nf_ct_set(skb, ct, IP_CT_UNTRACKED); } static struct nft_expr_type nft_notrack_type; static const struct nft_expr_ops nft_notrack_ops = { .type = &nft_notrack_type, .size = NFT_EXPR_SIZE(0), .eval = nft_notrack_eval, .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_notrack_type __read_mostly = { .name = "notrack", .ops = &nft_notrack_ops, .owner = THIS_MODULE, }; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT static int nft_ct_timeout_parse_policy(void *timeouts, const struct nf_conntrack_l4proto *l4proto, struct net *net, const struct nlattr *attr) { struct nlattr **tb; int ret = 0; tb = kcalloc(l4proto->ctnl_timeout.nlattr_max + 1, sizeof(*tb), GFP_KERNEL); if (!tb) return -ENOMEM; ret = nla_parse_nested_deprecated(tb, l4proto->ctnl_timeout.nlattr_max, attr, l4proto->ctnl_timeout.nla_policy, NULL); if (ret < 0) goto err; ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeouts); err: kfree(tb); return ret; } struct nft_ct_timeout_obj { struct nf_ct_timeout *timeout; u8 l4proto; }; static void nft_ct_timeout_obj_eval(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_ct_timeout_obj *priv = nft_obj_data(obj); struct nf_conn *ct = (struct nf_conn *)skb_nfct(pkt->skb); struct nf_conn_timeout *timeout; const unsigned int *values; if (priv->l4proto != pkt->tprot) return; if (!ct || nf_ct_is_template(ct) || nf_ct_is_confirmed(ct)) return; timeout = nf_ct_timeout_find(ct); if (!timeout) { timeout = nf_ct_timeout_ext_add(ct, priv->timeout, GFP_ATOMIC); if (!timeout) { regs->verdict.code = NF_DROP; return; } } rcu_assign_pointer(timeout->timeout, priv->timeout); /* adjust the timeout as per 'new' state. ct is unconfirmed, * so the current timestamp must not be added. */ values = nf_ct_timeout_data(timeout); if (values) nf_ct_refresh(ct, pkt->skb, values[0]); } static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) { struct nft_ct_timeout_obj *priv = nft_obj_data(obj); const struct nf_conntrack_l4proto *l4proto; struct nf_ct_timeout *timeout; int l3num = ctx->family; __u8 l4num; int ret; if (!tb[NFTA_CT_TIMEOUT_L4PROTO] || !tb[NFTA_CT_TIMEOUT_DATA]) return -EINVAL; if (tb[NFTA_CT_TIMEOUT_L3PROTO]) l3num = ntohs(nla_get_be16(tb[NFTA_CT_TIMEOUT_L3PROTO])); l4num = nla_get_u8(tb[NFTA_CT_TIMEOUT_L4PROTO]); priv->l4proto = l4num; l4proto = nf_ct_l4proto_find(l4num); if (l4proto->l4proto != l4num) { ret = -EOPNOTSUPP; goto err_proto_put; } timeout = kzalloc(sizeof(struct nf_ct_timeout) + l4proto->ctnl_timeout.obj_size, GFP_KERNEL); if (timeout == NULL) { ret = -ENOMEM; goto err_proto_put; } ret = nft_ct_timeout_parse_policy(&timeout->data, l4proto, ctx->net, tb[NFTA_CT_TIMEOUT_DATA]); if (ret < 0) goto err_free_timeout; timeout->l3num = l3num; timeout->l4proto = l4proto; ret = nf_ct_netns_get(ctx->net, ctx->family); if (ret < 0) goto err_free_timeout; priv->timeout = timeout; return 0; err_free_timeout: kfree(timeout); err_proto_put: return ret; } static void nft_ct_timeout_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { struct nft_ct_timeout_obj *priv = nft_obj_data(obj); struct nf_ct_timeout *timeout = priv->timeout; nf_ct_untimeout(ctx->net, timeout); nf_ct_netns_put(ctx->net, ctx->family); kfree(priv->timeout); } static int nft_ct_timeout_obj_dump(struct sk_buff *skb, struct nft_object *obj, bool reset) { const struct nft_ct_timeout_obj *priv = nft_obj_data(obj); const struct nf_ct_timeout *timeout = priv->timeout; struct nlattr *nest_params; int ret; if (nla_put_u8(skb, NFTA_CT_TIMEOUT_L4PROTO, timeout->l4proto->l4proto) || nla_put_be16(skb, NFTA_CT_TIMEOUT_L3PROTO, htons(timeout->l3num))) return -1; nest_params = nla_nest_start(skb, NFTA_CT_TIMEOUT_DATA); if (!nest_params) return -1; ret = timeout->l4proto->ctnl_timeout.obj_to_nlattr(skb, &timeout->data); if (ret < 0) return -1; nla_nest_end(skb, nest_params); return 0; } static const struct nla_policy nft_ct_timeout_policy[NFTA_CT_TIMEOUT_MAX + 1] = { [NFTA_CT_TIMEOUT_L3PROTO] = {.type = NLA_U16 }, [NFTA_CT_TIMEOUT_L4PROTO] = {.type = NLA_U8 }, [NFTA_CT_TIMEOUT_DATA] = {.type = NLA_NESTED }, }; static struct nft_object_type nft_ct_timeout_obj_type; static const struct nft_object_ops nft_ct_timeout_obj_ops = { .type = &nft_ct_timeout_obj_type, .size = sizeof(struct nft_ct_timeout_obj), .eval = nft_ct_timeout_obj_eval, .init = nft_ct_timeout_obj_init, .destroy = nft_ct_timeout_obj_destroy, .dump = nft_ct_timeout_obj_dump, }; static struct nft_object_type nft_ct_timeout_obj_type __read_mostly = { .type = NFT_OBJECT_CT_TIMEOUT, .ops = &nft_ct_timeout_obj_ops, .maxattr = NFTA_CT_TIMEOUT_MAX, .policy = nft_ct_timeout_policy, .owner = THIS_MODULE, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) { struct nft_ct_helper_obj *priv = nft_obj_data(obj); struct nf_conntrack_helper *help4, *help6; char name[NF_CT_HELPER_NAME_LEN]; int family = ctx->family; int err; if (!tb[NFTA_CT_HELPER_NAME] || !tb[NFTA_CT_HELPER_L4PROTO]) return -EINVAL; priv->l4proto = nla_get_u8(tb[NFTA_CT_HELPER_L4PROTO]); if (!priv->l4proto) return -ENOENT; nla_strscpy(name, tb[NFTA_CT_HELPER_NAME], sizeof(name)); if (tb[NFTA_CT_HELPER_L3PROTO]) family = ntohs(nla_get_be16(tb[NFTA_CT_HELPER_L3PROTO])); help4 = NULL; help6 = NULL; switch (family) { case NFPROTO_IPV4: if (ctx->family == NFPROTO_IPV6) return -EINVAL; help4 = nf_conntrack_helper_try_module_get(name, family, priv->l4proto); break; case NFPROTO_IPV6: if (ctx->family == NFPROTO_IPV4) return -EINVAL; help6 = nf_conntrack_helper_try_module_get(name, family, priv->l4proto); break; case NFPROTO_NETDEV: case NFPROTO_BRIDGE: case NFPROTO_INET: help4 = nf_conntrack_helper_try_module_get(name, NFPROTO_IPV4, priv->l4proto); help6 = nf_conntrack_helper_try_module_get(name, NFPROTO_IPV6, priv->l4proto); break; default: return -EAFNOSUPPORT; } /* && is intentional; only error if INET found neither ipv4 or ipv6 */ if (!help4 && !help6) return -ENOENT; priv->helper4 = help4; priv->helper6 = help6; err = nf_ct_netns_get(ctx->net, ctx->family); if (err < 0) goto err_put_helper; return 0; err_put_helper: if (priv->helper4) nf_conntrack_helper_put(priv->helper4); if (priv->helper6) nf_conntrack_helper_put(priv->helper6); return err; } static void nft_ct_helper_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { struct nft_ct_helper_obj *priv = nft_obj_data(obj); if (priv->helper4) nf_conntrack_helper_put(priv->helper4); if (priv->helper6) nf_conntrack_helper_put(priv->helper6); nf_ct_netns_put(ctx->net, ctx->family); } static void nft_ct_helper_obj_eval(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_ct_helper_obj *priv = nft_obj_data(obj); struct nf_conn *ct = (struct nf_conn *)skb_nfct(pkt->skb); struct nf_conntrack_helper *to_assign = NULL; struct nf_conn_help *help; if (!ct || nf_ct_is_confirmed(ct) || nf_ct_is_template(ct) || priv->l4proto != nf_ct_protonum(ct)) return; switch (nf_ct_l3num(ct)) { case NFPROTO_IPV4: to_assign = priv->helper4; break; case NFPROTO_IPV6: to_assign = priv->helper6; break; default: WARN_ON_ONCE(1); return; } if (!to_assign) return; if (test_bit(IPS_HELPER_BIT, &ct->status)) return; help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); if (help) { rcu_assign_pointer(help->helper, to_assign); set_bit(IPS_HELPER_BIT, &ct->status); } } static int nft_ct_helper_obj_dump(struct sk_buff *skb, struct nft_object *obj, bool reset) { const struct nft_ct_helper_obj *priv = nft_obj_data(obj); const struct nf_conntrack_helper *helper; u16 family; if (priv->helper4 && priv->helper6) { family = NFPROTO_INET; helper = priv->helper4; } else if (priv->helper6) { family = NFPROTO_IPV6; helper = priv->helper6; } else { family = NFPROTO_IPV4; helper = priv->helper4; } if (nla_put_string(skb, NFTA_CT_HELPER_NAME, helper->name)) return -1; if (nla_put_u8(skb, NFTA_CT_HELPER_L4PROTO, priv->l4proto)) return -1; if (nla_put_be16(skb, NFTA_CT_HELPER_L3PROTO, htons(family))) return -1; return 0; } static const struct nla_policy nft_ct_helper_policy[NFTA_CT_HELPER_MAX + 1] = { [NFTA_CT_HELPER_NAME] = { .type = NLA_STRING, .len = NF_CT_HELPER_NAME_LEN - 1 }, [NFTA_CT_HELPER_L3PROTO] = { .type = NLA_U16 }, [NFTA_CT_HELPER_L4PROTO] = { .type = NLA_U8 }, }; static struct nft_object_type nft_ct_helper_obj_type; static const struct nft_object_ops nft_ct_helper_obj_ops = { .type = &nft_ct_helper_obj_type, .size = sizeof(struct nft_ct_helper_obj), .eval = nft_ct_helper_obj_eval, .init = nft_ct_helper_obj_init, .destroy = nft_ct_helper_obj_destroy, .dump = nft_ct_helper_obj_dump, }; static struct nft_object_type nft_ct_helper_obj_type __read_mostly = { .type = NFT_OBJECT_CT_HELPER, .ops = &nft_ct_helper_obj_ops, .maxattr = NFTA_CT_HELPER_MAX, .policy = nft_ct_helper_policy, .owner = THIS_MODULE, }; struct nft_ct_expect_obj { u16 l3num; __be16 dport; u8 l4proto; u8 size; u32 timeout; }; static int nft_ct_expect_obj_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) { struct nft_ct_expect_obj *priv = nft_obj_data(obj); if (!tb[NFTA_CT_EXPECT_L4PROTO] || !tb[NFTA_CT_EXPECT_DPORT] || !tb[NFTA_CT_EXPECT_TIMEOUT] || !tb[NFTA_CT_EXPECT_SIZE]) return -EINVAL; priv->l3num = ctx->family; if (tb[NFTA_CT_EXPECT_L3PROTO]) priv->l3num = ntohs(nla_get_be16(tb[NFTA_CT_EXPECT_L3PROTO])); switch (priv->l3num) { case NFPROTO_IPV4: case NFPROTO_IPV6: if (priv->l3num != ctx->family) return -EINVAL; fallthrough; case NFPROTO_INET: break; default: return -EOPNOTSUPP; } priv->l4proto = nla_get_u8(tb[NFTA_CT_EXPECT_L4PROTO]); switch (priv->l4proto) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_UDPLITE: case IPPROTO_DCCP: case IPPROTO_SCTP: break; default: return -EOPNOTSUPP; } priv->dport = nla_get_be16(tb[NFTA_CT_EXPECT_DPORT]); priv->timeout = nla_get_u32(tb[NFTA_CT_EXPECT_TIMEOUT]); priv->size = nla_get_u8(tb[NFTA_CT_EXPECT_SIZE]); return nf_ct_netns_get(ctx->net, ctx->family); } static void nft_ct_expect_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { nf_ct_netns_put(ctx->net, ctx->family); } static int nft_ct_expect_obj_dump(struct sk_buff *skb, struct nft_object *obj, bool reset) { const struct nft_ct_expect_obj *priv = nft_obj_data(obj); if (nla_put_be16(skb, NFTA_CT_EXPECT_L3PROTO, htons(priv->l3num)) || nla_put_u8(skb, NFTA_CT_EXPECT_L4PROTO, priv->l4proto) || nla_put_be16(skb, NFTA_CT_EXPECT_DPORT, priv->dport) || nla_put_u32(skb, NFTA_CT_EXPECT_TIMEOUT, priv->timeout) || nla_put_u8(skb, NFTA_CT_EXPECT_SIZE, priv->size)) return -1; return 0; } static void nft_ct_expect_obj_eval(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_ct_expect_obj *priv = nft_obj_data(obj); struct nf_conntrack_expect *exp; enum ip_conntrack_info ctinfo; struct nf_conn_help *help; enum ip_conntrack_dir dir; u16 l3num = priv->l3num; struct nf_conn *ct; ct = nf_ct_get(pkt->skb, &ctinfo); if (!ct || nf_ct_is_confirmed(ct) || nf_ct_is_template(ct)) { regs->verdict.code = NFT_BREAK; return; } dir = CTINFO2DIR(ctinfo); help = nfct_help(ct); if (!help) help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); if (!help) { regs->verdict.code = NF_DROP; return; } if (help->expecting[NF_CT_EXPECT_CLASS_DEFAULT] >= priv->size) { regs->verdict.code = NFT_BREAK; return; } if (l3num == NFPROTO_INET) l3num = nf_ct_l3num(ct); exp = nf_ct_expect_alloc(ct); if (exp == NULL) { regs->verdict.code = NF_DROP; return; } nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, l3num, &ct->tuplehash[!dir].tuple.src.u3, &ct->tuplehash[!dir].tuple.dst.u3, priv->l4proto, NULL, &priv->dport); exp->timeout.expires = jiffies + priv->timeout * HZ; if (nf_ct_expect_related(exp, 0) != 0) regs->verdict.code = NF_DROP; } static const struct nla_policy nft_ct_expect_policy[NFTA_CT_EXPECT_MAX + 1] = { [NFTA_CT_EXPECT_L3PROTO] = { .type = NLA_U16 }, [NFTA_CT_EXPECT_L4PROTO] = { .type = NLA_U8 }, [NFTA_CT_EXPECT_DPORT] = { .type = NLA_U16 }, [NFTA_CT_EXPECT_TIMEOUT] = { .type = NLA_U32 }, [NFTA_CT_EXPECT_SIZE] = { .type = NLA_U8 }, }; static struct nft_object_type nft_ct_expect_obj_type; static const struct nft_object_ops nft_ct_expect_obj_ops = { .type = &nft_ct_expect_obj_type, .size = sizeof(struct nft_ct_expect_obj), .eval = nft_ct_expect_obj_eval, .init = nft_ct_expect_obj_init, .destroy = nft_ct_expect_obj_destroy, .dump = nft_ct_expect_obj_dump, }; static struct nft_object_type nft_ct_expect_obj_type __read_mostly = { .type = NFT_OBJECT_CT_EXPECT, .ops = &nft_ct_expect_obj_ops, .maxattr = NFTA_CT_EXPECT_MAX, .policy = nft_ct_expect_policy, .owner = THIS_MODULE, }; static int __init nft_ct_module_init(void) { int err; BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE > NFT_REG_SIZE); err = nft_register_expr(&nft_ct_type); if (err < 0) return err; err = nft_register_expr(&nft_notrack_type); if (err < 0) goto err1; err = nft_register_obj(&nft_ct_helper_obj_type); if (err < 0) goto err2; err = nft_register_obj(&nft_ct_expect_obj_type); if (err < 0) goto err3; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT err = nft_register_obj(&nft_ct_timeout_obj_type); if (err < 0) goto err4; #endif return 0; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT err4: nft_unregister_obj(&nft_ct_expect_obj_type); #endif err3: nft_unregister_obj(&nft_ct_helper_obj_type); err2: nft_unregister_expr(&nft_notrack_type); err1: nft_unregister_expr(&nft_ct_type); return err; } static void __exit nft_ct_module_exit(void) { #ifdef CONFIG_NF_CONNTRACK_TIMEOUT nft_unregister_obj(&nft_ct_timeout_obj_type); #endif nft_unregister_obj(&nft_ct_expect_obj_type); nft_unregister_obj(&nft_ct_helper_obj_type); nft_unregister_expr(&nft_notrack_type); nft_unregister_expr(&nft_ct_type); } module_init(nft_ct_module_init); module_exit(nft_ct_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_ALIAS_NFT_EXPR("ct"); MODULE_ALIAS_NFT_EXPR("notrack"); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_HELPER); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_TIMEOUT); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_EXPECT); MODULE_DESCRIPTION("Netfilter nf_tables conntrack module");
1 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _NET_CORE_DEV_H #define _NET_CORE_DEV_H #include <linux/types.h> struct net; struct net_device; struct netdev_bpf; struct netdev_phys_item_id; struct netlink_ext_ack; struct cpumask; /* Random bits of netdevice that don't need to be exposed */ #define FLOW_LIMIT_HISTORY (1 << 7) /* must be ^2 and !overflow buckets */ struct sd_flow_limit { u64 count; unsigned int num_buckets; unsigned int history_head; u16 history[FLOW_LIMIT_HISTORY]; u8 buckets[]; }; extern int netdev_flow_limit_table_len; #ifdef CONFIG_PROC_FS int __init dev_proc_init(void); #else #define dev_proc_init() 0 #endif void linkwatch_init_dev(struct net_device *dev); void linkwatch_run_queue(void); void dev_addr_flush(struct net_device *dev); int dev_addr_init(struct net_device *dev); void dev_addr_check(struct net_device *dev); /* sysctls not referred to from outside net/core/ */ extern int netdev_budget; extern unsigned int netdev_budget_usecs; extern unsigned int sysctl_skb_defer_max; extern int netdev_tstamp_prequeue; extern int netdev_unregister_timeout_secs; extern int weight_p; extern int dev_weight_rx_bias; extern int dev_weight_tx_bias; /* rtnl helpers */ extern struct list_head net_todo_list; void netdev_run_todo(void); /* netdev management, shared between various uAPI entry points */ struct netdev_name_node { struct hlist_node hlist; struct list_head list; struct net_device *dev; const char *name; }; int netdev_get_name(struct net *net, char *name, int ifindex); int dev_change_name(struct net_device *dev, const char *newname); #define netdev_for_each_altname(dev, namenode) \ list_for_each_entry((namenode), &(dev)->name_node->list, list) #define netdev_for_each_altname_safe(dev, namenode, next) \ list_for_each_entry_safe((namenode), (next), &(dev)->name_node->list, \ list) int netdev_name_node_alt_create(struct net_device *dev, const char *name); int netdev_name_node_alt_destroy(struct net_device *dev, const char *name); int dev_validate_mtu(struct net_device *dev, int mtu, struct netlink_ext_ack *extack); int dev_set_mtu_ext(struct net_device *dev, int mtu, struct netlink_ext_ack *extack); int dev_get_phys_port_id(struct net_device *dev, struct netdev_phys_item_id *ppid); int dev_get_phys_port_name(struct net_device *dev, char *name, size_t len); int dev_change_proto_down(struct net_device *dev, bool proto_down); void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask, u32 value); typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf); int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, int fd, int expected_fd, u32 flags); int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len); void dev_set_group(struct net_device *dev, int new_group); int dev_change_carrier(struct net_device *dev, bool new_carrier); void __dev_set_rx_mode(struct net_device *dev); void __dev_notify_flags(struct net_device *dev, unsigned int old_flags, unsigned int gchanges, u32 portid, const struct nlmsghdr *nlh); void unregister_netdevice_many_notify(struct list_head *head, u32 portid, const struct nlmsghdr *nlh); static inline void netif_set_gso_max_size(struct net_device *dev, unsigned int size) { /* dev->gso_max_size is read locklessly from sk_setup_caps() */ WRITE_ONCE(dev->gso_max_size, size); if (size <= GSO_LEGACY_MAX_SIZE) WRITE_ONCE(dev->gso_ipv4_max_size, size); } static inline void netif_set_gso_max_segs(struct net_device *dev, unsigned int segs) { /* dev->gso_max_segs is read locklessly from sk_setup_caps() */ WRITE_ONCE(dev->gso_max_segs, segs); } static inline void netif_set_gro_max_size(struct net_device *dev, unsigned int size) { /* This pairs with the READ_ONCE() in skb_gro_receive() */ WRITE_ONCE(dev->gro_max_size, size); if (size <= GRO_LEGACY_MAX_SIZE) WRITE_ONCE(dev->gro_ipv4_max_size, size); } static inline void netif_set_gso_ipv4_max_size(struct net_device *dev, unsigned int size) { /* dev->gso_ipv4_max_size is read locklessly from sk_setup_caps() */ WRITE_ONCE(dev->gso_ipv4_max_size, size); } static inline void netif_set_gro_ipv4_max_size(struct net_device *dev, unsigned int size) { /* This pairs with the READ_ONCE() in skb_gro_receive() */ WRITE_ONCE(dev->gro_ipv4_max_size, size); } int rps_cpumask_housekeeping(struct cpumask *mask); #if defined(CONFIG_DEBUG_NET) && defined(CONFIG_BPF_SYSCALL) void xdp_do_check_flushed(struct napi_struct *napi); #else static inline void xdp_do_check_flushed(struct napi_struct *napi) { } #endif struct napi_struct *napi_by_id(unsigned int napi_id); #endif
585 5643 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NETFILTER_NETDEV_H_ #define _NETFILTER_NETDEV_H_ #include <linux/netfilter.h> #include <linux/netdevice.h> #ifdef CONFIG_NETFILTER_INGRESS static inline bool nf_hook_ingress_active(const struct sk_buff *skb) { #ifdef CONFIG_JUMP_LABEL if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_INGRESS])) return false; #endif return rcu_access_pointer(skb->dev->nf_hooks_ingress); } /* caller must hold rcu_read_lock */ static inline int nf_hook_ingress(struct sk_buff *skb) { struct nf_hook_entries *e = rcu_dereference(skb->dev->nf_hooks_ingress); struct nf_hook_state state; int ret; /* Must recheck the ingress hook head, in the event it became NULL * after the check in nf_hook_ingress_active evaluated to true. */ if (unlikely(!e)) return 0; nf_hook_state_init(&state, NF_NETDEV_INGRESS, NFPROTO_NETDEV, skb->dev, NULL, NULL, dev_net(skb->dev), NULL); ret = nf_hook_slow(skb, &state, e, 0); if (ret == 0) return -1; return ret; } #else /* CONFIG_NETFILTER_INGRESS */ static inline int nf_hook_ingress_active(struct sk_buff *skb) { return 0; } static inline int nf_hook_ingress(struct sk_buff *skb) { return 0; } #endif /* CONFIG_NETFILTER_INGRESS */ #ifdef CONFIG_NETFILTER_EGRESS static inline bool nf_hook_egress_active(void) { #ifdef CONFIG_JUMP_LABEL if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_EGRESS])) return false; #endif return true; } /** * nf_hook_egress - classify packets before transmission * @skb: packet to be classified * @rc: result code which shall be returned by __dev_queue_xmit() on failure * @dev: netdev whose egress hooks shall be applied to @skb * * Returns @skb on success or %NULL if the packet was consumed or filtered. * Caller must hold rcu_read_lock. * * On ingress, packets are classified first by tc, then by netfilter. * On egress, the order is reversed for symmetry. Conceptually, tc and * netfilter can be thought of as layers, with netfilter layered above tc: * When tc redirects a packet to another interface, netfilter is not applied * because the packet is on the tc layer. * * The nf_skip_egress flag controls whether netfilter is applied on egress. * It is updated by __netif_receive_skb_core() and __dev_queue_xmit() when the * packet passes through tc and netfilter. Because __dev_queue_xmit() may be * called recursively by tunnel drivers such as vxlan, the flag is reverted to * false after sch_handle_egress(). This ensures that netfilter is applied * both on the overlay and underlying network. */ static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc, struct net_device *dev) { struct nf_hook_entries *e; struct nf_hook_state state; int ret; #ifdef CONFIG_NETFILTER_SKIP_EGRESS if (skb->nf_skip_egress) return skb; #endif e = rcu_dereference_check(dev->nf_hooks_egress, rcu_read_lock_bh_held()); if (!e) return skb; nf_hook_state_init(&state, NF_NETDEV_EGRESS, NFPROTO_NETDEV, NULL, dev, NULL, dev_net(dev), NULL); /* nf assumes rcu_read_lock, not just read_lock_bh */ rcu_read_lock(); ret = nf_hook_slow(skb, &state, e, 0); rcu_read_unlock(); if (ret == 1) { return skb; } else if (ret < 0) { *rc = NET_XMIT_DROP; return NULL; } else { /* ret == 0 */ *rc = NET_XMIT_SUCCESS; return NULL; } } #else /* CONFIG_NETFILTER_EGRESS */ static inline bool nf_hook_egress_active(void) { return false; } static inline struct sk_buff *nf_hook_egress(struct sk_buff *skb, int *rc, struct net_device *dev) { return skb; } #endif /* CONFIG_NETFILTER_EGRESS */ static inline void nf_skip_egress(struct sk_buff *skb, bool skip) { #ifdef CONFIG_NETFILTER_SKIP_EGRESS skb->nf_skip_egress = skip; #endif } static inline void nf_hook_netdev_init(struct net_device *dev) { #ifdef CONFIG_NETFILTER_INGRESS RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL); #endif #ifdef CONFIG_NETFILTER_EGRESS RCU_INIT_POINTER(dev->nf_hooks_egress, NULL); #endif } #endif /* _NETFILTER_NETDEV_H_ */
2 2 25 25 25 25 24 23 23 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 // SPDX-License-Identifier: GPL-2.0 /* * fs/partitions/atari.c * * Code extracted from drivers/block/genhd.c * * Copyright (C) 1991-1998 Linus Torvalds * Re-organised Feb 1998 Russell King */ #include <linux/ctype.h> #include "check.h" #include "atari.h" /* ++guenther: this should be settable by the user ("make config")?. */ #define ICD_PARTS /* check if a partition entry looks valid -- Atari format is assumed if at least one of the primary entries is ok this way */ #define VALID_PARTITION(pi,hdsiz) \ (((pi)->flg & 1) && \ isalnum((pi)->id[0]) && isalnum((pi)->id[1]) && isalnum((pi)->id[2]) && \ be32_to_cpu((pi)->st) <= (hdsiz) && \ be32_to_cpu((pi)->st) + be32_to_cpu((pi)->siz) <= (hdsiz)) static inline int OK_id(char *s) { return memcmp (s, "GEM", 3) == 0 || memcmp (s, "BGM", 3) == 0 || memcmp (s, "LNX", 3) == 0 || memcmp (s, "SWP", 3) == 0 || memcmp (s, "RAW", 3) == 0 ; } int atari_partition(struct parsed_partitions *state) { Sector sect; struct rootsector *rs; struct partition_info *pi; u32 extensect; u32 hd_size; int slot; #ifdef ICD_PARTS int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */ #endif /* * ATARI partition scheme supports 512 lba only. If this is not * the case, bail early to avoid miscalculating hd_size. */ if (queue_logical_block_size(state->disk->queue) != 512) return 0; rs = read_part_sector(state, 0, &sect); if (!rs) return -1; /* Verify this is an Atari rootsector: */ hd_size = get_capacity(state->disk); if (!VALID_PARTITION(&rs->part[0], hd_size) && !VALID_PARTITION(&rs->part[1], hd_size) && !VALID_PARTITION(&rs->part[2], hd_size) && !VALID_PARTITION(&rs->part[3], hd_size)) { /* * if there's no valid primary partition, assume that no Atari * format partition table (there's no reliable magic or the like * :-() */ put_dev_sector(sect); return 0; } pi = &rs->part[0]; strlcat(state->pp_buf, " AHDI", PAGE_SIZE); for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) { struct rootsector *xrs; Sector sect2; ulong partsect; if ( !(pi->flg & 1) ) continue; /* active partition */ if (memcmp (pi->id, "XGM", 3) != 0) { /* we don't care about other id's */ put_partition (state, slot, be32_to_cpu(pi->st), be32_to_cpu(pi->siz)); continue; } /* extension partition */ #ifdef ICD_PARTS part_fmt = 1; #endif strlcat(state->pp_buf, " XGM<", PAGE_SIZE); partsect = extensect = be32_to_cpu(pi->st); while (1) { xrs = read_part_sector(state, partsect, &sect2); if (!xrs) { printk (" block %ld read failed\n", partsect); put_dev_sector(sect); return -1; } /* ++roman: sanity check: bit 0 of flg field must be set */ if (!(xrs->part[0].flg & 1)) { printk( "\nFirst sub-partition in extended partition is not valid!\n" ); put_dev_sector(sect2); break; } put_partition(state, slot, partsect + be32_to_cpu(xrs->part[0].st), be32_to_cpu(xrs->part[0].siz)); if (!(xrs->part[1].flg & 1)) { /* end of linked partition list */ put_dev_sector(sect2); break; } if (memcmp( xrs->part[1].id, "XGM", 3 ) != 0) { printk("\nID of extended partition is not XGM!\n"); put_dev_sector(sect2); break; } partsect = be32_to_cpu(xrs->part[1].st) + extensect; put_dev_sector(sect2); if (++slot == state->limit) { printk( "\nMaximum number of partitions reached!\n" ); break; } } strlcat(state->pp_buf, " >", PAGE_SIZE); } #ifdef ICD_PARTS if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */ pi = &rs->icdpart[0]; /* sanity check: no ICD format if first partition invalid */ if (OK_id(pi->id)) { strlcat(state->pp_buf, " ICD<", PAGE_SIZE); for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) { /* accept only GEM,BGM,RAW,LNX,SWP partitions */ if (!((pi->flg & 1) && OK_id(pi->id))) continue; put_partition (state, slot, be32_to_cpu(pi->st), be32_to_cpu(pi->siz)); } strlcat(state->pp_buf, " >", PAGE_SIZE); } } #endif put_dev_sector(sect); strlcat(state->pp_buf, "\n", PAGE_SIZE); return 1; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 /* SPDX-License-Identifier: GPL-2.0 */ /* include/net/dsfield.h - Manipulation of the Differentiated Services field */ /* Written 1998-2000 by Werner Almesberger, EPFL ICA */ #ifndef __NET_DSFIELD_H #define __NET_DSFIELD_H #include <linux/types.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <asm/byteorder.h> static inline __u8 ipv4_get_dsfield(const struct iphdr *iph) { return iph->tos; } static inline __u8 ipv6_get_dsfield(const struct ipv6hdr *ipv6h) { return ntohs(*(__force const __be16 *)ipv6h) >> 4; } static inline void ipv4_change_dsfield(struct iphdr *iph,__u8 mask, __u8 value) { __u32 check = ntohs((__force __be16)iph->check); __u8 dsfield; dsfield = (iph->tos & mask) | value; check += iph->tos; if ((check+1) >> 16) check = (check+1) & 0xffff; check -= dsfield; check += check >> 16; /* adjust carry */ iph->check = (__force __sum16)htons(check); iph->tos = dsfield; } static inline void ipv6_change_dsfield(struct ipv6hdr *ipv6h,__u8 mask, __u8 value) { __be16 *p = (__force __be16 *)ipv6h; *p = (*p & htons((((u16)mask << 4) | 0xf00f))) | htons((u16)value << 4); } #endif
1 25 25 25 7 25 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 // SPDX-License-Identifier: GPL-2.0 /* * fs/partitions/amiga.c * * Code extracted from drivers/block/genhd.c * * Copyright (C) 1991-1998 Linus Torvalds * Re-organised Feb 1998 Russell King */ #define pr_fmt(fmt) fmt #include <linux/types.h> #include <linux/mm_types.h> #include <linux/overflow.h> #include <linux/affs_hardblocks.h> #include "check.h" /* magic offsets in partition DosEnvVec */ #define NR_HD 3 #define NR_SECT 5 #define LO_CYL 9 #define HI_CYL 10 static __inline__ u32 checksum_block(__be32 *m, int size) { u32 sum = 0; while (size--) sum += be32_to_cpu(*m++); return sum; } int amiga_partition(struct parsed_partitions *state) { Sector sect; unsigned char *data; struct RigidDiskBlock *rdb; struct PartitionBlock *pb; u64 start_sect, nr_sects; sector_t blk, end_sect; u32 cylblk; /* rdb_CylBlocks = nr_heads*sect_per_track */ u32 nr_hd, nr_sect, lo_cyl, hi_cyl; int part, res = 0; unsigned int blksize = 1; /* Multiplier for disk block size */ int slot = 1; for (blk = 0; ; blk++, put_dev_sector(sect)) { if (blk == RDB_ALLOCATION_LIMIT) goto rdb_done; data = read_part_sector(state, blk, &sect); if (!data) { pr_err("Dev %s: unable to read RDB block %llu\n", state->disk->disk_name, blk); res = -1; goto rdb_done; } if (*(__be32 *)data != cpu_to_be32(IDNAME_RIGIDDISK)) continue; rdb = (struct RigidDiskBlock *)data; if (checksum_block((__be32 *)data, be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F) == 0) break; /* Try again with 0xdc..0xdf zeroed, Windows might have * trashed it. */ *(__be32 *)(data+0xdc) = 0; if (checksum_block((__be32 *)data, be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F)==0) { pr_err("Trashed word at 0xd0 in block %llu ignored in checksum calculation\n", blk); break; } pr_err("Dev %s: RDB in block %llu has bad checksum\n", state->disk->disk_name, blk); } /* blksize is blocks per 512 byte standard block */ blksize = be32_to_cpu( rdb->rdb_BlockBytes ) / 512; { char tmp[7 + 10 + 1 + 1]; /* Be more informative */ snprintf(tmp, sizeof(tmp), " RDSK (%d)", blksize * 512); strlcat(state->pp_buf, tmp, PAGE_SIZE); } blk = be32_to_cpu(rdb->rdb_PartitionList); put_dev_sector(sect); for (part = 1; (s32) blk>0 && part<=16; part++, put_dev_sector(sect)) { /* Read in terms partition table understands */ if (check_mul_overflow(blk, (sector_t) blksize, &blk)) { pr_err("Dev %s: overflow calculating partition block %llu! Skipping partitions %u and beyond\n", state->disk->disk_name, blk, part); break; } data = read_part_sector(state, blk, &sect); if (!data) { pr_err("Dev %s: unable to read partition block %llu\n", state->disk->disk_name, blk); res = -1; goto rdb_done; } pb = (struct PartitionBlock *)data; blk = be32_to_cpu(pb->pb_Next); if (pb->pb_ID != cpu_to_be32(IDNAME_PARTITION)) continue; if (checksum_block((__be32 *)pb, be32_to_cpu(pb->pb_SummedLongs) & 0x7F) != 0 ) continue; /* RDB gives us more than enough rope to hang ourselves with, * many times over (2^128 bytes if all fields max out). * Some careful checks are in order, so check for potential * overflows. * We are multiplying four 32 bit numbers to one sector_t! */ nr_hd = be32_to_cpu(pb->pb_Environment[NR_HD]); nr_sect = be32_to_cpu(pb->pb_Environment[NR_SECT]); /* CylBlocks is total number of blocks per cylinder */ if (check_mul_overflow(nr_hd, nr_sect, &cylblk)) { pr_err("Dev %s: heads*sects %u overflows u32, skipping partition!\n", state->disk->disk_name, cylblk); continue; } /* check for consistency with RDB defined CylBlocks */ if (cylblk > be32_to_cpu(rdb->rdb_CylBlocks)) { pr_warn("Dev %s: cylblk %u > rdb_CylBlocks %u!\n", state->disk->disk_name, cylblk, be32_to_cpu(rdb->rdb_CylBlocks)); } /* RDB allows for variable logical block size - * normalize to 512 byte blocks and check result. */ if (check_mul_overflow(cylblk, blksize, &cylblk)) { pr_err("Dev %s: partition %u bytes per cyl. overflows u32, skipping partition!\n", state->disk->disk_name, part); continue; } /* Calculate partition start and end. Limit of 32 bit on cylblk * guarantees no overflow occurs if LBD support is enabled. */ lo_cyl = be32_to_cpu(pb->pb_Environment[LO_CYL]); start_sect = ((u64) lo_cyl * cylblk); hi_cyl = be32_to_cpu(pb->pb_Environment[HI_CYL]); nr_sects = (((u64) hi_cyl - lo_cyl + 1) * cylblk); if (!nr_sects) continue; /* Warn user if partition end overflows u32 (AmigaDOS limit) */ if ((start_sect + nr_sects) > UINT_MAX) { pr_warn("Dev %s: partition %u (%llu-%llu) needs 64 bit device support!\n", state->disk->disk_name, part, start_sect, start_sect + nr_sects); } if (check_add_overflow(start_sect, nr_sects, &end_sect)) { pr_err("Dev %s: partition %u (%llu-%llu) needs LBD device support, skipping partition!\n", state->disk->disk_name, part, start_sect, end_sect); continue; } /* Tell Kernel about it */ put_partition(state,slot++,start_sect,nr_sects); { /* Be even more informative to aid mounting */ char dostype[4]; char tmp[42]; __be32 *dt = (__be32 *)dostype; *dt = pb->pb_Environment[16]; if (dostype[3] < ' ') snprintf(tmp, sizeof(tmp), " (%c%c%c^%c)", dostype[0], dostype[1], dostype[2], dostype[3] + '@' ); else snprintf(tmp, sizeof(tmp), " (%c%c%c%c)", dostype[0], dostype[1], dostype[2], dostype[3]); strlcat(state->pp_buf, tmp, PAGE_SIZE); snprintf(tmp, sizeof(tmp), "(res %d spb %d)", be32_to_cpu(pb->pb_Environment[6]), be32_to_cpu(pb->pb_Environment[4])); strlcat(state->pp_buf, tmp, PAGE_SIZE); } res = 1; } strlcat(state->pp_buf, "\n", PAGE_SIZE); rdb_done: return res; }
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 // SPDX-License-Identifier: GPL-2.0-only /* * drivers/mfd/mfd-core.c * * core MFD support * Copyright (c) 2006 Ian Molton * Copyright (c) 2007,2008 Dmitry Baryshkov */ #include <linux/kernel.h> #include <linux/platform_device.h> #include <linux/acpi.h> #include <linux/list.h> #include <linux/property.h> #include <linux/mfd/core.h> #include <linux/pm_runtime.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/irqdomain.h> #include <linux/of.h> #include <linux/of_address.h> #include <linux/regulator/consumer.h> static LIST_HEAD(mfd_of_node_list); struct mfd_of_node_entry { struct list_head list; struct device *dev; struct device_node *np; }; static struct device_type mfd_dev_type = { .name = "mfd_device", }; #if IS_ENABLED(CONFIG_ACPI) struct match_ids_walk_data { struct acpi_device_id *ids; struct acpi_device *adev; }; static int match_device_ids(struct acpi_device *adev, void *data) { struct match_ids_walk_data *wd = data; if (!acpi_match_device_ids(adev, wd->ids)) { wd->adev = adev; return 1; } return 0; } static void mfd_acpi_add_device(const struct mfd_cell *cell, struct platform_device *pdev) { const struct mfd_cell_acpi_match *match = cell->acpi_match; struct acpi_device *adev = NULL; struct acpi_device *parent; parent = ACPI_COMPANION(pdev->dev.parent); if (!parent) return; /* * MFD child device gets its ACPI handle either from the ACPI device * directly under the parent that matches the either _HID or _CID, or * _ADR or it will use the parent handle if is no ID is given. * * Note that use of _ADR is a grey area in the ACPI specification, * though at least Intel Galileo Gen 2 is using it to distinguish * the children devices. */ if (match) { if (match->pnpid) { struct acpi_device_id ids[2] = {}; struct match_ids_walk_data wd = { .adev = NULL, .ids = ids, }; strscpy(ids[0].id, match->pnpid, sizeof(ids[0].id)); acpi_dev_for_each_child(parent, match_device_ids, &wd); adev = wd.adev; } else { adev = acpi_find_child_device(parent, match->adr, false); } } ACPI_COMPANION_SET(&pdev->dev, adev ?: parent); } #else static inline void mfd_acpi_add_device(const struct mfd_cell *cell, struct platform_device *pdev) { } #endif static int mfd_match_of_node_to_dev(struct platform_device *pdev, struct device_node *np, const struct mfd_cell *cell) { #if IS_ENABLED(CONFIG_OF) struct mfd_of_node_entry *of_entry; u64 of_node_addr; /* Skip if OF node has previously been allocated to a device */ list_for_each_entry(of_entry, &mfd_of_node_list, list) if (of_entry->np == np) return -EAGAIN; if (!cell->use_of_reg) /* No of_reg defined - allocate first free compatible match */ goto allocate_of_node; /* We only care about each node's first defined address */ if (of_property_read_reg(np, 0, &of_node_addr, NULL)) /* OF node does not contatin a 'reg' property to match to */ return -EAGAIN; if (cell->of_reg != of_node_addr) /* No match */ return -EAGAIN; allocate_of_node: of_entry = kzalloc(sizeof(*of_entry), GFP_KERNEL); if (!of_entry) return -ENOMEM; of_entry->dev = &pdev->dev; of_entry->np = np; list_add_tail(&of_entry->list, &mfd_of_node_list); pdev->dev.of_node = np; pdev->dev.fwnode = &np->fwnode; #endif return 0; } static int mfd_add_device(struct device *parent, int id, const struct mfd_cell *cell, struct resource *mem_base, int irq_base, struct irq_domain *domain) { struct resource *res; struct platform_device *pdev; struct device_node *np = NULL; struct mfd_of_node_entry *of_entry, *tmp; bool disabled = false; int ret = -ENOMEM; int platform_id; int r; if (id == PLATFORM_DEVID_AUTO) platform_id = id; else platform_id = id + cell->id; pdev = platform_device_alloc(cell->name, platform_id); if (!pdev) goto fail_alloc; pdev->mfd_cell = kmemdup(cell, sizeof(*cell), GFP_KERNEL); if (!pdev->mfd_cell) goto fail_device; res = kcalloc(cell->num_resources, sizeof(*res), GFP_KERNEL); if (!res) goto fail_device; pdev->dev.parent = parent; pdev->dev.type = &mfd_dev_type; pdev->dev.dma_mask = parent->dma_mask; pdev->dev.dma_parms = parent->dma_parms; pdev->dev.coherent_dma_mask = parent->coherent_dma_mask; ret = regulator_bulk_register_supply_alias( &pdev->dev, cell->parent_supplies, parent, cell->parent_supplies, cell->num_parent_supplies); if (ret < 0) goto fail_res; if (IS_ENABLED(CONFIG_OF) && parent->of_node && cell->of_compatible) { for_each_child_of_node(parent->of_node, np) { if (of_device_is_compatible(np, cell->of_compatible)) { /* Skip 'disabled' devices */ if (!of_device_is_available(np)) { disabled = true; continue; } ret = mfd_match_of_node_to_dev(pdev, np, cell); if (ret == -EAGAIN) continue; of_node_put(np); if (ret) goto fail_alias; goto match; } } if (disabled) { /* Ignore 'disabled' devices error free */ ret = 0; goto fail_alias; } match: if (!pdev->dev.of_node) pr_warn("%s: Failed to locate of_node [id: %d]\n", cell->name, platform_id); } mfd_acpi_add_device(cell, pdev); if (cell->pdata_size) { ret = platform_device_add_data(pdev, cell->platform_data, cell->pdata_size); if (ret) goto fail_of_entry; } if (cell->swnode) { ret = device_add_software_node(&pdev->dev, cell->swnode); if (ret) goto fail_of_entry; } for (r = 0; r < cell->num_resources; r++) { res[r].name = cell->resources[r].name; res[r].flags = cell->resources[r].flags; /* Find out base to use */ if ((cell->resources[r].flags & IORESOURCE_MEM) && mem_base) { res[r].parent = mem_base; res[r].start = mem_base->start + cell->resources[r].start; res[r].end = mem_base->start + cell->resources[r].end; } else if (cell->resources[r].flags & IORESOURCE_IRQ) { if (domain) { /* Unable to create mappings for IRQ ranges. */ WARN_ON(cell->resources[r].start != cell->resources[r].end); res[r].start = res[r].end = irq_create_mapping( domain, cell->resources[r].start); } else { res[r].start = irq_base + cell->resources[r].start; res[r].end = irq_base + cell->resources[r].end; } } else { res[r].parent = cell->resources[r].parent; res[r].start = cell->resources[r].start; res[r].end = cell->resources[r].end; } if (!cell->ignore_resource_conflicts) { if (has_acpi_companion(&pdev->dev)) { ret = acpi_check_resource_conflict(&res[r]); if (ret) goto fail_res_conflict; } } } ret = platform_device_add_resources(pdev, res, cell->num_resources); if (ret) goto fail_res_conflict; ret = platform_device_add(pdev); if (ret) goto fail_res_conflict; if (cell->pm_runtime_no_callbacks) pm_runtime_no_callbacks(&pdev->dev); kfree(res); return 0; fail_res_conflict: if (cell->swnode) device_remove_software_node(&pdev->dev); fail_of_entry: list_for_each_entry_safe(of_entry, tmp, &mfd_of_node_list, list) if (of_entry->dev == &pdev->dev) { list_del(&of_entry->list); kfree(of_entry); } fail_alias: regulator_bulk_unregister_supply_alias(&pdev->dev, cell->parent_supplies, cell->num_parent_supplies); fail_res: kfree(res); fail_device: platform_device_put(pdev); fail_alloc: return ret; } /** * mfd_add_devices - register child devices * * @parent: Pointer to parent device. * @id: Can be PLATFORM_DEVID_AUTO to let the Platform API take care * of device numbering, or will be added to a device's cell_id. * @cells: Array of (struct mfd_cell)s describing child devices. * @n_devs: Number of child devices to register. * @mem_base: Parent register range resource for child devices. * @irq_base: Base of the range of virtual interrupt numbers allocated for * this MFD device. Unused if @domain is specified. * @domain: Interrupt domain to create mappings for hardware interrupts. */ int mfd_add_devices(struct device *parent, int id, const struct mfd_cell *cells, int n_devs, struct resource *mem_base, int irq_base, struct irq_domain *domain) { int i; int ret; for (i = 0; i < n_devs; i++) { ret = mfd_add_device(parent, id, cells + i, mem_base, irq_base, domain); if (ret) goto fail; } return 0; fail: if (i) mfd_remove_devices(parent); return ret; } EXPORT_SYMBOL(mfd_add_devices); static int mfd_remove_devices_fn(struct device *dev, void *data) { struct platform_device *pdev; const struct mfd_cell *cell; struct mfd_of_node_entry *of_entry, *tmp; int *level = data; if (dev->type != &mfd_dev_type) return 0; pdev = to_platform_device(dev); cell = mfd_get_cell(pdev); if (level && cell->level > *level) return 0; if (cell->swnode) device_remove_software_node(&pdev->dev); list_for_each_entry_safe(of_entry, tmp, &mfd_of_node_list, list) if (of_entry->dev == &pdev->dev) { list_del(&of_entry->list); kfree(of_entry); } regulator_bulk_unregister_supply_alias(dev, cell->parent_supplies, cell->num_parent_supplies); platform_device_unregister(pdev); return 0; } void mfd_remove_devices_late(struct device *parent) { int level = MFD_DEP_LEVEL_HIGH; device_for_each_child_reverse(parent, &level, mfd_remove_devices_fn); } EXPORT_SYMBOL(mfd_remove_devices_late); void mfd_remove_devices(struct device *parent) { int level = MFD_DEP_LEVEL_NORMAL; device_for_each_child_reverse(parent, &level, mfd_remove_devices_fn); } EXPORT_SYMBOL(mfd_remove_devices); static void devm_mfd_dev_release(struct device *dev, void *res) { mfd_remove_devices(dev); } /** * devm_mfd_add_devices - Resource managed version of mfd_add_devices() * * Returns 0 on success or an appropriate negative error number on failure. * All child-devices of the MFD will automatically be removed when it gets * unbinded. * * @dev: Pointer to parent device. * @id: Can be PLATFORM_DEVID_AUTO to let the Platform API take care * of device numbering, or will be added to a device's cell_id. * @cells: Array of (struct mfd_cell)s describing child devices. * @n_devs: Number of child devices to register. * @mem_base: Parent register range resource for child devices. * @irq_base: Base of the range of virtual interrupt numbers allocated for * this MFD device. Unused if @domain is specified. * @domain: Interrupt domain to create mappings for hardware interrupts. */ int devm_mfd_add_devices(struct device *dev, int id, const struct mfd_cell *cells, int n_devs, struct resource *mem_base, int irq_base, struct irq_domain *domain) { struct device **ptr; int ret; ptr = devres_alloc(devm_mfd_dev_release, sizeof(*ptr), GFP_KERNEL); if (!ptr) return -ENOMEM; ret = mfd_add_devices(dev, id, cells, n_devs, mem_base, irq_base, domain); if (ret < 0) { devres_free(ptr); return ret; } *ptr = dev; devres_add(dev, ptr); return ret; } EXPORT_SYMBOL(devm_mfd_add_devices); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Ian Molton, Dmitry Baryshkov");
160 7203 8 717 1 223 3 226 223 3 224 1 68 68 431 2132 23 86 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 /* SPDX-License-Identifier: GPL-2.0 */ /* * Linux Socket Filter Data Structures */ #ifndef __LINUX_FILTER_H__ #define __LINUX_FILTER_H__ #include <linux/atomic.h> #include <linux/bpf.h> #include <linux/refcount.h> #include <linux/compat.h> #include <linux/skbuff.h> #include <linux/linkage.h> #include <linux/printk.h> #include <linux/workqueue.h> #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/capability.h> #include <linux/set_memory.h> #include <linux/kallsyms.h> #include <linux/if_vlan.h> #include <linux/vmalloc.h> #include <linux/sockptr.h> #include <crypto/sha1.h> #include <linux/u64_stats_sync.h> #include <net/sch_generic.h> #include <asm/byteorder.h> #include <uapi/linux/filter.h> struct sk_buff; struct sock; struct seccomp_data; struct bpf_prog_aux; struct xdp_rxq_info; struct xdp_buff; struct sock_reuseport; struct ctl_table; struct ctl_table_header; /* ArgX, context and stack frame pointer register positions. Note, * Arg1, Arg2, Arg3, etc are used as argument mappings of function * calls in BPF_CALL instruction. */ #define BPF_REG_ARG1 BPF_REG_1 #define BPF_REG_ARG2 BPF_REG_2 #define BPF_REG_ARG3 BPF_REG_3 #define BPF_REG_ARG4 BPF_REG_4 #define BPF_REG_ARG5 BPF_REG_5 #define BPF_REG_CTX BPF_REG_6 #define BPF_REG_FP BPF_REG_10 /* Additional register mappings for converted user programs. */ #define BPF_REG_A BPF_REG_0 #define BPF_REG_X BPF_REG_7 #define BPF_REG_TMP BPF_REG_2 /* scratch reg */ #define BPF_REG_D BPF_REG_8 /* data, callee-saved */ #define BPF_REG_H BPF_REG_9 /* hlen, callee-saved */ /* Kernel hidden auxiliary/helper register. */ #define BPF_REG_AX MAX_BPF_REG #define MAX_BPF_EXT_REG (MAX_BPF_REG + 1) #define MAX_BPF_JIT_REG MAX_BPF_EXT_REG /* unused opcode to mark special call to bpf_tail_call() helper */ #define BPF_TAIL_CALL 0xf0 /* unused opcode to mark special load instruction. Same as BPF_ABS */ #define BPF_PROBE_MEM 0x20 /* unused opcode to mark special ldsx instruction. Same as BPF_IND */ #define BPF_PROBE_MEMSX 0x40 /* unused opcode to mark call to interpreter with arguments */ #define BPF_CALL_ARGS 0xe0 /* unused opcode to mark speculation barrier for mitigating * Speculative Store Bypass */ #define BPF_NOSPEC 0xc0 /* As per nm, we expose JITed images as text (code) section for * kallsyms. That way, tools like perf can find it to match * addresses. */ #define BPF_SYM_ELF_TYPE 't' /* BPF program can access up to 512 bytes of stack space. */ #define MAX_BPF_STACK 512 /* Helper macros for filter block array initializers. */ /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */ #define BPF_ALU64_REG_OFF(OP, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_OP(OP) | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) #define BPF_ALU64_REG(OP, DST, SRC) \ BPF_ALU64_REG_OFF(OP, DST, SRC, 0) #define BPF_ALU32_REG_OFF(OP, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_OP(OP) | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) #define BPF_ALU32_REG(OP, DST, SRC) \ BPF_ALU32_REG_OFF(OP, DST, SRC, 0) /* ALU ops on immediates, bpf_add|sub|...: dst_reg += imm32 */ #define BPF_ALU64_IMM_OFF(OP, DST, IMM, OFF) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) #define BPF_ALU64_IMM(OP, DST, IMM) \ BPF_ALU64_IMM_OFF(OP, DST, IMM, 0) #define BPF_ALU32_IMM_OFF(OP, DST, IMM, OFF) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) #define BPF_ALU32_IMM(OP, DST, IMM) \ BPF_ALU32_IMM_OFF(OP, DST, IMM, 0) /* Endianess conversion, cpu_to_{l,b}e(), {l,b}e_to_cpu() */ #define BPF_ENDIAN(TYPE, DST, LEN) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_END | BPF_SRC(TYPE), \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = LEN }) /* Byte Swap, bswap16/32/64 */ #define BPF_BSWAP(DST, LEN) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_END | BPF_SRC(BPF_TO_LE), \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = LEN }) /* Short form of mov, dst_reg = src_reg */ #define BPF_MOV64_REG(DST, SRC) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_MOV | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = 0 }) #define BPF_MOV32_REG(DST, SRC) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_MOV | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = 0 }) /* Short form of mov, dst_reg = imm32 */ #define BPF_MOV64_IMM(DST, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_MOV | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = IMM }) #define BPF_MOV32_IMM(DST, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_MOV | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = 0, \ .imm = IMM }) /* Short form of movsx, dst_reg = (s8,s16,s32)src_reg */ #define BPF_MOVSX64_REG(DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_MOV | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) #define BPF_MOVSX32_REG(DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_MOV | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) /* Special form of mov32, used for doing explicit zero extension on dst. */ #define BPF_ZEXT_REG(DST) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_MOV | BPF_X, \ .dst_reg = DST, \ .src_reg = DST, \ .off = 0, \ .imm = 1 }) static inline bool insn_is_zext(const struct bpf_insn *insn) { return insn->code == (BPF_ALU | BPF_MOV | BPF_X) && insn->imm == 1; } /* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */ #define BPF_LD_IMM64(DST, IMM) \ BPF_LD_IMM64_RAW(DST, 0, IMM) #define BPF_LD_IMM64_RAW(DST, SRC, IMM) \ ((struct bpf_insn) { \ .code = BPF_LD | BPF_DW | BPF_IMM, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = (__u32) (IMM) }), \ ((struct bpf_insn) { \ .code = 0, /* zero is reserved opcode */ \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = ((__u64) (IMM)) >> 32 }) /* pseudo BPF_LD_IMM64 insn used to refer to process-local map_fd */ #define BPF_LD_MAP_FD(DST, MAP_FD) \ BPF_LD_IMM64_RAW(DST, BPF_PSEUDO_MAP_FD, MAP_FD) /* Short form of mov based on type, BPF_X: dst_reg = src_reg, BPF_K: dst_reg = imm32 */ #define BPF_MOV64_RAW(TYPE, DST, SRC, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU64 | BPF_MOV | BPF_SRC(TYPE), \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = IMM }) #define BPF_MOV32_RAW(TYPE, DST, SRC, IMM) \ ((struct bpf_insn) { \ .code = BPF_ALU | BPF_MOV | BPF_SRC(TYPE), \ .dst_reg = DST, \ .src_reg = SRC, \ .off = 0, \ .imm = IMM }) /* Direct packet access, R0 = *(uint *) (skb->data + imm32) */ #define BPF_LD_ABS(SIZE, IMM) \ ((struct bpf_insn) { \ .code = BPF_LD | BPF_SIZE(SIZE) | BPF_ABS, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = IMM }) /* Indirect packet access, R0 = *(uint *) (skb->data + src_reg + imm32) */ #define BPF_LD_IND(SIZE, SRC, IMM) \ ((struct bpf_insn) { \ .code = BPF_LD | BPF_SIZE(SIZE) | BPF_IND, \ .dst_reg = 0, \ .src_reg = SRC, \ .off = 0, \ .imm = IMM }) /* Memory load, dst_reg = *(uint *) (src_reg + off16) */ #define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) /* Memory load, dst_reg = *(signed size *) (src_reg + off16) */ #define BPF_LDX_MEMSX(SIZE, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEMSX, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) /* Memory store, *(uint *) (dst_reg + off16) = src_reg */ #define BPF_STX_MEM(SIZE, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) /* * Atomic operations: * * BPF_ADD *(uint *) (dst_reg + off16) += src_reg * BPF_AND *(uint *) (dst_reg + off16) &= src_reg * BPF_OR *(uint *) (dst_reg + off16) |= src_reg * BPF_XOR *(uint *) (dst_reg + off16) ^= src_reg * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); * BPF_AND | BPF_FETCH src_reg = atomic_fetch_and(dst_reg + off16, src_reg); * BPF_OR | BPF_FETCH src_reg = atomic_fetch_or(dst_reg + off16, src_reg); * BPF_XOR | BPF_FETCH src_reg = atomic_fetch_xor(dst_reg + off16, src_reg); * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg) * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg) */ #define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = OP }) /* Legacy alias */ #define BPF_STX_XADD(SIZE, DST, SRC, OFF) BPF_ATOMIC_OP(SIZE, BPF_ADD, DST, SRC, OFF) /* Memory store, *(uint *) (dst_reg + off16) = imm32 */ #define BPF_ST_MEM(SIZE, DST, OFF, IMM) \ ((struct bpf_insn) { \ .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) /* Conditional jumps against registers, if (dst_reg 'op' src_reg) goto pc + off16 */ #define BPF_JMP_REG(OP, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_OP(OP) | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) /* Conditional jumps against immediates, if (dst_reg 'op' imm32) goto pc + off16 */ #define BPF_JMP_IMM(OP, DST, IMM, OFF) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) /* Like BPF_JMP_REG, but with 32-bit wide operands for comparison. */ #define BPF_JMP32_REG(OP, DST, SRC, OFF) \ ((struct bpf_insn) { \ .code = BPF_JMP32 | BPF_OP(OP) | BPF_X, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = 0 }) /* Like BPF_JMP_IMM, but with 32-bit wide operands for comparison. */ #define BPF_JMP32_IMM(OP, DST, IMM, OFF) \ ((struct bpf_insn) { \ .code = BPF_JMP32 | BPF_OP(OP) | BPF_K, \ .dst_reg = DST, \ .src_reg = 0, \ .off = OFF, \ .imm = IMM }) /* Unconditional jumps, goto pc + off16 */ #define BPF_JMP_A(OFF) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_JA, \ .dst_reg = 0, \ .src_reg = 0, \ .off = OFF, \ .imm = 0 }) /* Relative call */ #define BPF_CALL_REL(TGT) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_CALL, \ .dst_reg = 0, \ .src_reg = BPF_PSEUDO_CALL, \ .off = 0, \ .imm = TGT }) /* Convert function address to BPF immediate */ #define BPF_CALL_IMM(x) ((void *)(x) - (void *)__bpf_call_base) #define BPF_EMIT_CALL(FUNC) \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_CALL, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = BPF_CALL_IMM(FUNC) }) /* Raw code statement block */ #define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \ ((struct bpf_insn) { \ .code = CODE, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ .imm = IMM }) /* Program exit */ #define BPF_EXIT_INSN() \ ((struct bpf_insn) { \ .code = BPF_JMP | BPF_EXIT, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = 0 }) /* Speculation barrier */ #define BPF_ST_NOSPEC() \ ((struct bpf_insn) { \ .code = BPF_ST | BPF_NOSPEC, \ .dst_reg = 0, \ .src_reg = 0, \ .off = 0, \ .imm = 0 }) /* Internal classic blocks for direct assignment */ #define __BPF_STMT(CODE, K) \ ((struct sock_filter) BPF_STMT(CODE, K)) #define __BPF_JUMP(CODE, K, JT, JF) \ ((struct sock_filter) BPF_JUMP(CODE, K, JT, JF)) #define bytes_to_bpf_size(bytes) \ ({ \ int bpf_size = -EINVAL; \ \ if (bytes == sizeof(u8)) \ bpf_size = BPF_B; \ else if (bytes == sizeof(u16)) \ bpf_size = BPF_H; \ else if (bytes == sizeof(u32)) \ bpf_size = BPF_W; \ else if (bytes == sizeof(u64)) \ bpf_size = BPF_DW; \ \ bpf_size; \ }) #define bpf_size_to_bytes(bpf_size) \ ({ \ int bytes = -EINVAL; \ \ if (bpf_size == BPF_B) \ bytes = sizeof(u8); \ else if (bpf_size == BPF_H) \ bytes = sizeof(u16); \ else if (bpf_size == BPF_W) \ bytes = sizeof(u32); \ else if (bpf_size == BPF_DW) \ bytes = sizeof(u64); \ \ bytes; \ }) #define BPF_SIZEOF(type) \ ({ \ const int __size = bytes_to_bpf_size(sizeof(type)); \ BUILD_BUG_ON(__size < 0); \ __size; \ }) #define BPF_FIELD_SIZEOF(type, field) \ ({ \ const int __size = bytes_to_bpf_size(sizeof_field(type, field)); \ BUILD_BUG_ON(__size < 0); \ __size; \ }) #define BPF_LDST_BYTES(insn) \ ({ \ const int __size = bpf_size_to_bytes(BPF_SIZE((insn)->code)); \ WARN_ON(__size < 0); \ __size; \ }) #define __BPF_MAP_0(m, v, ...) v #define __BPF_MAP_1(m, v, t, a, ...) m(t, a) #define __BPF_MAP_2(m, v, t, a, ...) m(t, a), __BPF_MAP_1(m, v, __VA_ARGS__) #define __BPF_MAP_3(m, v, t, a, ...) m(t, a), __BPF_MAP_2(m, v, __VA_ARGS__) #define __BPF_MAP_4(m, v, t, a, ...) m(t, a), __BPF_MAP_3(m, v, __VA_ARGS__) #define __BPF_MAP_5(m, v, t, a, ...) m(t, a), __BPF_MAP_4(m, v, __VA_ARGS__) #define __BPF_REG_0(...) __BPF_PAD(5) #define __BPF_REG_1(...) __BPF_MAP(1, __VA_ARGS__), __BPF_PAD(4) #define __BPF_REG_2(...) __BPF_MAP(2, __VA_ARGS__), __BPF_PAD(3) #define __BPF_REG_3(...) __BPF_MAP(3, __VA_ARGS__), __BPF_PAD(2) #define __BPF_REG_4(...) __BPF_MAP(4, __VA_ARGS__), __BPF_PAD(1) #define __BPF_REG_5(...) __BPF_MAP(5, __VA_ARGS__) #define __BPF_MAP(n, ...) __BPF_MAP_##n(__VA_ARGS__) #define __BPF_REG(n, ...) __BPF_REG_##n(__VA_ARGS__) #define __BPF_CAST(t, a) \ (__force t) \ (__force \ typeof(__builtin_choose_expr(sizeof(t) == sizeof(unsigned long), \ (unsigned long)0, (t)0))) a #define __BPF_V void #define __BPF_N #define __BPF_DECL_ARGS(t, a) t a #define __BPF_DECL_REGS(t, a) u64 a #define __BPF_PAD(n) \ __BPF_MAP(n, __BPF_DECL_ARGS, __BPF_N, u64, __ur_1, u64, __ur_2, \ u64, __ur_3, u64, __ur_4, u64, __ur_5) #define BPF_CALL_x(x, name, ...) \ static __always_inline \ u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ typedef u64 (*btf_##name)(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)); \ u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)) \ { \ return ((btf_##name)____##name)(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\ } \ static __always_inline \ u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)) #define BPF_CALL_0(name, ...) BPF_CALL_x(0, name, __VA_ARGS__) #define BPF_CALL_1(name, ...) BPF_CALL_x(1, name, __VA_ARGS__) #define BPF_CALL_2(name, ...) BPF_CALL_x(2, name, __VA_ARGS__) #define BPF_CALL_3(name, ...) BPF_CALL_x(3, name, __VA_ARGS__) #define BPF_CALL_4(name, ...) BPF_CALL_x(4, name, __VA_ARGS__) #define BPF_CALL_5(name, ...) BPF_CALL_x(5, name, __VA_ARGS__) #define bpf_ctx_range(TYPE, MEMBER) \ offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1 #define bpf_ctx_range_till(TYPE, MEMBER1, MEMBER2) \ offsetof(TYPE, MEMBER1) ... offsetofend(TYPE, MEMBER2) - 1 #if BITS_PER_LONG == 64 # define bpf_ctx_range_ptr(TYPE, MEMBER) \ offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1 #else # define bpf_ctx_range_ptr(TYPE, MEMBER) \ offsetof(TYPE, MEMBER) ... offsetof(TYPE, MEMBER) + 8 - 1 #endif /* BITS_PER_LONG == 64 */ #define bpf_target_off(TYPE, MEMBER, SIZE, PTR_SIZE) \ ({ \ BUILD_BUG_ON(sizeof_field(TYPE, MEMBER) != (SIZE)); \ *(PTR_SIZE) = (SIZE); \ offsetof(TYPE, MEMBER); \ }) /* A struct sock_filter is architecture independent. */ struct compat_sock_fprog { u16 len; compat_uptr_t filter; /* struct sock_filter * */ }; struct sock_fprog_kern { u16 len; struct sock_filter *filter; }; /* Some arches need doubleword alignment for their instructions and/or data */ #define BPF_IMAGE_ALIGNMENT 8 struct bpf_binary_header { u32 size; u8 image[] __aligned(BPF_IMAGE_ALIGNMENT); }; struct bpf_prog_stats { u64_stats_t cnt; u64_stats_t nsecs; u64_stats_t misses; struct u64_stats_sync syncp; } __aligned(2 * sizeof(u64)); struct sk_filter { refcount_t refcnt; struct rcu_head rcu; struct bpf_prog *prog; }; DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); extern struct mutex nf_conn_btf_access_lock; extern int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, int off, int size); typedef unsigned int (*bpf_dispatcher_fn)(const void *ctx, const struct bpf_insn *insnsi, unsigned int (*bpf_func)(const void *, const struct bpf_insn *)); static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog, const void *ctx, bpf_dispatcher_fn dfunc) { u32 ret; cant_migrate(); if (static_branch_unlikely(&bpf_stats_enabled_key)) { struct bpf_prog_stats *stats; u64 start = sched_clock(); unsigned long flags; ret = dfunc(ctx, prog->insnsi, prog->bpf_func); stats = this_cpu_ptr(prog->stats); flags = u64_stats_update_begin_irqsave(&stats->syncp); u64_stats_inc(&stats->cnt); u64_stats_add(&stats->nsecs, sched_clock() - start); u64_stats_update_end_irqrestore(&stats->syncp, flags); } else { ret = dfunc(ctx, prog->insnsi, prog->bpf_func); } return ret; } static __always_inline u32 bpf_prog_run(const struct bpf_prog *prog, const void *ctx) { return __bpf_prog_run(prog, ctx, bpf_dispatcher_nop_func); } /* * Use in preemptible and therefore migratable context to make sure that * the execution of the BPF program runs on one CPU. * * This uses migrate_disable/enable() explicitly to document that the * invocation of a BPF program does not require reentrancy protection * against a BPF program which is invoked from a preempting task. */ static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog, const void *ctx) { u32 ret; migrate_disable(); ret = bpf_prog_run(prog, ctx); migrate_enable(); return ret; } #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN struct bpf_skb_data_end { struct qdisc_skb_cb qdisc_cb; void *data_meta; void *data_end; }; struct bpf_nh_params { u32 nh_family; union { u32 ipv4_nh; struct in6_addr ipv6_nh; }; }; struct bpf_redirect_info { u64 tgt_index; void *tgt_value; struct bpf_map *map; u32 flags; u32 kern_flags; u32 map_id; enum bpf_map_type map_type; struct bpf_nh_params nh; }; DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); /* flags for bpf_redirect_info kern_flags */ #define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */ /* Compute the linear packet data range [data, data_end) which * will be accessed by various program types (cls_bpf, act_bpf, * lwt, ...). Subsystems allowing direct data access must (!) * ensure that cb[] area can be written to when BPF program is * invoked (otherwise cb[] save/restore is necessary). */ static inline void bpf_compute_data_pointers(struct sk_buff *skb) { struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; BUILD_BUG_ON(sizeof(*cb) > sizeof_field(struct sk_buff, cb)); cb->data_meta = skb->data - skb_metadata_len(skb); cb->data_end = skb->data + skb_headlen(skb); } /* Similar to bpf_compute_data_pointers(), except that save orginal * data in cb->data and cb->meta_data for restore. */ static inline void bpf_compute_and_save_data_end( struct sk_buff *skb, void **saved_data_end) { struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; *saved_data_end = cb->data_end; cb->data_end = skb->data + skb_headlen(skb); } /* Restore data saved by bpf_compute_and_save_data_end(). */ static inline void bpf_restore_data_end( struct sk_buff *skb, void *saved_data_end) { struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; cb->data_end = saved_data_end; } static inline u8 *bpf_skb_cb(const struct sk_buff *skb) { /* eBPF programs may read/write skb->cb[] area to transfer meta * data between tail calls. Since this also needs to work with * tc, that scratch memory is mapped to qdisc_skb_cb's data area. * * In some socket filter cases, the cb unfortunately needs to be * saved/restored so that protocol specific skb->cb[] data won't * be lost. In any case, due to unpriviledged eBPF programs * attached to sockets, we need to clear the bpf_skb_cb() area * to not leak previous contents to user space. */ BUILD_BUG_ON(sizeof_field(struct __sk_buff, cb) != BPF_SKB_CB_LEN); BUILD_BUG_ON(sizeof_field(struct __sk_buff, cb) != sizeof_field(struct qdisc_skb_cb, data)); return qdisc_skb_cb(skb)->data; } /* Must be invoked with migration disabled */ static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog, const void *ctx) { const struct sk_buff *skb = ctx; u8 *cb_data = bpf_skb_cb(skb); u8 cb_saved[BPF_SKB_CB_LEN]; u32 res; if (unlikely(prog->cb_access)) { memcpy(cb_saved, cb_data, sizeof(cb_saved)); memset(cb_data, 0, sizeof(cb_saved)); } res = bpf_prog_run(prog, skb); if (unlikely(prog->cb_access)) memcpy(cb_data, cb_saved, sizeof(cb_saved)); return res; } static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog, struct sk_buff *skb) { u32 res; migrate_disable(); res = __bpf_prog_run_save_cb(prog, skb); migrate_enable(); return res; } static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog, struct sk_buff *skb) { u8 *cb_data = bpf_skb_cb(skb); u32 res; if (unlikely(prog->cb_access)) memset(cb_data, 0, BPF_SKB_CB_LEN); res = bpf_prog_run_pin_on_cpu(prog, skb); return res; } DECLARE_BPF_DISPATCHER(xdp) DECLARE_STATIC_KEY_FALSE(bpf_master_redirect_enabled_key); u32 xdp_master_redirect(struct xdp_buff *xdp); void bpf_prog_change_xdp(struct bpf_prog *prev_prog, struct bpf_prog *prog); static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog) { return prog->len * sizeof(struct bpf_insn); } static inline u32 bpf_prog_tag_scratch_size(const struct bpf_prog *prog) { return round_up(bpf_prog_insn_size(prog) + sizeof(__be64) + 1, SHA1_BLOCK_SIZE); } static inline unsigned int bpf_prog_size(unsigned int proglen) { return max(sizeof(struct bpf_prog), offsetof(struct bpf_prog, insns[proglen])); } static inline bool bpf_prog_was_classic(const struct bpf_prog *prog) { /* When classic BPF programs have been loaded and the arch * does not have a classic BPF JIT (anymore), they have been * converted via bpf_migrate_filter() to eBPF and thus always * have an unspec program type. */ return prog->type == BPF_PROG_TYPE_UNSPEC; } static inline u32 bpf_ctx_off_adjust_machine(u32 size) { const u32 size_machine = sizeof(unsigned long); if (size > size_machine && size % size_machine == 0) size = size_machine; return size; } static inline bool bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) { return size <= size_default && (size & (size - 1)) == 0; } static inline u8 bpf_ctx_narrow_access_offset(u32 off, u32 size, u32 size_default) { u8 access_off = off & (size_default - 1); #ifdef __LITTLE_ENDIAN return access_off; #else return size_default - (access_off + size); #endif } #define bpf_ctx_wide_access_ok(off, size, type, field) \ (size == sizeof(__u64) && \ off >= offsetof(type, field) && \ off + sizeof(__u64) <= offsetofend(type, field) && \ off % sizeof(__u64) == 0) #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) static inline void bpf_prog_lock_ro(struct bpf_prog *fp) { #ifndef CONFIG_BPF_JIT_ALWAYS_ON if (!fp->jited) { set_vm_flush_reset_perms(fp); set_memory_ro((unsigned long)fp, fp->pages); } #endif } static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) { set_vm_flush_reset_perms(hdr); set_memory_rox((unsigned long)hdr, hdr->size >> PAGE_SHIFT); } int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap); static inline int sk_filter(struct sock *sk, struct sk_buff *skb) { return sk_filter_trim_cap(sk, skb, 1); } struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err); void bpf_prog_free(struct bpf_prog *fp); bool bpf_opcode_in_insntable(u8 code); void bpf_prog_fill_jited_linfo(struct bpf_prog *prog, const u32 *insn_to_jit_off); int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog); void bpf_prog_jit_attempt_done(struct bpf_prog *prog); struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags); struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags); struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags); void __bpf_prog_free(struct bpf_prog *fp); static inline void bpf_prog_unlock_free(struct bpf_prog *fp) { __bpf_prog_free(fp); } typedef int (*bpf_aux_classic_check_t)(struct sock_filter *filter, unsigned int flen); int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog); int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, bpf_aux_classic_check_t trans, bool save_orig); void bpf_prog_destroy(struct bpf_prog *fp); int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_attach_bpf(u32 ufd, struct sock *sk); int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk); void sk_reuseport_prog_free(struct bpf_prog *prog); int sk_detach_filter(struct sock *sk); int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len); bool sk_filter_charge(struct sock *sk, struct sk_filter *fp); void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); #define __bpf_call_base_args \ ((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \ (void *)__bpf_call_base) struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); bool bpf_jit_needs_zext(void); bool bpf_jit_supports_subprog_tailcalls(void); bool bpf_jit_supports_kfunc_call(void); bool bpf_jit_supports_far_kfunc_call(void); bool bpf_jit_supports_exceptions(void); void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie); bool bpf_helper_changes_pkt_data(void *func); static inline bool bpf_dump_raw_ok(const struct cred *cred) { /* Reconstruction of call-sites is dependent on kallsyms, * thus make dump the same restriction. */ return kallsyms_show_value(cred); } struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt); void bpf_clear_redirect_map(struct bpf_map *map); static inline bool xdp_return_frame_no_direct(void) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT; } static inline void xdp_set_return_frame_no_direct(void) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT; } static inline void xdp_clear_return_frame_no_direct(void) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT; } static inline int xdp_ok_fwd_dev(const struct net_device *fwd, unsigned int pktlen) { unsigned int len; if (unlikely(!(fwd->flags & IFF_UP))) return -ENETDOWN; len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN; if (pktlen > len) return -EMSGSIZE; return 0; } /* The pair of xdp_do_redirect and xdp_do_flush MUST be called in the * same cpu context. Further for best results no more than a single map * for the do_redirect/do_flush pair should be used. This limitation is * because we only track one map and force a flush when the map changes. * This does not appear to be a real limitation for existing software. */ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *prog); int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *prog); int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp, struct xdp_frame *xdpf, struct bpf_prog *prog); void xdp_do_flush(void); void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act); #ifdef CONFIG_INET struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, struct bpf_prog *prog, struct sk_buff *skb, struct sock *migrating_sk, u32 hash); #else static inline struct sock * bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, struct bpf_prog *prog, struct sk_buff *skb, struct sock *migrating_sk, u32 hash) { return NULL; } #endif #ifdef CONFIG_BPF_JIT extern int bpf_jit_enable; extern int bpf_jit_harden; extern int bpf_jit_kallsyms; extern long bpf_jit_limit; extern long bpf_jit_limit_max; typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); void bpf_jit_fill_hole_with_zero(void *area, unsigned int size); struct bpf_binary_header * bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, unsigned int alignment, bpf_jit_fill_hole_t bpf_fill_ill_insns); void bpf_jit_binary_free(struct bpf_binary_header *hdr); u64 bpf_jit_alloc_exec_limit(void); void *bpf_jit_alloc_exec(unsigned long size); void bpf_jit_free_exec(void *addr); void bpf_jit_free(struct bpf_prog *fp); struct bpf_binary_header * bpf_jit_binary_pack_hdr(const struct bpf_prog *fp); void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insns); void bpf_prog_pack_free(void *ptr, u32 size); static inline bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) { return list_empty(&fp->aux->ksym.lnode) || fp->aux->ksym.lnode.prev == LIST_POISON2; } struct bpf_binary_header * bpf_jit_binary_pack_alloc(unsigned int proglen, u8 **ro_image, unsigned int alignment, struct bpf_binary_header **rw_hdr, u8 **rw_image, bpf_jit_fill_hole_t bpf_fill_ill_insns); int bpf_jit_binary_pack_finalize(struct bpf_prog *prog, struct bpf_binary_header *ro_header, struct bpf_binary_header *rw_header); void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header, struct bpf_binary_header *rw_header); int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, struct bpf_jit_poke_descriptor *poke); int bpf_jit_get_func_addr(const struct bpf_prog *prog, const struct bpf_insn *insn, bool extra_pass, u64 *func_addr, bool *func_addr_fixed); struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *fp); void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other); static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen, u32 pass, void *image) { pr_err("flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n", flen, proglen, pass, image, current->comm, task_pid_nr(current)); if (image) print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET, 16, 1, image, proglen, false); } static inline bool bpf_jit_is_ebpf(void) { # ifdef CONFIG_HAVE_EBPF_JIT return true; # else return false; # endif } static inline bool ebpf_jit_enabled(void) { return bpf_jit_enable && bpf_jit_is_ebpf(); } static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp) { return fp->jited && bpf_jit_is_ebpf(); } static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) { /* These are the prerequisites, should someone ever have the * idea to call blinding outside of them, we make sure to * bail out. */ if (!bpf_jit_is_ebpf()) return false; if (!prog->jit_requested) return false; if (!bpf_jit_harden) return false; if (bpf_jit_harden == 1 && bpf_capable()) return false; return true; } static inline bool bpf_jit_kallsyms_enabled(void) { /* There are a couple of corner cases where kallsyms should * not be enabled f.e. on hardening. */ if (bpf_jit_harden) return false; if (!bpf_jit_kallsyms) return false; if (bpf_jit_kallsyms == 1) return true; return false; } const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char *sym); bool is_bpf_text_address(unsigned long addr); int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym); struct bpf_prog *bpf_prog_ksym_find(unsigned long addr); static inline const char * bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym) { const char *ret = __bpf_address_lookup(addr, size, off, sym); if (ret && modname) *modname = NULL; return ret; } void bpf_prog_kallsyms_add(struct bpf_prog *fp); void bpf_prog_kallsyms_del(struct bpf_prog *fp); #else /* CONFIG_BPF_JIT */ static inline bool ebpf_jit_enabled(void) { return false; } static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) { return false; } static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp) { return false; } static inline int bpf_jit_add_poke_descriptor(struct bpf_prog *prog, struct bpf_jit_poke_descriptor *poke) { return -ENOTSUPP; } static inline void bpf_jit_free(struct bpf_prog *fp) { bpf_prog_unlock_free(fp); } static inline bool bpf_jit_kallsyms_enabled(void) { return false; } static inline const char * __bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char *sym) { return NULL; } static inline bool is_bpf_text_address(unsigned long addr) { return false; } static inline int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, char *sym) { return -ERANGE; } static inline struct bpf_prog *bpf_prog_ksym_find(unsigned long addr) { return NULL; } static inline const char * bpf_address_lookup(unsigned long addr, unsigned long *size, unsigned long *off, char **modname, char *sym) { return NULL; } static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp) { } static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp) { } #endif /* CONFIG_BPF_JIT */ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp); #define BPF_ANC BIT(15) static inline bool bpf_needs_clear_a(const struct sock_filter *first) { switch (first->code) { case BPF_RET | BPF_K: case BPF_LD | BPF_W | BPF_LEN: return false; case BPF_LD | BPF_W | BPF_ABS: case BPF_LD | BPF_H | BPF_ABS: case BPF_LD | BPF_B | BPF_ABS: if (first->k == SKF_AD_OFF + SKF_AD_ALU_XOR_X) return true; return false; default: return true; } } static inline u16 bpf_anc_helper(const struct sock_filter *ftest) { BUG_ON(ftest->code & BPF_ANC); switch (ftest->code) { case BPF_LD | BPF_W | BPF_ABS: case BPF_LD | BPF_H | BPF_ABS: case BPF_LD | BPF_B | BPF_ABS: #define BPF_ANCILLARY(CODE) case SKF_AD_OFF + SKF_AD_##CODE: \ return BPF_ANC | SKF_AD_##CODE switch (ftest->k) { BPF_ANCILLARY(PROTOCOL); BPF_ANCILLARY(PKTTYPE); BPF_ANCILLARY(IFINDEX); BPF_ANCILLARY(NLATTR); BPF_ANCILLARY(NLATTR_NEST); BPF_ANCILLARY(MARK); BPF_ANCILLARY(QUEUE); BPF_ANCILLARY(HATYPE); BPF_ANCILLARY(RXHASH); BPF_ANCILLARY(CPU); BPF_ANCILLARY(ALU_XOR_X); BPF_ANCILLARY(VLAN_TAG); BPF_ANCILLARY(VLAN_TAG_PRESENT); BPF_ANCILLARY(PAY_OFFSET); BPF_ANCILLARY(RANDOM); BPF_ANCILLARY(VLAN_TPID); } fallthrough; default: return ftest->code; } } void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size); static inline int bpf_tell_extensions(void) { return SKF_AD_MAX; } struct bpf_sock_addr_kern { struct sock *sk; struct sockaddr *uaddr; /* Temporary "register" to make indirect stores to nested structures * defined above. We need three registers to make such a store, but * only two (src and dst) are available at convert_ctx_access time */ u64 tmp_reg; void *t_ctx; /* Attach type specific context. */ u32 uaddrlen; }; struct bpf_sock_ops_kern { struct sock *sk; union { u32 args[4]; u32 reply; u32 replylong[4]; }; struct sk_buff *syn_skb; struct sk_buff *skb; void *skb_data_end; u8 op; u8 is_fullsock; u8 remaining_opt_len; u64 temp; /* temp and everything after is not * initialized to 0 before calling * the BPF program. New fields that * should be initialized to 0 should * be inserted before temp. * temp is scratch storage used by * sock_ops_convert_ctx_access * as temporary storage of a register. */ }; struct bpf_sysctl_kern { struct ctl_table_header *head; struct ctl_table *table; void *cur_val; size_t cur_len; void *new_val; size_t new_len; int new_updated; int write; loff_t *ppos; /* Temporary "register" for indirect stores to ppos. */ u64 tmp_reg; }; #define BPF_SOCKOPT_KERN_BUF_SIZE 32 struct bpf_sockopt_buf { u8 data[BPF_SOCKOPT_KERN_BUF_SIZE]; }; struct bpf_sockopt_kern { struct sock *sk; u8 *optval; u8 *optval_end; s32 level; s32 optname; s32 optlen; /* for retval in struct bpf_cg_run_ctx */ struct task_struct *current_task; /* Temporary "register" for indirect stores to ppos. */ u64 tmp_reg; }; int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len); struct bpf_sk_lookup_kern { u16 family; u16 protocol; __be16 sport; u16 dport; struct { __be32 saddr; __be32 daddr; } v4; struct { const struct in6_addr *saddr; const struct in6_addr *daddr; } v6; struct sock *selected_sk; u32 ingress_ifindex; bool no_reuseport; }; extern struct static_key_false bpf_sk_lookup_enabled; /* Runners for BPF_SK_LOOKUP programs to invoke on socket lookup. * * Allowed return values for a BPF SK_LOOKUP program are SK_PASS and * SK_DROP. Their meaning is as follows: * * SK_PASS && ctx.selected_sk != NULL: use selected_sk as lookup result * SK_PASS && ctx.selected_sk == NULL: continue to htable-based socket lookup * SK_DROP : terminate lookup with -ECONNREFUSED * * This macro aggregates return values and selected sockets from * multiple BPF programs according to following rules in order: * * 1. If any program returned SK_PASS and a non-NULL ctx.selected_sk, * macro result is SK_PASS and last ctx.selected_sk is used. * 2. If any program returned SK_DROP return value, * macro result is SK_DROP. * 3. Otherwise result is SK_PASS and ctx.selected_sk is NULL. * * Caller must ensure that the prog array is non-NULL, and that the * array as well as the programs it contains remain valid. */ #define BPF_PROG_SK_LOOKUP_RUN_ARRAY(array, ctx, func) \ ({ \ struct bpf_sk_lookup_kern *_ctx = &(ctx); \ struct bpf_prog_array_item *_item; \ struct sock *_selected_sk = NULL; \ bool _no_reuseport = false; \ struct bpf_prog *_prog; \ bool _all_pass = true; \ u32 _ret; \ \ migrate_disable(); \ _item = &(array)->items[0]; \ while ((_prog = READ_ONCE(_item->prog))) { \ /* restore most recent selection */ \ _ctx->selected_sk = _selected_sk; \ _ctx->no_reuseport = _no_reuseport; \ \ _ret = func(_prog, _ctx); \ if (_ret == SK_PASS && _ctx->selected_sk) { \ /* remember last non-NULL socket */ \ _selected_sk = _ctx->selected_sk; \ _no_reuseport = _ctx->no_reuseport; \ } else if (_ret == SK_DROP && _all_pass) { \ _all_pass = false; \ } \ _item++; \ } \ _ctx->selected_sk = _selected_sk; \ _ctx->no_reuseport = _no_reuseport; \ migrate_enable(); \ _all_pass || _selected_sk ? SK_PASS : SK_DROP; \ }) static inline bool bpf_sk_lookup_run_v4(struct net *net, int protocol, const __be32 saddr, const __be16 sport, const __be32 daddr, const u16 dport, const int ifindex, struct sock **psk) { struct bpf_prog_array *run_array; struct sock *selected_sk = NULL; bool no_reuseport = false; rcu_read_lock(); run_array = rcu_dereference(net->bpf.run_array[NETNS_BPF_SK_LOOKUP]); if (run_array) { struct bpf_sk_lookup_kern ctx = { .family = AF_INET, .protocol = protocol, .v4.saddr = saddr, .v4.daddr = daddr, .sport = sport, .dport = dport, .ingress_ifindex = ifindex, }; u32 act; act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, bpf_prog_run); if (act == SK_PASS) { selected_sk = ctx.selected_sk; no_reuseport = ctx.no_reuseport; } else { selected_sk = ERR_PTR(-ECONNREFUSED); } } rcu_read_unlock(); *psk = selected_sk; return no_reuseport; } #if IS_ENABLED(CONFIG_IPV6) static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const u16 dport, const int ifindex, struct sock **psk) { struct bpf_prog_array *run_array; struct sock *selected_sk = NULL; bool no_reuseport = false; rcu_read_lock(); run_array = rcu_dereference(net->bpf.run_array[NETNS_BPF_SK_LOOKUP]); if (run_array) { struct bpf_sk_lookup_kern ctx = { .family = AF_INET6, .protocol = protocol, .v6.saddr = saddr, .v6.daddr = daddr, .sport = sport, .dport = dport, .ingress_ifindex = ifindex, }; u32 act; act = BPF_PROG_SK_LOOKUP_RUN_ARRAY(run_array, ctx, bpf_prog_run); if (act == SK_PASS) { selected_sk = ctx.selected_sk; no_reuseport = ctx.no_reuseport; } else { selected_sk = ERR_PTR(-ECONNREFUSED); } } rcu_read_unlock(); *psk = selected_sk; return no_reuseport; } #endif /* IS_ENABLED(CONFIG_IPV6) */ static __always_inline long __bpf_xdp_redirect_map(struct bpf_map *map, u64 index, u64 flags, const u64 flag_mask, void *lookup_elem(struct bpf_map *map, u32 key)) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); const u64 action_mask = XDP_ABORTED | XDP_DROP | XDP_PASS | XDP_TX; /* Lower bits of the flags are used as return code on lookup failure */ if (unlikely(flags & ~(action_mask | flag_mask))) return XDP_ABORTED; ri->tgt_value = lookup_elem(map, index); if (unlikely(!ri->tgt_value) && !(flags & BPF_F_BROADCAST)) { /* If the lookup fails we want to clear out the state in the * redirect_info struct completely, so that if an eBPF program * performs multiple lookups, the last one always takes * precedence. */ ri->map_id = INT_MAX; /* Valid map id idr range: [1,INT_MAX[ */ ri->map_type = BPF_MAP_TYPE_UNSPEC; return flags & action_mask; } ri->tgt_index = index; ri->map_id = map->id; ri->map_type = map->map_type; if (flags & BPF_F_BROADCAST) { WRITE_ONCE(ri->map, map); ri->flags = flags; } else { WRITE_ONCE(ri->map, NULL); ri->flags = 0; } return XDP_REDIRECT; } #ifdef CONFIG_NET int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len); int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags); int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len); int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len); void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len); void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf, unsigned long len, bool flush); #else /* CONFIG_NET */ static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) { return -EOPNOTSUPP; } static inline int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) { return -EOPNOTSUPP; } static inline int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len) { return -EOPNOTSUPP; } static inline int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len) { return -EOPNOTSUPP; } static inline void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len) { return NULL; } static inline void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf, unsigned long len, bool flush) { } #endif /* CONFIG_NET */ #endif /* __LINUX_FILTER_H__ */
1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 /* Protective Load Balancing (PLB) * * PLB was designed to reduce link load imbalance across datacenter * switches. PLB is a host-based optimization; it leverages congestion * signals from the transport layer to randomly change the path of the * connection experiencing sustained congestion. PLB prefers to repath * after idle periods to minimize packet reordering. It repaths by * changing the IPv6 Flow Label on the packets of a connection, which * datacenter switches include as part of ECMP/WCMP hashing. * * PLB is described in detail in: * * Mubashir Adnan Qureshi, Yuchung Cheng, Qianwen Yin, Qiaobin Fu, * Gautam Kumar, Masoud Moshref, Junhua Yan, Van Jacobson, * David Wetherall,Abdul Kabbani: * "PLB: Congestion Signals are Simple and Effective for * Network Load Balancing" * In ACM SIGCOMM 2022, Amsterdam Netherlands. * */ #include <net/tcp.h> /* Called once per round-trip to update PLB state for a connection. */ void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb, const int cong_ratio) { struct net *net = sock_net(sk); if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)) return; if (cong_ratio >= 0) { if (cong_ratio < READ_ONCE(net->ipv4.sysctl_tcp_plb_cong_thresh)) plb->consec_cong_rounds = 0; else if (plb->consec_cong_rounds < READ_ONCE(net->ipv4.sysctl_tcp_plb_rehash_rounds)) plb->consec_cong_rounds++; } } EXPORT_SYMBOL_GPL(tcp_plb_update_state); /* Check whether recent congestion has been persistent enough to warrant * a load balancing decision that switches the connection to another path. */ void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb) { struct net *net = sock_net(sk); u32 max_suspend; bool forced_rehash = false, idle_rehash = false; if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)) return; forced_rehash = plb->consec_cong_rounds >= READ_ONCE(net->ipv4.sysctl_tcp_plb_rehash_rounds); /* If sender goes idle then we check whether to rehash. */ idle_rehash = READ_ONCE(net->ipv4.sysctl_tcp_plb_idle_rehash_rounds) && !tcp_sk(sk)->packets_out && plb->consec_cong_rounds >= READ_ONCE(net->ipv4.sysctl_tcp_plb_idle_rehash_rounds); if (!forced_rehash && !idle_rehash) return; /* Note that tcp_jiffies32 can wrap; we detect wraps by checking for * cases where the max suspension end is before the actual suspension * end. We clear pause_until to 0 to indicate there is no recent * RTO event that constrains PLB rehashing. */ max_suspend = 2 * READ_ONCE(net->ipv4.sysctl_tcp_plb_suspend_rto_sec) * HZ; if (plb->pause_until && (!before(tcp_jiffies32, plb->pause_until) || before(tcp_jiffies32 + max_suspend, plb->pause_until))) plb->pause_until = 0; if (plb->pause_until) return; sk_rethink_txhash(sk); plb->consec_cong_rounds = 0; tcp_sk(sk)->plb_rehash++; NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPLBREHASH); } EXPORT_SYMBOL_GPL(tcp_plb_check_rehash); /* Upon RTO, disallow load balancing for a while, to avoid having load * balancing decisions switch traffic to a black-holed path that was * previously avoided with a sk_rethink_txhash() call at RTO time. */ void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb) { struct net *net = sock_net(sk); u32 pause; if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)) return; pause = READ_ONCE(net->ipv4.sysctl_tcp_plb_suspend_rto_sec) * HZ; pause += get_random_u32_below(pause); plb->pause_until = tcp_jiffies32 + pause; /* Reset PLB state upon RTO, since an RTO causes a sk_rethink_txhash() call * that may switch this connection to a path with completely different * congestion characteristics. */ plb->consec_cong_rounds = 0; } EXPORT_SYMBOL_GPL(tcp_plb_update_state_upon_rto);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/sysv/namei.c * * minix/namei.c * Copyright (C) 1991, 1992 Linus Torvalds * * coh/namei.c * Copyright (C) 1993 Pascal Haible, Bruno Haible * * sysv/namei.c * Copyright (C) 1993 Bruno Haible * Copyright (C) 1997, 1998 Krzysztof G. Baranowski */ #include <linux/pagemap.h> #include "sysv.h" static int add_nondir(struct dentry *dentry, struct inode *inode) { int err = sysv_add_link(dentry, inode); if (!err) { d_instantiate(dentry, inode); return 0; } inode_dec_link_count(inode); iput(inode); return err; } static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags) { struct inode * inode = NULL; ino_t ino; if (dentry->d_name.len > SYSV_NAMELEN) return ERR_PTR(-ENAMETOOLONG); ino = sysv_inode_by_name(dentry); if (ino) inode = sysv_iget(dir->i_sb, ino); return d_splice_alias(inode, dentry); } static int sysv_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode * inode; int err; if (!old_valid_dev(rdev)) return -EINVAL; inode = sysv_new_inode(dir, mode); err = PTR_ERR(inode); if (!IS_ERR(inode)) { sysv_set_inode(inode, rdev); mark_inode_dirty(inode); err = add_nondir(dentry, inode); } return err; } static int sysv_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return sysv_mknod(&nop_mnt_idmap, dir, dentry, mode, 0); } static int sysv_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { int err = -ENAMETOOLONG; int l = strlen(symname)+1; struct inode * inode; if (l > dir->i_sb->s_blocksize) goto out; inode = sysv_new_inode(dir, S_IFLNK|0777); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out; sysv_set_inode(inode, 0); err = page_symlink(inode, symname, l); if (err) goto out_fail; mark_inode_dirty(inode); err = add_nondir(dentry, inode); out: return err; out_fail: inode_dec_link_count(inode); iput(inode); goto out; } static int sysv_link(struct dentry * old_dentry, struct inode * dir, struct dentry * dentry) { struct inode *inode = d_inode(old_dentry); inode_set_ctime_current(inode); inode_inc_link_count(inode); ihold(inode); return add_nondir(dentry, inode); } static int sysv_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode * inode; int err; inode_inc_link_count(dir); inode = sysv_new_inode(dir, S_IFDIR|mode); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_dir; sysv_set_inode(inode, 0); inode_inc_link_count(inode); err = sysv_make_empty(inode, dir); if (err) goto out_fail; err = sysv_add_link(dentry, inode); if (err) goto out_fail; d_instantiate(dentry, inode); out: return err; out_fail: inode_dec_link_count(inode); inode_dec_link_count(inode); iput(inode); out_dir: inode_dec_link_count(dir); goto out; } static int sysv_unlink(struct inode * dir, struct dentry * dentry) { struct inode * inode = d_inode(dentry); struct page * page; struct sysv_dir_entry * de; int err; de = sysv_find_entry(dentry, &page); if (!de) return -ENOENT; err = sysv_delete_entry(de, page); if (!err) { inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); inode_dec_link_count(inode); } unmap_and_put_page(page, de); return err; } static int sysv_rmdir(struct inode * dir, struct dentry * dentry) { struct inode *inode = d_inode(dentry); int err = -ENOTEMPTY; if (sysv_empty_dir(inode)) { err = sysv_unlink(dir, dentry); if (!err) { inode->i_size = 0; inode_dec_link_count(inode); inode_dec_link_count(dir); } } return err; } /* * Anybody can rename anything with this: the permission checks are left to the * higher-level routines. */ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct inode * old_inode = d_inode(old_dentry); struct inode * new_inode = d_inode(new_dentry); struct page * dir_page = NULL; struct sysv_dir_entry * dir_de = NULL; struct page * old_page; struct sysv_dir_entry * old_de; int err = -ENOENT; if (flags & ~RENAME_NOREPLACE) return -EINVAL; old_de = sysv_find_entry(old_dentry, &old_page); if (!old_de) goto out; if (S_ISDIR(old_inode->i_mode)) { err = -EIO; dir_de = sysv_dotdot(old_inode, &dir_page); if (!dir_de) goto out_old; } if (new_inode) { struct page * new_page; struct sysv_dir_entry * new_de; err = -ENOTEMPTY; if (dir_de && !sysv_empty_dir(new_inode)) goto out_dir; err = -ENOENT; new_de = sysv_find_entry(new_dentry, &new_page); if (!new_de) goto out_dir; err = sysv_set_link(new_de, new_page, old_inode); unmap_and_put_page(new_page, new_de); if (err) goto out_dir; inode_set_ctime_current(new_inode); if (dir_de) drop_nlink(new_inode); inode_dec_link_count(new_inode); } else { err = sysv_add_link(new_dentry, old_inode); if (err) goto out_dir; if (dir_de) inode_inc_link_count(new_dir); } err = sysv_delete_entry(old_de, old_page); if (err) goto out_dir; mark_inode_dirty(old_inode); if (dir_de) { err = sysv_set_link(dir_de, dir_page, new_dir); if (!err) inode_dec_link_count(old_dir); } out_dir: if (dir_de) unmap_and_put_page(dir_page, dir_de); out_old: unmap_and_put_page(old_page, old_de); out: return err; } /* * directories can handle most operations... */ const struct inode_operations sysv_dir_inode_operations = { .create = sysv_create, .lookup = sysv_lookup, .link = sysv_link, .unlink = sysv_unlink, .symlink = sysv_symlink, .mkdir = sysv_mkdir, .rmdir = sysv_rmdir, .mknod = sysv_mknod, .rename = sysv_rename, .getattr = sysv_getattr, };
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 // SPDX-License-Identifier: GPL-2.0-only #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/module.h> #include <linux/spinlock.h> #include <linux/interrupt.h> #include <asm/string.h> #include <linux/kmod.h> #include <linux/sysctl.h> #include <net/ip_vs.h> /* IPVS pe list */ static LIST_HEAD(ip_vs_pe); /* semaphore for IPVS PEs. */ static DEFINE_MUTEX(ip_vs_pe_mutex); /* Get pe in the pe list by name */ struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name) { struct ip_vs_pe *pe; IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__, pe_name); rcu_read_lock(); list_for_each_entry_rcu(pe, &ip_vs_pe, n_list) { /* Test and get the modules atomically */ if (pe->module && !try_module_get(pe->module)) { /* This pe is just deleted */ continue; } if (strcmp(pe_name, pe->name)==0) { /* HIT */ rcu_read_unlock(); return pe; } module_put(pe->module); } rcu_read_unlock(); return NULL; } /* Lookup pe and try to load it if it doesn't exist */ struct ip_vs_pe *ip_vs_pe_getbyname(const char *name) { struct ip_vs_pe *pe; /* Search for the pe by name */ pe = __ip_vs_pe_getbyname(name); /* If pe not found, load the module and search again */ if (!pe) { request_module("ip_vs_pe_%s", name); pe = __ip_vs_pe_getbyname(name); } return pe; } /* Register a pe in the pe list */ int register_ip_vs_pe(struct ip_vs_pe *pe) { struct ip_vs_pe *tmp; /* increase the module use count */ if (!ip_vs_use_count_inc()) return -ENOENT; mutex_lock(&ip_vs_pe_mutex); /* Make sure that the pe with this name doesn't exist * in the pe list. */ list_for_each_entry(tmp, &ip_vs_pe, n_list) { if (strcmp(tmp->name, pe->name) == 0) { mutex_unlock(&ip_vs_pe_mutex); ip_vs_use_count_dec(); pr_err("%s(): [%s] pe already existed " "in the system\n", __func__, pe->name); return -EINVAL; } } /* Add it into the d-linked pe list */ list_add_rcu(&pe->n_list, &ip_vs_pe); mutex_unlock(&ip_vs_pe_mutex); pr_info("[%s] pe registered.\n", pe->name); return 0; } EXPORT_SYMBOL_GPL(register_ip_vs_pe); /* Unregister a pe from the pe list */ int unregister_ip_vs_pe(struct ip_vs_pe *pe) { mutex_lock(&ip_vs_pe_mutex); /* Remove it from the d-linked pe list */ list_del_rcu(&pe->n_list); mutex_unlock(&ip_vs_pe_mutex); /* decrease the module use count */ ip_vs_use_count_dec(); pr_info("[%s] pe unregistered.\n", pe->name); return 0; } EXPORT_SYMBOL_GPL(unregister_ip_vs_pe);
34 1 33 34 11 11 470 470 470 470 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 // SPDX-License-Identifier: GPL-2.0-or-later /* * ip_vs_proto_tcp.c: TCP load balancing support for IPVS * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Julian Anastasov <ja@ssi.bg> * * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> * * Network name space (netns) aware. * Global data moved to netns i.e struct netns_ipvs * tcp_timeouts table has copy per netns in a hash table per * protocol ip_vs_proto_data and is handled by netns */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/kernel.h> #include <linux/ip.h> #include <linux/tcp.h> /* for tcphdr */ #include <net/ip.h> #include <net/tcp.h> /* for csum_tcpudp_magic */ #include <net/ip6_checksum.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/indirect_call_wrapper.h> #include <net/ip_vs.h> static int tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp); static int tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph) { struct ip_vs_service *svc; struct tcphdr _tcph, *th; __be16 _ports[2], *ports = NULL; /* In the event of icmp, we're only guaranteed to have the first 8 * bytes of the transport header, so we only check the rest of the * TCP packet for non-ICMP packets */ if (likely(!ip_vs_iph_icmp(iph))) { th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); if (th) { if (th->rst || !(sysctl_sloppy_tcp(ipvs) || th->syn)) return 1; ports = &th->source; } } else { ports = skb_header_pointer( skb, iph->len, sizeof(_ports), &_ports); } if (!ports) { *verdict = NF_DROP; return 0; } /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */ if (likely(!ip_vs_iph_inverse(iph))) svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, &iph->daddr, ports[1]); else svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, &iph->saddr, ports[0]); if (svc) { int ignored; if (ip_vs_todrop(ipvs)) { /* * It seems that we are very loaded. * We have to drop this packet :( */ *verdict = NF_DROP; return 0; } /* * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); if (!*cpp && ignored <= 0) { if (!ignored) *verdict = ip_vs_leave(svc, skb, pd, iph); else *verdict = NF_DROP; return 0; } } /* NF_ACCEPT */ return 1; } static inline void tcp_fast_csum_update(int af, struct tcphdr *tcph, const union nf_inet_addr *oldip, const union nf_inet_addr *newip, __be16 oldport, __be16 newport) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) tcph->check = csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, ip_vs_check_diff2(oldport, newport, ~csum_unfold(tcph->check)))); else #endif tcph->check = csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, ip_vs_check_diff2(oldport, newport, ~csum_unfold(tcph->check)))); } static inline void tcp_partial_csum_update(int af, struct tcphdr *tcph, const union nf_inet_addr *oldip, const union nf_inet_addr *newip, __be16 oldlen, __be16 newlen) { #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) tcph->check = ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, ip_vs_check_diff2(oldlen, newlen, csum_unfold(tcph->check)))); else #endif tcph->check = ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, ip_vs_check_diff2(oldlen, newlen, csum_unfold(tcph->check)))); } INDIRECT_CALLABLE_SCOPE int tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct tcphdr *tcph; unsigned int tcphoff = iph->len; bool payload_csum = false; int oldlen; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6 && iph->fragoffs) return 1; #endif oldlen = skb->len - tcphoff; /* csum_check requires unshared skb */ if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph))) return 0; if (unlikely(cp->app != NULL)) { int ret; /* Some checks before mangling */ if (!tcp_csum_check(cp->af, skb, pp)) return 0; /* Call application helper if needed */ if (!(ret = ip_vs_app_pkt_out(cp, skb, iph))) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 1) oldlen = skb->len - tcphoff; else payload_csum = true; } tcph = (void *)skb_network_header(skb) + tcphoff; tcph->source = cp->vport; /* Adjust TCP checksums */ if (skb->ip_summed == CHECKSUM_PARTIAL) { tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, htons(oldlen), htons(skb->len - tcphoff)); } else if (!payload_csum) { /* Only port and addr are changed, do fast csum update */ tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, cp->dport, cp->vport); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = cp->app ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ tcph->check = 0; skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) tcph->check = csum_ipv6_magic(&cp->vaddr.in6, &cp->caddr.in6, skb->len - tcphoff, cp->protocol, skb->csum); else #endif tcph->check = csum_tcpudp_magic(cp->vaddr.ip, cp->caddr.ip, skb->len - tcphoff, cp->protocol, skb->csum); skb->ip_summed = CHECKSUM_UNNECESSARY; IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", pp->name, tcph->check, (char*)&(tcph->check) - (char*)tcph); } return 1; } static int tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) { struct tcphdr *tcph; unsigned int tcphoff = iph->len; bool payload_csum = false; int oldlen; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6 && iph->fragoffs) return 1; #endif oldlen = skb->len - tcphoff; /* csum_check requires unshared skb */ if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph))) return 0; if (unlikely(cp->app != NULL)) { int ret; /* Some checks before mangling */ if (!tcp_csum_check(cp->af, skb, pp)) return 0; /* * Attempt ip_vs_app call. * It will fix ip_vs_conn and iph ack_seq stuff */ if (!(ret = ip_vs_app_pkt_in(cp, skb, iph))) return 0; /* ret=2: csum update is needed after payload mangling */ if (ret == 1) oldlen = skb->len - tcphoff; else payload_csum = true; } tcph = (void *)skb_network_header(skb) + tcphoff; tcph->dest = cp->dport; /* * Adjust TCP checksums */ if (skb->ip_summed == CHECKSUM_PARTIAL) { tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, htons(oldlen), htons(skb->len - tcphoff)); } else if (!payload_csum) { /* Only port and addr are changed, do fast csum update */ tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, cp->vport, cp->dport); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = cp->app ? CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ tcph->check = 0; skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) tcph->check = csum_ipv6_magic(&cp->caddr.in6, &cp->daddr.in6, skb->len - tcphoff, cp->protocol, skb->csum); else #endif tcph->check = csum_tcpudp_magic(cp->caddr.ip, cp->daddr.ip, skb->len - tcphoff, cp->protocol, skb->csum); skb->ip_summed = CHECKSUM_UNNECESSARY; } return 1; } static int tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) { unsigned int tcphoff; #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) tcphoff = sizeof(struct ipv6hdr); else #endif tcphoff = ip_hdrlen(skb); switch (skb->ip_summed) { case CHECKSUM_NONE: skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); fallthrough; case CHECKSUM_COMPLETE: #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->len - tcphoff, ipv6_hdr(skb)->nexthdr, skb->csum)) { IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } } else #endif if (csum_tcpudp_magic(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, skb->len - tcphoff, ip_hdr(skb)->protocol, skb->csum)) { IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } break; default: /* No need to checksum. */ break; } return 1; } #define TCP_DIR_INPUT 0 #define TCP_DIR_OUTPUT 4 #define TCP_DIR_INPUT_ONLY 8 static const int tcp_state_off[IP_VS_DIR_LAST] = { [IP_VS_DIR_INPUT] = TCP_DIR_INPUT, [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT, [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY, }; /* * Timeout table[state] */ static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { [IP_VS_TCP_S_NONE] = 2*HZ, [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ, [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ, [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ, [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ, [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ, [IP_VS_TCP_S_CLOSE] = 10*HZ, [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, [IP_VS_TCP_S_LAST_ACK] = 30*HZ, [IP_VS_TCP_S_LISTEN] = 2*60*HZ, [IP_VS_TCP_S_SYNACK] = 120*HZ, [IP_VS_TCP_S_LAST] = 2*HZ, }; static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { [IP_VS_TCP_S_NONE] = "NONE", [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT", [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV", [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT", [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT", [IP_VS_TCP_S_CLOSE] = "CLOSE", [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT", [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK", [IP_VS_TCP_S_LISTEN] = "LISTEN", [IP_VS_TCP_S_SYNACK] = "SYNACK", [IP_VS_TCP_S_LAST] = "BUG!", }; static const bool tcp_state_active_table[IP_VS_TCP_S_LAST] = { [IP_VS_TCP_S_NONE] = false, [IP_VS_TCP_S_ESTABLISHED] = true, [IP_VS_TCP_S_SYN_SENT] = true, [IP_VS_TCP_S_SYN_RECV] = true, [IP_VS_TCP_S_FIN_WAIT] = false, [IP_VS_TCP_S_TIME_WAIT] = false, [IP_VS_TCP_S_CLOSE] = false, [IP_VS_TCP_S_CLOSE_WAIT] = false, [IP_VS_TCP_S_LAST_ACK] = false, [IP_VS_TCP_S_LISTEN] = false, [IP_VS_TCP_S_SYNACK] = true, }; #define sNO IP_VS_TCP_S_NONE #define sES IP_VS_TCP_S_ESTABLISHED #define sSS IP_VS_TCP_S_SYN_SENT #define sSR IP_VS_TCP_S_SYN_RECV #define sFW IP_VS_TCP_S_FIN_WAIT #define sTW IP_VS_TCP_S_TIME_WAIT #define sCL IP_VS_TCP_S_CLOSE #define sCW IP_VS_TCP_S_CLOSE_WAIT #define sLA IP_VS_TCP_S_LAST_ACK #define sLI IP_VS_TCP_S_LISTEN #define sSA IP_VS_TCP_S_SYNACK struct tcp_states_t { int next_state[IP_VS_TCP_S_LAST]; }; static const char * tcp_state_name(int state) { if (state >= IP_VS_TCP_S_LAST) return "ERR!"; return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?"; } static bool tcp_state_active(int state) { if (state >= IP_VS_TCP_S_LAST) return false; return tcp_state_active_table[state]; } static struct tcp_states_t tcp_states[] = { /* INPUT */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }}, /* OUTPUT */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }}, /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, /* INPUT-ONLY */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, }; static struct tcp_states_t tcp_states_dos[] = { /* INPUT */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, /*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, /* OUTPUT */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }}, /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, /* INPUT-ONLY */ /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }}, /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, }; static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags) { int on = (flags & 1); /* secure_tcp */ /* ** FIXME: change secure_tcp to independent sysctl var ** or make it per-service or per-app because it is valid ** for most if not for all of the applications. Something ** like "capabilities" (flags) for each object. */ pd->tcp_state_table = (on ? tcp_states_dos : tcp_states); } static inline int tcp_state_idx(struct tcphdr *th) { if (th->rst) return 3; if (th->syn) return 0; if (th->fin) return 1; if (th->ack) return 2; return -1; } static inline void set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, int direction, struct tcphdr *th) { int state_idx; int new_state = IP_VS_TCP_S_CLOSE; int state_off = tcp_state_off[direction]; /* * Update state offset to INPUT_ONLY if necessary * or delete NO_OUTPUT flag if output packet detected */ if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { if (state_off == TCP_DIR_OUTPUT) cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; else state_off = TCP_DIR_INPUT_ONLY; } if ((state_idx = tcp_state_idx(th)) < 0) { IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx); goto tcp_state_out; } new_state = pd->tcp_state_table[state_off+state_idx].next_state[cp->state]; tcp_state_out: if (new_state != cp->state) { struct ip_vs_dest *dest = cp->dest; IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] c:%s:%d v:%s:%d " "d:%s:%d state: %s->%s conn->refcnt:%d\n", pd->pp->name, ((state_off == TCP_DIR_OUTPUT) ? "output " : "input "), th->syn ? 'S' : '.', th->fin ? 'F' : '.', th->ack ? 'A' : '.', th->rst ? 'R' : '.', IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), tcp_state_name(cp->state), tcp_state_name(new_state), refcount_read(&cp->refcnt)); if (dest) { if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && !tcp_state_active(new_state)) { atomic_dec(&dest->activeconns); atomic_inc(&dest->inactconns); cp->flags |= IP_VS_CONN_F_INACTIVE; } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && tcp_state_active(new_state)) { atomic_inc(&dest->activeconns); atomic_dec(&dest->inactconns); cp->flags &= ~IP_VS_CONN_F_INACTIVE; } } if (new_state == IP_VS_TCP_S_ESTABLISHED) ip_vs_control_assure_ct(cp); } if (likely(pd)) cp->timeout = pd->timeout_table[cp->state = new_state]; else /* What to do ? */ cp->timeout = tcp_timeouts[cp->state = new_state]; } /* * Handle state transitions */ static void tcp_state_transition(struct ip_vs_conn *cp, int direction, const struct sk_buff *skb, struct ip_vs_proto_data *pd) { struct tcphdr _tcph, *th; #ifdef CONFIG_IP_VS_IPV6 int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); #else int ihl = ip_hdrlen(skb); #endif th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph); if (th == NULL) return; spin_lock_bh(&cp->lock); set_tcp_state(pd, cp, direction, th); spin_unlock_bh(&cp->lock); } static inline __u16 tcp_app_hashkey(__be16 port) { return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port) & TCP_APP_TAB_MASK; } static int tcp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_app *i; __u16 hash; __be16 port = inc->port; int ret = 0; struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); hash = tcp_app_hashkey(port); list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) { if (i->port == port) { ret = -EEXIST; goto out; } } list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]); atomic_inc(&pd->appcnt); out: return ret; } static void tcp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) { struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); atomic_dec(&pd->appcnt); list_del_rcu(&inc->p_list); } static int tcp_app_conn_bind(struct ip_vs_conn *cp) { struct netns_ipvs *ipvs = cp->ipvs; int hash; struct ip_vs_app *inc; int result = 0; /* Default binding: bind app only for NAT */ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) return 0; /* Lookup application incarnations and bind the right one */ hash = tcp_app_hashkey(cp->vport); list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) { if (inc->port == cp->vport) { if (unlikely(!ip_vs_app_inc_get(inc))) break; IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" "%s:%u to app %s on port %u\n", __func__, IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), inc->name, ntohs(inc->port)); cp->app = inc; if (inc->init_conn) result = inc->init_conn(inc, cp); break; } } return result; } /* * Set LISTEN timeout. (ip_vs_conn_put will setup timer) */ void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) { struct ip_vs_proto_data *pd = ip_vs_proto_data_get(cp->ipvs, IPPROTO_TCP); spin_lock_bh(&cp->lock); cp->state = IP_VS_TCP_S_LISTEN; cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN] : tcp_timeouts[IP_VS_TCP_S_LISTEN]); spin_unlock_bh(&cp->lock); } /* --------------------------------------------- * timeouts is netns related now. * --------------------------------------------- */ static int __ip_vs_tcp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE); pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts, sizeof(tcp_timeouts)); if (!pd->timeout_table) return -ENOMEM; pd->tcp_state_table = tcp_states; return 0; } static void __ip_vs_tcp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) { kfree(pd->timeout_table); } struct ip_vs_protocol ip_vs_protocol_tcp = { .name = "TCP", .protocol = IPPROTO_TCP, .num_states = IP_VS_TCP_S_LAST, .dont_defrag = 0, .init = NULL, .exit = NULL, .init_netns = __ip_vs_tcp_init, .exit_netns = __ip_vs_tcp_exit, .register_app = tcp_register_app, .unregister_app = tcp_unregister_app, .conn_schedule = tcp_conn_schedule, .conn_in_get = ip_vs_conn_in_get_proto, .conn_out_get = ip_vs_conn_out_get_proto, .snat_handler = tcp_snat_handler, .dnat_handler = tcp_dnat_handler, .state_name = tcp_state_name, .state_transition = tcp_state_transition, .app_conn_bind = tcp_app_conn_bind, .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = tcp_timeout_change, };
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 /* SPDX-License-Identifier: GPL-2.0 */ /* * Header for use in defining a given L4 protocol for connection tracking. * * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> * - generalized L3 protocol dependent part. * * Derived from include/linux/netfiter_ipv4/ip_conntrack_protcol.h */ #ifndef _NF_CONNTRACK_L4PROTO_H #define _NF_CONNTRACK_L4PROTO_H #include <linux/netlink.h> #include <net/netlink.h> #include <net/netfilter/nf_conntrack.h> #include <net/netns/generic.h> struct seq_file; struct nf_conntrack_l4proto { /* L4 Protocol number. */ u_int8_t l4proto; /* Resolve clashes on insertion races. */ bool allow_clash; /* protoinfo nlattr size, closes a hole */ u16 nlattr_size; /* called by gc worker if table is full */ bool (*can_early_drop)(const struct nf_conn *ct); /* convert protoinfo to nfnetink attributes */ int (*to_nlattr)(struct sk_buff *skb, struct nlattr *nla, struct nf_conn *ct, bool destroy); /* convert nfnetlink attributes to protoinfo */ int (*from_nlattr)(struct nlattr *tb[], struct nf_conn *ct); int (*tuple_to_nlattr)(struct sk_buff *skb, const struct nf_conntrack_tuple *t); /* Calculate tuple nlattr size */ unsigned int (*nlattr_tuple_size)(void); int (*nlattr_to_tuple)(struct nlattr *tb[], struct nf_conntrack_tuple *t, u_int32_t flags); const struct nla_policy *nla_policy; struct { int (*nlattr_to_obj)(struct nlattr *tb[], struct net *net, void *data); int (*obj_to_nlattr)(struct sk_buff *skb, const void *data); u16 obj_size; u16 nlattr_max; const struct nla_policy *nla_policy; } ctnl_timeout; #ifdef CONFIG_NF_CONNTRACK_PROCFS /* Print out the private part of the conntrack. */ void (*print_conntrack)(struct seq_file *s, struct nf_conn *); #endif }; bool icmp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple); bool icmpv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, struct net *net, struct nf_conntrack_tuple *tuple); bool nf_conntrack_invert_icmp_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig); bool nf_conntrack_invert_icmpv6_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig); int nf_conntrack_inet_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state, u8 l4proto, union nf_inet_addr *outer_daddr); int nf_conntrack_icmpv4_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state); int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, const struct nf_hook_state *state); int nf_conntrack_icmp_packet(struct nf_conn *ct, struct sk_buff *skb, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_icmpv6_packet(struct nf_conn *ct, struct sk_buff *skb, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_udp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_udplite_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_tcp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_dccp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_sctp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); int nf_conntrack_gre_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); void nf_conntrack_generic_init_net(struct net *net); void nf_conntrack_tcp_init_net(struct net *net); void nf_conntrack_udp_init_net(struct net *net); void nf_conntrack_gre_init_net(struct net *net); void nf_conntrack_dccp_init_net(struct net *net); void nf_conntrack_sctp_init_net(struct net *net); void nf_conntrack_icmp_init_net(struct net *net); void nf_conntrack_icmpv6_init_net(struct net *net); /* Existing built-in generic protocol */ extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic; #define MAX_NF_CT_PROTO IPPROTO_UDPLITE const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto); /* Generic netlink helpers */ int nf_ct_port_tuple_to_nlattr(struct sk_buff *skb, const struct nf_conntrack_tuple *tuple); int nf_ct_port_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *t, u_int32_t flags); unsigned int nf_ct_port_nlattr_tuple_size(void); extern const struct nla_policy nf_ct_port_nla_policy[]; #ifdef CONFIG_SYSCTL __printf(4, 5) __cold void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, const struct nf_conn *ct, const struct nf_hook_state *state, const char *fmt, ...); __printf(4, 5) __cold void nf_l4proto_log_invalid(const struct sk_buff *skb, const struct nf_hook_state *state, u8 protonum, const char *fmt, ...); #else static inline __printf(4, 5) __cold void nf_l4proto_log_invalid(const struct sk_buff *skb, const struct nf_hook_state *state, u8 protonum, const char *fmt, ...) {} static inline __printf(4, 5) __cold void nf_ct_l4proto_log_invalid(const struct sk_buff *skb, const struct nf_conn *ct, const struct nf_hook_state *state, const char *fmt, ...) { } #endif /* CONFIG_SYSCTL */ #if IS_ENABLED(CONFIG_NF_CONNTRACK) static inline struct nf_generic_net *nf_generic_pernet(struct net *net) { return &net->ct.nf_ct_proto.generic; } static inline struct nf_tcp_net *nf_tcp_pernet(struct net *net) { return &net->ct.nf_ct_proto.tcp; } static inline struct nf_udp_net *nf_udp_pernet(struct net *net) { return &net->ct.nf_ct_proto.udp; } static inline struct nf_icmp_net *nf_icmp_pernet(struct net *net) { return &net->ct.nf_ct_proto.icmp; } static inline struct nf_icmp_net *nf_icmpv6_pernet(struct net *net) { return &net->ct.nf_ct_proto.icmpv6; } /* Caller must check nf_ct_protonum(ct) is IPPROTO_TCP before calling. */ static inline void nf_ct_set_tcp_be_liberal(struct nf_conn *ct) { ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; } /* Caller must check nf_ct_protonum(ct) is IPPROTO_TCP before calling. */ static inline bool nf_conntrack_tcp_established(const struct nf_conn *ct) { return ct->proto.tcp.state == TCP_CONNTRACK_ESTABLISHED && test_bit(IPS_ASSURED_BIT, &ct->status); } #endif #ifdef CONFIG_NF_CT_PROTO_DCCP static inline struct nf_dccp_net *nf_dccp_pernet(struct net *net) { return &net->ct.nf_ct_proto.dccp; } #endif #ifdef CONFIG_NF_CT_PROTO_SCTP static inline struct nf_sctp_net *nf_sctp_pernet(struct net *net) { return &net->ct.nf_ct_proto.sctp; } #endif #ifdef CONFIG_NF_CT_PROTO_GRE static inline struct nf_gre_net *nf_gre_pernet(struct net *net) { return &net->ct.nf_ct_proto.gre; } #endif #endif /*_NF_CONNTRACK_PROTOCOL_H*/
23 9 15 10 10 1 6 1 6 4 3 3 10 1 9 8 1 1 1 1 16 9 9 8 1 9 12 12 9 3 5 2 5 7 10 10 1 9 9 5 5 9 4 1 4 4 3 3 4 3 8 8 3 2 6 3 2 1 2 2 2 2 1 2 1 1 2 1 1 2 3 3 3 3 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 // SPDX-License-Identifier: GPL-2.0+ /* * A virtual v4l2-mem2mem example device. * * This is a virtual device driver for testing mem-to-mem vb2 framework. * It simulates a device that uses memory buffers for both source and * destination, processes the data and issues an "irq" (simulated by a delayed * workqueue). * The device is capable of multi-instance, multi-buffer-per-transaction * operation (via the mem2mem framework). * * Copyright (c) 2009-2010 Samsung Electronics Co., Ltd. * Pawel Osciak, <pawel@osciak.com> * Marek Szyprowski, <m.szyprowski@samsung.com> */ #include <linux/module.h> #include <linux/delay.h> #include <linux/fs.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/platform_device.h> #include <media/v4l2-mem2mem.h> #include <media/v4l2-device.h> #include <media/v4l2-ioctl.h> #include <media/v4l2-ctrls.h> #include <media/v4l2-event.h> #include <media/videobuf2-vmalloc.h> MODULE_DESCRIPTION("Virtual device for mem2mem framework testing"); MODULE_AUTHOR("Pawel Osciak, <pawel@osciak.com>"); MODULE_LICENSE("GPL"); MODULE_VERSION("0.2"); MODULE_ALIAS("mem2mem_testdev"); static unsigned int debug; module_param(debug, uint, 0644); MODULE_PARM_DESC(debug, "debug level"); /* Default transaction time in msec */ static unsigned int default_transtime = 40; /* Max 25 fps */ module_param(default_transtime, uint, 0644); MODULE_PARM_DESC(default_transtime, "default transaction time in ms"); #define MIN_W 32 #define MIN_H 32 #define MAX_W 640 #define MAX_H 480 /* Pixel alignment for non-bayer formats */ #define WIDTH_ALIGN 2 #define HEIGHT_ALIGN 1 /* Pixel alignment for bayer formats */ #define BAYER_WIDTH_ALIGN 2 #define BAYER_HEIGHT_ALIGN 2 /* Flags that indicate a format can be used for capture/output */ #define MEM2MEM_CAPTURE BIT(0) #define MEM2MEM_OUTPUT BIT(1) #define MEM2MEM_NAME "vim2m" /* Per queue */ #define MEM2MEM_DEF_NUM_BUFS VIDEO_MAX_FRAME /* In bytes, per queue */ #define MEM2MEM_VID_MEM_LIMIT (16 * 1024 * 1024) /* Flags that indicate processing mode */ #define MEM2MEM_HFLIP BIT(0) #define MEM2MEM_VFLIP BIT(1) #define dprintk(dev, lvl, fmt, arg...) \ v4l2_dbg(lvl, debug, &(dev)->v4l2_dev, "%s: " fmt, __func__, ## arg) static void vim2m_dev_release(struct device *dev) {} static struct platform_device vim2m_pdev = { .name = MEM2MEM_NAME, .dev.release = vim2m_dev_release, }; struct vim2m_fmt { u32 fourcc; int depth; /* Types the format can be used for */ u32 types; }; static struct vim2m_fmt formats[] = { { .fourcc = V4L2_PIX_FMT_RGB565, /* rrrrrggg gggbbbbb */ .depth = 16, .types = MEM2MEM_CAPTURE | MEM2MEM_OUTPUT, }, { .fourcc = V4L2_PIX_FMT_RGB565X, /* gggbbbbb rrrrrggg */ .depth = 16, .types = MEM2MEM_CAPTURE | MEM2MEM_OUTPUT, }, { .fourcc = V4L2_PIX_FMT_RGB24, .depth = 24, .types = MEM2MEM_CAPTURE | MEM2MEM_OUTPUT, }, { .fourcc = V4L2_PIX_FMT_BGR24, .depth = 24, .types = MEM2MEM_CAPTURE | MEM2MEM_OUTPUT, }, { .fourcc = V4L2_PIX_FMT_YUYV, .depth = 16, .types = MEM2MEM_CAPTURE, }, { .fourcc = V4L2_PIX_FMT_SBGGR8, .depth = 8, .types = MEM2MEM_CAPTURE, }, { .fourcc = V4L2_PIX_FMT_SGBRG8, .depth = 8, .types = MEM2MEM_CAPTURE, }, { .fourcc = V4L2_PIX_FMT_SGRBG8, .depth = 8, .types = MEM2MEM_CAPTURE, }, { .fourcc = V4L2_PIX_FMT_SRGGB8, .depth = 8, .types = MEM2MEM_CAPTURE, }, }; #define NUM_FORMATS ARRAY_SIZE(formats) /* Per-queue, driver-specific private data */ struct vim2m_q_data { unsigned int width; unsigned int height; unsigned int sizeimage; unsigned int sequence; struct vim2m_fmt *fmt; }; enum { V4L2_M2M_SRC = 0, V4L2_M2M_DST = 1, }; #define V4L2_CID_TRANS_TIME_MSEC (V4L2_CID_USER_BASE + 0x1000) #define V4L2_CID_TRANS_NUM_BUFS (V4L2_CID_USER_BASE + 0x1001) static struct vim2m_fmt *find_format(u32 fourcc) { struct vim2m_fmt *fmt; unsigned int k; for (k = 0; k < NUM_FORMATS; k++) { fmt = &formats[k]; if (fmt->fourcc == fourcc) break; } if (k == NUM_FORMATS) return NULL; return &formats[k]; } static void get_alignment(u32 fourcc, unsigned int *walign, unsigned int *halign) { switch (fourcc) { case V4L2_PIX_FMT_SBGGR8: case V4L2_PIX_FMT_SGBRG8: case V4L2_PIX_FMT_SGRBG8: case V4L2_PIX_FMT_SRGGB8: *walign = BAYER_WIDTH_ALIGN; *halign = BAYER_HEIGHT_ALIGN; return; default: *walign = WIDTH_ALIGN; *halign = HEIGHT_ALIGN; return; } } struct vim2m_dev { struct v4l2_device v4l2_dev; struct video_device vfd; #ifdef CONFIG_MEDIA_CONTROLLER struct media_device mdev; #endif atomic_t num_inst; struct mutex dev_mutex; struct v4l2_m2m_dev *m2m_dev; }; struct vim2m_ctx { struct v4l2_fh fh; struct vim2m_dev *dev; struct v4l2_ctrl_handler hdl; /* Processed buffers in this transaction */ u8 num_processed; /* Transaction length (i.e. how many buffers per transaction) */ u32 translen; /* Transaction time (i.e. simulated processing time) in milliseconds */ u32 transtime; struct mutex vb_mutex; struct delayed_work work_run; /* Abort requested by m2m */ int aborting; /* Processing mode */ int mode; enum v4l2_colorspace colorspace; enum v4l2_ycbcr_encoding ycbcr_enc; enum v4l2_xfer_func xfer_func; enum v4l2_quantization quant; /* Source and destination queue data */ struct vim2m_q_data q_data[2]; }; static inline struct vim2m_ctx *file2ctx(struct file *file) { return container_of(file->private_data, struct vim2m_ctx, fh); } static struct vim2m_q_data *get_q_data(struct vim2m_ctx *ctx, enum v4l2_buf_type type) { switch (type) { case V4L2_BUF_TYPE_VIDEO_OUTPUT: return &ctx->q_data[V4L2_M2M_SRC]; case V4L2_BUF_TYPE_VIDEO_CAPTURE: return &ctx->q_data[V4L2_M2M_DST]; default: return NULL; } } static const char *type_name(enum v4l2_buf_type type) { switch (type) { case V4L2_BUF_TYPE_VIDEO_OUTPUT: return "Output"; case V4L2_BUF_TYPE_VIDEO_CAPTURE: return "Capture"; default: return "Invalid"; } } #define CLIP(__color) \ (u8)(((__color) > 0xff) ? 0xff : (((__color) < 0) ? 0 : (__color))) static void copy_line(struct vim2m_q_data *q_data_out, u8 *src, u8 *dst, bool reverse) { int x, depth = q_data_out->fmt->depth >> 3; if (!reverse) { memcpy(dst, src, q_data_out->width * depth); } else { for (x = 0; x < q_data_out->width >> 1; x++) { memcpy(dst, src, depth); memcpy(dst + depth, src - depth, depth); src -= depth << 1; dst += depth << 1; } return; } } static void copy_two_pixels(struct vim2m_q_data *q_data_in, struct vim2m_q_data *q_data_out, u8 *src[2], u8 **dst, int ypos, bool reverse) { struct vim2m_fmt *out = q_data_out->fmt; struct vim2m_fmt *in = q_data_in->fmt; u8 _r[2], _g[2], _b[2], *r, *g, *b; int i; /* Step 1: read two consecutive pixels from src pointer */ r = _r; g = _g; b = _b; switch (in->fourcc) { case V4L2_PIX_FMT_RGB565: /* rrrrrggg gggbbbbb */ for (i = 0; i < 2; i++) { u16 pix = le16_to_cpu(*(__le16 *)(src[i])); *r++ = (u8)(((pix & 0xf800) >> 11) << 3) | 0x07; *g++ = (u8)((((pix & 0x07e0) >> 5)) << 2) | 0x03; *b++ = (u8)((pix & 0x1f) << 3) | 0x07; } break; case V4L2_PIX_FMT_RGB565X: /* gggbbbbb rrrrrggg */ for (i = 0; i < 2; i++) { u16 pix = be16_to_cpu(*(__be16 *)(src[i])); *r++ = (u8)(((pix & 0xf800) >> 11) << 3) | 0x07; *g++ = (u8)((((pix & 0x07e0) >> 5)) << 2) | 0x03; *b++ = (u8)((pix & 0x1f) << 3) | 0x07; } break; default: case V4L2_PIX_FMT_RGB24: for (i = 0; i < 2; i++) { *r++ = src[i][0]; *g++ = src[i][1]; *b++ = src[i][2]; } break; case V4L2_PIX_FMT_BGR24: for (i = 0; i < 2; i++) { *b++ = src[i][0]; *g++ = src[i][1]; *r++ = src[i][2]; } break; } /* Step 2: store two consecutive points, reversing them if needed */ r = _r; g = _g; b = _b; switch (out->fourcc) { case V4L2_PIX_FMT_RGB565: /* rrrrrggg gggbbbbb */ for (i = 0; i < 2; i++) { u16 pix; __le16 *dst_pix = (__le16 *)*dst; pix = ((*r << 8) & 0xf800) | ((*g << 3) & 0x07e0) | (*b >> 3); *dst_pix = cpu_to_le16(pix); *dst += 2; } return; case V4L2_PIX_FMT_RGB565X: /* gggbbbbb rrrrrggg */ for (i = 0; i < 2; i++) { u16 pix; __be16 *dst_pix = (__be16 *)*dst; pix = ((*r << 8) & 0xf800) | ((*g << 3) & 0x07e0) | (*b >> 3); *dst_pix = cpu_to_be16(pix); *dst += 2; } return; case V4L2_PIX_FMT_RGB24: for (i = 0; i < 2; i++) { *(*dst)++ = *r++; *(*dst)++ = *g++; *(*dst)++ = *b++; } return; case V4L2_PIX_FMT_BGR24: for (i = 0; i < 2; i++) { *(*dst)++ = *b++; *(*dst)++ = *g++; *(*dst)++ = *r++; } return; case V4L2_PIX_FMT_YUYV: default: { u8 y, y1, u, v; y = ((8453 * (*r) + 16594 * (*g) + 3223 * (*b) + 524288) >> 15); u = ((-4878 * (*r) - 9578 * (*g) + 14456 * (*b) + 4210688) >> 15); v = ((14456 * (*r++) - 12105 * (*g++) - 2351 * (*b++) + 4210688) >> 15); y1 = ((8453 * (*r) + 16594 * (*g) + 3223 * (*b) + 524288) >> 15); *(*dst)++ = y; *(*dst)++ = u; *(*dst)++ = y1; *(*dst)++ = v; return; } case V4L2_PIX_FMT_SBGGR8: if (!(ypos & 1)) { *(*dst)++ = *b; *(*dst)++ = *++g; } else { *(*dst)++ = *g; *(*dst)++ = *++r; } return; case V4L2_PIX_FMT_SGBRG8: if (!(ypos & 1)) { *(*dst)++ = *g; *(*dst)++ = *++b; } else { *(*dst)++ = *r; *(*dst)++ = *++g; } return; case V4L2_PIX_FMT_SGRBG8: if (!(ypos & 1)) { *(*dst)++ = *g; *(*dst)++ = *++r; } else { *(*dst)++ = *b; *(*dst)++ = *++g; } return; case V4L2_PIX_FMT_SRGGB8: if (!(ypos & 1)) { *(*dst)++ = *r; *(*dst)++ = *++g; } else { *(*dst)++ = *g; *(*dst)++ = *++b; } return; } } static int device_process(struct vim2m_ctx *ctx, struct vb2_v4l2_buffer *in_vb, struct vb2_v4l2_buffer *out_vb) { struct vim2m_dev *dev = ctx->dev; struct vim2m_q_data *q_data_in, *q_data_out; u8 *p_in, *p_line, *p_in_x[2], *p, *p_out; unsigned int width, height, bytesperline, bytes_per_pixel; unsigned int x, y, y_in, y_out, x_int, x_fract, x_err, x_offset; int start, end, step; q_data_in = get_q_data(ctx, V4L2_BUF_TYPE_VIDEO_OUTPUT); if (!q_data_in) return 0; bytesperline = (q_data_in->width * q_data_in->fmt->depth) >> 3; bytes_per_pixel = q_data_in->fmt->depth >> 3; q_data_out = get_q_data(ctx, V4L2_BUF_TYPE_VIDEO_CAPTURE); if (!q_data_out) return 0; /* As we're doing scaling, use the output dimensions here */ height = q_data_out->height; width = q_data_out->width; p_in = vb2_plane_vaddr(&in_vb->vb2_buf, 0); p_out = vb2_plane_vaddr(&out_vb->vb2_buf, 0); if (!p_in || !p_out) { v4l2_err(&dev->v4l2_dev, "Acquiring kernel pointers to buffers failed\n"); return -EFAULT; } out_vb->sequence = q_data_out->sequence++; in_vb->sequence = q_data_in->sequence++; v4l2_m2m_buf_copy_metadata(in_vb, out_vb, true); if (ctx->mode & MEM2MEM_VFLIP) { start = height - 1; end = -1; step = -1; } else { start = 0; end = height; step = 1; } y_out = 0; /* * When format and resolution are identical, * we can use a faster copy logic */ if (q_data_in->fmt->fourcc == q_data_out->fmt->fourcc && q_data_in->width == q_data_out->width && q_data_in->height == q_data_out->height) { for (y = start; y != end; y += step, y_out++) { p = p_in + (y * bytesperline); if (ctx->mode & MEM2MEM_HFLIP) p += bytesperline - (q_data_in->fmt->depth >> 3); copy_line(q_data_out, p, p_out, ctx->mode & MEM2MEM_HFLIP); p_out += bytesperline; } return 0; } /* Slower algorithm with format conversion, hflip, vflip and scaler */ /* To speed scaler up, use Bresenham for X dimension */ x_int = q_data_in->width / q_data_out->width; x_fract = q_data_in->width % q_data_out->width; for (y = start; y != end; y += step, y_out++) { y_in = (y * q_data_in->height) / q_data_out->height; x_offset = 0; x_err = 0; p_line = p_in + (y_in * bytesperline); if (ctx->mode & MEM2MEM_HFLIP) p_line += bytesperline - (q_data_in->fmt->depth >> 3); p_in_x[0] = p_line; for (x = 0; x < width >> 1; x++) { x_offset += x_int; x_err += x_fract; if (x_err > width) { x_offset++; x_err -= width; } if (ctx->mode & MEM2MEM_HFLIP) p_in_x[1] = p_line - x_offset * bytes_per_pixel; else p_in_x[1] = p_line + x_offset * bytes_per_pixel; copy_two_pixels(q_data_in, q_data_out, p_in_x, &p_out, y_out, ctx->mode & MEM2MEM_HFLIP); /* Calculate the next p_in_x0 */ x_offset += x_int; x_err += x_fract; if (x_err > width) { x_offset++; x_err -= width; } if (ctx->mode & MEM2MEM_HFLIP) p_in_x[0] = p_line - x_offset * bytes_per_pixel; else p_in_x[0] = p_line + x_offset * bytes_per_pixel; } } return 0; } /* * mem2mem callbacks */ /* * job_ready() - check whether an instance is ready to be scheduled to run */ static int job_ready(void *priv) { struct vim2m_ctx *ctx = priv; if (v4l2_m2m_num_src_bufs_ready(ctx->fh.m2m_ctx) < ctx->translen || v4l2_m2m_num_dst_bufs_ready(ctx->fh.m2m_ctx) < ctx->translen) { dprintk(ctx->dev, 1, "Not enough buffers available\n"); return 0; } return 1; } static void job_abort(void *priv) { struct vim2m_ctx *ctx = priv; /* Will cancel the transaction in the next interrupt handler */ ctx->aborting = 1; } /* device_run() - prepares and starts the device * * This simulates all the immediate preparations required before starting * a device. This will be called by the framework when it decides to schedule * a particular instance. */ static void device_run(void *priv) { struct vim2m_ctx *ctx = priv; struct vb2_v4l2_buffer *src_buf, *dst_buf; src_buf = v4l2_m2m_next_src_buf(ctx->fh.m2m_ctx); dst_buf = v4l2_m2m_next_dst_buf(ctx->fh.m2m_ctx); /* Apply request controls if any */ v4l2_ctrl_request_setup(src_buf->vb2_buf.req_obj.req, &ctx->hdl); device_process(ctx, src_buf, dst_buf); /* Complete request controls if any */ v4l2_ctrl_request_complete(src_buf->vb2_buf.req_obj.req, &ctx->hdl); /* Run delayed work, which simulates a hardware irq */ schedule_delayed_work(&ctx->work_run, msecs_to_jiffies(ctx->transtime)); } static void device_work(struct work_struct *w) { struct vim2m_ctx *curr_ctx; struct vim2m_dev *vim2m_dev; struct vb2_v4l2_buffer *src_vb, *dst_vb; curr_ctx = container_of(w, struct vim2m_ctx, work_run.work); vim2m_dev = curr_ctx->dev; src_vb = v4l2_m2m_src_buf_remove(curr_ctx->fh.m2m_ctx); dst_vb = v4l2_m2m_dst_buf_remove(curr_ctx->fh.m2m_ctx); curr_ctx->num_processed++; v4l2_m2m_buf_done(src_vb, VB2_BUF_STATE_DONE); v4l2_m2m_buf_done(dst_vb, VB2_BUF_STATE_DONE); if (curr_ctx->num_processed == curr_ctx->translen || curr_ctx->aborting) { dprintk(curr_ctx->dev, 2, "Finishing capture buffer fill\n"); curr_ctx->num_processed = 0; v4l2_m2m_job_finish(vim2m_dev->m2m_dev, curr_ctx->fh.m2m_ctx); } else { device_run(curr_ctx); } } /* * video ioctls */ static int vidioc_querycap(struct file *file, void *priv, struct v4l2_capability *cap) { strscpy(cap->driver, MEM2MEM_NAME, sizeof(cap->driver)); strscpy(cap->card, MEM2MEM_NAME, sizeof(cap->card)); snprintf(cap->bus_info, sizeof(cap->bus_info), "platform:%s", MEM2MEM_NAME); return 0; } static int enum_fmt(struct v4l2_fmtdesc *f, u32 type) { int i, num; struct vim2m_fmt *fmt; num = 0; for (i = 0; i < NUM_FORMATS; ++i) { if (formats[i].types & type) { /* index-th format of type type found ? */ if (num == f->index) break; /* * Correct type but haven't reached our index yet, * just increment per-type index */ ++num; } } if (i < NUM_FORMATS) { /* Format found */ fmt = &formats[i]; f->pixelformat = fmt->fourcc; return 0; } /* Format not found */ return -EINVAL; } static int vidioc_enum_fmt_vid_cap(struct file *file, void *priv, struct v4l2_fmtdesc *f) { return enum_fmt(f, MEM2MEM_CAPTURE); } static int vidioc_enum_fmt_vid_out(struct file *file, void *priv, struct v4l2_fmtdesc *f) { return enum_fmt(f, MEM2MEM_OUTPUT); } static int vidioc_enum_framesizes(struct file *file, void *priv, struct v4l2_frmsizeenum *fsize) { if (fsize->index != 0) return -EINVAL; if (!find_format(fsize->pixel_format)) return -EINVAL; fsize->type = V4L2_FRMSIZE_TYPE_STEPWISE; fsize->stepwise.min_width = MIN_W; fsize->stepwise.min_height = MIN_H; fsize->stepwise.max_width = MAX_W; fsize->stepwise.max_height = MAX_H; get_alignment(fsize->pixel_format, &fsize->stepwise.step_width, &fsize->stepwise.step_height); return 0; } static int vidioc_g_fmt(struct vim2m_ctx *ctx, struct v4l2_format *f) { struct vb2_queue *vq; struct vim2m_q_data *q_data; vq = v4l2_m2m_get_vq(ctx->fh.m2m_ctx, f->type); if (!vq) return -EINVAL; q_data = get_q_data(ctx, f->type); if (!q_data) return -EINVAL; f->fmt.pix.width = q_data->width; f->fmt.pix.height = q_data->height; f->fmt.pix.field = V4L2_FIELD_NONE; f->fmt.pix.pixelformat = q_data->fmt->fourcc; f->fmt.pix.bytesperline = (q_data->width * q_data->fmt->depth) >> 3; f->fmt.pix.sizeimage = q_data->sizeimage; f->fmt.pix.colorspace = ctx->colorspace; f->fmt.pix.xfer_func = ctx->xfer_func; f->fmt.pix.ycbcr_enc = ctx->ycbcr_enc; f->fmt.pix.quantization = ctx->quant; return 0; } static int vidioc_g_fmt_vid_out(struct file *file, void *priv, struct v4l2_format *f) { return vidioc_g_fmt(file2ctx(file), f); } static int vidioc_g_fmt_vid_cap(struct file *file, void *priv, struct v4l2_format *f) { return vidioc_g_fmt(file2ctx(file), f); } static int vidioc_try_fmt(struct v4l2_format *f, struct vim2m_fmt *fmt) { int walign, halign; /* * V4L2 specification specifies the driver corrects the * format struct if any of the dimensions is unsupported */ if (f->fmt.pix.height < MIN_H) f->fmt.pix.height = MIN_H; else if (f->fmt.pix.height > MAX_H) f->fmt.pix.height = MAX_H; if (f->fmt.pix.width < MIN_W) f->fmt.pix.width = MIN_W; else if (f->fmt.pix.width > MAX_W) f->fmt.pix.width = MAX_W; get_alignment(f->fmt.pix.pixelformat, &walign, &halign); f->fmt.pix.width &= ~(walign - 1); f->fmt.pix.height &= ~(halign - 1); f->fmt.pix.bytesperline = (f->fmt.pix.width * fmt->depth) >> 3; f->fmt.pix.sizeimage = f->fmt.pix.height * f->fmt.pix.bytesperline; f->fmt.pix.field = V4L2_FIELD_NONE; return 0; } static int vidioc_try_fmt_vid_cap(struct file *file, void *priv, struct v4l2_format *f) { struct vim2m_fmt *fmt; struct vim2m_ctx *ctx = file2ctx(file); fmt = find_format(f->fmt.pix.pixelformat); if (!fmt) { f->fmt.pix.pixelformat = formats[0].fourcc; fmt = find_format(f->fmt.pix.pixelformat); } if (!(fmt->types & MEM2MEM_CAPTURE)) { v4l2_err(&ctx->dev->v4l2_dev, "Fourcc format (0x%08x) invalid.\n", f->fmt.pix.pixelformat); return -EINVAL; } f->fmt.pix.colorspace = ctx->colorspace; f->fmt.pix.xfer_func = ctx->xfer_func; f->fmt.pix.ycbcr_enc = ctx->ycbcr_enc; f->fmt.pix.quantization = ctx->quant; return vidioc_try_fmt(f, fmt); } static int vidioc_try_fmt_vid_out(struct file *file, void *priv, struct v4l2_format *f) { struct vim2m_fmt *fmt; struct vim2m_ctx *ctx = file2ctx(file); fmt = find_format(f->fmt.pix.pixelformat); if (!fmt) { f->fmt.pix.pixelformat = formats[0].fourcc; fmt = find_format(f->fmt.pix.pixelformat); } if (!(fmt->types & MEM2MEM_OUTPUT)) { v4l2_err(&ctx->dev->v4l2_dev, "Fourcc format (0x%08x) invalid.\n", f->fmt.pix.pixelformat); return -EINVAL; } if (!f->fmt.pix.colorspace) f->fmt.pix.colorspace = V4L2_COLORSPACE_REC709; return vidioc_try_fmt(f, fmt); } static int vidioc_s_fmt(struct vim2m_ctx *ctx, struct v4l2_format *f) { struct vim2m_q_data *q_data; struct vb2_queue *vq; vq = v4l2_m2m_get_vq(ctx->fh.m2m_ctx, f->type); if (!vq) return -EINVAL; q_data = get_q_data(ctx, f->type); if (!q_data) return -EINVAL; if (vb2_is_busy(vq)) { v4l2_err(&ctx->dev->v4l2_dev, "%s queue busy\n", __func__); return -EBUSY; } q_data->fmt = find_format(f->fmt.pix.pixelformat); q_data->width = f->fmt.pix.width; q_data->height = f->fmt.pix.height; q_data->sizeimage = q_data->width * q_data->height * q_data->fmt->depth >> 3; dprintk(ctx->dev, 1, "Format for type %s: %dx%d (%d bpp), fmt: %c%c%c%c\n", type_name(f->type), q_data->width, q_data->height, q_data->fmt->depth, (q_data->fmt->fourcc & 0xff), (q_data->fmt->fourcc >> 8) & 0xff, (q_data->fmt->fourcc >> 16) & 0xff, (q_data->fmt->fourcc >> 24) & 0xff); return 0; } static int vidioc_s_fmt_vid_cap(struct file *file, void *priv, struct v4l2_format *f) { int ret; ret = vidioc_try_fmt_vid_cap(file, priv, f); if (ret) return ret; return vidioc_s_fmt(file2ctx(file), f); } static int vidioc_s_fmt_vid_out(struct file *file, void *priv, struct v4l2_format *f) { struct vim2m_ctx *ctx = file2ctx(file); int ret; ret = vidioc_try_fmt_vid_out(file, priv, f); if (ret) return ret; ret = vidioc_s_fmt(file2ctx(file), f); if (!ret) { ctx->colorspace = f->fmt.pix.colorspace; ctx->xfer_func = f->fmt.pix.xfer_func; ctx->ycbcr_enc = f->fmt.pix.ycbcr_enc; ctx->quant = f->fmt.pix.quantization; } return ret; } static int vim2m_s_ctrl(struct v4l2_ctrl *ctrl) { struct vim2m_ctx *ctx = container_of(ctrl->handler, struct vim2m_ctx, hdl); switch (ctrl->id) { case V4L2_CID_HFLIP: if (ctrl->val) ctx->mode |= MEM2MEM_HFLIP; else ctx->mode &= ~MEM2MEM_HFLIP; break; case V4L2_CID_VFLIP: if (ctrl->val) ctx->mode |= MEM2MEM_VFLIP; else ctx->mode &= ~MEM2MEM_VFLIP; break; case V4L2_CID_TRANS_TIME_MSEC: ctx->transtime = ctrl->val; if (ctx->transtime < 1) ctx->transtime = 1; break; case V4L2_CID_TRANS_NUM_BUFS: ctx->translen = ctrl->val; break; default: v4l2_err(&ctx->dev->v4l2_dev, "Invalid control\n"); return -EINVAL; } return 0; } static const struct v4l2_ctrl_ops vim2m_ctrl_ops = { .s_ctrl = vim2m_s_ctrl, }; static const struct v4l2_ioctl_ops vim2m_ioctl_ops = { .vidioc_querycap = vidioc_querycap, .vidioc_enum_fmt_vid_cap = vidioc_enum_fmt_vid_cap, .vidioc_enum_framesizes = vidioc_enum_framesizes, .vidioc_g_fmt_vid_cap = vidioc_g_fmt_vid_cap, .vidioc_try_fmt_vid_cap = vidioc_try_fmt_vid_cap, .vidioc_s_fmt_vid_cap = vidioc_s_fmt_vid_cap, .vidioc_enum_fmt_vid_out = vidioc_enum_fmt_vid_out, .vidioc_g_fmt_vid_out = vidioc_g_fmt_vid_out, .vidioc_try_fmt_vid_out = vidioc_try_fmt_vid_out, .vidioc_s_fmt_vid_out = vidioc_s_fmt_vid_out, .vidioc_reqbufs = v4l2_m2m_ioctl_reqbufs, .vidioc_querybuf = v4l2_m2m_ioctl_querybuf, .vidioc_qbuf = v4l2_m2m_ioctl_qbuf, .vidioc_dqbuf = v4l2_m2m_ioctl_dqbuf, .vidioc_prepare_buf = v4l2_m2m_ioctl_prepare_buf, .vidioc_create_bufs = v4l2_m2m_ioctl_create_bufs, .vidioc_expbuf = v4l2_m2m_ioctl_expbuf, .vidioc_streamon = v4l2_m2m_ioctl_streamon, .vidioc_streamoff = v4l2_m2m_ioctl_streamoff, .vidioc_subscribe_event = v4l2_ctrl_subscribe_event, .vidioc_unsubscribe_event = v4l2_event_unsubscribe, }; /* * Queue operations */ static int vim2m_queue_setup(struct vb2_queue *vq, unsigned int *nbuffers, unsigned int *nplanes, unsigned int sizes[], struct device *alloc_devs[]) { struct vim2m_ctx *ctx = vb2_get_drv_priv(vq); struct vim2m_q_data *q_data; unsigned int size, count = *nbuffers; q_data = get_q_data(ctx, vq->type); if (!q_data) return -EINVAL; size = q_data->width * q_data->height * q_data->fmt->depth >> 3; while (size * count > MEM2MEM_VID_MEM_LIMIT) (count)--; *nbuffers = count; if (*nplanes) return sizes[0] < size ? -EINVAL : 0; *nplanes = 1; sizes[0] = size; dprintk(ctx->dev, 1, "%s: get %d buffer(s) of size %d each.\n", type_name(vq->type), count, size); return 0; } static int vim2m_buf_out_validate(struct vb2_buffer *vb) { struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); struct vim2m_ctx *ctx = vb2_get_drv_priv(vb->vb2_queue); if (vbuf->field == V4L2_FIELD_ANY) vbuf->field = V4L2_FIELD_NONE; if (vbuf->field != V4L2_FIELD_NONE) { dprintk(ctx->dev, 1, "%s field isn't supported\n", __func__); return -EINVAL; } return 0; } static int vim2m_buf_prepare(struct vb2_buffer *vb) { struct vim2m_ctx *ctx = vb2_get_drv_priv(vb->vb2_queue); struct vim2m_q_data *q_data; dprintk(ctx->dev, 2, "type: %s\n", type_name(vb->vb2_queue->type)); q_data = get_q_data(ctx, vb->vb2_queue->type); if (!q_data) return -EINVAL; if (vb2_plane_size(vb, 0) < q_data->sizeimage) { dprintk(ctx->dev, 1, "%s data will not fit into plane (%lu < %lu)\n", __func__, vb2_plane_size(vb, 0), (long)q_data->sizeimage); return -EINVAL; } vb2_set_plane_payload(vb, 0, q_data->sizeimage); return 0; } static void vim2m_buf_queue(struct vb2_buffer *vb) { struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); struct vim2m_ctx *ctx = vb2_get_drv_priv(vb->vb2_queue); v4l2_m2m_buf_queue(ctx->fh.m2m_ctx, vbuf); } static int vim2m_start_streaming(struct vb2_queue *q, unsigned int count) { struct vim2m_ctx *ctx = vb2_get_drv_priv(q); struct vim2m_q_data *q_data = get_q_data(ctx, q->type); if (!q_data) return -EINVAL; if (V4L2_TYPE_IS_OUTPUT(q->type)) ctx->aborting = 0; q_data->sequence = 0; return 0; } static void vim2m_stop_streaming(struct vb2_queue *q) { struct vim2m_ctx *ctx = vb2_get_drv_priv(q); struct vb2_v4l2_buffer *vbuf; cancel_delayed_work_sync(&ctx->work_run); for (;;) { if (V4L2_TYPE_IS_OUTPUT(q->type)) vbuf = v4l2_m2m_src_buf_remove(ctx->fh.m2m_ctx); else vbuf = v4l2_m2m_dst_buf_remove(ctx->fh.m2m_ctx); if (!vbuf) return; v4l2_ctrl_request_complete(vbuf->vb2_buf.req_obj.req, &ctx->hdl); v4l2_m2m_buf_done(vbuf, VB2_BUF_STATE_ERROR); } } static void vim2m_buf_request_complete(struct vb2_buffer *vb) { struct vim2m_ctx *ctx = vb2_get_drv_priv(vb->vb2_queue); v4l2_ctrl_request_complete(vb->req_obj.req, &ctx->hdl); } static const struct vb2_ops vim2m_qops = { .queue_setup = vim2m_queue_setup, .buf_out_validate = vim2m_buf_out_validate, .buf_prepare = vim2m_buf_prepare, .buf_queue = vim2m_buf_queue, .start_streaming = vim2m_start_streaming, .stop_streaming = vim2m_stop_streaming, .wait_prepare = vb2_ops_wait_prepare, .wait_finish = vb2_ops_wait_finish, .buf_request_complete = vim2m_buf_request_complete, }; static int queue_init(void *priv, struct vb2_queue *src_vq, struct vb2_queue *dst_vq) { struct vim2m_ctx *ctx = priv; int ret; src_vq->type = V4L2_BUF_TYPE_VIDEO_OUTPUT; src_vq->io_modes = VB2_MMAP | VB2_USERPTR | VB2_DMABUF; src_vq->drv_priv = ctx; src_vq->buf_struct_size = sizeof(struct v4l2_m2m_buffer); src_vq->ops = &vim2m_qops; src_vq->mem_ops = &vb2_vmalloc_memops; src_vq->timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_COPY; src_vq->lock = &ctx->vb_mutex; src_vq->supports_requests = true; ret = vb2_queue_init(src_vq); if (ret) return ret; dst_vq->type = V4L2_BUF_TYPE_VIDEO_CAPTURE; dst_vq->io_modes = VB2_MMAP | VB2_USERPTR | VB2_DMABUF; dst_vq->drv_priv = ctx; dst_vq->buf_struct_size = sizeof(struct v4l2_m2m_buffer); dst_vq->ops = &vim2m_qops; dst_vq->mem_ops = &vb2_vmalloc_memops; dst_vq->timestamp_flags = V4L2_BUF_FLAG_TIMESTAMP_COPY; dst_vq->lock = &ctx->vb_mutex; return vb2_queue_init(dst_vq); } static struct v4l2_ctrl_config vim2m_ctrl_trans_time_msec = { .ops = &vim2m_ctrl_ops, .id = V4L2_CID_TRANS_TIME_MSEC, .name = "Transaction Time (msec)", .type = V4L2_CTRL_TYPE_INTEGER, .min = 1, .max = 10001, .step = 1, }; static const struct v4l2_ctrl_config vim2m_ctrl_trans_num_bufs = { .ops = &vim2m_ctrl_ops, .id = V4L2_CID_TRANS_NUM_BUFS, .name = "Buffers Per Transaction", .type = V4L2_CTRL_TYPE_INTEGER, .def = 1, .min = 1, .max = MEM2MEM_DEF_NUM_BUFS, .step = 1, }; /* * File operations */ static int vim2m_open(struct file *file) { struct vim2m_dev *dev = video_drvdata(file); struct vim2m_ctx *ctx = NULL; struct v4l2_ctrl_handler *hdl; int rc = 0; if (mutex_lock_interruptible(&dev->dev_mutex)) return -ERESTARTSYS; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) { rc = -ENOMEM; goto open_unlock; } v4l2_fh_init(&ctx->fh, video_devdata(file)); file->private_data = &ctx->fh; ctx->dev = dev; hdl = &ctx->hdl; v4l2_ctrl_handler_init(hdl, 4); v4l2_ctrl_new_std(hdl, &vim2m_ctrl_ops, V4L2_CID_HFLIP, 0, 1, 1, 0); v4l2_ctrl_new_std(hdl, &vim2m_ctrl_ops, V4L2_CID_VFLIP, 0, 1, 1, 0); vim2m_ctrl_trans_time_msec.def = default_transtime; v4l2_ctrl_new_custom(hdl, &vim2m_ctrl_trans_time_msec, NULL); v4l2_ctrl_new_custom(hdl, &vim2m_ctrl_trans_num_bufs, NULL); if (hdl->error) { rc = hdl->error; v4l2_ctrl_handler_free(hdl); kfree(ctx); goto open_unlock; } ctx->fh.ctrl_handler = hdl; v4l2_ctrl_handler_setup(hdl); ctx->q_data[V4L2_M2M_SRC].fmt = &formats[0]; ctx->q_data[V4L2_M2M_SRC].width = 640; ctx->q_data[V4L2_M2M_SRC].height = 480; ctx->q_data[V4L2_M2M_SRC].sizeimage = ctx->q_data[V4L2_M2M_SRC].width * ctx->q_data[V4L2_M2M_SRC].height * (ctx->q_data[V4L2_M2M_SRC].fmt->depth >> 3); ctx->q_data[V4L2_M2M_DST] = ctx->q_data[V4L2_M2M_SRC]; ctx->colorspace = V4L2_COLORSPACE_REC709; ctx->fh.m2m_ctx = v4l2_m2m_ctx_init(dev->m2m_dev, ctx, &queue_init); mutex_init(&ctx->vb_mutex); INIT_DELAYED_WORK(&ctx->work_run, device_work); if (IS_ERR(ctx->fh.m2m_ctx)) { rc = PTR_ERR(ctx->fh.m2m_ctx); v4l2_ctrl_handler_free(hdl); v4l2_fh_exit(&ctx->fh); kfree(ctx); goto open_unlock; } v4l2_fh_add(&ctx->fh); atomic_inc(&dev->num_inst); dprintk(dev, 1, "Created instance: %p, m2m_ctx: %p\n", ctx, ctx->fh.m2m_ctx); open_unlock: mutex_unlock(&dev->dev_mutex); return rc; } static int vim2m_release(struct file *file) { struct vim2m_dev *dev = video_drvdata(file); struct vim2m_ctx *ctx = file2ctx(file); dprintk(dev, 1, "Releasing instance %p\n", ctx); v4l2_fh_del(&ctx->fh); v4l2_fh_exit(&ctx->fh); v4l2_ctrl_handler_free(&ctx->hdl); mutex_lock(&dev->dev_mutex); v4l2_m2m_ctx_release(ctx->fh.m2m_ctx); mutex_unlock(&dev->dev_mutex); kfree(ctx); atomic_dec(&dev->num_inst); return 0; } static void vim2m_device_release(struct video_device *vdev) { struct vim2m_dev *dev = container_of(vdev, struct vim2m_dev, vfd); v4l2_device_unregister(&dev->v4l2_dev); v4l2_m2m_release(dev->m2m_dev); #ifdef CONFIG_MEDIA_CONTROLLER media_device_cleanup(&dev->mdev); #endif kfree(dev); } static const struct v4l2_file_operations vim2m_fops = { .owner = THIS_MODULE, .open = vim2m_open, .release = vim2m_release, .poll = v4l2_m2m_fop_poll, .unlocked_ioctl = video_ioctl2, .mmap = v4l2_m2m_fop_mmap, }; static const struct video_device vim2m_videodev = { .name = MEM2MEM_NAME, .vfl_dir = VFL_DIR_M2M, .fops = &vim2m_fops, .ioctl_ops = &vim2m_ioctl_ops, .minor = -1, .release = vim2m_device_release, .device_caps = V4L2_CAP_VIDEO_M2M | V4L2_CAP_STREAMING, }; static const struct v4l2_m2m_ops m2m_ops = { .device_run = device_run, .job_ready = job_ready, .job_abort = job_abort, }; static const struct media_device_ops m2m_media_ops = { .req_validate = vb2_request_validate, .req_queue = v4l2_m2m_request_queue, }; static int vim2m_probe(struct platform_device *pdev) { struct vim2m_dev *dev; struct video_device *vfd; int ret; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return -ENOMEM; ret = v4l2_device_register(&pdev->dev, &dev->v4l2_dev); if (ret) goto error_free; atomic_set(&dev->num_inst, 0); mutex_init(&dev->dev_mutex); dev->vfd = vim2m_videodev; vfd = &dev->vfd; vfd->lock = &dev->dev_mutex; vfd->v4l2_dev = &dev->v4l2_dev; video_set_drvdata(vfd, dev); v4l2_info(&dev->v4l2_dev, "Device registered as /dev/video%d\n", vfd->num); platform_set_drvdata(pdev, dev); dev->m2m_dev = v4l2_m2m_init(&m2m_ops); if (IS_ERR(dev->m2m_dev)) { v4l2_err(&dev->v4l2_dev, "Failed to init mem2mem device\n"); ret = PTR_ERR(dev->m2m_dev); dev->m2m_dev = NULL; goto error_dev; } #ifdef CONFIG_MEDIA_CONTROLLER dev->mdev.dev = &pdev->dev; strscpy(dev->mdev.model, "vim2m", sizeof(dev->mdev.model)); strscpy(dev->mdev.bus_info, "platform:vim2m", sizeof(dev->mdev.bus_info)); media_device_init(&dev->mdev); dev->mdev.ops = &m2m_media_ops; dev->v4l2_dev.mdev = &dev->mdev; #endif ret = video_register_device(vfd, VFL_TYPE_VIDEO, 0); if (ret) { v4l2_err(&dev->v4l2_dev, "Failed to register video device\n"); goto error_m2m; } #ifdef CONFIG_MEDIA_CONTROLLER ret = v4l2_m2m_register_media_controller(dev->m2m_dev, vfd, MEDIA_ENT_F_PROC_VIDEO_SCALER); if (ret) { v4l2_err(&dev->v4l2_dev, "Failed to init mem2mem media controller\n"); goto error_v4l2; } ret = media_device_register(&dev->mdev); if (ret) { v4l2_err(&dev->v4l2_dev, "Failed to register mem2mem media device\n"); goto error_m2m_mc; } #endif return 0; #ifdef CONFIG_MEDIA_CONTROLLER error_m2m_mc: v4l2_m2m_unregister_media_controller(dev->m2m_dev); #endif error_v4l2: video_unregister_device(&dev->vfd); /* vim2m_device_release called by video_unregister_device to release various objects */ return ret; error_m2m: v4l2_m2m_release(dev->m2m_dev); error_dev: v4l2_device_unregister(&dev->v4l2_dev); error_free: kfree(dev); return ret; } static void vim2m_remove(struct platform_device *pdev) { struct vim2m_dev *dev = platform_get_drvdata(pdev); v4l2_info(&dev->v4l2_dev, "Removing " MEM2MEM_NAME); #ifdef CONFIG_MEDIA_CONTROLLER media_device_unregister(&dev->mdev); v4l2_m2m_unregister_media_controller(dev->m2m_dev); #endif video_unregister_device(&dev->vfd); } static struct platform_driver vim2m_pdrv = { .probe = vim2m_probe, .remove_new = vim2m_remove, .driver = { .name = MEM2MEM_NAME, }, }; static void __exit vim2m_exit(void) { platform_driver_unregister(&vim2m_pdrv); platform_device_unregister(&vim2m_pdev); } static int __init vim2m_init(void) { int ret; ret = platform_device_register(&vim2m_pdev); if (ret) return ret; ret = platform_driver_register(&vim2m_pdrv); if (ret) platform_device_unregister(&vim2m_pdev); return ret; } module_init(vim2m_init); module_exit(vim2m_exit);
8 15 4 15 2 1 2 2 1 2 1 13 14 49 8 8 49 48 8 8 8 49 8 8 8 8 8 8 8 8 8 11 16 10 8 15 15 15 16 16 14 49 49 58 58 57 58 49 49 49 49 49 49 49 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 // SPDX-License-Identifier: GPL-2.0-only /* * IEEE 802.1Q Multiple Registration Protocol (MRP) * * Copyright (c) 2012 Massachusetts Institute of Technology * * Adapted from code in net/802/garp.c * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> */ #include <linux/kernel.h> #include <linux/timer.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/module.h> #include <net/mrp.h> #include <asm/unaligned.h> static unsigned int mrp_join_time __read_mostly = 200; module_param(mrp_join_time, uint, 0644); MODULE_PARM_DESC(mrp_join_time, "Join time in ms (default 200ms)"); static unsigned int mrp_periodic_time __read_mostly = 1000; module_param(mrp_periodic_time, uint, 0644); MODULE_PARM_DESC(mrp_periodic_time, "Periodic time in ms (default 1s)"); MODULE_DESCRIPTION("IEEE 802.1Q Multiple Registration Protocol (MRP)"); MODULE_LICENSE("GPL"); static const u8 mrp_applicant_state_table[MRP_APPLICANT_MAX + 1][MRP_EVENT_MAX + 1] = { [MRP_APPLICANT_VO] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_VP, [MRP_EVENT_LV] = MRP_APPLICANT_VO, [MRP_EVENT_TX] = MRP_APPLICANT_VO, [MRP_EVENT_R_NEW] = MRP_APPLICANT_VO, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_AO, [MRP_EVENT_R_IN] = MRP_APPLICANT_VO, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_VO, [MRP_EVENT_R_MT] = MRP_APPLICANT_VO, [MRP_EVENT_R_LV] = MRP_APPLICANT_VO, [MRP_EVENT_R_LA] = MRP_APPLICANT_VO, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VO, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_VO, }, [MRP_APPLICANT_VP] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_VP, [MRP_EVENT_LV] = MRP_APPLICANT_VO, [MRP_EVENT_TX] = MRP_APPLICANT_AA, [MRP_EVENT_R_NEW] = MRP_APPLICANT_VP, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_AP, [MRP_EVENT_R_IN] = MRP_APPLICANT_VP, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_VP, [MRP_EVENT_R_MT] = MRP_APPLICANT_VP, [MRP_EVENT_R_LV] = MRP_APPLICANT_VP, [MRP_EVENT_R_LA] = MRP_APPLICANT_VP, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_VP, }, [MRP_APPLICANT_VN] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_VN, [MRP_EVENT_LV] = MRP_APPLICANT_LA, [MRP_EVENT_TX] = MRP_APPLICANT_AN, [MRP_EVENT_R_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_VN, [MRP_EVENT_R_IN] = MRP_APPLICANT_VN, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_VN, [MRP_EVENT_R_MT] = MRP_APPLICANT_VN, [MRP_EVENT_R_LV] = MRP_APPLICANT_VN, [MRP_EVENT_R_LA] = MRP_APPLICANT_VN, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VN, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_VN, }, [MRP_APPLICANT_AN] = { [MRP_EVENT_NEW] = MRP_APPLICANT_AN, [MRP_EVENT_JOIN] = MRP_APPLICANT_AN, [MRP_EVENT_LV] = MRP_APPLICANT_LA, [MRP_EVENT_TX] = MRP_APPLICANT_QA, [MRP_EVENT_R_NEW] = MRP_APPLICANT_AN, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_AN, [MRP_EVENT_R_IN] = MRP_APPLICANT_AN, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AN, [MRP_EVENT_R_MT] = MRP_APPLICANT_AN, [MRP_EVENT_R_LV] = MRP_APPLICANT_VN, [MRP_EVENT_R_LA] = MRP_APPLICANT_VN, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VN, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AN, }, [MRP_APPLICANT_AA] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_AA, [MRP_EVENT_LV] = MRP_APPLICANT_LA, [MRP_EVENT_TX] = MRP_APPLICANT_QA, [MRP_EVENT_R_NEW] = MRP_APPLICANT_AA, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QA, [MRP_EVENT_R_IN] = MRP_APPLICANT_AA, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AA, [MRP_EVENT_R_MT] = MRP_APPLICANT_AA, [MRP_EVENT_R_LV] = MRP_APPLICANT_VP, [MRP_EVENT_R_LA] = MRP_APPLICANT_VP, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AA, }, [MRP_APPLICANT_QA] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_QA, [MRP_EVENT_LV] = MRP_APPLICANT_LA, [MRP_EVENT_TX] = MRP_APPLICANT_QA, [MRP_EVENT_R_NEW] = MRP_APPLICANT_QA, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QA, [MRP_EVENT_R_IN] = MRP_APPLICANT_QA, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AA, [MRP_EVENT_R_MT] = MRP_APPLICANT_AA, [MRP_EVENT_R_LV] = MRP_APPLICANT_VP, [MRP_EVENT_R_LA] = MRP_APPLICANT_VP, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AA, }, [MRP_APPLICANT_LA] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_AA, [MRP_EVENT_LV] = MRP_APPLICANT_LA, [MRP_EVENT_TX] = MRP_APPLICANT_VO, [MRP_EVENT_R_NEW] = MRP_APPLICANT_LA, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_LA, [MRP_EVENT_R_IN] = MRP_APPLICANT_LA, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_LA, [MRP_EVENT_R_MT] = MRP_APPLICANT_LA, [MRP_EVENT_R_LV] = MRP_APPLICANT_LA, [MRP_EVENT_R_LA] = MRP_APPLICANT_LA, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_LA, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_LA, }, [MRP_APPLICANT_AO] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_AP, [MRP_EVENT_LV] = MRP_APPLICANT_AO, [MRP_EVENT_TX] = MRP_APPLICANT_AO, [MRP_EVENT_R_NEW] = MRP_APPLICANT_AO, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QO, [MRP_EVENT_R_IN] = MRP_APPLICANT_AO, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AO, [MRP_EVENT_R_MT] = MRP_APPLICANT_AO, [MRP_EVENT_R_LV] = MRP_APPLICANT_VO, [MRP_EVENT_R_LA] = MRP_APPLICANT_VO, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VO, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AO, }, [MRP_APPLICANT_QO] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_QP, [MRP_EVENT_LV] = MRP_APPLICANT_QO, [MRP_EVENT_TX] = MRP_APPLICANT_QO, [MRP_EVENT_R_NEW] = MRP_APPLICANT_QO, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QO, [MRP_EVENT_R_IN] = MRP_APPLICANT_QO, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AO, [MRP_EVENT_R_MT] = MRP_APPLICANT_AO, [MRP_EVENT_R_LV] = MRP_APPLICANT_VO, [MRP_EVENT_R_LA] = MRP_APPLICANT_VO, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VO, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_QO, }, [MRP_APPLICANT_AP] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_AP, [MRP_EVENT_LV] = MRP_APPLICANT_AO, [MRP_EVENT_TX] = MRP_APPLICANT_QA, [MRP_EVENT_R_NEW] = MRP_APPLICANT_AP, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QP, [MRP_EVENT_R_IN] = MRP_APPLICANT_AP, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AP, [MRP_EVENT_R_MT] = MRP_APPLICANT_AP, [MRP_EVENT_R_LV] = MRP_APPLICANT_VP, [MRP_EVENT_R_LA] = MRP_APPLICANT_VP, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AP, }, [MRP_APPLICANT_QP] = { [MRP_EVENT_NEW] = MRP_APPLICANT_VN, [MRP_EVENT_JOIN] = MRP_APPLICANT_QP, [MRP_EVENT_LV] = MRP_APPLICANT_QO, [MRP_EVENT_TX] = MRP_APPLICANT_QP, [MRP_EVENT_R_NEW] = MRP_APPLICANT_QP, [MRP_EVENT_R_JOIN_IN] = MRP_APPLICANT_QP, [MRP_EVENT_R_IN] = MRP_APPLICANT_QP, [MRP_EVENT_R_JOIN_MT] = MRP_APPLICANT_AP, [MRP_EVENT_R_MT] = MRP_APPLICANT_AP, [MRP_EVENT_R_LV] = MRP_APPLICANT_VP, [MRP_EVENT_R_LA] = MRP_APPLICANT_VP, [MRP_EVENT_REDECLARE] = MRP_APPLICANT_VP, [MRP_EVENT_PERIODIC] = MRP_APPLICANT_AP, }, }; static const u8 mrp_tx_action_table[MRP_APPLICANT_MAX + 1] = { [MRP_APPLICANT_VO] = MRP_TX_ACTION_S_IN_OPTIONAL, [MRP_APPLICANT_VP] = MRP_TX_ACTION_S_JOIN_IN, [MRP_APPLICANT_VN] = MRP_TX_ACTION_S_NEW, [MRP_APPLICANT_AN] = MRP_TX_ACTION_S_NEW, [MRP_APPLICANT_AA] = MRP_TX_ACTION_S_JOIN_IN, [MRP_APPLICANT_QA] = MRP_TX_ACTION_S_JOIN_IN_OPTIONAL, [MRP_APPLICANT_LA] = MRP_TX_ACTION_S_LV, [MRP_APPLICANT_AO] = MRP_TX_ACTION_S_IN_OPTIONAL, [MRP_APPLICANT_QO] = MRP_TX_ACTION_S_IN_OPTIONAL, [MRP_APPLICANT_AP] = MRP_TX_ACTION_S_JOIN_IN, [MRP_APPLICANT_QP] = MRP_TX_ACTION_S_IN_OPTIONAL, }; static void mrp_attrvalue_inc(void *value, u8 len) { u8 *v = (u8 *)value; /* Add 1 to the last byte. If it becomes zero, * go to the previous byte and repeat. */ while (len > 0 && !++v[--len]) ; } static int mrp_attr_cmp(const struct mrp_attr *attr, const void *value, u8 len, u8 type) { if (attr->type != type) return attr->type - type; if (attr->len != len) return attr->len - len; return memcmp(attr->value, value, len); } static struct mrp_attr *mrp_attr_lookup(const struct mrp_applicant *app, const void *value, u8 len, u8 type) { struct rb_node *parent = app->mad.rb_node; struct mrp_attr *attr; int d; while (parent) { attr = rb_entry(parent, struct mrp_attr, node); d = mrp_attr_cmp(attr, value, len, type); if (d > 0) parent = parent->rb_left; else if (d < 0) parent = parent->rb_right; else return attr; } return NULL; } static struct mrp_attr *mrp_attr_create(struct mrp_applicant *app, const void *value, u8 len, u8 type) { struct rb_node *parent = NULL, **p = &app->mad.rb_node; struct mrp_attr *attr; int d; while (*p) { parent = *p; attr = rb_entry(parent, struct mrp_attr, node); d = mrp_attr_cmp(attr, value, len, type); if (d > 0) p = &parent->rb_left; else if (d < 0) p = &parent->rb_right; else { /* The attribute already exists; re-use it. */ return attr; } } attr = kmalloc(sizeof(*attr) + len, GFP_ATOMIC); if (!attr) return attr; attr->state = MRP_APPLICANT_VO; attr->type = type; attr->len = len; memcpy(attr->value, value, len); rb_link_node(&attr->node, parent, p); rb_insert_color(&attr->node, &app->mad); return attr; } static void mrp_attr_destroy(struct mrp_applicant *app, struct mrp_attr *attr) { rb_erase(&attr->node, &app->mad); kfree(attr); } static void mrp_attr_destroy_all(struct mrp_applicant *app) { struct rb_node *node, *next; struct mrp_attr *attr; for (node = rb_first(&app->mad); next = node ? rb_next(node) : NULL, node != NULL; node = next) { attr = rb_entry(node, struct mrp_attr, node); mrp_attr_destroy(app, attr); } } static int mrp_pdu_init(struct mrp_applicant *app) { struct sk_buff *skb; struct mrp_pdu_hdr *ph; skb = alloc_skb(app->dev->mtu + LL_RESERVED_SPACE(app->dev), GFP_ATOMIC); if (!skb) return -ENOMEM; skb->dev = app->dev; skb->protocol = app->app->pkttype.type; skb_reserve(skb, LL_RESERVED_SPACE(app->dev)); skb_reset_network_header(skb); skb_reset_transport_header(skb); ph = __skb_put(skb, sizeof(*ph)); ph->version = app->app->version; app->pdu = skb; return 0; } static int mrp_pdu_append_end_mark(struct mrp_applicant *app) { __be16 *endmark; if (skb_tailroom(app->pdu) < sizeof(*endmark)) return -1; endmark = __skb_put(app->pdu, sizeof(*endmark)); put_unaligned(MRP_END_MARK, endmark); return 0; } static void mrp_pdu_queue(struct mrp_applicant *app) { if (!app->pdu) return; if (mrp_cb(app->pdu)->mh) mrp_pdu_append_end_mark(app); mrp_pdu_append_end_mark(app); dev_hard_header(app->pdu, app->dev, ntohs(app->app->pkttype.type), app->app->group_address, app->dev->dev_addr, app->pdu->len); skb_queue_tail(&app->queue, app->pdu); app->pdu = NULL; } static void mrp_queue_xmit(struct mrp_applicant *app) { struct sk_buff *skb; while ((skb = skb_dequeue(&app->queue))) dev_queue_xmit(skb); } static int mrp_pdu_append_msg_hdr(struct mrp_applicant *app, u8 attrtype, u8 attrlen) { struct mrp_msg_hdr *mh; if (mrp_cb(app->pdu)->mh) { if (mrp_pdu_append_end_mark(app) < 0) return -1; mrp_cb(app->pdu)->mh = NULL; mrp_cb(app->pdu)->vah = NULL; } if (skb_tailroom(app->pdu) < sizeof(*mh)) return -1; mh = __skb_put(app->pdu, sizeof(*mh)); mh->attrtype = attrtype; mh->attrlen = attrlen; mrp_cb(app->pdu)->mh = mh; return 0; } static int mrp_pdu_append_vecattr_hdr(struct mrp_applicant *app, const void *firstattrvalue, u8 attrlen) { struct mrp_vecattr_hdr *vah; if (skb_tailroom(app->pdu) < sizeof(*vah) + attrlen) return -1; vah = __skb_put(app->pdu, sizeof(*vah) + attrlen); put_unaligned(0, &vah->lenflags); memcpy(vah->firstattrvalue, firstattrvalue, attrlen); mrp_cb(app->pdu)->vah = vah; memcpy(mrp_cb(app->pdu)->attrvalue, firstattrvalue, attrlen); return 0; } static int mrp_pdu_append_vecattr_event(struct mrp_applicant *app, const struct mrp_attr *attr, enum mrp_vecattr_event vaevent) { u16 len, pos; u8 *vaevents; int err; again: if (!app->pdu) { err = mrp_pdu_init(app); if (err < 0) return err; } /* If there is no Message header in the PDU, or the Message header is * for a different attribute type, add an EndMark (if necessary) and a * new Message header to the PDU. */ if (!mrp_cb(app->pdu)->mh || mrp_cb(app->pdu)->mh->attrtype != attr->type || mrp_cb(app->pdu)->mh->attrlen != attr->len) { if (mrp_pdu_append_msg_hdr(app, attr->type, attr->len) < 0) goto queue; } /* If there is no VectorAttribute header for this Message in the PDU, * or this attribute's value does not sequentially follow the previous * attribute's value, add a new VectorAttribute header to the PDU. */ if (!mrp_cb(app->pdu)->vah || memcmp(mrp_cb(app->pdu)->attrvalue, attr->value, attr->len)) { if (mrp_pdu_append_vecattr_hdr(app, attr->value, attr->len) < 0) goto queue; } len = be16_to_cpu(get_unaligned(&mrp_cb(app->pdu)->vah->lenflags)); pos = len % 3; /* Events are packed into Vectors in the PDU, three to a byte. Add a * byte to the end of the Vector if necessary. */ if (!pos) { if (skb_tailroom(app->pdu) < sizeof(u8)) goto queue; vaevents = __skb_put(app->pdu, sizeof(u8)); } else { vaevents = (u8 *)(skb_tail_pointer(app->pdu) - sizeof(u8)); } switch (pos) { case 0: *vaevents = vaevent * (__MRP_VECATTR_EVENT_MAX * __MRP_VECATTR_EVENT_MAX); break; case 1: *vaevents += vaevent * __MRP_VECATTR_EVENT_MAX; break; case 2: *vaevents += vaevent; break; default: WARN_ON(1); } /* Increment the length of the VectorAttribute in the PDU, as well as * the value of the next attribute that would continue its Vector. */ put_unaligned(cpu_to_be16(++len), &mrp_cb(app->pdu)->vah->lenflags); mrp_attrvalue_inc(mrp_cb(app->pdu)->attrvalue, attr->len); return 0; queue: mrp_pdu_queue(app); goto again; } static void mrp_attr_event(struct mrp_applicant *app, struct mrp_attr *attr, enum mrp_event event) { enum mrp_applicant_state state; state = mrp_applicant_state_table[attr->state][event]; if (state == MRP_APPLICANT_INVALID) { WARN_ON(1); return; } if (event == MRP_EVENT_TX) { /* When appending the attribute fails, don't update its state * in order to retry at the next TX event. */ switch (mrp_tx_action_table[attr->state]) { case MRP_TX_ACTION_NONE: case MRP_TX_ACTION_S_JOIN_IN_OPTIONAL: case MRP_TX_ACTION_S_IN_OPTIONAL: break; case MRP_TX_ACTION_S_NEW: if (mrp_pdu_append_vecattr_event( app, attr, MRP_VECATTR_EVENT_NEW) < 0) return; break; case MRP_TX_ACTION_S_JOIN_IN: if (mrp_pdu_append_vecattr_event( app, attr, MRP_VECATTR_EVENT_JOIN_IN) < 0) return; break; case MRP_TX_ACTION_S_LV: if (mrp_pdu_append_vecattr_event( app, attr, MRP_VECATTR_EVENT_LV) < 0) return; /* As a pure applicant, sending a leave message * implies that the attribute was unregistered and * can be destroyed. */ mrp_attr_destroy(app, attr); return; default: WARN_ON(1); } } attr->state = state; } int mrp_request_join(const struct net_device *dev, const struct mrp_application *appl, const void *value, u8 len, u8 type) { struct mrp_port *port = rtnl_dereference(dev->mrp_port); struct mrp_applicant *app = rtnl_dereference( port->applicants[appl->type]); struct mrp_attr *attr; if (sizeof(struct mrp_skb_cb) + len > sizeof_field(struct sk_buff, cb)) return -ENOMEM; spin_lock_bh(&app->lock); attr = mrp_attr_create(app, value, len, type); if (!attr) { spin_unlock_bh(&app->lock); return -ENOMEM; } mrp_attr_event(app, attr, MRP_EVENT_JOIN); spin_unlock_bh(&app->lock); return 0; } EXPORT_SYMBOL_GPL(mrp_request_join); void mrp_request_leave(const struct net_device *dev, const struct mrp_application *appl, const void *value, u8 len, u8 type) { struct mrp_port *port = rtnl_dereference(dev->mrp_port); struct mrp_applicant *app = rtnl_dereference( port->applicants[appl->type]); struct mrp_attr *attr; if (sizeof(struct mrp_skb_cb) + len > sizeof_field(struct sk_buff, cb)) return; spin_lock_bh(&app->lock); attr = mrp_attr_lookup(app, value, len, type); if (!attr) { spin_unlock_bh(&app->lock); return; } mrp_attr_event(app, attr, MRP_EVENT_LV); spin_unlock_bh(&app->lock); } EXPORT_SYMBOL_GPL(mrp_request_leave); static void mrp_mad_event(struct mrp_applicant *app, enum mrp_event event) { struct rb_node *node, *next; struct mrp_attr *attr; for (node = rb_first(&app->mad); next = node ? rb_next(node) : NULL, node != NULL; node = next) { attr = rb_entry(node, struct mrp_attr, node); mrp_attr_event(app, attr, event); } } static void mrp_join_timer_arm(struct mrp_applicant *app) { unsigned long delay; delay = get_random_u32_below(msecs_to_jiffies(mrp_join_time)); mod_timer(&app->join_timer, jiffies + delay); } static void mrp_join_timer(struct timer_list *t) { struct mrp_applicant *app = from_timer(app, t, join_timer); spin_lock(&app->lock); mrp_mad_event(app, MRP_EVENT_TX); mrp_pdu_queue(app); spin_unlock(&app->lock); mrp_queue_xmit(app); spin_lock(&app->lock); if (likely(app->active)) mrp_join_timer_arm(app); spin_unlock(&app->lock); } static void mrp_periodic_timer_arm(struct mrp_applicant *app) { mod_timer(&app->periodic_timer, jiffies + msecs_to_jiffies(mrp_periodic_time)); } static void mrp_periodic_timer(struct timer_list *t) { struct mrp_applicant *app = from_timer(app, t, periodic_timer); spin_lock(&app->lock); if (likely(app->active)) { mrp_mad_event(app, MRP_EVENT_PERIODIC); mrp_pdu_queue(app); mrp_periodic_timer_arm(app); } spin_unlock(&app->lock); } static int mrp_pdu_parse_end_mark(struct sk_buff *skb, int *offset) { __be16 endmark; if (skb_copy_bits(skb, *offset, &endmark, sizeof(endmark)) < 0) return -1; if (endmark == MRP_END_MARK) { *offset += sizeof(endmark); return -1; } return 0; } static void mrp_pdu_parse_vecattr_event(struct mrp_applicant *app, struct sk_buff *skb, enum mrp_vecattr_event vaevent) { struct mrp_attr *attr; enum mrp_event event; attr = mrp_attr_lookup(app, mrp_cb(skb)->attrvalue, mrp_cb(skb)->mh->attrlen, mrp_cb(skb)->mh->attrtype); if (attr == NULL) return; switch (vaevent) { case MRP_VECATTR_EVENT_NEW: event = MRP_EVENT_R_NEW; break; case MRP_VECATTR_EVENT_JOIN_IN: event = MRP_EVENT_R_JOIN_IN; break; case MRP_VECATTR_EVENT_IN: event = MRP_EVENT_R_IN; break; case MRP_VECATTR_EVENT_JOIN_MT: event = MRP_EVENT_R_JOIN_MT; break; case MRP_VECATTR_EVENT_MT: event = MRP_EVENT_R_MT; break; case MRP_VECATTR_EVENT_LV: event = MRP_EVENT_R_LV; break; default: return; } mrp_attr_event(app, attr, event); } static int mrp_pdu_parse_vecattr(struct mrp_applicant *app, struct sk_buff *skb, int *offset) { struct mrp_vecattr_hdr _vah; u16 valen; u8 vaevents, vaevent; mrp_cb(skb)->vah = skb_header_pointer(skb, *offset, sizeof(_vah), &_vah); if (!mrp_cb(skb)->vah) return -1; *offset += sizeof(_vah); if (get_unaligned(&mrp_cb(skb)->vah->lenflags) & MRP_VECATTR_HDR_FLAG_LA) mrp_mad_event(app, MRP_EVENT_R_LA); valen = be16_to_cpu(get_unaligned(&mrp_cb(skb)->vah->lenflags) & MRP_VECATTR_HDR_LEN_MASK); /* The VectorAttribute structure in a PDU carries event information * about one or more attributes having consecutive values. Only the * value for the first attribute is contained in the structure. So * we make a copy of that value, and then increment it each time we * advance to the next event in its Vector. */ if (sizeof(struct mrp_skb_cb) + mrp_cb(skb)->mh->attrlen > sizeof_field(struct sk_buff, cb)) return -1; if (skb_copy_bits(skb, *offset, mrp_cb(skb)->attrvalue, mrp_cb(skb)->mh->attrlen) < 0) return -1; *offset += mrp_cb(skb)->mh->attrlen; /* In a VectorAttribute, the Vector contains events which are packed * three to a byte. We process one byte of the Vector at a time. */ while (valen > 0) { if (skb_copy_bits(skb, *offset, &vaevents, sizeof(vaevents)) < 0) return -1; *offset += sizeof(vaevents); /* Extract and process the first event. */ vaevent = vaevents / (__MRP_VECATTR_EVENT_MAX * __MRP_VECATTR_EVENT_MAX); if (vaevent >= __MRP_VECATTR_EVENT_MAX) { /* The byte is malformed; stop processing. */ return -1; } mrp_pdu_parse_vecattr_event(app, skb, vaevent); /* If present, extract and process the second event. */ if (!--valen) break; mrp_attrvalue_inc(mrp_cb(skb)->attrvalue, mrp_cb(skb)->mh->attrlen); vaevents %= (__MRP_VECATTR_EVENT_MAX * __MRP_VECATTR_EVENT_MAX); vaevent = vaevents / __MRP_VECATTR_EVENT_MAX; mrp_pdu_parse_vecattr_event(app, skb, vaevent); /* If present, extract and process the third event. */ if (!--valen) break; mrp_attrvalue_inc(mrp_cb(skb)->attrvalue, mrp_cb(skb)->mh->attrlen); vaevents %= __MRP_VECATTR_EVENT_MAX; vaevent = vaevents; mrp_pdu_parse_vecattr_event(app, skb, vaevent); } return 0; } static int mrp_pdu_parse_msg(struct mrp_applicant *app, struct sk_buff *skb, int *offset) { struct mrp_msg_hdr _mh; mrp_cb(skb)->mh = skb_header_pointer(skb, *offset, sizeof(_mh), &_mh); if (!mrp_cb(skb)->mh) return -1; *offset += sizeof(_mh); if (mrp_cb(skb)->mh->attrtype == 0 || mrp_cb(skb)->mh->attrtype > app->app->maxattr || mrp_cb(skb)->mh->attrlen == 0) return -1; while (skb->len > *offset) { if (mrp_pdu_parse_end_mark(skb, offset) < 0) break; if (mrp_pdu_parse_vecattr(app, skb, offset) < 0) return -1; } return 0; } static int mrp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct mrp_application *appl = container_of(pt, struct mrp_application, pkttype); struct mrp_port *port; struct mrp_applicant *app; struct mrp_pdu_hdr _ph; const struct mrp_pdu_hdr *ph; int offset = skb_network_offset(skb); /* If the interface is in promiscuous mode, drop the packet if * it was unicast to another host. */ if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) goto out; skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) goto out; port = rcu_dereference(dev->mrp_port); if (unlikely(!port)) goto out; app = rcu_dereference(port->applicants[appl->type]); if (unlikely(!app)) goto out; ph = skb_header_pointer(skb, offset, sizeof(_ph), &_ph); if (!ph) goto out; offset += sizeof(_ph); if (ph->version != app->app->version) goto out; spin_lock(&app->lock); while (skb->len > offset) { if (mrp_pdu_parse_end_mark(skb, &offset) < 0) break; if (mrp_pdu_parse_msg(app, skb, &offset) < 0) break; } spin_unlock(&app->lock); out: kfree_skb(skb); return 0; } static int mrp_init_port(struct net_device *dev) { struct mrp_port *port; port = kzalloc(sizeof(*port), GFP_KERNEL); if (!port) return -ENOMEM; rcu_assign_pointer(dev->mrp_port, port); return 0; } static void mrp_release_port(struct net_device *dev) { struct mrp_port *port = rtnl_dereference(dev->mrp_port); unsigned int i; for (i = 0; i <= MRP_APPLICATION_MAX; i++) { if (rtnl_dereference(port->applicants[i])) return; } RCU_INIT_POINTER(dev->mrp_port, NULL); kfree_rcu(port, rcu); } int mrp_init_applicant(struct net_device *dev, struct mrp_application *appl) { struct mrp_applicant *app; int err; ASSERT_RTNL(); if (!rtnl_dereference(dev->mrp_port)) { err = mrp_init_port(dev); if (err < 0) goto err1; } err = -ENOMEM; app = kzalloc(sizeof(*app), GFP_KERNEL); if (!app) goto err2; err = dev_mc_add(dev, appl->group_address); if (err < 0) goto err3; app->dev = dev; app->app = appl; app->mad = RB_ROOT; app->active = true; spin_lock_init(&app->lock); skb_queue_head_init(&app->queue); rcu_assign_pointer(dev->mrp_port->applicants[appl->type], app); timer_setup(&app->join_timer, mrp_join_timer, 0); mrp_join_timer_arm(app); timer_setup(&app->periodic_timer, mrp_periodic_timer, 0); mrp_periodic_timer_arm(app); return 0; err3: kfree(app); err2: mrp_release_port(dev); err1: return err; } EXPORT_SYMBOL_GPL(mrp_init_applicant); void mrp_uninit_applicant(struct net_device *dev, struct mrp_application *appl) { struct mrp_port *port = rtnl_dereference(dev->mrp_port); struct mrp_applicant *app = rtnl_dereference( port->applicants[appl->type]); ASSERT_RTNL(); RCU_INIT_POINTER(port->applicants[appl->type], NULL); spin_lock_bh(&app->lock); app->active = false; spin_unlock_bh(&app->lock); /* Delete timer and generate a final TX event to flush out * all pending messages before the applicant is gone. */ timer_shutdown_sync(&app->join_timer); timer_shutdown_sync(&app->periodic_timer); spin_lock_bh(&app->lock); mrp_mad_event(app, MRP_EVENT_TX); mrp_attr_destroy_all(app); mrp_pdu_queue(app); spin_unlock_bh(&app->lock); mrp_queue_xmit(app); dev_mc_del(dev, appl->group_address); kfree_rcu(app, rcu); mrp_release_port(dev); } EXPORT_SYMBOL_GPL(mrp_uninit_applicant); int mrp_register_application(struct mrp_application *appl) { appl->pkttype.func = mrp_rcv; dev_add_pack(&appl->pkttype); return 0; } EXPORT_SYMBOL_GPL(mrp_register_application); void mrp_unregister_application(struct mrp_application *appl) { dev_remove_pack(&appl->pkttype); } EXPORT_SYMBOL_GPL(mrp_unregister_application);
1146 3108 893 892 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/backing-dev.h * * low-level device information and state which is propagated up through * to high-level code. */ #ifndef _LINUX_BACKING_DEV_H #define _LINUX_BACKING_DEV_H #include <linux/kernel.h> #include <linux/fs.h> #include <linux/sched.h> #include <linux/device.h> #include <linux/writeback.h> #include <linux/backing-dev-defs.h> #include <linux/slab.h> static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi) { kref_get(&bdi->refcnt); return bdi; } struct backing_dev_info *bdi_get_by_id(u64 id); void bdi_put(struct backing_dev_info *bdi); __printf(2, 3) int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...); __printf(2, 0) int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args); void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner); void bdi_unregister(struct backing_dev_info *bdi); struct backing_dev_info *bdi_alloc(int node_id); void wb_start_background_writeback(struct bdi_writeback *wb); void wb_workfn(struct work_struct *work); void wb_wakeup_delayed(struct bdi_writeback *wb); void wb_wait_for_completion(struct wb_completion *done); extern spinlock_t bdi_lock; extern struct list_head bdi_list; extern struct workqueue_struct *bdi_wq; static inline bool wb_has_dirty_io(struct bdi_writeback *wb) { return test_bit(WB_has_dirty_io, &wb->state); } static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi) { /* * @bdi->tot_write_bandwidth is guaranteed to be > 0 if there are * any dirty wbs. See wb_update_write_bandwidth(). */ return atomic_long_read(&bdi->tot_write_bandwidth); } static inline void wb_stat_mod(struct bdi_writeback *wb, enum wb_stat_item item, s64 amount) { percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH); } static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { wb_stat_mod(wb, item, 1); } static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { wb_stat_mod(wb, item, -1); } static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { return percpu_counter_read_positive(&wb->stat[item]); } static inline s64 wb_stat_sum(struct bdi_writeback *wb, enum wb_stat_item item) { return percpu_counter_sum_positive(&wb->stat[item]); } extern void wb_writeout_inc(struct bdi_writeback *wb); /* * maximal error of a stat counter. */ static inline unsigned long wb_stat_error(void) { #ifdef CONFIG_SMP return nr_cpu_ids * WB_STAT_BATCH; #else return 1; #endif } /* BDI ratio is expressed as part per 1000000 for finer granularity. */ #define BDI_RATIO_SCALE 10000 u64 bdi_get_min_bytes(struct backing_dev_info *bdi); u64 bdi_get_max_bytes(struct backing_dev_info *bdi); int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio); int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes); int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes); int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit); /* * Flags in backing_dev_info::capability * * BDI_CAP_WRITEBACK: Supports dirty page writeback, and dirty pages * should contribute to accounting * BDI_CAP_WRITEBACK_ACCT: Automatically account writeback pages * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold */ #define BDI_CAP_WRITEBACK (1 << 0) #define BDI_CAP_WRITEBACK_ACCT (1 << 1) #define BDI_CAP_STRICTLIMIT (1 << 2) extern struct backing_dev_info noop_backing_dev_info; int bdi_init(struct backing_dev_info *bdi); /** * writeback_in_progress - determine whether there is writeback in progress * @wb: bdi_writeback of interest * * Determine whether there is writeback waiting to be handled against a * bdi_writeback. */ static inline bool writeback_in_progress(struct bdi_writeback *wb) { return test_bit(WB_writeback_running, &wb->state); } struct backing_dev_info *inode_to_bdi(struct inode *inode); static inline bool mapping_can_writeback(struct address_space *mapping) { return inode_to_bdi(mapping->host)->capabilities & BDI_CAP_WRITEBACK; } #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css); struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp); void wb_memcg_offline(struct mem_cgroup *memcg); void wb_blkcg_offline(struct cgroup_subsys_state *css); /** * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode * @inode: inode of interest * * Cgroup writeback requires support from the filesystem. Also, both memcg and * iocg have to be on the default hierarchy. Test whether all conditions are * met. * * Note that the test result may change dynamically on the same inode * depending on how memcg and iocg are configured. */ static inline bool inode_cgwb_enabled(struct inode *inode) { struct backing_dev_info *bdi = inode_to_bdi(inode); return cgroup_subsys_on_dfl(memory_cgrp_subsys) && cgroup_subsys_on_dfl(io_cgrp_subsys) && (bdi->capabilities & BDI_CAP_WRITEBACK) && (inode->i_sb->s_iflags & SB_I_CGROUPWB); } /** * wb_find_current - find wb for %current on a bdi * @bdi: bdi of interest * * Find the wb of @bdi which matches both the memcg and blkcg of %current. * Must be called under rcu_read_lock() which protects the returend wb. * NULL if not found. */ static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) { struct cgroup_subsys_state *memcg_css; struct bdi_writeback *wb; memcg_css = task_css(current, memory_cgrp_id); if (!memcg_css->parent) return &bdi->wb; wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); /* * %current's blkcg equals the effective blkcg of its memcg. No * need to use the relatively expensive cgroup_get_e_css(). */ if (likely(wb && wb->blkcg_css == task_css(current, io_cgrp_id))) return wb; return NULL; } /** * wb_get_create_current - get or create wb for %current on a bdi * @bdi: bdi of interest * @gfp: allocation mask * * Equivalent to wb_get_create() on %current's memcg. This function is * called from a relatively hot path and optimizes the common cases using * wb_find_current(). */ static inline struct bdi_writeback * wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) { struct bdi_writeback *wb; rcu_read_lock(); wb = wb_find_current(bdi); if (wb && unlikely(!wb_tryget(wb))) wb = NULL; rcu_read_unlock(); if (unlikely(!wb)) { struct cgroup_subsys_state *memcg_css; memcg_css = task_get_css(current, memory_cgrp_id); wb = wb_get_create(bdi, memcg_css, gfp); css_put(memcg_css); } return wb; } /** * inode_to_wb - determine the wb of an inode * @inode: inode of interest * * Returns the wb @inode is currently associated with. The caller must be * holding either @inode->i_lock, the i_pages lock, or the * associated wb's list_lock. */ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) { #ifdef CONFIG_LOCKDEP WARN_ON_ONCE(debug_locks && (!lockdep_is_held(&inode->i_lock) && !lockdep_is_held(&inode->i_mapping->i_pages.xa_lock) && !lockdep_is_held(&inode->i_wb->list_lock))); #endif return inode->i_wb; } static inline struct bdi_writeback *inode_to_wb_wbc( struct inode *inode, struct writeback_control *wbc) { /* * If wbc does not have inode attached, it means cgroup writeback was * disabled when wbc started. Just use the default wb in that case. */ return wbc->wb ? wbc->wb : &inode_to_bdi(inode)->wb; } /** * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction * @inode: target inode * @cookie: output param, to be passed to the end function * * The caller wants to access the wb associated with @inode but isn't * holding inode->i_lock, the i_pages lock or wb->list_lock. This * function determines the wb associated with @inode and ensures that the * association doesn't change until the transaction is finished with * unlocked_inode_to_wb_end(). * * The caller must call unlocked_inode_to_wb_end() with *@cookie afterwards and * can't sleep during the transaction. IRQs may or may not be disabled on * return. */ static inline struct bdi_writeback * unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) { rcu_read_lock(); /* * Paired with store_release in inode_switch_wbs_work_fn() and * ensures that we see the new wb if we see cleared I_WB_SWITCH. */ cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH; if (unlikely(cookie->locked)) xa_lock_irqsave(&inode->i_mapping->i_pages, cookie->flags); /* * Protected by either !I_WB_SWITCH + rcu_read_lock() or the i_pages * lock. inode_to_wb() will bark. Deref directly. */ return inode->i_wb; } /** * unlocked_inode_to_wb_end - end inode wb access transaction * @inode: target inode * @cookie: @cookie from unlocked_inode_to_wb_begin() */ static inline void unlocked_inode_to_wb_end(struct inode *inode, struct wb_lock_cookie *cookie) { if (unlikely(cookie->locked)) xa_unlock_irqrestore(&inode->i_mapping->i_pages, cookie->flags); rcu_read_unlock(); } #else /* CONFIG_CGROUP_WRITEBACK */ static inline bool inode_cgwb_enabled(struct inode *inode) { return false; } static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) { return &bdi->wb; } static inline struct bdi_writeback * wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) { return &bdi->wb; } static inline struct bdi_writeback *inode_to_wb(struct inode *inode) { return &inode_to_bdi(inode)->wb; } static inline struct bdi_writeback *inode_to_wb_wbc( struct inode *inode, struct writeback_control *wbc) { return inode_to_wb(inode); } static inline struct bdi_writeback * unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie) { return inode_to_wb(inode); } static inline void unlocked_inode_to_wb_end(struct inode *inode, struct wb_lock_cookie *cookie) { } static inline void wb_memcg_offline(struct mem_cgroup *memcg) { } static inline void wb_blkcg_offline(struct cgroup_subsys_state *css) { } #endif /* CONFIG_CGROUP_WRITEBACK */ const char *bdi_dev_name(struct backing_dev_info *bdi); #endif /* _LINUX_BACKING_DEV_H */
5 5 1 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 // SPDX-License-Identifier: GPL-2.0-only /* * ebt_dnat * * Authors: * Bart De Schuymer <bdschuym@pandora.be> * * June, 2002 * */ #include <linux/module.h> #include <net/sock.h> #include "../br_private.h" #include <linux/netfilter.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_bridge/ebtables.h> #include <linux/netfilter_bridge/ebt_nat.h> static unsigned int ebt_dnat_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ebt_nat_info *info = par->targinfo; if (skb_ensure_writable(skb, 0)) return EBT_DROP; ether_addr_copy(eth_hdr(skb)->h_dest, info->mac); if (is_multicast_ether_addr(info->mac)) { if (is_broadcast_ether_addr(info->mac)) skb->pkt_type = PACKET_BROADCAST; else skb->pkt_type = PACKET_MULTICAST; } else { const struct net_device *dev; switch (xt_hooknum(par)) { case NF_BR_BROUTING: dev = xt_in(par); break; case NF_BR_PRE_ROUTING: dev = br_port_get_rcu(xt_in(par))->br->dev; break; default: dev = NULL; break; } if (!dev) /* NF_BR_LOCAL_OUT */ return info->target; if (ether_addr_equal(info->mac, dev->dev_addr)) skb->pkt_type = PACKET_HOST; else skb->pkt_type = PACKET_OTHERHOST; } return info->target; } static int ebt_dnat_tg_check(const struct xt_tgchk_param *par) { const struct ebt_nat_info *info = par->targinfo; unsigned int hook_mask; if (BASE_CHAIN && info->target == EBT_RETURN) return -EINVAL; hook_mask = par->hook_mask & ~(1 << NF_BR_NUMHOOKS); if ((strcmp(par->table, "nat") != 0 || (hook_mask & ~((1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_LOCAL_OUT)))) && (strcmp(par->table, "broute") != 0 || hook_mask & ~(1 << NF_BR_BROUTING))) return -EINVAL; if (ebt_invalid_target(info->target)) return -EINVAL; return 0; } static struct xt_target ebt_dnat_tg_reg __read_mostly = { .name = "dnat", .revision = 0, .family = NFPROTO_BRIDGE, .hooks = (1 << NF_BR_NUMHOOKS) | (1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_LOCAL_OUT) | (1 << NF_BR_BROUTING), .target = ebt_dnat_tg, .checkentry = ebt_dnat_tg_check, .targetsize = sizeof(struct ebt_nat_info), .me = THIS_MODULE, }; static int __init ebt_dnat_init(void) { return xt_register_target(&ebt_dnat_tg_reg); } static void __exit ebt_dnat_fini(void) { xt_unregister_target(&ebt_dnat_tg_reg); } module_init(ebt_dnat_init); module_exit(ebt_dnat_fini); MODULE_DESCRIPTION("Ebtables: Destination MAC address translation"); MODULE_LICENSE("GPL");
15 1 1 11 21 1 6 15 5 39 21 47 1 5 1 47 53 53 27 4 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 // SPDX-License-Identifier: GPL-2.0 #include <linux/syscalls.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/exportfs.h> #include <linux/fs_struct.h> #include <linux/fsnotify.h> #include <linux/personality.h> #include <linux/uaccess.h> #include <linux/compat.h> #include "internal.h" #include "mount.h" static long do_sys_name_to_handle(const struct path *path, struct file_handle __user *ufh, int __user *mnt_id, int fh_flags) { long retval; struct file_handle f_handle; int handle_dwords, handle_bytes; struct file_handle *handle = NULL; /* * We need to make sure whether the file system support decoding of * the file handle if decodeable file handle was requested. */ if (!exportfs_can_encode_fh(path->dentry->d_sb->s_export_op, fh_flags)) return -EOPNOTSUPP; if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) return -EFAULT; if (f_handle.handle_bytes > MAX_HANDLE_SZ) return -EINVAL; handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes, GFP_KERNEL); if (!handle) return -ENOMEM; /* convert handle size to multiple of sizeof(u32) */ handle_dwords = f_handle.handle_bytes >> 2; /* we ask for a non connectable maybe decodeable file handle */ retval = exportfs_encode_fh(path->dentry, (struct fid *)handle->f_handle, &handle_dwords, fh_flags); handle->handle_type = retval; /* convert handle size to bytes */ handle_bytes = handle_dwords * sizeof(u32); handle->handle_bytes = handle_bytes; if ((handle->handle_bytes > f_handle.handle_bytes) || (retval == FILEID_INVALID) || (retval < 0)) { /* As per old exportfs_encode_fh documentation * we could return ENOSPC to indicate overflow * But file system returned 255 always. So handle * both the values */ if (retval == FILEID_INVALID || retval == -ENOSPC) retval = -EOVERFLOW; /* * set the handle size to zero so we copy only * non variable part of the file_handle */ handle_bytes = 0; } else retval = 0; /* copy the mount id */ if (put_user(real_mount(path->mnt)->mnt_id, mnt_id) || copy_to_user(ufh, handle, sizeof(struct file_handle) + handle_bytes)) retval = -EFAULT; kfree(handle); return retval; } /** * sys_name_to_handle_at: convert name to handle * @dfd: directory relative to which name is interpreted if not absolute * @name: name that should be converted to handle. * @handle: resulting file handle * @mnt_id: mount id of the file system containing the file * @flag: flag value to indicate whether to follow symlink or not * and whether a decodable file handle is required. * * @handle->handle_size indicate the space available to store the * variable part of the file handle in bytes. If there is not * enough space, the field is updated to return the minimum * value required. */ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name, struct file_handle __user *, handle, int __user *, mnt_id, int, flag) { struct path path; int lookup_flags; int fh_flags; int err; if (flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH | AT_HANDLE_FID)) return -EINVAL; lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0; fh_flags = (flag & AT_HANDLE_FID) ? EXPORT_FH_FID : 0; if (flag & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; err = user_path_at(dfd, name, lookup_flags, &path); if (!err) { err = do_sys_name_to_handle(&path, handle, mnt_id, fh_flags); path_put(&path); } return err; } static struct vfsmount *get_vfsmount_from_fd(int fd) { struct vfsmount *mnt; if (fd == AT_FDCWD) { struct fs_struct *fs = current->fs; spin_lock(&fs->lock); mnt = mntget(fs->pwd.mnt); spin_unlock(&fs->lock); } else { struct fd f = fdget(fd); if (!f.file) return ERR_PTR(-EBADF); mnt = mntget(f.file->f_path.mnt); fdput(f); } return mnt; } static int vfs_dentry_acceptable(void *context, struct dentry *dentry) { return 1; } static int do_handle_to_path(int mountdirfd, struct file_handle *handle, struct path *path) { int retval = 0; int handle_dwords; path->mnt = get_vfsmount_from_fd(mountdirfd); if (IS_ERR(path->mnt)) { retval = PTR_ERR(path->mnt); goto out_err; } /* change the handle size to multiple of sizeof(u32) */ handle_dwords = handle->handle_bytes >> 2; path->dentry = exportfs_decode_fh(path->mnt, (struct fid *)handle->f_handle, handle_dwords, handle->handle_type, vfs_dentry_acceptable, NULL); if (IS_ERR(path->dentry)) { retval = PTR_ERR(path->dentry); goto out_mnt; } return 0; out_mnt: mntput(path->mnt); out_err: return retval; } static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, struct path *path) { int retval = 0; struct file_handle f_handle; struct file_handle *handle = NULL; /* * With handle we don't look at the execute bit on the * directory. Ideally we would like CAP_DAC_SEARCH. * But we don't have that */ if (!capable(CAP_DAC_READ_SEARCH)) { retval = -EPERM; goto out_err; } if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) { retval = -EFAULT; goto out_err; } if ((f_handle.handle_bytes > MAX_HANDLE_SZ) || (f_handle.handle_bytes == 0)) { retval = -EINVAL; goto out_err; } handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes, GFP_KERNEL); if (!handle) { retval = -ENOMEM; goto out_err; } /* copy the full handle */ *handle = f_handle; if (copy_from_user(&handle->f_handle, &ufh->f_handle, f_handle.handle_bytes)) { retval = -EFAULT; goto out_handle; } retval = do_handle_to_path(mountdirfd, handle, path); out_handle: kfree(handle); out_err: return retval; } static long do_handle_open(int mountdirfd, struct file_handle __user *ufh, int open_flag) { long retval = 0; struct path path; struct file *file; int fd; retval = handle_to_path(mountdirfd, ufh, &path); if (retval) return retval; fd = get_unused_fd_flags(open_flag); if (fd < 0) { path_put(&path); return fd; } file = file_open_root(&path, "", open_flag, 0); if (IS_ERR(file)) { put_unused_fd(fd); retval = PTR_ERR(file); } else { retval = fd; fd_install(fd, file); } path_put(&path); return retval; } /** * sys_open_by_handle_at: Open the file handle * @mountdirfd: directory file descriptor * @handle: file handle to be opened * @flags: open flags. * * @mountdirfd indicate the directory file descriptor * of the mount point. file handle is decoded relative * to the vfsmount pointed by the @mountdirfd. @flags * value is same as the open(2) flags. */ SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, struct file_handle __user *, handle, int, flags) { long ret; if (force_o_largefile()) flags |= O_LARGEFILE; ret = do_handle_open(mountdirfd, handle, flags); return ret; } #ifdef CONFIG_COMPAT /* * Exactly like fs/open.c:sys_open_by_handle_at(), except that it * doesn't set the O_LARGEFILE flag. */ COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, struct file_handle __user *, handle, int, flags) { return do_handle_open(mountdirfd, handle, flags); } #endif
34 33 33 33 24 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 /* SPDX-License-Identifier: GPL-2.0 * * page_pool/helpers.h * Author: Jesper Dangaard Brouer <netoptimizer@brouer.com> * Copyright (C) 2016 Red Hat, Inc. */ /** * DOC: page_pool allocator * * The page_pool allocator is optimized for recycling page or page fragment used * by skb packet and xdp frame. * * Basic use involves replacing any alloc_pages() calls with page_pool_alloc(), * which allocate memory with or without page splitting depending on the * requested memory size. * * If the driver knows that it always requires full pages or its allocations are * always smaller than half a page, it can use one of the more specific API * calls: * * 1. page_pool_alloc_pages(): allocate memory without page splitting when * driver knows that the memory it need is always bigger than half of the page * allocated from page pool. There is no cache line dirtying for 'struct page' * when a page is recycled back to the page pool. * * 2. page_pool_alloc_frag(): allocate memory with page splitting when driver * knows that the memory it need is always smaller than or equal to half of the * page allocated from page pool. Page splitting enables memory saving and thus * avoids TLB/cache miss for data access, but there also is some cost to * implement page splitting, mainly some cache line dirtying/bouncing for * 'struct page' and atomic operation for page->pp_ref_count. * * The API keeps track of in-flight pages, in order to let API users know when * it is safe to free a page_pool object, the API users must call * page_pool_put_page() or page_pool_free_va() to free the page_pool object, or * attach the page_pool object to a page_pool-aware object like skbs marked with * skb_mark_for_recycle(). * * page_pool_put_page() may be called multiple times on the same page if a page * is split into multiple fragments. For the last fragment, it will either * recycle the page, or in case of page->_refcount > 1, it will release the DMA * mapping and in-flight state accounting. * * dma_sync_single_range_for_device() is only called for the last fragment when * page_pool is created with PP_FLAG_DMA_SYNC_DEV flag, so it depends on the * last freed fragment to do the sync_for_device operation for all fragments in * the same page when a page is split. The API user must setup pool->p.max_len * and pool->p.offset correctly and ensure that page_pool_put_page() is called * with dma_sync_size being -1 for fragment API. */ #ifndef _NET_PAGE_POOL_HELPERS_H #define _NET_PAGE_POOL_HELPERS_H #include <net/page_pool/types.h> #ifdef CONFIG_PAGE_POOL_STATS /* Deprecated driver-facing API, use netlink instead */ int page_pool_ethtool_stats_get_count(void); u8 *page_pool_ethtool_stats_get_strings(u8 *data); u64 *page_pool_ethtool_stats_get(u64 *data, void *stats); bool page_pool_get_stats(const struct page_pool *pool, struct page_pool_stats *stats); #else static inline int page_pool_ethtool_stats_get_count(void) { return 0; } static inline u8 *page_pool_ethtool_stats_get_strings(u8 *data) { return data; } static inline u64 *page_pool_ethtool_stats_get(u64 *data, void *stats) { return data; } #endif /** * page_pool_dev_alloc_pages() - allocate a page. * @pool: pool from which to allocate * * Get a page from the page allocator or page_pool caches. */ static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc_pages(pool, gfp); } /** * page_pool_dev_alloc_frag() - allocate a page fragment. * @pool: pool from which to allocate * @offset: offset to the allocated page * @size: requested size * * Get a page fragment from the page allocator or page_pool caches. * * Return: * Return allocated page fragment, otherwise return NULL. */ static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool, unsigned int *offset, unsigned int size) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc_frag(pool, offset, size, gfp); } static inline struct page *page_pool_alloc(struct page_pool *pool, unsigned int *offset, unsigned int *size, gfp_t gfp) { unsigned int max_size = PAGE_SIZE << pool->p.order; struct page *page; if ((*size << 1) > max_size) { *size = max_size; *offset = 0; return page_pool_alloc_pages(pool, gfp); } page = page_pool_alloc_frag(pool, offset, *size, gfp); if (unlikely(!page)) return NULL; /* There is very likely not enough space for another fragment, so append * the remaining size to the current fragment to avoid truesize * underestimate problem. */ if (pool->frag_offset + *size > max_size) { *size = max_size - *offset; pool->frag_offset = max_size; } return page; } /** * page_pool_dev_alloc() - allocate a page or a page fragment. * @pool: pool from which to allocate * @offset: offset to the allocated page * @size: in as the requested size, out as the allocated size * * Get a page or a page fragment from the page allocator or page_pool caches * depending on the requested size in order to allocate memory with least memory * utilization and performance penalty. * * Return: * Return allocated page or page fragment, otherwise return NULL. */ static inline struct page *page_pool_dev_alloc(struct page_pool *pool, unsigned int *offset, unsigned int *size) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc(pool, offset, size, gfp); } static inline void *page_pool_alloc_va(struct page_pool *pool, unsigned int *size, gfp_t gfp) { unsigned int offset; struct page *page; /* Mask off __GFP_HIGHMEM to ensure we can use page_address() */ page = page_pool_alloc(pool, &offset, size, gfp & ~__GFP_HIGHMEM); if (unlikely(!page)) return NULL; return page_address(page) + offset; } /** * page_pool_dev_alloc_va() - allocate a page or a page fragment and return its * va. * @pool: pool from which to allocate * @size: in as the requested size, out as the allocated size * * This is just a thin wrapper around the page_pool_alloc() API, and * it returns va of the allocated page or page fragment. * * Return: * Return the va for the allocated page or page fragment, otherwise return NULL. */ static inline void *page_pool_dev_alloc_va(struct page_pool *pool, unsigned int *size) { gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN); return page_pool_alloc_va(pool, size, gfp); } /** * page_pool_get_dma_dir() - Retrieve the stored DMA direction. * @pool: pool from which page was allocated * * Get the stored dma direction. A driver might decide to store this locally * and avoid the extra cache line from page_pool to determine the direction. */ static inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool) { return pool->p.dma_dir; } /** * page_pool_fragment_page() - split a fresh page into fragments * @page: page to split * @nr: references to set * * pp_ref_count represents the number of outstanding references to the page, * which will be freed using page_pool APIs (rather than page allocator APIs * like put_page()). Such references are usually held by page_pool-aware * objects like skbs marked for page pool recycling. * * This helper allows the caller to take (set) multiple references to a * freshly allocated page. The page must be freshly allocated (have a * pp_ref_count of 1). This is commonly done by drivers and * "fragment allocators" to save atomic operations - either when they know * upfront how many references they will need; or to take MAX references and * return the unused ones with a single atomic dec(), instead of performing * multiple atomic inc() operations. */ static inline void page_pool_fragment_page(struct page *page, long nr) { atomic_long_set(&page->pp_ref_count, nr); } static inline long page_pool_unref_page(struct page *page, long nr) { long ret; /* If nr == pp_ref_count then we have cleared all remaining * references to the page: * 1. 'n == 1': no need to actually overwrite it. * 2. 'n != 1': overwrite it with one, which is the rare case * for pp_ref_count draining. * * The main advantage to doing this is that not only we avoid a atomic * update, as an atomic_read is generally a much cheaper operation than * an atomic update, especially when dealing with a page that may be * referenced by only 2 or 3 users; but also unify the pp_ref_count * handling by ensuring all pages have partitioned into only 1 piece * initially, and only overwrite it when the page is partitioned into * more than one piece. */ if (atomic_long_read(&page->pp_ref_count) == nr) { /* As we have ensured nr is always one for constant case using * the BUILD_BUG_ON(), only need to handle the non-constant case * here for pp_ref_count draining, which is a rare case. */ BUILD_BUG_ON(__builtin_constant_p(nr) && nr != 1); if (!__builtin_constant_p(nr)) atomic_long_set(&page->pp_ref_count, 1); return 0; } ret = atomic_long_sub_return(nr, &page->pp_ref_count); WARN_ON(ret < 0); /* We are the last user here too, reset pp_ref_count back to 1 to * ensure all pages have been partitioned into 1 piece initially, * this should be the rare case when the last two fragment users call * page_pool_unref_page() currently. */ if (unlikely(!ret)) atomic_long_set(&page->pp_ref_count, 1); return ret; } static inline void page_pool_ref_page(struct page *page) { atomic_long_inc(&page->pp_ref_count); } static inline bool page_pool_is_last_ref(struct page *page) { /* If page_pool_unref_page() returns 0, we were the last user */ return page_pool_unref_page(page, 1) == 0; } /** * page_pool_put_page() - release a reference to a page pool page * @pool: pool from which page was allocated * @page: page to release a reference on * @dma_sync_size: how much of the page may have been touched by the device * @allow_direct: released by the consumer, allow lockless caching * * The outcome of this depends on the page refcnt. If the driver bumps * the refcnt > 1 this will unmap the page. If the page refcnt is 1 * the allocator owns the page and will try to recycle it in one of the pool * caches. If PP_FLAG_DMA_SYNC_DEV is set, the page will be synced for_device * using dma_sync_single_range_for_device(). */ static inline void page_pool_put_page(struct page_pool *pool, struct page *page, unsigned int dma_sync_size, bool allow_direct) { /* When page_pool isn't compiled-in, net/core/xdp.c doesn't * allow registering MEM_TYPE_PAGE_POOL, but shield linker. */ #ifdef CONFIG_PAGE_POOL if (!page_pool_is_last_ref(page)) return; page_pool_put_unrefed_page(pool, page, dma_sync_size, allow_direct); #endif } /** * page_pool_put_full_page() - release a reference on a page pool page * @pool: pool from which page was allocated * @page: page to release a reference on * @allow_direct: released by the consumer, allow lockless caching * * Similar to page_pool_put_page(), but will DMA sync the entire memory area * as configured in &page_pool_params.max_len. */ static inline void page_pool_put_full_page(struct page_pool *pool, struct page *page, bool allow_direct) { page_pool_put_page(pool, page, -1, allow_direct); } /** * page_pool_recycle_direct() - release a reference on a page pool page * @pool: pool from which page was allocated * @page: page to release a reference on * * Similar to page_pool_put_full_page() but caller must guarantee safe context * (e.g NAPI), since it will recycle the page directly into the pool fast cache. */ static inline void page_pool_recycle_direct(struct page_pool *pool, struct page *page) { page_pool_put_full_page(pool, page, true); } #define PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA \ (sizeof(dma_addr_t) > sizeof(unsigned long)) /** * page_pool_free_va() - free a va into the page_pool * @pool: pool from which va was allocated * @va: va to be freed * @allow_direct: freed by the consumer, allow lockless caching * * Free a va allocated from page_pool_allo_va(). */ static inline void page_pool_free_va(struct page_pool *pool, void *va, bool allow_direct) { page_pool_put_page(pool, virt_to_head_page(va), -1, allow_direct); } /** * page_pool_get_dma_addr() - Retrieve the stored DMA address. * @page: page allocated from a page pool * * Fetch the DMA address of the page. The page pool to which the page belongs * must had been created with PP_FLAG_DMA_MAP. */ static inline dma_addr_t page_pool_get_dma_addr(struct page *page) { dma_addr_t ret = page->dma_addr; if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) ret <<= PAGE_SHIFT; return ret; } static inline bool page_pool_set_dma_addr(struct page *page, dma_addr_t addr) { if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) { page->dma_addr = addr >> PAGE_SHIFT; /* We assume page alignment to shave off bottom bits, * if this "compression" doesn't work we need to drop. */ return addr != (dma_addr_t)page->dma_addr << PAGE_SHIFT; } page->dma_addr = addr; return false; } static inline bool page_pool_put(struct page_pool *pool) { return refcount_dec_and_test(&pool->user_cnt); } static inline void page_pool_nid_changed(struct page_pool *pool, int new_nid) { if (unlikely(pool->p.nid != new_nid)) page_pool_update_nid(pool, new_nid); } #endif /* _NET_PAGE_POOL_HELPERS_H */
14 14 7 3 4 11 7 3 1 1 1 1 1 17 17 4 3 1 2 2 12 12 13 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 // SPDX-License-Identifier: GPL-2.0-or-later /* * OSS compatible sequencer driver * * seq_oss_readq.c - MIDI input queue * * Copyright (C) 1998,99 Takashi Iwai <tiwai@suse.de> */ #include "seq_oss_readq.h" #include "seq_oss_event.h" #include <sound/seq_oss_legacy.h> #include "../seq_lock.h" #include <linux/wait.h> #include <linux/slab.h> /* * constants */ //#define SNDRV_SEQ_OSS_MAX_TIMEOUT (unsigned long)(-1) #define SNDRV_SEQ_OSS_MAX_TIMEOUT (HZ * 3600) /* * prototypes */ /* * create a read queue */ struct seq_oss_readq * snd_seq_oss_readq_new(struct seq_oss_devinfo *dp, int maxlen) { struct seq_oss_readq *q; q = kzalloc(sizeof(*q), GFP_KERNEL); if (!q) return NULL; q->q = kcalloc(maxlen, sizeof(union evrec), GFP_KERNEL); if (!q->q) { kfree(q); return NULL; } q->maxlen = maxlen; q->qlen = 0; q->head = q->tail = 0; init_waitqueue_head(&q->midi_sleep); spin_lock_init(&q->lock); q->pre_event_timeout = SNDRV_SEQ_OSS_MAX_TIMEOUT; q->input_time = (unsigned long)-1; return q; } /* * delete the read queue */ void snd_seq_oss_readq_delete(struct seq_oss_readq *q) { if (q) { kfree(q->q); kfree(q); } } /* * reset the read queue */ void snd_seq_oss_readq_clear(struct seq_oss_readq *q) { if (q->qlen) { q->qlen = 0; q->head = q->tail = 0; } /* if someone sleeping, wake'em up */ wake_up(&q->midi_sleep); q->input_time = (unsigned long)-1; } /* * put a midi byte */ int snd_seq_oss_readq_puts(struct seq_oss_readq *q, int dev, unsigned char *data, int len) { union evrec rec; int result; memset(&rec, 0, sizeof(rec)); rec.c[0] = SEQ_MIDIPUTC; rec.c[2] = dev; while (len-- > 0) { rec.c[1] = *data++; result = snd_seq_oss_readq_put_event(q, &rec); if (result < 0) return result; } return 0; } /* * put MIDI sysex bytes; the event buffer may be chained, thus it has * to be expanded via snd_seq_dump_var_event(). */ struct readq_sysex_ctx { struct seq_oss_readq *readq; int dev; }; static int readq_dump_sysex(void *ptr, void *buf, int count) { struct readq_sysex_ctx *ctx = ptr; return snd_seq_oss_readq_puts(ctx->readq, ctx->dev, buf, count); } int snd_seq_oss_readq_sysex(struct seq_oss_readq *q, int dev, struct snd_seq_event *ev) { struct readq_sysex_ctx ctx = { .readq = q, .dev = dev }; if ((ev->flags & SNDRV_SEQ_EVENT_LENGTH_MASK) != SNDRV_SEQ_EVENT_LENGTH_VARIABLE) return 0; return snd_seq_dump_var_event(ev, readq_dump_sysex, &ctx); } /* * copy an event to input queue: * return zero if enqueued */ int snd_seq_oss_readq_put_event(struct seq_oss_readq *q, union evrec *ev) { unsigned long flags; spin_lock_irqsave(&q->lock, flags); if (q->qlen >= q->maxlen - 1) { spin_unlock_irqrestore(&q->lock, flags); return -ENOMEM; } memcpy(&q->q[q->tail], ev, sizeof(*ev)); q->tail = (q->tail + 1) % q->maxlen; q->qlen++; /* wake up sleeper */ wake_up(&q->midi_sleep); spin_unlock_irqrestore(&q->lock, flags); return 0; } /* * pop queue * caller must hold lock */ int snd_seq_oss_readq_pick(struct seq_oss_readq *q, union evrec *rec) { if (q->qlen == 0) return -EAGAIN; memcpy(rec, &q->q[q->head], sizeof(*rec)); return 0; } /* * sleep until ready */ void snd_seq_oss_readq_wait(struct seq_oss_readq *q) { wait_event_interruptible_timeout(q->midi_sleep, (q->qlen > 0 || q->head == q->tail), q->pre_event_timeout); } /* * drain one record * caller must hold lock */ void snd_seq_oss_readq_free(struct seq_oss_readq *q) { if (q->qlen > 0) { q->head = (q->head + 1) % q->maxlen; q->qlen--; } } /* * polling/select: * return non-zero if readq is not empty. */ unsigned int snd_seq_oss_readq_poll(struct seq_oss_readq *q, struct file *file, poll_table *wait) { poll_wait(file, &q->midi_sleep, wait); return q->qlen; } /* * put a timestamp */ int snd_seq_oss_readq_put_timestamp(struct seq_oss_readq *q, unsigned long curt, int seq_mode) { if (curt != q->input_time) { union evrec rec; memset(&rec, 0, sizeof(rec)); switch (seq_mode) { case SNDRV_SEQ_OSS_MODE_SYNTH: rec.echo = (curt << 8) | SEQ_WAIT; snd_seq_oss_readq_put_event(q, &rec); break; case SNDRV_SEQ_OSS_MODE_MUSIC: rec.t.code = EV_TIMING; rec.t.cmd = TMR_WAIT_ABS; rec.t.time = curt; snd_seq_oss_readq_put_event(q, &rec); break; } q->input_time = curt; } return 0; } #ifdef CONFIG_SND_PROC_FS /* * proc interface */ void snd_seq_oss_readq_info_read(struct seq_oss_readq *q, struct snd_info_buffer *buf) { snd_iprintf(buf, " read queue [%s] length = %d : tick = %ld\n", (waitqueue_active(&q->midi_sleep) ? "sleeping":"running"), q->qlen, q->input_time); } #endif /* CONFIG_SND_PROC_FS */
2 1 1 3 2 1 1 2 4 1 3 2 2 1 2 3 3 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/em_meta.c Metadata ematch * * Authors: Thomas Graf <tgraf@suug.ch> * * ========================================================================== * * The metadata ematch compares two meta objects where each object * represents either a meta value stored in the kernel or a static * value provided by userspace. The objects are not provided by * userspace itself but rather a definition providing the information * to build them. Every object is of a certain type which must be * equal to the object it is being compared to. * * The definition of a objects conists of the type (meta type), a * identifier (meta id) and additional type specific information. * The meta id is either TCF_META_TYPE_VALUE for values provided by * userspace or a index to the meta operations table consisting of * function pointers to type specific meta data collectors returning * the value of the requested meta value. * * lvalue rvalue * +-----------+ +-----------+ * | type: INT | | type: INT | * def | id: DEV | | id: VALUE | * | data: | | data: 3 | * +-----------+ +-----------+ * | | * ---> meta_ops[INT][DEV](...) | * | | * ----------- | * V V * +-----------+ +-----------+ * | type: INT | | type: INT | * obj | id: DEV | | id: VALUE | * | data: 2 |<--data got filled out | data: 3 | * +-----------+ +-----------+ * | | * --------------> 2 equals 3 <-------------- * * This is a simplified schema, the complexity varies depending * on the meta type. Obviously, the length of the data must also * be provided for non-numeric types. * * Additionally, type dependent modifiers such as shift operators * or mask may be applied to extend the functionality. As of now, * the variable length type supports shifting the byte string to * the right, eating up any number of octets and thus supporting * wildcard interface name comparisons such as "ppp%" matching * ppp0..9. * * NOTE: Certain meta values depend on other subsystems and are * only available if that subsystem is enabled in the kernel. */ #include <linux/slab.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/loadavg.h> #include <linux/string.h> #include <linux/skbuff.h> #include <linux/random.h> #include <linux/if_vlan.h> #include <linux/tc_ematch/tc_em_meta.h> #include <net/dst.h> #include <net/route.h> #include <net/pkt_cls.h> #include <net/sock.h> struct meta_obj { unsigned long value; unsigned int len; }; struct meta_value { struct tcf_meta_val hdr; unsigned long val; unsigned int len; }; struct meta_match { struct meta_value lvalue; struct meta_value rvalue; }; static inline int meta_id(struct meta_value *v) { return TCF_META_ID(v->hdr.kind); } static inline int meta_type(struct meta_value *v) { return TCF_META_TYPE(v->hdr.kind); } #define META_COLLECTOR(FUNC) static void meta_##FUNC(struct sk_buff *skb, \ struct tcf_pkt_info *info, struct meta_value *v, \ struct meta_obj *dst, int *err) /************************************************************************** * System status & misc **************************************************************************/ META_COLLECTOR(int_random) { get_random_bytes(&dst->value, sizeof(dst->value)); } static inline unsigned long fixed_loadavg(int load) { int rnd_load = load + (FIXED_1/200); int rnd_frac = ((rnd_load & (FIXED_1-1)) * 100) >> FSHIFT; return ((rnd_load >> FSHIFT) * 100) + rnd_frac; } META_COLLECTOR(int_loadavg_0) { dst->value = fixed_loadavg(avenrun[0]); } META_COLLECTOR(int_loadavg_1) { dst->value = fixed_loadavg(avenrun[1]); } META_COLLECTOR(int_loadavg_2) { dst->value = fixed_loadavg(avenrun[2]); } /************************************************************************** * Device names & indices **************************************************************************/ static inline int int_dev(struct net_device *dev, struct meta_obj *dst) { if (unlikely(dev == NULL)) return -1; dst->value = dev->ifindex; return 0; } static inline int var_dev(struct net_device *dev, struct meta_obj *dst) { if (unlikely(dev == NULL)) return -1; dst->value = (unsigned long) dev->name; dst->len = strlen(dev->name); return 0; } META_COLLECTOR(int_dev) { *err = int_dev(skb->dev, dst); } META_COLLECTOR(var_dev) { *err = var_dev(skb->dev, dst); } /************************************************************************** * vlan tag **************************************************************************/ META_COLLECTOR(int_vlan_tag) { unsigned short tag; if (skb_vlan_tag_present(skb)) dst->value = skb_vlan_tag_get(skb); else if (!__vlan_get_tag(skb, &tag)) dst->value = tag; else *err = -1; } /************************************************************************** * skb attributes **************************************************************************/ META_COLLECTOR(int_priority) { dst->value = skb->priority; } META_COLLECTOR(int_protocol) { /* Let userspace take care of the byte ordering */ dst->value = skb_protocol(skb, false); } META_COLLECTOR(int_pkttype) { dst->value = skb->pkt_type; } META_COLLECTOR(int_pktlen) { dst->value = skb->len; } META_COLLECTOR(int_datalen) { dst->value = skb->data_len; } META_COLLECTOR(int_maclen) { dst->value = skb->mac_len; } META_COLLECTOR(int_rxhash) { dst->value = skb_get_hash(skb); } /************************************************************************** * Netfilter **************************************************************************/ META_COLLECTOR(int_mark) { dst->value = skb->mark; } /************************************************************************** * Traffic Control **************************************************************************/ META_COLLECTOR(int_tcindex) { dst->value = skb->tc_index; } /************************************************************************** * Routing **************************************************************************/ META_COLLECTOR(int_rtclassid) { if (unlikely(skb_dst(skb) == NULL)) *err = -1; else #ifdef CONFIG_IP_ROUTE_CLASSID dst->value = skb_dst(skb)->tclassid; #else dst->value = 0; #endif } META_COLLECTOR(int_rtiif) { if (unlikely(skb_rtable(skb) == NULL)) *err = -1; else dst->value = inet_iif(skb); } /************************************************************************** * Socket Attributes **************************************************************************/ #define skip_nonlocal(skb) \ (unlikely(skb->sk == NULL)) META_COLLECTOR(int_sk_family) { if (skip_nonlocal(skb)) { *err = -1; return; } dst->value = skb->sk->sk_family; } META_COLLECTOR(int_sk_state) { if (skip_nonlocal(skb)) { *err = -1; return; } dst->value = skb->sk->sk_state; } META_COLLECTOR(int_sk_reuse) { if (skip_nonlocal(skb)) { *err = -1; return; } dst->value = skb->sk->sk_reuse; } META_COLLECTOR(int_sk_bound_if) { if (skip_nonlocal(skb)) { *err = -1; return; } /* No error if bound_dev_if is 0, legal userspace check */ dst->value = skb->sk->sk_bound_dev_if; } META_COLLECTOR(var_sk_bound_if) { int bound_dev_if; if (skip_nonlocal(skb)) { *err = -1; return; } bound_dev_if = READ_ONCE(skb->sk->sk_bound_dev_if); if (bound_dev_if == 0) { dst->value = (unsigned long) "any"; dst->len = 3; } else { struct net_device *dev; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(skb->sk), bound_dev_if); *err = var_dev(dev, dst); rcu_read_unlock(); } } META_COLLECTOR(int_sk_refcnt) { if (skip_nonlocal(skb)) { *err = -1; return; } dst->value = refcount_read(&skb->sk->sk_refcnt); } META_COLLECTOR(int_sk_rcvbuf) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = sk->sk_rcvbuf; } META_COLLECTOR(int_sk_shutdown) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = sk->sk_shutdown; } META_COLLECTOR(int_sk_proto) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = sk->sk_protocol; } META_COLLECTOR(int_sk_type) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = sk->sk_type; } META_COLLECTOR(int_sk_rmem_alloc) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = sk_rmem_alloc_get(sk); } META_COLLECTOR(int_sk_wmem_alloc) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = sk_wmem_alloc_get(sk); } META_COLLECTOR(int_sk_omem_alloc) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = atomic_read(&sk->sk_omem_alloc); } META_COLLECTOR(int_sk_rcv_qlen) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = sk->sk_receive_queue.qlen; } META_COLLECTOR(int_sk_snd_qlen) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = sk->sk_write_queue.qlen; } META_COLLECTOR(int_sk_wmem_queued) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = READ_ONCE(sk->sk_wmem_queued); } META_COLLECTOR(int_sk_fwd_alloc) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = sk_forward_alloc_get(sk); } META_COLLECTOR(int_sk_sndbuf) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = sk->sk_sndbuf; } META_COLLECTOR(int_sk_alloc) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = (__force int) sk->sk_allocation; } META_COLLECTOR(int_sk_hash) { if (skip_nonlocal(skb)) { *err = -1; return; } dst->value = skb->sk->sk_hash; } META_COLLECTOR(int_sk_lingertime) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = READ_ONCE(sk->sk_lingertime) / HZ; } META_COLLECTOR(int_sk_err_qlen) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = sk->sk_error_queue.qlen; } META_COLLECTOR(int_sk_ack_bl) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = READ_ONCE(sk->sk_ack_backlog); } META_COLLECTOR(int_sk_max_ack_bl) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = READ_ONCE(sk->sk_max_ack_backlog); } META_COLLECTOR(int_sk_prio) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = READ_ONCE(sk->sk_priority); } META_COLLECTOR(int_sk_rcvlowat) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = READ_ONCE(sk->sk_rcvlowat); } META_COLLECTOR(int_sk_rcvtimeo) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = READ_ONCE(sk->sk_rcvtimeo) / HZ; } META_COLLECTOR(int_sk_sndtimeo) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = READ_ONCE(sk->sk_sndtimeo) / HZ; } META_COLLECTOR(int_sk_sendmsg_off) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = sk->sk_frag.offset; } META_COLLECTOR(int_sk_write_pend) { const struct sock *sk = skb_to_full_sk(skb); if (!sk) { *err = -1; return; } dst->value = sk->sk_write_pending; } /************************************************************************** * Meta value collectors assignment table **************************************************************************/ struct meta_ops { void (*get)(struct sk_buff *, struct tcf_pkt_info *, struct meta_value *, struct meta_obj *, int *); }; #define META_ID(name) TCF_META_ID_##name #define META_FUNC(name) { .get = meta_##name } /* Meta value operations table listing all meta value collectors and * assigns them to a type and meta id. */ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX + 1][TCF_META_ID_MAX + 1] = { [TCF_META_TYPE_VAR] = { [META_ID(DEV)] = META_FUNC(var_dev), [META_ID(SK_BOUND_IF)] = META_FUNC(var_sk_bound_if), }, [TCF_META_TYPE_INT] = { [META_ID(RANDOM)] = META_FUNC(int_random), [META_ID(LOADAVG_0)] = META_FUNC(int_loadavg_0), [META_ID(LOADAVG_1)] = META_FUNC(int_loadavg_1), [META_ID(LOADAVG_2)] = META_FUNC(int_loadavg_2), [META_ID(DEV)] = META_FUNC(int_dev), [META_ID(PRIORITY)] = META_FUNC(int_priority), [META_ID(PROTOCOL)] = META_FUNC(int_protocol), [META_ID(PKTTYPE)] = META_FUNC(int_pkttype), [META_ID(PKTLEN)] = META_FUNC(int_pktlen), [META_ID(DATALEN)] = META_FUNC(int_datalen), [META_ID(MACLEN)] = META_FUNC(int_maclen), [META_ID(NFMARK)] = META_FUNC(int_mark), [META_ID(TCINDEX)] = META_FUNC(int_tcindex), [META_ID(RTCLASSID)] = META_FUNC(int_rtclassid), [META_ID(RTIIF)] = META_FUNC(int_rtiif), [META_ID(SK_FAMILY)] = META_FUNC(int_sk_family), [META_ID(SK_STATE)] = META_FUNC(int_sk_state), [META_ID(SK_REUSE)] = META_FUNC(int_sk_reuse), [META_ID(SK_BOUND_IF)] = META_FUNC(int_sk_bound_if), [META_ID(SK_REFCNT)] = META_FUNC(int_sk_refcnt), [META_ID(SK_RCVBUF)] = META_FUNC(int_sk_rcvbuf), [META_ID(SK_SNDBUF)] = META_FUNC(int_sk_sndbuf), [META_ID(SK_SHUTDOWN)] = META_FUNC(int_sk_shutdown), [META_ID(SK_PROTO)] = META_FUNC(int_sk_proto), [META_ID(SK_TYPE)] = META_FUNC(int_sk_type), [META_ID(SK_RMEM_ALLOC)] = META_FUNC(int_sk_rmem_alloc), [META_ID(SK_WMEM_ALLOC)] = META_FUNC(int_sk_wmem_alloc), [META_ID(SK_OMEM_ALLOC)] = META_FUNC(int_sk_omem_alloc), [META_ID(SK_WMEM_QUEUED)] = META_FUNC(int_sk_wmem_queued), [META_ID(SK_RCV_QLEN)] = META_FUNC(int_sk_rcv_qlen), [META_ID(SK_SND_QLEN)] = META_FUNC(int_sk_snd_qlen), [META_ID(SK_ERR_QLEN)] = META_FUNC(int_sk_err_qlen), [META_ID(SK_FORWARD_ALLOCS)] = META_FUNC(int_sk_fwd_alloc), [META_ID(SK_ALLOCS)] = META_FUNC(int_sk_alloc), [META_ID(SK_HASH)] = META_FUNC(int_sk_hash), [META_ID(SK_LINGERTIME)] = META_FUNC(int_sk_lingertime), [META_ID(SK_ACK_BACKLOG)] = META_FUNC(int_sk_ack_bl), [META_ID(SK_MAX_ACK_BACKLOG)] = META_FUNC(int_sk_max_ack_bl), [META_ID(SK_PRIO)] = META_FUNC(int_sk_prio), [META_ID(SK_RCVLOWAT)] = META_FUNC(int_sk_rcvlowat), [META_ID(SK_RCVTIMEO)] = META_FUNC(int_sk_rcvtimeo), [META_ID(SK_SNDTIMEO)] = META_FUNC(int_sk_sndtimeo), [META_ID(SK_SENDMSG_OFF)] = META_FUNC(int_sk_sendmsg_off), [META_ID(SK_WRITE_PENDING)] = META_FUNC(int_sk_write_pend), [META_ID(VLAN_TAG)] = META_FUNC(int_vlan_tag), [META_ID(RXHASH)] = META_FUNC(int_rxhash), } }; static inline struct meta_ops *meta_ops(struct meta_value *val) { return &__meta_ops[meta_type(val)][meta_id(val)]; } /************************************************************************** * Type specific operations for TCF_META_TYPE_VAR **************************************************************************/ static int meta_var_compare(struct meta_obj *a, struct meta_obj *b) { int r = a->len - b->len; if (r == 0) r = memcmp((void *) a->value, (void *) b->value, a->len); return r; } static int meta_var_change(struct meta_value *dst, struct nlattr *nla) { int len = nla_len(nla); dst->val = (unsigned long)kmemdup(nla_data(nla), len, GFP_KERNEL); if (dst->val == 0UL) return -ENOMEM; dst->len = len; return 0; } static void meta_var_destroy(struct meta_value *v) { kfree((void *) v->val); } static void meta_var_apply_extras(struct meta_value *v, struct meta_obj *dst) { int shift = v->hdr.shift; if (shift && shift < dst->len) dst->len -= shift; } static int meta_var_dump(struct sk_buff *skb, struct meta_value *v, int tlv) { if (v->val && v->len && nla_put(skb, tlv, v->len, (void *) v->val)) goto nla_put_failure; return 0; nla_put_failure: return -1; } /************************************************************************** * Type specific operations for TCF_META_TYPE_INT **************************************************************************/ static int meta_int_compare(struct meta_obj *a, struct meta_obj *b) { /* Let gcc optimize it, the unlikely is not really based on * some numbers but jump free code for mismatches seems * more logical. */ if (unlikely(a->value == b->value)) return 0; else if (a->value < b->value) return -1; else return 1; } static int meta_int_change(struct meta_value *dst, struct nlattr *nla) { if (nla_len(nla) >= sizeof(unsigned long)) { dst->val = *(unsigned long *) nla_data(nla); dst->len = sizeof(unsigned long); } else if (nla_len(nla) == sizeof(u32)) { dst->val = nla_get_u32(nla); dst->len = sizeof(u32); } else return -EINVAL; return 0; } static void meta_int_apply_extras(struct meta_value *v, struct meta_obj *dst) { if (v->hdr.shift) dst->value >>= v->hdr.shift; if (v->val) dst->value &= v->val; } static int meta_int_dump(struct sk_buff *skb, struct meta_value *v, int tlv) { if (v->len == sizeof(unsigned long)) { if (nla_put(skb, tlv, sizeof(unsigned long), &v->val)) goto nla_put_failure; } else if (v->len == sizeof(u32)) { if (nla_put_u32(skb, tlv, v->val)) goto nla_put_failure; } return 0; nla_put_failure: return -1; } /************************************************************************** * Type specific operations table **************************************************************************/ struct meta_type_ops { void (*destroy)(struct meta_value *); int (*compare)(struct meta_obj *, struct meta_obj *); int (*change)(struct meta_value *, struct nlattr *); void (*apply_extras)(struct meta_value *, struct meta_obj *); int (*dump)(struct sk_buff *, struct meta_value *, int); }; static const struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX + 1] = { [TCF_META_TYPE_VAR] = { .destroy = meta_var_destroy, .compare = meta_var_compare, .change = meta_var_change, .apply_extras = meta_var_apply_extras, .dump = meta_var_dump }, [TCF_META_TYPE_INT] = { .compare = meta_int_compare, .change = meta_int_change, .apply_extras = meta_int_apply_extras, .dump = meta_int_dump } }; static inline const struct meta_type_ops *meta_type_ops(struct meta_value *v) { return &__meta_type_ops[meta_type(v)]; } /************************************************************************** * Core **************************************************************************/ static int meta_get(struct sk_buff *skb, struct tcf_pkt_info *info, struct meta_value *v, struct meta_obj *dst) { int err = 0; if (meta_id(v) == TCF_META_ID_VALUE) { dst->value = v->val; dst->len = v->len; return 0; } meta_ops(v)->get(skb, info, v, dst, &err); if (err < 0) return err; if (meta_type_ops(v)->apply_extras) meta_type_ops(v)->apply_extras(v, dst); return 0; } static int em_meta_match(struct sk_buff *skb, struct tcf_ematch *m, struct tcf_pkt_info *info) { int r; struct meta_match *meta = (struct meta_match *) m->data; struct meta_obj l_value, r_value; if (meta_get(skb, info, &meta->lvalue, &l_value) < 0 || meta_get(skb, info, &meta->rvalue, &r_value) < 0) return 0; r = meta_type_ops(&meta->lvalue)->compare(&l_value, &r_value); switch (meta->lvalue.hdr.op) { case TCF_EM_OPND_EQ: return !r; case TCF_EM_OPND_LT: return r < 0; case TCF_EM_OPND_GT: return r > 0; } return 0; } static void meta_delete(struct meta_match *meta) { if (meta) { const struct meta_type_ops *ops = meta_type_ops(&meta->lvalue); if (ops && ops->destroy) { ops->destroy(&meta->lvalue); ops->destroy(&meta->rvalue); } } kfree(meta); } static inline int meta_change_data(struct meta_value *dst, struct nlattr *nla) { if (nla) { if (nla_len(nla) == 0) return -EINVAL; return meta_type_ops(dst)->change(dst, nla); } return 0; } static inline int meta_is_supported(struct meta_value *val) { return !meta_id(val) || meta_ops(val)->get; } static const struct nla_policy meta_policy[TCA_EM_META_MAX + 1] = { [TCA_EM_META_HDR] = { .len = sizeof(struct tcf_meta_hdr) }, }; static int em_meta_change(struct net *net, void *data, int len, struct tcf_ematch *m) { int err; struct nlattr *tb[TCA_EM_META_MAX + 1]; struct tcf_meta_hdr *hdr; struct meta_match *meta = NULL; err = nla_parse_deprecated(tb, TCA_EM_META_MAX, data, len, meta_policy, NULL); if (err < 0) goto errout; err = -EINVAL; if (tb[TCA_EM_META_HDR] == NULL) goto errout; hdr = nla_data(tb[TCA_EM_META_HDR]); if (TCF_META_TYPE(hdr->left.kind) != TCF_META_TYPE(hdr->right.kind) || TCF_META_TYPE(hdr->left.kind) > TCF_META_TYPE_MAX || TCF_META_ID(hdr->left.kind) > TCF_META_ID_MAX || TCF_META_ID(hdr->right.kind) > TCF_META_ID_MAX) goto errout; meta = kzalloc(sizeof(*meta), GFP_KERNEL); if (meta == NULL) { err = -ENOMEM; goto errout; } memcpy(&meta->lvalue.hdr, &hdr->left, sizeof(hdr->left)); memcpy(&meta->rvalue.hdr, &hdr->right, sizeof(hdr->right)); if (!meta_is_supported(&meta->lvalue) || !meta_is_supported(&meta->rvalue)) { err = -EOPNOTSUPP; goto errout; } if (meta_change_data(&meta->lvalue, tb[TCA_EM_META_LVALUE]) < 0 || meta_change_data(&meta->rvalue, tb[TCA_EM_META_RVALUE]) < 0) goto errout; m->datalen = sizeof(*meta); m->data = (unsigned long) meta; err = 0; errout: if (err && meta) meta_delete(meta); return err; } static void em_meta_destroy(struct tcf_ematch *m) { if (m) meta_delete((struct meta_match *) m->data); } static int em_meta_dump(struct sk_buff *skb, struct tcf_ematch *em) { struct meta_match *meta = (struct meta_match *) em->data; struct tcf_meta_hdr hdr; const struct meta_type_ops *ops; memset(&hdr, 0, sizeof(hdr)); memcpy(&hdr.left, &meta->lvalue.hdr, sizeof(hdr.left)); memcpy(&hdr.right, &meta->rvalue.hdr, sizeof(hdr.right)); if (nla_put(skb, TCA_EM_META_HDR, sizeof(hdr), &hdr)) goto nla_put_failure; ops = meta_type_ops(&meta->lvalue); if (ops->dump(skb, &meta->lvalue, TCA_EM_META_LVALUE) < 0 || ops->dump(skb, &meta->rvalue, TCA_EM_META_RVALUE) < 0) goto nla_put_failure; return 0; nla_put_failure: return -1; } static struct tcf_ematch_ops em_meta_ops = { .kind = TCF_EM_META, .change = em_meta_change, .match = em_meta_match, .destroy = em_meta_destroy, .dump = em_meta_dump, .owner = THIS_MODULE, .link = LIST_HEAD_INIT(em_meta_ops.link) }; static int __init init_em_meta(void) { return tcf_em_register(&em_meta_ops); } static void __exit exit_em_meta(void) { tcf_em_unregister(&em_meta_ops); } MODULE_LICENSE("GPL"); module_init(init_em_meta); module_exit(exit_em_meta); MODULE_ALIAS_TCF_EMATCH(TCF_EM_META);
372 442 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 // SPDX-License-Identifier: GPL-2.0 /* * All the USB notify logic * * (C) Copyright 2005 Greg Kroah-Hartman <gregkh@suse.de> * * notifier functions originally based on those in kernel/sys.c * but fixed up to not be so broken. * * Released under the GPLv2 only. */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/notifier.h> #include <linux/usb.h> #include <linux/mutex.h> #include "usb.h" static BLOCKING_NOTIFIER_HEAD(usb_notifier_list); /** * usb_register_notify - register a notifier callback whenever a usb change happens * @nb: pointer to the notifier block for the callback events. * * These changes are either USB devices or busses being added or removed. */ void usb_register_notify(struct notifier_block *nb) { blocking_notifier_chain_register(&usb_notifier_list, nb); } EXPORT_SYMBOL_GPL(usb_register_notify); /** * usb_unregister_notify - unregister a notifier callback * @nb: pointer to the notifier block for the callback events. * * usb_register_notify() must have been previously called for this function * to work properly. */ void usb_unregister_notify(struct notifier_block *nb) { blocking_notifier_chain_unregister(&usb_notifier_list, nb); } EXPORT_SYMBOL_GPL(usb_unregister_notify); void usb_notify_add_device(struct usb_device *udev) { blocking_notifier_call_chain(&usb_notifier_list, USB_DEVICE_ADD, udev); } void usb_notify_remove_device(struct usb_device *udev) { blocking_notifier_call_chain(&usb_notifier_list, USB_DEVICE_REMOVE, udev); } void usb_notify_add_bus(struct usb_bus *ubus) { blocking_notifier_call_chain(&usb_notifier_list, USB_BUS_ADD, ubus); } void usb_notify_remove_bus(struct usb_bus *ubus) { blocking_notifier_call_chain(&usb_notifier_list, USB_BUS_REMOVE, ubus); }
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 // SPDX-License-Identifier: GPL-2.0 #include "hfa384x_usb.c" #include "prism2mgmt.c" #include "prism2mib.c" #include "prism2sta.c" #include "prism2fw.c" #define PRISM_DEV(vid, pid, name) \ { USB_DEVICE(vid, pid), \ .driver_info = (unsigned long)name } static const struct usb_device_id usb_prism_tbl[] = { PRISM_DEV(0x04bb, 0x0922, "IOData AirPort WN-B11/USBS"), PRISM_DEV(0x07aa, 0x0012, "Corega USB Wireless LAN Stick-11"), PRISM_DEV(0x09aa, 0x3642, "Prism2.x 11Mbps USB WLAN Adapter"), PRISM_DEV(0x1668, 0x0408, "Actiontec Prism2.5 11Mbps USB WLAN Adapter"), PRISM_DEV(0x1668, 0x0421, "Actiontec Prism2.5 11Mbps USB WLAN Adapter"), PRISM_DEV(0x1915, 0x2236, "Linksys WUSB11v3.0 11Mbps USB WLAN Adapter"), PRISM_DEV(0x066b, 0x2212, "Linksys WUSB11v2.5 11Mbps USB WLAN Adapter"), PRISM_DEV(0x066b, 0x2213, "Linksys WUSB12v1.1 11Mbps USB WLAN Adapter"), PRISM_DEV(0x0411, 0x0016, "Melco WLI-USB-S11 11Mbps WLAN Adapter"), PRISM_DEV(0x08de, 0x7a01, "PRISM25 USB IEEE 802.11 Mini Adapter"), PRISM_DEV(0x8086, 0x1111, "Intel PRO/Wireless 2011B USB LAN Adapter"), PRISM_DEV(0x0d8e, 0x7a01, "PRISM25 IEEE 802.11 Mini USB Adapter"), PRISM_DEV(0x045e, 0x006e, "Microsoft MN510 USB Wireless Adapter"), PRISM_DEV(0x0967, 0x0204, "Acer Warplink USB Adapter"), PRISM_DEV(0x0cde, 0x0002, "Z-Com 725/726 Prism2.5 USB/USB Integrated"), PRISM_DEV(0x0cde, 0x0005, "Z-Com Xl735 USB Wireless 802.11b Adapter"), PRISM_DEV(0x413c, 0x8100, "Dell TrueMobile 1180 USB Wireless Adapter"), PRISM_DEV(0x0b3b, 0x1601, "ALLNET 0193 11Mbps USB WLAN Adapter"), PRISM_DEV(0x0b3b, 0x1602, "ZyXEL ZyAIR B200 USB Wireless Adapter"), PRISM_DEV(0x0baf, 0x00eb, "USRobotics USR1120 USB Wireless Adapter"), PRISM_DEV(0x0411, 0x0027, "Melco WLI-USB-KS11G 11Mbps WLAN Adapter"), PRISM_DEV(0x04f1, 0x3009, "JVC MP-XP7250 Builtin USB WLAN Adapter"), PRISM_DEV(0x0846, 0x4110, "NetGear MA111"), PRISM_DEV(0x03f3, 0x0020, "Adaptec AWN-8020 USB WLAN Adapter"), PRISM_DEV(0x2821, 0x3300, "ASUS-WL140 / Hawking HighDB USB Wireless Adapter"), PRISM_DEV(0x2001, 0x3700, "DWL-122 USB Wireless Adapter"), PRISM_DEV(0x2001, 0x3702, "DWL-120 Rev F USB Wireless Adapter"), PRISM_DEV(0x50c2, 0x4013, "Averatec USB WLAN Adapter"), PRISM_DEV(0x2c02, 0x14ea, "Planex GW-US11H USB WLAN Adapter"), PRISM_DEV(0x124a, 0x168b, "Airvast PRISM3 USB WLAN Adapter"), PRISM_DEV(0x083a, 0x3503, "T-Sinus 111 USB WLAN Adapter"), PRISM_DEV(0x0411, 0x0044, "Melco WLI-USB-KB11 11Mbps WLAN Adapter"), PRISM_DEV(0x1668, 0x6106, "ROPEX FreeLan USB 802.11b Adapter"), PRISM_DEV(0x124a, 0x4017, "Pheenet WL-503IA USB 802.11b Adapter"), PRISM_DEV(0x0bb2, 0x0302, "Ambit Microsystems Corp."), PRISM_DEV(0x9016, 0x182d, "Sitecom WL-022 USB 802.11b Adapter"), PRISM_DEV(0x0543, 0x0f01, "ViewSonic Airsync USB Adapter 11Mbps (Prism2.5)"), PRISM_DEV(0x067c, 0x1022, "Siemens SpeedStream 1022 11Mbps USB WLAN Adapter"), PRISM_DEV(0x049f, 0x0033, "Compaq/Intel W100 PRO/Wireless 11Mbps multiport WLAN Adapter"), { } /* terminator */ }; MODULE_DEVICE_TABLE(usb, usb_prism_tbl); static int prism2sta_probe_usb(struct usb_interface *interface, const struct usb_device_id *id) { struct usb_device *dev; struct usb_endpoint_descriptor *bulk_in, *bulk_out; struct usb_host_interface *iface_desc = interface->cur_altsetting; struct wlandevice *wlandev = NULL; struct hfa384x *hw = NULL; int result = 0; result = usb_find_common_endpoints(iface_desc, &bulk_in, &bulk_out, NULL, NULL); if (result) goto failed; dev = interface_to_usbdev(interface); wlandev = create_wlan(); if (!wlandev) { dev_err(&interface->dev, "Memory allocation failure.\n"); result = -EIO; goto failed; } hw = wlandev->priv; if (wlan_setup(wlandev, &interface->dev) != 0) { dev_err(&interface->dev, "wlan_setup() failed.\n"); result = -EIO; goto failed; } /* Initialize the hw data */ hw->endp_in = usb_rcvbulkpipe(dev, bulk_in->bEndpointAddress); hw->endp_out = usb_sndbulkpipe(dev, bulk_out->bEndpointAddress); hfa384x_create(hw, dev); hw->wlandev = wlandev; /* Register the wlandev, this gets us a name and registers the * linux netdevice. */ SET_NETDEV_DEV(wlandev->netdev, &interface->dev); /* Do a chip-level reset on the MAC */ if (prism2_doreset) { result = hfa384x_corereset(hw, prism2_reset_holdtime, prism2_reset_settletime, 0); if (result != 0) { result = -EIO; dev_err(&interface->dev, "hfa384x_corereset() failed.\n"); goto failed_reset; } } usb_get_dev(dev); wlandev->msdstate = WLAN_MSD_HWPRESENT; /* Try and load firmware, then enable card before we register */ prism2_fwtry(dev, wlandev); prism2sta_ifstate(wlandev, P80211ENUM_ifstate_enable); if (register_wlandev(wlandev) != 0) { dev_err(&interface->dev, "register_wlandev() failed.\n"); result = -EIO; goto failed_register; } goto done; failed_register: usb_put_dev(dev); failed_reset: wlan_unsetup(wlandev); failed: kfree(wlandev); kfree(hw); wlandev = NULL; done: usb_set_intfdata(interface, wlandev); return result; } static void prism2sta_disconnect_usb(struct usb_interface *interface) { struct wlandevice *wlandev; wlandev = usb_get_intfdata(interface); if (wlandev) { LIST_HEAD(cleanlist); struct hfa384x_usbctlx *ctlx, *temp; unsigned long flags; struct hfa384x *hw = wlandev->priv; if (!hw) goto exit; spin_lock_irqsave(&hw->ctlxq.lock, flags); p80211netdev_hwremoved(wlandev); list_splice_init(&hw->ctlxq.reapable, &cleanlist); list_splice_init(&hw->ctlxq.completing, &cleanlist); list_splice_init(&hw->ctlxq.pending, &cleanlist); list_splice_init(&hw->ctlxq.active, &cleanlist); spin_unlock_irqrestore(&hw->ctlxq.lock, flags); /* There's no hardware to shutdown, but the driver * might have some tasks that must be stopped before * we can tear everything down. */ prism2sta_ifstate(wlandev, P80211ENUM_ifstate_disable); timer_shutdown_sync(&hw->throttle); timer_shutdown_sync(&hw->reqtimer); timer_shutdown_sync(&hw->resptimer); /* Unlink all the URBs. This "removes the wheels" * from the entire CTLX handling mechanism. */ usb_kill_urb(&hw->rx_urb); usb_kill_urb(&hw->tx_urb); usb_kill_urb(&hw->ctlx_urb); cancel_work_sync(&hw->completion_bh); cancel_work_sync(&hw->reaper_bh); cancel_work_sync(&hw->link_bh); cancel_work_sync(&hw->commsqual_bh); cancel_work_sync(&hw->usb_work); /* Now we complete any outstanding commands * and tell everyone who is waiting for their * responses that we have shut down. */ list_for_each_entry(ctlx, &cleanlist, list) complete(&ctlx->done); /* Give any outstanding synchronous commands * a chance to complete. All they need to do * is "wake up", so that's easy. * (I'd like a better way to do this, really.) */ msleep(100); /* Now delete the CTLXs, because no-one else can now. */ list_for_each_entry_safe(ctlx, temp, &cleanlist, list) kfree(ctlx); /* Unhook the wlandev */ unregister_wlandev(wlandev); wlan_unsetup(wlandev); usb_put_dev(hw->usb); hfa384x_destroy(hw); kfree(hw); kfree(wlandev); } exit: usb_set_intfdata(interface, NULL); } #ifdef CONFIG_PM static int prism2sta_suspend(struct usb_interface *interface, pm_message_t message) { struct hfa384x *hw = NULL; struct wlandevice *wlandev; wlandev = usb_get_intfdata(interface); if (!wlandev) return -ENODEV; hw = wlandev->priv; if (!hw) return -ENODEV; prism2sta_ifstate(wlandev, P80211ENUM_ifstate_disable); usb_kill_urb(&hw->rx_urb); usb_kill_urb(&hw->tx_urb); usb_kill_urb(&hw->ctlx_urb); return 0; } static int prism2sta_resume(struct usb_interface *interface) { int result = 0; struct hfa384x *hw = NULL; struct wlandevice *wlandev; wlandev = usb_get_intfdata(interface); if (!wlandev) return -ENODEV; hw = wlandev->priv; if (!hw) return -ENODEV; /* Do a chip-level reset on the MAC */ if (prism2_doreset) { result = hfa384x_corereset(hw, prism2_reset_holdtime, prism2_reset_settletime, 0); if (result != 0) { unregister_wlandev(wlandev); hfa384x_destroy(hw); dev_err(&interface->dev, "hfa384x_corereset() failed.\n"); kfree(wlandev); kfree(hw); wlandev = NULL; return -ENODEV; } } prism2sta_ifstate(wlandev, P80211ENUM_ifstate_enable); return 0; } #else #define prism2sta_suspend NULL #define prism2sta_resume NULL #endif /* CONFIG_PM */ static struct usb_driver prism2_usb_driver = { .name = "prism2_usb", .probe = prism2sta_probe_usb, .disconnect = prism2sta_disconnect_usb, .id_table = usb_prism_tbl, .suspend = prism2sta_suspend, .resume = prism2sta_resume, .reset_resume = prism2sta_resume, /* fops, minor? */ }; module_usb_driver(prism2_usb_driver);
470 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 // SPDX-License-Identifier: GPL-2.0-or-later /* * (C) 2010 Pablo Neira Ayuso <pablo@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/netfilter.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/moduleparam.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_timestamp.h> static bool nf_ct_tstamp __read_mostly; module_param_named(tstamp, nf_ct_tstamp, bool, 0644); MODULE_PARM_DESC(tstamp, "Enable connection tracking flow timestamping."); void nf_conntrack_tstamp_pernet_init(struct net *net) { net->ct.sysctl_tstamp = nf_ct_tstamp; }
1 1928 1926 1 1 1 1 19 1 1 17 1 2 2 2 2 1 1 1 1 1 1 10 1 1 2 1 1 1 2 1 1 1 1 1 6 2 2 1 1 1 1 1 7 7 2 1 2 4 4 1 2 2 2 1 1 2 1 1 3 1 2 3 3 33 15 2 1 9 1 1 1 8 1 1 7 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) * Copyright (C) Terry Dawson VK2KTJ (terry@animats.net) * Copyright (C) Tomi Manninen OH2BNS (oh2bns@sral.fi) */ #include <linux/capability.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/spinlock.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/stat.h> #include <net/net_namespace.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/termios.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/notifier.h> #include <net/rose.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <net/tcp_states.h> #include <net/ip.h> #include <net/arp.h> static int rose_ndevs = 10; int sysctl_rose_restart_request_timeout = ROSE_DEFAULT_T0; int sysctl_rose_call_request_timeout = ROSE_DEFAULT_T1; int sysctl_rose_reset_request_timeout = ROSE_DEFAULT_T2; int sysctl_rose_clear_request_timeout = ROSE_DEFAULT_T3; int sysctl_rose_no_activity_timeout = ROSE_DEFAULT_IDLE; int sysctl_rose_ack_hold_back_timeout = ROSE_DEFAULT_HB; int sysctl_rose_routing_control = ROSE_DEFAULT_ROUTING; int sysctl_rose_link_fail_timeout = ROSE_DEFAULT_FAIL_TIMEOUT; int sysctl_rose_maximum_vcs = ROSE_DEFAULT_MAXVC; int sysctl_rose_window_size = ROSE_DEFAULT_WINDOW_SIZE; static HLIST_HEAD(rose_list); static DEFINE_SPINLOCK(rose_list_lock); static const struct proto_ops rose_proto_ops; ax25_address rose_callsign; /* * ROSE network devices are virtual network devices encapsulating ROSE * frames into AX.25 which will be sent through an AX.25 device, so form a * special "super class" of normal net devices; split their locks off into a * separate class since they always nest. */ static struct lock_class_key rose_netdev_xmit_lock_key; static struct lock_class_key rose_netdev_addr_lock_key; static void rose_set_lockdep_one(struct net_device *dev, struct netdev_queue *txq, void *_unused) { lockdep_set_class(&txq->_xmit_lock, &rose_netdev_xmit_lock_key); } static void rose_set_lockdep_key(struct net_device *dev) { lockdep_set_class(&dev->addr_list_lock, &rose_netdev_addr_lock_key); netdev_for_each_tx_queue(dev, rose_set_lockdep_one, NULL); } /* * Convert a ROSE address into text. */ char *rose2asc(char *buf, const rose_address *addr) { if (addr->rose_addr[0] == 0x00 && addr->rose_addr[1] == 0x00 && addr->rose_addr[2] == 0x00 && addr->rose_addr[3] == 0x00 && addr->rose_addr[4] == 0x00) { strcpy(buf, "*"); } else { sprintf(buf, "%02X%02X%02X%02X%02X", addr->rose_addr[0] & 0xFF, addr->rose_addr[1] & 0xFF, addr->rose_addr[2] & 0xFF, addr->rose_addr[3] & 0xFF, addr->rose_addr[4] & 0xFF); } return buf; } /* * Compare two ROSE addresses, 0 == equal. */ int rosecmp(const rose_address *addr1, const rose_address *addr2) { int i; for (i = 0; i < 5; i++) if (addr1->rose_addr[i] != addr2->rose_addr[i]) return 1; return 0; } /* * Compare two ROSE addresses for only mask digits, 0 == equal. */ int rosecmpm(const rose_address *addr1, const rose_address *addr2, unsigned short mask) { unsigned int i, j; if (mask > 10) return 1; for (i = 0; i < mask; i++) { j = i / 2; if ((i % 2) != 0) { if ((addr1->rose_addr[j] & 0x0F) != (addr2->rose_addr[j] & 0x0F)) return 1; } else { if ((addr1->rose_addr[j] & 0xF0) != (addr2->rose_addr[j] & 0xF0)) return 1; } } return 0; } /* * Socket removal during an interrupt is now safe. */ static void rose_remove_socket(struct sock *sk) { spin_lock_bh(&rose_list_lock); sk_del_node_init(sk); spin_unlock_bh(&rose_list_lock); } /* * Kill all bound sockets on a broken link layer connection to a * particular neighbour. */ void rose_kill_by_neigh(struct rose_neigh *neigh) { struct sock *s; spin_lock_bh(&rose_list_lock); sk_for_each(s, &rose_list) { struct rose_sock *rose = rose_sk(s); if (rose->neighbour == neigh) { rose_disconnect(s, ENETUNREACH, ROSE_OUT_OF_ORDER, 0); rose->neighbour->use--; rose->neighbour = NULL; } } spin_unlock_bh(&rose_list_lock); } /* * Kill all bound sockets on a dropped device. */ static void rose_kill_by_device(struct net_device *dev) { struct sock *sk, *array[16]; struct rose_sock *rose; bool rescan; int i, cnt; start: rescan = false; cnt = 0; spin_lock_bh(&rose_list_lock); sk_for_each(sk, &rose_list) { rose = rose_sk(sk); if (rose->device == dev) { if (cnt == ARRAY_SIZE(array)) { rescan = true; break; } sock_hold(sk); array[cnt++] = sk; } } spin_unlock_bh(&rose_list_lock); for (i = 0; i < cnt; i++) { sk = array[cnt]; rose = rose_sk(sk); lock_sock(sk); spin_lock_bh(&rose_list_lock); if (rose->device == dev) { rose_disconnect(sk, ENETUNREACH, ROSE_OUT_OF_ORDER, 0); if (rose->neighbour) rose->neighbour->use--; netdev_put(rose->device, &rose->dev_tracker); rose->device = NULL; } spin_unlock_bh(&rose_list_lock); release_sock(sk); sock_put(sk); cond_resched(); } if (rescan) goto start; } /* * Handle device status changes. */ static int rose_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; if (event != NETDEV_DOWN) return NOTIFY_DONE; switch (dev->type) { case ARPHRD_ROSE: rose_kill_by_device(dev); break; case ARPHRD_AX25: rose_link_device_down(dev); rose_rt_device_down(dev); break; } return NOTIFY_DONE; } /* * Add a socket to the bound sockets list. */ static void rose_insert_socket(struct sock *sk) { spin_lock_bh(&rose_list_lock); sk_add_node(sk, &rose_list); spin_unlock_bh(&rose_list_lock); } /* * Find a socket that wants to accept the Call Request we just * received. */ static struct sock *rose_find_listener(rose_address *addr, ax25_address *call) { struct sock *s; spin_lock_bh(&rose_list_lock); sk_for_each(s, &rose_list) { struct rose_sock *rose = rose_sk(s); if (!rosecmp(&rose->source_addr, addr) && !ax25cmp(&rose->source_call, call) && !rose->source_ndigis && s->sk_state == TCP_LISTEN) goto found; } sk_for_each(s, &rose_list) { struct rose_sock *rose = rose_sk(s); if (!rosecmp(&rose->source_addr, addr) && !ax25cmp(&rose->source_call, &null_ax25_address) && s->sk_state == TCP_LISTEN) goto found; } s = NULL; found: spin_unlock_bh(&rose_list_lock); return s; } /* * Find a connected ROSE socket given my LCI and device. */ struct sock *rose_find_socket(unsigned int lci, struct rose_neigh *neigh) { struct sock *s; spin_lock_bh(&rose_list_lock); sk_for_each(s, &rose_list) { struct rose_sock *rose = rose_sk(s); if (rose->lci == lci && rose->neighbour == neigh) goto found; } s = NULL; found: spin_unlock_bh(&rose_list_lock); return s; } /* * Find a unique LCI for a given device. */ unsigned int rose_new_lci(struct rose_neigh *neigh) { int lci; if (neigh->dce_mode) { for (lci = 1; lci <= sysctl_rose_maximum_vcs; lci++) if (rose_find_socket(lci, neigh) == NULL && rose_route_free_lci(lci, neigh) == NULL) return lci; } else { for (lci = sysctl_rose_maximum_vcs; lci > 0; lci--) if (rose_find_socket(lci, neigh) == NULL && rose_route_free_lci(lci, neigh) == NULL) return lci; } return 0; } /* * Deferred destroy. */ void rose_destroy_socket(struct sock *); /* * Handler for deferred kills. */ static void rose_destroy_timer(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); rose_destroy_socket(sk); } /* * This is called from user mode and the timers. Thus it protects itself * against interrupt users but doesn't worry about being called during * work. Once it is removed from the queue no interrupt or bottom half * will touch it and we are (fairly 8-) ) safe. */ void rose_destroy_socket(struct sock *sk) { struct sk_buff *skb; rose_remove_socket(sk); rose_stop_heartbeat(sk); rose_stop_idletimer(sk); rose_stop_timer(sk); rose_clear_queues(sk); /* Flush the queues */ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { if (skb->sk != sk) { /* A pending connection */ /* Queue the unaccepted socket for death */ sock_set_flag(skb->sk, SOCK_DEAD); rose_start_heartbeat(skb->sk); rose_sk(skb->sk)->state = ROSE_STATE_0; } kfree_skb(skb); } if (sk_has_allocations(sk)) { /* Defer: outstanding buffers */ timer_setup(&sk->sk_timer, rose_destroy_timer, 0); sk->sk_timer.expires = jiffies + 10 * HZ; add_timer(&sk->sk_timer); } else sock_put(sk); } /* * Handling for system calls applied via the various interfaces to a * ROSE socket object. */ static int rose_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); int opt; if (level != SOL_ROSE) return -ENOPROTOOPT; if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&opt, optval, sizeof(int))) return -EFAULT; switch (optname) { case ROSE_DEFER: rose->defer = opt ? 1 : 0; return 0; case ROSE_T1: if (opt < 1) return -EINVAL; rose->t1 = opt * HZ; return 0; case ROSE_T2: if (opt < 1) return -EINVAL; rose->t2 = opt * HZ; return 0; case ROSE_T3: if (opt < 1) return -EINVAL; rose->t3 = opt * HZ; return 0; case ROSE_HOLDBACK: if (opt < 1) return -EINVAL; rose->hb = opt * HZ; return 0; case ROSE_IDLE: if (opt < 0) return -EINVAL; rose->idle = opt * 60 * HZ; return 0; case ROSE_QBITINCL: rose->qbitincl = opt ? 1 : 0; return 0; default: return -ENOPROTOOPT; } } static int rose_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); int val = 0; int len; if (level != SOL_ROSE) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case ROSE_DEFER: val = rose->defer; break; case ROSE_T1: val = rose->t1 / HZ; break; case ROSE_T2: val = rose->t2 / HZ; break; case ROSE_T3: val = rose->t3 / HZ; break; case ROSE_HOLDBACK: val = rose->hb / HZ; break; case ROSE_IDLE: val = rose->idle / (60 * HZ); break; case ROSE_QBITINCL: val = rose->qbitincl; break; default: return -ENOPROTOOPT; } len = min_t(unsigned int, len, sizeof(int)); if (put_user(len, optlen)) return -EFAULT; return copy_to_user(optval, &val, len) ? -EFAULT : 0; } static int rose_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; lock_sock(sk); if (sock->state != SS_UNCONNECTED) { release_sock(sk); return -EINVAL; } if (sk->sk_state != TCP_LISTEN) { struct rose_sock *rose = rose_sk(sk); rose->dest_ndigis = 0; memset(&rose->dest_addr, 0, ROSE_ADDR_LEN); memset(&rose->dest_call, 0, AX25_ADDR_LEN); memset(rose->dest_digis, 0, AX25_ADDR_LEN * ROSE_MAX_DIGIS); sk->sk_max_ack_backlog = backlog; sk->sk_state = TCP_LISTEN; release_sock(sk); return 0; } release_sock(sk); return -EOPNOTSUPP; } static struct proto rose_proto = { .name = "ROSE", .owner = THIS_MODULE, .obj_size = sizeof(struct rose_sock), }; static int rose_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct rose_sock *rose; if (!net_eq(net, &init_net)) return -EAFNOSUPPORT; if (sock->type != SOCK_SEQPACKET || protocol != 0) return -ESOCKTNOSUPPORT; sk = sk_alloc(net, PF_ROSE, GFP_ATOMIC, &rose_proto, kern); if (sk == NULL) return -ENOMEM; rose = rose_sk(sk); sock_init_data(sock, sk); skb_queue_head_init(&rose->ack_queue); #ifdef M_BIT skb_queue_head_init(&rose->frag_queue); rose->fraglen = 0; #endif sock->ops = &rose_proto_ops; sk->sk_protocol = protocol; timer_setup(&rose->timer, NULL, 0); timer_setup(&rose->idletimer, NULL, 0); rose->t1 = msecs_to_jiffies(sysctl_rose_call_request_timeout); rose->t2 = msecs_to_jiffies(sysctl_rose_reset_request_timeout); rose->t3 = msecs_to_jiffies(sysctl_rose_clear_request_timeout); rose->hb = msecs_to_jiffies(sysctl_rose_ack_hold_back_timeout); rose->idle = msecs_to_jiffies(sysctl_rose_no_activity_timeout); rose->state = ROSE_STATE_0; return 0; } static struct sock *rose_make_new(struct sock *osk) { struct sock *sk; struct rose_sock *rose, *orose; if (osk->sk_type != SOCK_SEQPACKET) return NULL; sk = sk_alloc(sock_net(osk), PF_ROSE, GFP_ATOMIC, &rose_proto, 0); if (sk == NULL) return NULL; rose = rose_sk(sk); sock_init_data(NULL, sk); skb_queue_head_init(&rose->ack_queue); #ifdef M_BIT skb_queue_head_init(&rose->frag_queue); rose->fraglen = 0; #endif sk->sk_type = osk->sk_type; sk->sk_priority = READ_ONCE(osk->sk_priority); sk->sk_protocol = osk->sk_protocol; sk->sk_rcvbuf = osk->sk_rcvbuf; sk->sk_sndbuf = osk->sk_sndbuf; sk->sk_state = TCP_ESTABLISHED; sock_copy_flags(sk, osk); timer_setup(&rose->timer, NULL, 0); timer_setup(&rose->idletimer, NULL, 0); orose = rose_sk(osk); rose->t1 = orose->t1; rose->t2 = orose->t2; rose->t3 = orose->t3; rose->hb = orose->hb; rose->idle = orose->idle; rose->defer = orose->defer; rose->device = orose->device; if (rose->device) netdev_hold(rose->device, &rose->dev_tracker, GFP_ATOMIC); rose->qbitincl = orose->qbitincl; return sk; } static int rose_release(struct socket *sock) { struct sock *sk = sock->sk; struct rose_sock *rose; if (sk == NULL) return 0; sock_hold(sk); sock_orphan(sk); lock_sock(sk); rose = rose_sk(sk); switch (rose->state) { case ROSE_STATE_0: release_sock(sk); rose_disconnect(sk, 0, -1, -1); lock_sock(sk); rose_destroy_socket(sk); break; case ROSE_STATE_2: rose->neighbour->use--; release_sock(sk); rose_disconnect(sk, 0, -1, -1); lock_sock(sk); rose_destroy_socket(sk); break; case ROSE_STATE_1: case ROSE_STATE_3: case ROSE_STATE_4: case ROSE_STATE_5: rose_clear_queues(sk); rose_stop_idletimer(sk); rose_write_internal(sk, ROSE_CLEAR_REQUEST); rose_start_t3timer(sk); rose->state = ROSE_STATE_2; sk->sk_state = TCP_CLOSE; sk->sk_shutdown |= SEND_SHUTDOWN; sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DEAD); sock_set_flag(sk, SOCK_DESTROY); break; default: break; } spin_lock_bh(&rose_list_lock); netdev_put(rose->device, &rose->dev_tracker); rose->device = NULL; spin_unlock_bh(&rose_list_lock); sock->sk = NULL; release_sock(sk); sock_put(sk); return 0; } static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr; struct net_device *dev; ax25_address *source; ax25_uid_assoc *user; int n; if (!sock_flag(sk, SOCK_ZAPPED)) return -EINVAL; if (addr_len != sizeof(struct sockaddr_rose) && addr_len != sizeof(struct full_sockaddr_rose)) return -EINVAL; if (addr->srose_family != AF_ROSE) return -EINVAL; if (addr_len == sizeof(struct sockaddr_rose) && addr->srose_ndigis > 1) return -EINVAL; if ((unsigned int) addr->srose_ndigis > ROSE_MAX_DIGIS) return -EINVAL; if ((dev = rose_dev_get(&addr->srose_addr)) == NULL) return -EADDRNOTAVAIL; source = &addr->srose_call; user = ax25_findbyuid(current_euid()); if (user) { rose->source_call = user->call; ax25_uid_put(user); } else { if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) { dev_put(dev); return -EACCES; } rose->source_call = *source; } rose->source_addr = addr->srose_addr; rose->device = dev; netdev_tracker_alloc(rose->device, &rose->dev_tracker, GFP_KERNEL); rose->source_ndigis = addr->srose_ndigis; if (addr_len == sizeof(struct full_sockaddr_rose)) { struct full_sockaddr_rose *full_addr = (struct full_sockaddr_rose *)uaddr; for (n = 0 ; n < addr->srose_ndigis ; n++) rose->source_digis[n] = full_addr->srose_digis[n]; } else { if (rose->source_ndigis == 1) { rose->source_digis[0] = addr->srose_digi; } } rose_insert_socket(sk); sock_reset_flag(sk, SOCK_ZAPPED); return 0; } static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr; unsigned char cause, diagnostic; ax25_uid_assoc *user; int n, err = 0; if (addr_len != sizeof(struct sockaddr_rose) && addr_len != sizeof(struct full_sockaddr_rose)) return -EINVAL; if (addr->srose_family != AF_ROSE) return -EINVAL; if (addr_len == sizeof(struct sockaddr_rose) && addr->srose_ndigis > 1) return -EINVAL; if ((unsigned int) addr->srose_ndigis > ROSE_MAX_DIGIS) return -EINVAL; /* Source + Destination digis should not exceed ROSE_MAX_DIGIS */ if ((rose->source_ndigis + addr->srose_ndigis) > ROSE_MAX_DIGIS) return -EINVAL; lock_sock(sk); if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) { /* Connect completed during a ERESTARTSYS event */ sock->state = SS_CONNECTED; goto out_release; } if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) { sock->state = SS_UNCONNECTED; err = -ECONNREFUSED; goto out_release; } if (sk->sk_state == TCP_ESTABLISHED) { /* No reconnect on a seqpacket socket */ err = -EISCONN; goto out_release; } sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; rose->neighbour = rose_get_neigh(&addr->srose_addr, &cause, &diagnostic, 0); if (!rose->neighbour) { err = -ENETUNREACH; goto out_release; } rose->lci = rose_new_lci(rose->neighbour); if (!rose->lci) { err = -ENETUNREACH; goto out_release; } if (sock_flag(sk, SOCK_ZAPPED)) { /* Must bind first - autobinding in this may or may not work */ struct net_device *dev; sock_reset_flag(sk, SOCK_ZAPPED); dev = rose_dev_first(); if (!dev) { err = -ENETUNREACH; goto out_release; } user = ax25_findbyuid(current_euid()); if (!user) { err = -EINVAL; dev_put(dev); goto out_release; } memcpy(&rose->source_addr, dev->dev_addr, ROSE_ADDR_LEN); rose->source_call = user->call; rose->device = dev; netdev_tracker_alloc(rose->device, &rose->dev_tracker, GFP_KERNEL); ax25_uid_put(user); rose_insert_socket(sk); /* Finish the bind */ } rose->dest_addr = addr->srose_addr; rose->dest_call = addr->srose_call; rose->rand = ((long)rose & 0xFFFF) + rose->lci; rose->dest_ndigis = addr->srose_ndigis; if (addr_len == sizeof(struct full_sockaddr_rose)) { struct full_sockaddr_rose *full_addr = (struct full_sockaddr_rose *)uaddr; for (n = 0 ; n < addr->srose_ndigis ; n++) rose->dest_digis[n] = full_addr->srose_digis[n]; } else { if (rose->dest_ndigis == 1) { rose->dest_digis[0] = addr->srose_digi; } } /* Move to connecting socket, start sending Connect Requests */ sock->state = SS_CONNECTING; sk->sk_state = TCP_SYN_SENT; rose->state = ROSE_STATE_1; rose->neighbour->use++; rose_write_internal(sk, ROSE_CALL_REQUEST); rose_start_heartbeat(sk); rose_start_t1timer(sk); /* Now the loop */ if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) { err = -EINPROGRESS; goto out_release; } /* * A Connect Ack with Choke or timeout or failed routing will go to * closed. */ if (sk->sk_state == TCP_SYN_SENT) { DEFINE_WAIT(wait); for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); if (sk->sk_state != TCP_SYN_SENT) break; if (!signal_pending(current)) { release_sock(sk); schedule(); lock_sock(sk); continue; } err = -ERESTARTSYS; break; } finish_wait(sk_sleep(sk), &wait); if (err) goto out_release; } if (sk->sk_state != TCP_ESTABLISHED) { sock->state = SS_UNCONNECTED; err = sock_error(sk); /* Always set at this point */ goto out_release; } sock->state = SS_CONNECTED; out_release: release_sock(sk); return err; } static int rose_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { struct sk_buff *skb; struct sock *newsk; DEFINE_WAIT(wait); struct sock *sk; int err = 0; if ((sk = sock->sk) == NULL) return -EINVAL; lock_sock(sk); if (sk->sk_type != SOCK_SEQPACKET) { err = -EOPNOTSUPP; goto out_release; } if (sk->sk_state != TCP_LISTEN) { err = -EINVAL; goto out_release; } /* * The write queue this time is holding sockets ready to use * hooked into the SABM we saved */ for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); skb = skb_dequeue(&sk->sk_receive_queue); if (skb) break; if (flags & O_NONBLOCK) { err = -EWOULDBLOCK; break; } if (!signal_pending(current)) { release_sock(sk); schedule(); lock_sock(sk); continue; } err = -ERESTARTSYS; break; } finish_wait(sk_sleep(sk), &wait); if (err) goto out_release; newsk = skb->sk; sock_graft(newsk, newsock); /* Now attach up the new socket */ skb->sk = NULL; kfree_skb(skb); sk_acceptq_removed(sk); out_release: release_sock(sk); return err; } static int rose_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct full_sockaddr_rose *srose = (struct full_sockaddr_rose *)uaddr; struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); int n; memset(srose, 0, sizeof(*srose)); if (peer != 0) { if (sk->sk_state != TCP_ESTABLISHED) return -ENOTCONN; srose->srose_family = AF_ROSE; srose->srose_addr = rose->dest_addr; srose->srose_call = rose->dest_call; srose->srose_ndigis = rose->dest_ndigis; for (n = 0; n < rose->dest_ndigis; n++) srose->srose_digis[n] = rose->dest_digis[n]; } else { srose->srose_family = AF_ROSE; srose->srose_addr = rose->source_addr; srose->srose_call = rose->source_call; srose->srose_ndigis = rose->source_ndigis; for (n = 0; n < rose->source_ndigis; n++) srose->srose_digis[n] = rose->source_digis[n]; } return sizeof(struct full_sockaddr_rose); } int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct rose_neigh *neigh, unsigned int lci) { struct sock *sk; struct sock *make; struct rose_sock *make_rose; struct rose_facilities_struct facilities; int n; skb->sk = NULL; /* Initially we don't know who it's for */ /* * skb->data points to the rose frame start */ memset(&facilities, 0x00, sizeof(struct rose_facilities_struct)); if (!rose_parse_facilities(skb->data + ROSE_CALL_REQ_FACILITIES_OFF, skb->len - ROSE_CALL_REQ_FACILITIES_OFF, &facilities)) { rose_transmit_clear_request(neigh, lci, ROSE_INVALID_FACILITY, 76); return 0; } sk = rose_find_listener(&facilities.source_addr, &facilities.source_call); /* * We can't accept the Call Request. */ if (sk == NULL || sk_acceptq_is_full(sk) || (make = rose_make_new(sk)) == NULL) { rose_transmit_clear_request(neigh, lci, ROSE_NETWORK_CONGESTION, 120); return 0; } skb->sk = make; make->sk_state = TCP_ESTABLISHED; make_rose = rose_sk(make); make_rose->lci = lci; make_rose->dest_addr = facilities.dest_addr; make_rose->dest_call = facilities.dest_call; make_rose->dest_ndigis = facilities.dest_ndigis; for (n = 0 ; n < facilities.dest_ndigis ; n++) make_rose->dest_digis[n] = facilities.dest_digis[n]; make_rose->source_addr = facilities.source_addr; make_rose->source_call = facilities.source_call; make_rose->source_ndigis = facilities.source_ndigis; for (n = 0 ; n < facilities.source_ndigis ; n++) make_rose->source_digis[n] = facilities.source_digis[n]; make_rose->neighbour = neigh; make_rose->device = dev; /* Caller got a reference for us. */ netdev_tracker_alloc(make_rose->device, &make_rose->dev_tracker, GFP_ATOMIC); make_rose->facilities = facilities; make_rose->neighbour->use++; if (rose_sk(sk)->defer) { make_rose->state = ROSE_STATE_5; } else { rose_write_internal(make, ROSE_CALL_ACCEPTED); make_rose->state = ROSE_STATE_3; rose_start_idletimer(make); } make_rose->condition = 0x00; make_rose->vs = 0; make_rose->va = 0; make_rose->vr = 0; make_rose->vl = 0; sk_acceptq_added(sk); rose_insert_socket(make); skb_queue_head(&sk->sk_receive_queue, skb); rose_start_heartbeat(make); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); return 1; } static int rose_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); DECLARE_SOCKADDR(struct sockaddr_rose *, usrose, msg->msg_name); int err; struct full_sockaddr_rose srose; struct sk_buff *skb; unsigned char *asmptr; int n, size, qbit = 0; if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT)) return -EINVAL; if (sock_flag(sk, SOCK_ZAPPED)) return -EADDRNOTAVAIL; if (sk->sk_shutdown & SEND_SHUTDOWN) { send_sig(SIGPIPE, current, 0); return -EPIPE; } if (rose->neighbour == NULL || rose->device == NULL) return -ENETUNREACH; if (usrose != NULL) { if (msg->msg_namelen != sizeof(struct sockaddr_rose) && msg->msg_namelen != sizeof(struct full_sockaddr_rose)) return -EINVAL; memset(&srose, 0, sizeof(struct full_sockaddr_rose)); memcpy(&srose, usrose, msg->msg_namelen); if (rosecmp(&rose->dest_addr, &srose.srose_addr) != 0 || ax25cmp(&rose->dest_call, &srose.srose_call) != 0) return -EISCONN; if (srose.srose_ndigis != rose->dest_ndigis) return -EISCONN; if (srose.srose_ndigis == rose->dest_ndigis) { for (n = 0 ; n < srose.srose_ndigis ; n++) if (ax25cmp(&rose->dest_digis[n], &srose.srose_digis[n])) return -EISCONN; } if (srose.srose_family != AF_ROSE) return -EINVAL; } else { if (sk->sk_state != TCP_ESTABLISHED) return -ENOTCONN; srose.srose_family = AF_ROSE; srose.srose_addr = rose->dest_addr; srose.srose_call = rose->dest_call; srose.srose_ndigis = rose->dest_ndigis; for (n = 0 ; n < rose->dest_ndigis ; n++) srose.srose_digis[n] = rose->dest_digis[n]; } /* Build a packet */ /* Sanity check the packet size */ if (len > 65535) return -EMSGSIZE; size = len + AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN; if ((skb = sock_alloc_send_skb(sk, size, msg->msg_flags & MSG_DONTWAIT, &err)) == NULL) return err; skb_reserve(skb, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN); /* * Put the data on the end */ skb_reset_transport_header(skb); skb_put(skb, len); err = memcpy_from_msg(skb_transport_header(skb), msg, len); if (err) { kfree_skb(skb); return err; } /* * If the Q BIT Include socket option is in force, the first * byte of the user data is the logical value of the Q Bit. */ if (rose->qbitincl) { qbit = skb->data[0]; skb_pull(skb, 1); } /* * Push down the ROSE header */ asmptr = skb_push(skb, ROSE_MIN_LEN); /* Build a ROSE Network header */ asmptr[0] = ((rose->lci >> 8) & 0x0F) | ROSE_GFI; asmptr[1] = (rose->lci >> 0) & 0xFF; asmptr[2] = ROSE_DATA; if (qbit) asmptr[0] |= ROSE_Q_BIT; if (sk->sk_state != TCP_ESTABLISHED) { kfree_skb(skb); return -ENOTCONN; } #ifdef M_BIT #define ROSE_PACLEN (256-ROSE_MIN_LEN) if (skb->len - ROSE_MIN_LEN > ROSE_PACLEN) { unsigned char header[ROSE_MIN_LEN]; struct sk_buff *skbn; int frontlen; int lg; /* Save a copy of the Header */ skb_copy_from_linear_data(skb, header, ROSE_MIN_LEN); skb_pull(skb, ROSE_MIN_LEN); frontlen = skb_headroom(skb); while (skb->len > 0) { if ((skbn = sock_alloc_send_skb(sk, frontlen + ROSE_PACLEN, 0, &err)) == NULL) { kfree_skb(skb); return err; } skbn->sk = sk; skbn->free = 1; skbn->arp = 1; skb_reserve(skbn, frontlen); lg = (ROSE_PACLEN > skb->len) ? skb->len : ROSE_PACLEN; /* Copy the user data */ skb_copy_from_linear_data(skb, skb_put(skbn, lg), lg); skb_pull(skb, lg); /* Duplicate the Header */ skb_push(skbn, ROSE_MIN_LEN); skb_copy_to_linear_data(skbn, header, ROSE_MIN_LEN); if (skb->len > 0) skbn->data[2] |= M_BIT; skb_queue_tail(&sk->sk_write_queue, skbn); /* Throw it on the queue */ } skb->free = 1; kfree_skb(skb); } else { skb_queue_tail(&sk->sk_write_queue, skb); /* Throw it on the queue */ } #else skb_queue_tail(&sk->sk_write_queue, skb); /* Shove it onto the queue */ #endif rose_kick(sk); return len; } static int rose_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); size_t copied; unsigned char *asmptr; struct sk_buff *skb; int n, er, qbit; /* * This works for seqpacket too. The receiver has ordered the queue for * us! We do one quick check first though */ if (sk->sk_state != TCP_ESTABLISHED) return -ENOTCONN; /* Now we can treat all alike */ skb = skb_recv_datagram(sk, flags, &er); if (!skb) return er; qbit = (skb->data[0] & ROSE_Q_BIT) == ROSE_Q_BIT; skb_pull(skb, ROSE_MIN_LEN); if (rose->qbitincl) { asmptr = skb_push(skb, 1); *asmptr = qbit; } skb_reset_transport_header(skb); copied = skb->len; if (copied > size) { copied = size; msg->msg_flags |= MSG_TRUNC; } skb_copy_datagram_msg(skb, 0, msg, copied); if (msg->msg_name) { struct sockaddr_rose *srose; DECLARE_SOCKADDR(struct full_sockaddr_rose *, full_srose, msg->msg_name); memset(msg->msg_name, 0, sizeof(struct full_sockaddr_rose)); srose = msg->msg_name; srose->srose_family = AF_ROSE; srose->srose_addr = rose->dest_addr; srose->srose_call = rose->dest_call; srose->srose_ndigis = rose->dest_ndigis; for (n = 0 ; n < rose->dest_ndigis ; n++) full_srose->srose_digis[n] = rose->dest_digis[n]; msg->msg_namelen = sizeof(struct full_sockaddr_rose); } skb_free_datagram(sk, skb); return copied; } static int rose_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); void __user *argp = (void __user *)arg; switch (cmd) { case TIOCOUTQ: { long amount; amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk); if (amount < 0) amount = 0; return put_user(amount, (unsigned int __user *) argp); } case TIOCINQ: { struct sk_buff *skb; long amount = 0L; spin_lock_irq(&sk->sk_receive_queue.lock); if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) amount = skb->len; spin_unlock_irq(&sk->sk_receive_queue.lock); return put_user(amount, (unsigned int __user *) argp); } case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCGIFMETRIC: case SIOCSIFMETRIC: return -EINVAL; case SIOCADDRT: case SIOCDELRT: case SIOCRSCLRRT: if (!capable(CAP_NET_ADMIN)) return -EPERM; return rose_rt_ioctl(cmd, argp); case SIOCRSGCAUSE: { struct rose_cause_struct rose_cause; rose_cause.cause = rose->cause; rose_cause.diagnostic = rose->diagnostic; return copy_to_user(argp, &rose_cause, sizeof(struct rose_cause_struct)) ? -EFAULT : 0; } case SIOCRSSCAUSE: { struct rose_cause_struct rose_cause; if (copy_from_user(&rose_cause, argp, sizeof(struct rose_cause_struct))) return -EFAULT; rose->cause = rose_cause.cause; rose->diagnostic = rose_cause.diagnostic; return 0; } case SIOCRSSL2CALL: if (!capable(CAP_NET_ADMIN)) return -EPERM; if (ax25cmp(&rose_callsign, &null_ax25_address) != 0) ax25_listen_release(&rose_callsign, NULL); if (copy_from_user(&rose_callsign, argp, sizeof(ax25_address))) return -EFAULT; if (ax25cmp(&rose_callsign, &null_ax25_address) != 0) return ax25_listen_register(&rose_callsign, NULL); return 0; case SIOCRSGL2CALL: return copy_to_user(argp, &rose_callsign, sizeof(ax25_address)) ? -EFAULT : 0; case SIOCRSACCEPT: if (rose->state == ROSE_STATE_5) { rose_write_internal(sk, ROSE_CALL_ACCEPTED); rose_start_idletimer(sk); rose->condition = 0x00; rose->vs = 0; rose->va = 0; rose->vr = 0; rose->vl = 0; rose->state = ROSE_STATE_3; } return 0; default: return -ENOIOCTLCMD; } return 0; } #ifdef CONFIG_PROC_FS static void *rose_info_start(struct seq_file *seq, loff_t *pos) __acquires(rose_list_lock) { spin_lock_bh(&rose_list_lock); return seq_hlist_start_head(&rose_list, *pos); } static void *rose_info_next(struct seq_file *seq, void *v, loff_t *pos) { return seq_hlist_next(v, &rose_list, pos); } static void rose_info_stop(struct seq_file *seq, void *v) __releases(rose_list_lock) { spin_unlock_bh(&rose_list_lock); } static int rose_info_show(struct seq_file *seq, void *v) { char buf[11], rsbuf[11]; if (v == SEQ_START_TOKEN) seq_puts(seq, "dest_addr dest_call src_addr src_call dev lci neigh st vs vr va t t1 t2 t3 hb idle Snd-Q Rcv-Q inode\n"); else { struct sock *s = sk_entry(v); struct rose_sock *rose = rose_sk(s); const char *devname, *callsign; const struct net_device *dev = rose->device; if (!dev) devname = "???"; else devname = dev->name; seq_printf(seq, "%-10s %-9s ", rose2asc(rsbuf, &rose->dest_addr), ax2asc(buf, &rose->dest_call)); if (ax25cmp(&rose->source_call, &null_ax25_address) == 0) callsign = "??????-?"; else callsign = ax2asc(buf, &rose->source_call); seq_printf(seq, "%-10s %-9s %-5s %3.3X %05d %d %d %d %d %3lu %3lu %3lu %3lu %3lu %3lu/%03lu %5d %5d %ld\n", rose2asc(rsbuf, &rose->source_addr), callsign, devname, rose->lci & 0x0FFF, (rose->neighbour) ? rose->neighbour->number : 0, rose->state, rose->vs, rose->vr, rose->va, ax25_display_timer(&rose->timer) / HZ, rose->t1 / HZ, rose->t2 / HZ, rose->t3 / HZ, rose->hb / HZ, ax25_display_timer(&rose->idletimer) / (60 * HZ), rose->idle / (60 * HZ), sk_wmem_alloc_get(s), sk_rmem_alloc_get(s), s->sk_socket ? SOCK_INODE(s->sk_socket)->i_ino : 0L); } return 0; } static const struct seq_operations rose_info_seqops = { .start = rose_info_start, .next = rose_info_next, .stop = rose_info_stop, .show = rose_info_show, }; #endif /* CONFIG_PROC_FS */ static const struct net_proto_family rose_family_ops = { .family = PF_ROSE, .create = rose_create, .owner = THIS_MODULE, }; static const struct proto_ops rose_proto_ops = { .family = PF_ROSE, .owner = THIS_MODULE, .release = rose_release, .bind = rose_bind, .connect = rose_connect, .socketpair = sock_no_socketpair, .accept = rose_accept, .getname = rose_getname, .poll = datagram_poll, .ioctl = rose_ioctl, .gettstamp = sock_gettstamp, .listen = rose_listen, .shutdown = sock_no_shutdown, .setsockopt = rose_setsockopt, .getsockopt = rose_getsockopt, .sendmsg = rose_sendmsg, .recvmsg = rose_recvmsg, .mmap = sock_no_mmap, }; static struct notifier_block rose_dev_notifier = { .notifier_call = rose_device_event, }; static struct net_device **dev_rose; static struct ax25_protocol rose_pid = { .pid = AX25_P_ROSE, .func = rose_route_frame }; static struct ax25_linkfail rose_linkfail_notifier = { .func = rose_link_failed }; static int __init rose_proto_init(void) { int i; int rc; if (rose_ndevs > 0x7FFFFFFF/sizeof(struct net_device *)) { printk(KERN_ERR "ROSE: rose_proto_init - rose_ndevs parameter too large\n"); rc = -EINVAL; goto out; } rc = proto_register(&rose_proto, 0); if (rc != 0) goto out; rose_callsign = null_ax25_address; dev_rose = kcalloc(rose_ndevs, sizeof(struct net_device *), GFP_KERNEL); if (dev_rose == NULL) { printk(KERN_ERR "ROSE: rose_proto_init - unable to allocate device structure\n"); rc = -ENOMEM; goto out_proto_unregister; } for (i = 0; i < rose_ndevs; i++) { struct net_device *dev; char name[IFNAMSIZ]; sprintf(name, "rose%d", i); dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, rose_setup); if (!dev) { printk(KERN_ERR "ROSE: rose_proto_init - unable to allocate memory\n"); rc = -ENOMEM; goto fail; } rc = register_netdev(dev); if (rc) { printk(KERN_ERR "ROSE: netdevice registration failed\n"); free_netdev(dev); goto fail; } rose_set_lockdep_key(dev); dev_rose[i] = dev; } sock_register(&rose_family_ops); register_netdevice_notifier(&rose_dev_notifier); ax25_register_pid(&rose_pid); ax25_linkfail_register(&rose_linkfail_notifier); #ifdef CONFIG_SYSCTL rose_register_sysctl(); #endif rose_loopback_init(); rose_add_loopback_neigh(); proc_create_seq("rose", 0444, init_net.proc_net, &rose_info_seqops); proc_create_seq("rose_neigh", 0444, init_net.proc_net, &rose_neigh_seqops); proc_create_seq("rose_nodes", 0444, init_net.proc_net, &rose_node_seqops); proc_create_seq("rose_routes", 0444, init_net.proc_net, &rose_route_seqops); out: return rc; fail: while (--i >= 0) { unregister_netdev(dev_rose[i]); free_netdev(dev_rose[i]); } kfree(dev_rose); out_proto_unregister: proto_unregister(&rose_proto); goto out; } module_init(rose_proto_init); module_param(rose_ndevs, int, 0); MODULE_PARM_DESC(rose_ndevs, "number of ROSE devices"); MODULE_AUTHOR("Jonathan Naylor G4KLX <g4klx@g4klx.demon.co.uk>"); MODULE_DESCRIPTION("The amateur radio ROSE network layer protocol"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_ROSE); static void __exit rose_exit(void) { int i; remove_proc_entry("rose", init_net.proc_net); remove_proc_entry("rose_neigh", init_net.proc_net); remove_proc_entry("rose_nodes", init_net.proc_net); remove_proc_entry("rose_routes", init_net.proc_net); rose_loopback_clear(); rose_rt_free(); ax25_protocol_release(AX25_P_ROSE); ax25_linkfail_release(&rose_linkfail_notifier); if (ax25cmp(&rose_callsign, &null_ax25_address) != 0) ax25_listen_release(&rose_callsign, NULL); #ifdef CONFIG_SYSCTL rose_unregister_sysctl(); #endif unregister_netdevice_notifier(&rose_dev_notifier); sock_unregister(PF_ROSE); for (i = 0; i < rose_ndevs; i++) { struct net_device *dev = dev_rose[i]; if (dev) { unregister_netdev(dev); free_netdev(dev); } } kfree(dev_rose); proto_unregister(&rose_proto); } module_exit(rose_exit);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 // SPDX-License-Identifier: GPL-2.0 /* * io_uring opcode handling table */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/io_uring.h> #include "io_uring.h" #include "opdef.h" #include "refs.h" #include "tctx.h" #include "sqpoll.h" #include "fdinfo.h" #include "kbuf.h" #include "rsrc.h" #include "xattr.h" #include "nop.h" #include "fs.h" #include "splice.h" #include "sync.h" #include "advise.h" #include "openclose.h" #include "uring_cmd.h" #include "epoll.h" #include "statx.h" #include "net.h" #include "msg_ring.h" #include "timeout.h" #include "poll.h" #include "cancel.h" #include "rw.h" #include "waitid.h" #include "futex.h" static int io_no_issue(struct io_kiocb *req, unsigned int issue_flags) { WARN_ON_ONCE(1); return -ECANCELED; } static __maybe_unused int io_eopnotsupp_prep(struct io_kiocb *kiocb, const struct io_uring_sqe *sqe) { return -EOPNOTSUPP; } const struct io_issue_def io_issue_defs[] = { [IORING_OP_NOP] = { .audit_skip = 1, .iopoll = 1, .prep = io_nop_prep, .issue = io_nop, }, [IORING_OP_READV] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, .plug = 1, .audit_skip = 1, .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, .vectored = 1, .prep = io_prep_rwv, .issue = io_read, }, [IORING_OP_WRITEV] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, .plug = 1, .audit_skip = 1, .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, .vectored = 1, .prep = io_prep_rwv, .issue = io_write, }, [IORING_OP_FSYNC] = { .needs_file = 1, .audit_skip = 1, .prep = io_fsync_prep, .issue = io_fsync, }, [IORING_OP_READ_FIXED] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .plug = 1, .audit_skip = 1, .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, .prep = io_prep_rw_fixed, .issue = io_read, }, [IORING_OP_WRITE_FIXED] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, .plug = 1, .audit_skip = 1, .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, .prep = io_prep_rw_fixed, .issue = io_write, }, [IORING_OP_POLL_ADD] = { .needs_file = 1, .unbound_nonreg_file = 1, .audit_skip = 1, .prep = io_poll_add_prep, .issue = io_poll_add, }, [IORING_OP_POLL_REMOVE] = { .audit_skip = 1, .prep = io_poll_remove_prep, .issue = io_poll_remove, }, [IORING_OP_SYNC_FILE_RANGE] = { .needs_file = 1, .audit_skip = 1, .prep = io_sfr_prep, .issue = io_sync_file_range, }, [IORING_OP_SENDMSG] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, .ioprio = 1, .manual_alloc = 1, #if defined(CONFIG_NET) .prep = io_sendmsg_prep, .issue = io_sendmsg, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_RECVMSG] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, .ioprio = 1, .manual_alloc = 1, #if defined(CONFIG_NET) .prep = io_recvmsg_prep, .issue = io_recvmsg, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_TIMEOUT] = { .audit_skip = 1, .prep = io_timeout_prep, .issue = io_timeout, }, [IORING_OP_TIMEOUT_REMOVE] = { /* used by timeout updates' prep() */ .audit_skip = 1, .prep = io_timeout_remove_prep, .issue = io_timeout_remove, }, [IORING_OP_ACCEPT] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .poll_exclusive = 1, .ioprio = 1, /* used for flags */ #if defined(CONFIG_NET) .prep = io_accept_prep, .issue = io_accept, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_ASYNC_CANCEL] = { .audit_skip = 1, .prep = io_async_cancel_prep, .issue = io_async_cancel, }, [IORING_OP_LINK_TIMEOUT] = { .audit_skip = 1, .prep = io_link_timeout_prep, .issue = io_no_issue, }, [IORING_OP_CONNECT] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, #if defined(CONFIG_NET) .prep = io_connect_prep, .issue = io_connect, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_FALLOCATE] = { .needs_file = 1, .prep = io_fallocate_prep, .issue = io_fallocate, }, [IORING_OP_OPENAT] = { .prep = io_openat_prep, .issue = io_openat, }, [IORING_OP_CLOSE] = { .prep = io_close_prep, .issue = io_close, }, [IORING_OP_FILES_UPDATE] = { .audit_skip = 1, .iopoll = 1, .prep = io_files_update_prep, .issue = io_files_update, }, [IORING_OP_STATX] = { .audit_skip = 1, .prep = io_statx_prep, .issue = io_statx, }, [IORING_OP_READ] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, .plug = 1, .audit_skip = 1, .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, .prep = io_prep_rw, .issue = io_read, }, [IORING_OP_WRITE] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, .plug = 1, .audit_skip = 1, .ioprio = 1, .iopoll = 1, .iopoll_queue = 1, .prep = io_prep_rw, .issue = io_write, }, [IORING_OP_FADVISE] = { .needs_file = 1, .audit_skip = 1, .prep = io_fadvise_prep, .issue = io_fadvise, }, [IORING_OP_MADVISE] = { .audit_skip = 1, .prep = io_madvise_prep, .issue = io_madvise, }, [IORING_OP_SEND] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, .audit_skip = 1, .ioprio = 1, .manual_alloc = 1, #if defined(CONFIG_NET) .prep = io_sendmsg_prep, .issue = io_send, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_RECV] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, .audit_skip = 1, .ioprio = 1, #if defined(CONFIG_NET) .prep = io_recvmsg_prep, .issue = io_recv, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_OPENAT2] = { .prep = io_openat2_prep, .issue = io_openat2, }, [IORING_OP_EPOLL_CTL] = { .unbound_nonreg_file = 1, .audit_skip = 1, #if defined(CONFIG_EPOLL) .prep = io_epoll_ctl_prep, .issue = io_epoll_ctl, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_SPLICE] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, .audit_skip = 1, .prep = io_splice_prep, .issue = io_splice, }, [IORING_OP_PROVIDE_BUFFERS] = { .audit_skip = 1, .iopoll = 1, .prep = io_provide_buffers_prep, .issue = io_provide_buffers, }, [IORING_OP_REMOVE_BUFFERS] = { .audit_skip = 1, .iopoll = 1, .prep = io_remove_buffers_prep, .issue = io_remove_buffers, }, [IORING_OP_TEE] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, .audit_skip = 1, .prep = io_tee_prep, .issue = io_tee, }, [IORING_OP_SHUTDOWN] = { .needs_file = 1, #if defined(CONFIG_NET) .prep = io_shutdown_prep, .issue = io_shutdown, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_RENAMEAT] = { .prep = io_renameat_prep, .issue = io_renameat, }, [IORING_OP_UNLINKAT] = { .prep = io_unlinkat_prep, .issue = io_unlinkat, }, [IORING_OP_MKDIRAT] = { .prep = io_mkdirat_prep, .issue = io_mkdirat, }, [IORING_OP_SYMLINKAT] = { .prep = io_symlinkat_prep, .issue = io_symlinkat, }, [IORING_OP_LINKAT] = { .prep = io_linkat_prep, .issue = io_linkat, }, [IORING_OP_MSG_RING] = { .needs_file = 1, .iopoll = 1, .prep = io_msg_ring_prep, .issue = io_msg_ring, }, [IORING_OP_FSETXATTR] = { .needs_file = 1, .prep = io_fsetxattr_prep, .issue = io_fsetxattr, }, [IORING_OP_SETXATTR] = { .prep = io_setxattr_prep, .issue = io_setxattr, }, [IORING_OP_FGETXATTR] = { .needs_file = 1, .prep = io_fgetxattr_prep, .issue = io_fgetxattr, }, [IORING_OP_GETXATTR] = { .prep = io_getxattr_prep, .issue = io_getxattr, }, [IORING_OP_SOCKET] = { .audit_skip = 1, #if defined(CONFIG_NET) .prep = io_socket_prep, .issue = io_socket, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_URING_CMD] = { .needs_file = 1, .plug = 1, .iopoll = 1, .iopoll_queue = 1, .prep = io_uring_cmd_prep, .issue = io_uring_cmd, }, [IORING_OP_SEND_ZC] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, .audit_skip = 1, .ioprio = 1, .manual_alloc = 1, #if defined(CONFIG_NET) .prep = io_send_zc_prep, .issue = io_send_zc, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_SENDMSG_ZC] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, .ioprio = 1, .manual_alloc = 1, #if defined(CONFIG_NET) .prep = io_send_zc_prep, .issue = io_sendmsg_zc, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_READ_MULTISHOT] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, .audit_skip = 1, .prep = io_read_mshot_prep, .issue = io_read_mshot, }, [IORING_OP_WAITID] = { .prep = io_waitid_prep, .issue = io_waitid, }, [IORING_OP_FUTEX_WAIT] = { #if defined(CONFIG_FUTEX) .prep = io_futex_prep, .issue = io_futex_wait, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_FUTEX_WAKE] = { #if defined(CONFIG_FUTEX) .prep = io_futex_prep, .issue = io_futex_wake, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_FUTEX_WAITV] = { #if defined(CONFIG_FUTEX) .prep = io_futexv_prep, .issue = io_futexv_wait, #else .prep = io_eopnotsupp_prep, #endif }, [IORING_OP_FIXED_FD_INSTALL] = { .needs_file = 1, .prep = io_install_fixed_fd_prep, .issue = io_install_fixed_fd, }, }; const struct io_cold_def io_cold_defs[] = { [IORING_OP_NOP] = { .name = "NOP", }, [IORING_OP_READV] = { .async_size = sizeof(struct io_async_rw), .name = "READV", .prep_async = io_readv_prep_async, .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_WRITEV] = { .async_size = sizeof(struct io_async_rw), .name = "WRITEV", .prep_async = io_writev_prep_async, .cleanup = io_readv_writev_cleanup, .fail = io_rw_fail, }, [IORING_OP_FSYNC] = { .name = "FSYNC", }, [IORING_OP_READ_FIXED] = { .async_size = sizeof(struct io_async_rw), .name = "READ_FIXED", .fail = io_rw_fail, }, [IORING_OP_WRITE_FIXED] = { .async_size = sizeof(struct io_async_rw), .name = "WRITE_FIXED", .fail = io_rw_fail, }, [IORING_OP_POLL_ADD] = { .name = "POLL_ADD", }, [IORING_OP_POLL_REMOVE] = { .name = "POLL_REMOVE", }, [IORING_OP_SYNC_FILE_RANGE] = { .name = "SYNC_FILE_RANGE", }, [IORING_OP_SENDMSG] = { .name = "SENDMSG", #if defined(CONFIG_NET) .async_size = sizeof(struct io_async_msghdr), .prep_async = io_sendmsg_prep_async, .cleanup = io_sendmsg_recvmsg_cleanup, .fail = io_sendrecv_fail, #endif }, [IORING_OP_RECVMSG] = { .name = "RECVMSG", #if defined(CONFIG_NET) .async_size = sizeof(struct io_async_msghdr), .prep_async = io_recvmsg_prep_async, .cleanup = io_sendmsg_recvmsg_cleanup, .fail = io_sendrecv_fail, #endif }, [IORING_OP_TIMEOUT] = { .async_size = sizeof(struct io_timeout_data), .name = "TIMEOUT", }, [IORING_OP_TIMEOUT_REMOVE] = { .name = "TIMEOUT_REMOVE", }, [IORING_OP_ACCEPT] = { .name = "ACCEPT", }, [IORING_OP_ASYNC_CANCEL] = { .name = "ASYNC_CANCEL", }, [IORING_OP_LINK_TIMEOUT] = { .async_size = sizeof(struct io_timeout_data), .name = "LINK_TIMEOUT", }, [IORING_OP_CONNECT] = { .name = "CONNECT", #if defined(CONFIG_NET) .async_size = sizeof(struct io_async_connect), .prep_async = io_connect_prep_async, #endif }, [IORING_OP_FALLOCATE] = { .name = "FALLOCATE", }, [IORING_OP_OPENAT] = { .name = "OPENAT", .cleanup = io_open_cleanup, }, [IORING_OP_CLOSE] = { .name = "CLOSE", }, [IORING_OP_FILES_UPDATE] = { .name = "FILES_UPDATE", }, [IORING_OP_STATX] = { .name = "STATX", .cleanup = io_statx_cleanup, }, [IORING_OP_READ] = { .async_size = sizeof(struct io_async_rw), .name = "READ", .fail = io_rw_fail, }, [IORING_OP_WRITE] = { .async_size = sizeof(struct io_async_rw), .name = "WRITE", .fail = io_rw_fail, }, [IORING_OP_FADVISE] = { .name = "FADVISE", }, [IORING_OP_MADVISE] = { .name = "MADVISE", }, [IORING_OP_SEND] = { .name = "SEND", #if defined(CONFIG_NET) .async_size = sizeof(struct io_async_msghdr), .fail = io_sendrecv_fail, .prep_async = io_send_prep_async, #endif }, [IORING_OP_RECV] = { .name = "RECV", #if defined(CONFIG_NET) .fail = io_sendrecv_fail, #endif }, [IORING_OP_OPENAT2] = { .name = "OPENAT2", .cleanup = io_open_cleanup, }, [IORING_OP_EPOLL_CTL] = { .name = "EPOLL", }, [IORING_OP_SPLICE] = { .name = "SPLICE", }, [IORING_OP_PROVIDE_BUFFERS] = { .name = "PROVIDE_BUFFERS", }, [IORING_OP_REMOVE_BUFFERS] = { .name = "REMOVE_BUFFERS", }, [IORING_OP_TEE] = { .name = "TEE", }, [IORING_OP_SHUTDOWN] = { .name = "SHUTDOWN", }, [IORING_OP_RENAMEAT] = { .name = "RENAMEAT", .cleanup = io_renameat_cleanup, }, [IORING_OP_UNLINKAT] = { .name = "UNLINKAT", .cleanup = io_unlinkat_cleanup, }, [IORING_OP_MKDIRAT] = { .name = "MKDIRAT", .cleanup = io_mkdirat_cleanup, }, [IORING_OP_SYMLINKAT] = { .name = "SYMLINKAT", .cleanup = io_link_cleanup, }, [IORING_OP_LINKAT] = { .name = "LINKAT", .cleanup = io_link_cleanup, }, [IORING_OP_MSG_RING] = { .name = "MSG_RING", .cleanup = io_msg_ring_cleanup, }, [IORING_OP_FSETXATTR] = { .name = "FSETXATTR", .cleanup = io_xattr_cleanup, }, [IORING_OP_SETXATTR] = { .name = "SETXATTR", .cleanup = io_xattr_cleanup, }, [IORING_OP_FGETXATTR] = { .name = "FGETXATTR", .cleanup = io_xattr_cleanup, }, [IORING_OP_GETXATTR] = { .name = "GETXATTR", .cleanup = io_xattr_cleanup, }, [IORING_OP_SOCKET] = { .name = "SOCKET", }, [IORING_OP_URING_CMD] = { .name = "URING_CMD", .async_size = 2 * sizeof(struct io_uring_sqe), .prep_async = io_uring_cmd_prep_async, }, [IORING_OP_SEND_ZC] = { .name = "SEND_ZC", #if defined(CONFIG_NET) .async_size = sizeof(struct io_async_msghdr), .prep_async = io_send_prep_async, .cleanup = io_send_zc_cleanup, .fail = io_sendrecv_fail, #endif }, [IORING_OP_SENDMSG_ZC] = { .name = "SENDMSG_ZC", #if defined(CONFIG_NET) .async_size = sizeof(struct io_async_msghdr), .prep_async = io_sendmsg_prep_async, .cleanup = io_send_zc_cleanup, .fail = io_sendrecv_fail, #endif }, [IORING_OP_READ_MULTISHOT] = { .name = "READ_MULTISHOT", }, [IORING_OP_WAITID] = { .name = "WAITID", .async_size = sizeof(struct io_waitid_async), }, [IORING_OP_FUTEX_WAIT] = { .name = "FUTEX_WAIT", }, [IORING_OP_FUTEX_WAKE] = { .name = "FUTEX_WAKE", }, [IORING_OP_FUTEX_WAITV] = { .name = "FUTEX_WAITV", }, [IORING_OP_FIXED_FD_INSTALL] = { .name = "FIXED_FD_INSTALL", }, }; const char *io_uring_get_opcode(u8 opcode) { if (opcode < IORING_OP_LAST) return io_cold_defs[opcode].name; return "INVALID"; } void __init io_uring_optable_init(void) { int i; BUILD_BUG_ON(ARRAY_SIZE(io_cold_defs) != IORING_OP_LAST); BUILD_BUG_ON(ARRAY_SIZE(io_issue_defs) != IORING_OP_LAST); for (i = 0; i < ARRAY_SIZE(io_issue_defs); i++) { BUG_ON(!io_issue_defs[i].prep); if (io_issue_defs[i].prep != io_eopnotsupp_prep) BUG_ON(!io_issue_defs[i].issue); WARN_ON_ONCE(!io_cold_defs[i].name); } }
9 9 9 76 76 1 73 52 52 73 48 48 48 39 39 14 13 8 21 22 22 22 15 22 15 15 22 22 9 10 14 10 4 6 1 1 1 1 1 1 1 1 4 1 1 1 1 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 /* * * Copyright IBM Corporation, 2012 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> * * Cgroup v2 * Copyright (C) 2019 Red Hat, Inc. * Author: Giuseppe Scrivano <gscrivan@redhat.com> * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2.1 of the GNU Lesser General Public License * as published by the Free Software Foundation. * * This program is distributed in the hope that it would be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * */ #include <linux/cgroup.h> #include <linux/page_counter.h> #include <linux/slab.h> #include <linux/hugetlb.h> #include <linux/hugetlb_cgroup.h> #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) #define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) static struct hugetlb_cgroup *root_h_cgroup __read_mostly; static inline struct page_counter * __hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx, bool rsvd) { if (rsvd) return &h_cg->rsvd_hugepage[idx]; return &h_cg->hugepage[idx]; } static inline struct page_counter * hugetlb_cgroup_counter_from_cgroup(struct hugetlb_cgroup *h_cg, int idx) { return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, false); } static inline struct page_counter * hugetlb_cgroup_counter_from_cgroup_rsvd(struct hugetlb_cgroup *h_cg, int idx) { return __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, true); } static inline struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) { return s ? container_of(s, struct hugetlb_cgroup, css) : NULL; } static inline struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) { return hugetlb_cgroup_from_css(task_css(task, hugetlb_cgrp_id)); } static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) { return (h_cg == root_h_cgroup); } static inline struct hugetlb_cgroup * parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg) { return hugetlb_cgroup_from_css(h_cg->css.parent); } static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) { struct hstate *h; for_each_hstate(h) { if (page_counter_read( hugetlb_cgroup_counter_from_cgroup(h_cg, hstate_index(h)))) return true; } return false; } static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, struct hugetlb_cgroup *parent_h_cgroup) { int idx; for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) { struct page_counter *fault_parent = NULL; struct page_counter *rsvd_parent = NULL; unsigned long limit; int ret; if (parent_h_cgroup) { fault_parent = hugetlb_cgroup_counter_from_cgroup( parent_h_cgroup, idx); rsvd_parent = hugetlb_cgroup_counter_from_cgroup_rsvd( parent_h_cgroup, idx); } page_counter_init(hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx), fault_parent); page_counter_init( hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx), rsvd_parent); limit = round_down(PAGE_COUNTER_MAX, pages_per_huge_page(&hstates[idx])); ret = page_counter_set_max( hugetlb_cgroup_counter_from_cgroup(h_cgroup, idx), limit); VM_BUG_ON(ret); ret = page_counter_set_max( hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx), limit); VM_BUG_ON(ret); } } static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup) { int node; for_each_node(node) kfree(h_cgroup->nodeinfo[node]); kfree(h_cgroup); } static struct cgroup_subsys_state * hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); struct hugetlb_cgroup *h_cgroup; int node; h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids), GFP_KERNEL); if (!h_cgroup) return ERR_PTR(-ENOMEM); if (!parent_h_cgroup) root_h_cgroup = h_cgroup; /* * TODO: this routine can waste much memory for nodes which will * never be onlined. It's better to use memory hotplug callback * function. */ for_each_node(node) { /* Set node_to_alloc to NUMA_NO_NODE for offline nodes. */ int node_to_alloc = node_state(node, N_NORMAL_MEMORY) ? node : NUMA_NO_NODE; h_cgroup->nodeinfo[node] = kzalloc_node(sizeof(struct hugetlb_cgroup_per_node), GFP_KERNEL, node_to_alloc); if (!h_cgroup->nodeinfo[node]) goto fail_alloc_nodeinfo; } hugetlb_cgroup_init(h_cgroup, parent_h_cgroup); return &h_cgroup->css; fail_alloc_nodeinfo: hugetlb_cgroup_free(h_cgroup); return ERR_PTR(-ENOMEM); } static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) { hugetlb_cgroup_free(hugetlb_cgroup_from_css(css)); } /* * Should be called with hugetlb_lock held. * Since we are holding hugetlb_lock, pages cannot get moved from * active list or uncharged from the cgroup, So no need to get * page reference and test for page active here. This function * cannot fail. */ static void hugetlb_cgroup_move_parent(int idx, struct hugetlb_cgroup *h_cg, struct page *page) { unsigned int nr_pages; struct page_counter *counter; struct hugetlb_cgroup *page_hcg; struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(h_cg); struct folio *folio = page_folio(page); page_hcg = hugetlb_cgroup_from_folio(folio); /* * We can have pages in active list without any cgroup * ie, hugepage with less than 3 pages. We can safely * ignore those pages. */ if (!page_hcg || page_hcg != h_cg) goto out; nr_pages = compound_nr(page); if (!parent) { parent = root_h_cgroup; /* root has no limit */ page_counter_charge(&parent->hugepage[idx], nr_pages); } counter = &h_cg->hugepage[idx]; /* Take the pages off the local counter */ page_counter_cancel(counter, nr_pages); set_hugetlb_cgroup(folio, parent); out: return; } /* * Force the hugetlb cgroup to empty the hugetlb resources by moving them to * the parent cgroup. */ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) { struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); struct hstate *h; struct page *page; do { for_each_hstate(h) { spin_lock_irq(&hugetlb_lock); list_for_each_entry(page, &h->hugepage_activelist, lru) hugetlb_cgroup_move_parent(hstate_index(h), h_cg, page); spin_unlock_irq(&hugetlb_lock); } cond_resched(); } while (hugetlb_cgroup_have_usage(h_cg)); } static inline void hugetlb_event(struct hugetlb_cgroup *hugetlb, int idx, enum hugetlb_memory_event event) { atomic_long_inc(&hugetlb->events_local[idx][event]); cgroup_file_notify(&hugetlb->events_local_file[idx]); do { atomic_long_inc(&hugetlb->events[idx][event]); cgroup_file_notify(&hugetlb->events_file[idx]); } while ((hugetlb = parent_hugetlb_cgroup(hugetlb)) && !hugetlb_cgroup_is_root(hugetlb)); } static int __hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr, bool rsvd) { int ret = 0; struct page_counter *counter; struct hugetlb_cgroup *h_cg = NULL; if (hugetlb_cgroup_disabled()) goto done; again: rcu_read_lock(); h_cg = hugetlb_cgroup_from_task(current); if (!css_tryget(&h_cg->css)) { rcu_read_unlock(); goto again; } rcu_read_unlock(); if (!page_counter_try_charge( __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), nr_pages, &counter)) { ret = -ENOMEM; hugetlb_event(h_cg, idx, HUGETLB_MAX); css_put(&h_cg->css); goto done; } /* Reservations take a reference to the css because they do not get * reparented. */ if (!rsvd) css_put(&h_cg->css); done: *ptr = h_cg; return ret; } int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr) { return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, false); } int hugetlb_cgroup_charge_cgroup_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup **ptr) { return __hugetlb_cgroup_charge_cgroup(idx, nr_pages, ptr, true); } /* Should be called with hugetlb_lock held */ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct folio *folio, bool rsvd) { if (hugetlb_cgroup_disabled() || !h_cg) return; __set_hugetlb_cgroup(folio, h_cg, rsvd); if (!rsvd) { unsigned long usage = h_cg->nodeinfo[folio_nid(folio)]->usage[idx]; /* * This write is not atomic due to fetching usage and writing * to it, but that's fine because we call this with * hugetlb_lock held anyway. */ WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx], usage + nr_pages); } } void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct folio *folio) { __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, false); } void hugetlb_cgroup_commit_charge_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct folio *folio) { __hugetlb_cgroup_commit_charge(idx, nr_pages, h_cg, folio, true); } /* * Should be called with hugetlb_lock held */ static void __hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, struct folio *folio, bool rsvd) { struct hugetlb_cgroup *h_cg; if (hugetlb_cgroup_disabled()) return; lockdep_assert_held(&hugetlb_lock); h_cg = __hugetlb_cgroup_from_folio(folio, rsvd); if (unlikely(!h_cg)) return; __set_hugetlb_cgroup(folio, NULL, rsvd); page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), nr_pages); if (rsvd) css_put(&h_cg->css); else { unsigned long usage = h_cg->nodeinfo[folio_nid(folio)]->usage[idx]; /* * This write is not atomic due to fetching usage and writing * to it, but that's fine because we call this with * hugetlb_lock held anyway. */ WRITE_ONCE(h_cg->nodeinfo[folio_nid(folio)]->usage[idx], usage - nr_pages); } } void hugetlb_cgroup_uncharge_folio(int idx, unsigned long nr_pages, struct folio *folio) { __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, false); } void hugetlb_cgroup_uncharge_folio_rsvd(int idx, unsigned long nr_pages, struct folio *folio) { __hugetlb_cgroup_uncharge_folio(idx, nr_pages, folio, true); } static void __hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, bool rsvd) { if (hugetlb_cgroup_disabled() || !h_cg) return; page_counter_uncharge(__hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), nr_pages); if (rsvd) css_put(&h_cg->css); } void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg) { __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, false); } void hugetlb_cgroup_uncharge_cgroup_rsvd(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg) { __hugetlb_cgroup_uncharge_cgroup(idx, nr_pages, h_cg, true); } void hugetlb_cgroup_uncharge_counter(struct resv_map *resv, unsigned long start, unsigned long end) { if (hugetlb_cgroup_disabled() || !resv || !resv->reservation_counter || !resv->css) return; page_counter_uncharge(resv->reservation_counter, (end - start) * resv->pages_per_hpage); css_put(resv->css); } void hugetlb_cgroup_uncharge_file_region(struct resv_map *resv, struct file_region *rg, unsigned long nr_pages, bool region_del) { if (hugetlb_cgroup_disabled() || !resv || !rg || !nr_pages) return; if (rg->reservation_counter && resv->pages_per_hpage && !resv->reservation_counter) { page_counter_uncharge(rg->reservation_counter, nr_pages * resv->pages_per_hpage); /* * Only do css_put(rg->css) when we delete the entire region * because one file_region must hold exactly one css reference. */ if (region_del) css_put(rg->css); } } enum { RES_USAGE, RES_RSVD_USAGE, RES_LIMIT, RES_RSVD_LIMIT, RES_MAX_USAGE, RES_RSVD_MAX_USAGE, RES_FAILCNT, RES_RSVD_FAILCNT, }; static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy) { int nid; struct cftype *cft = seq_cft(seq); int idx = MEMFILE_IDX(cft->private); bool legacy = MEMFILE_ATTR(cft->private); struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); struct cgroup_subsys_state *css; unsigned long usage; if (legacy) { /* Add up usage across all nodes for the non-hierarchical total. */ usage = 0; for_each_node_state(nid, N_MEMORY) usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]); seq_printf(seq, "total=%lu", usage * PAGE_SIZE); /* Simply print the per-node usage for the non-hierarchical total. */ for_each_node_state(nid, N_MEMORY) seq_printf(seq, " N%d=%lu", nid, READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) * PAGE_SIZE); seq_putc(seq, '\n'); } /* * The hierarchical total is pretty much the value recorded by the * counter, so use that. */ seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "", page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE); /* * For each node, transverse the css tree to obtain the hierarchical * node usage. */ for_each_node_state(nid, N_MEMORY) { usage = 0; rcu_read_lock(); css_for_each_descendant_pre(css, &h_cg->css) { usage += READ_ONCE(hugetlb_cgroup_from_css(css) ->nodeinfo[nid] ->usage[idx]); } rcu_read_unlock(); seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE); } seq_putc(seq, '\n'); return 0; } static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { struct page_counter *counter; struct page_counter *rsvd_counter; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); counter = &h_cg->hugepage[MEMFILE_IDX(cft->private)]; rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(cft->private)]; switch (MEMFILE_ATTR(cft->private)) { case RES_USAGE: return (u64)page_counter_read(counter) * PAGE_SIZE; case RES_RSVD_USAGE: return (u64)page_counter_read(rsvd_counter) * PAGE_SIZE; case RES_LIMIT: return (u64)counter->max * PAGE_SIZE; case RES_RSVD_LIMIT: return (u64)rsvd_counter->max * PAGE_SIZE; case RES_MAX_USAGE: return (u64)counter->watermark * PAGE_SIZE; case RES_RSVD_MAX_USAGE: return (u64)rsvd_counter->watermark * PAGE_SIZE; case RES_FAILCNT: return counter->failcnt; case RES_RSVD_FAILCNT: return rsvd_counter->failcnt; default: BUG(); } } static int hugetlb_cgroup_read_u64_max(struct seq_file *seq, void *v) { int idx; u64 val; struct cftype *cft = seq_cft(seq); unsigned long limit; struct page_counter *counter; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); idx = MEMFILE_IDX(cft->private); counter = &h_cg->hugepage[idx]; limit = round_down(PAGE_COUNTER_MAX, pages_per_huge_page(&hstates[idx])); switch (MEMFILE_ATTR(cft->private)) { case RES_RSVD_USAGE: counter = &h_cg->rsvd_hugepage[idx]; fallthrough; case RES_USAGE: val = (u64)page_counter_read(counter); seq_printf(seq, "%llu\n", val * PAGE_SIZE); break; case RES_RSVD_LIMIT: counter = &h_cg->rsvd_hugepage[idx]; fallthrough; case RES_LIMIT: val = (u64)counter->max; if (val == limit) seq_puts(seq, "max\n"); else seq_printf(seq, "%llu\n", val * PAGE_SIZE); break; default: BUG(); } return 0; } static DEFINE_MUTEX(hugetlb_limit_mutex); static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, const char *max) { int ret, idx; unsigned long nr_pages; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); bool rsvd = false; if (hugetlb_cgroup_is_root(h_cg)) /* Can't set limit on root */ return -EINVAL; buf = strstrip(buf); ret = page_counter_memparse(buf, max, &nr_pages); if (ret) return ret; idx = MEMFILE_IDX(of_cft(of)->private); nr_pages = round_down(nr_pages, pages_per_huge_page(&hstates[idx])); switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_RSVD_LIMIT: rsvd = true; fallthrough; case RES_LIMIT: mutex_lock(&hugetlb_limit_mutex); ret = page_counter_set_max( __hugetlb_cgroup_counter_from_cgroup(h_cg, idx, rsvd), nr_pages); mutex_unlock(&hugetlb_limit_mutex); break; default: ret = -EINVAL; break; } return ret ?: nbytes; } static ssize_t hugetlb_cgroup_write_legacy(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return hugetlb_cgroup_write(of, buf, nbytes, off, "-1"); } static ssize_t hugetlb_cgroup_write_dfl(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { return hugetlb_cgroup_write(of, buf, nbytes, off, "max"); } static ssize_t hugetlb_cgroup_reset(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { int ret = 0; struct page_counter *counter, *rsvd_counter; struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(of_css(of)); counter = &h_cg->hugepage[MEMFILE_IDX(of_cft(of)->private)]; rsvd_counter = &h_cg->rsvd_hugepage[MEMFILE_IDX(of_cft(of)->private)]; switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_MAX_USAGE: page_counter_reset_watermark(counter); break; case RES_RSVD_MAX_USAGE: page_counter_reset_watermark(rsvd_counter); break; case RES_FAILCNT: counter->failcnt = 0; break; case RES_RSVD_FAILCNT: rsvd_counter->failcnt = 0; break; default: ret = -EINVAL; break; } return ret ?: nbytes; } static char *mem_fmt(char *buf, int size, unsigned long hsize) { if (hsize >= SZ_1G) snprintf(buf, size, "%luGB", hsize / SZ_1G); else if (hsize >= SZ_1M) snprintf(buf, size, "%luMB", hsize / SZ_1M); else snprintf(buf, size, "%luKB", hsize / SZ_1K); return buf; } static int __hugetlb_events_show(struct seq_file *seq, bool local) { int idx; long max; struct cftype *cft = seq_cft(seq); struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); idx = MEMFILE_IDX(cft->private); if (local) max = atomic_long_read(&h_cg->events_local[idx][HUGETLB_MAX]); else max = atomic_long_read(&h_cg->events[idx][HUGETLB_MAX]); seq_printf(seq, "max %lu\n", max); return 0; } static int hugetlb_events_show(struct seq_file *seq, void *v) { return __hugetlb_events_show(seq, false); } static int hugetlb_events_local_show(struct seq_file *seq, void *v) { return __hugetlb_events_show(seq, true); } static void __init __hugetlb_cgroup_file_dfl_init(int idx) { char buf[32]; struct cftype *cft; struct hstate *h = &hstates[idx]; /* format the size */ mem_fmt(buf, sizeof(buf), huge_page_size(h)); /* Add the limit file */ cft = &h->cgroup_files_dfl[0]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max", buf); cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); cft->seq_show = hugetlb_cgroup_read_u64_max; cft->write = hugetlb_cgroup_write_dfl; cft->flags = CFTYPE_NOT_ON_ROOT; /* Add the reservation limit file */ cft = &h->cgroup_files_dfl[1]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.max", buf); cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_LIMIT); cft->seq_show = hugetlb_cgroup_read_u64_max; cft->write = hugetlb_cgroup_write_dfl; cft->flags = CFTYPE_NOT_ON_ROOT; /* Add the current usage file */ cft = &h->cgroup_files_dfl[2]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.current", buf); cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); cft->seq_show = hugetlb_cgroup_read_u64_max; cft->flags = CFTYPE_NOT_ON_ROOT; /* Add the current reservation usage file */ cft = &h->cgroup_files_dfl[3]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.current", buf); cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_USAGE); cft->seq_show = hugetlb_cgroup_read_u64_max; cft->flags = CFTYPE_NOT_ON_ROOT; /* Add the events file */ cft = &h->cgroup_files_dfl[4]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events", buf); cft->private = MEMFILE_PRIVATE(idx, 0); cft->seq_show = hugetlb_events_show; cft->file_offset = offsetof(struct hugetlb_cgroup, events_file[idx]); cft->flags = CFTYPE_NOT_ON_ROOT; /* Add the events.local file */ cft = &h->cgroup_files_dfl[5]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events.local", buf); cft->private = MEMFILE_PRIVATE(idx, 0); cft->seq_show = hugetlb_events_local_show; cft->file_offset = offsetof(struct hugetlb_cgroup, events_local_file[idx]); cft->flags = CFTYPE_NOT_ON_ROOT; /* Add the numa stat file */ cft = &h->cgroup_files_dfl[6]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf); cft->private = MEMFILE_PRIVATE(idx, 0); cft->seq_show = hugetlb_cgroup_read_numa_stat; cft->flags = CFTYPE_NOT_ON_ROOT; /* NULL terminate the last cft */ cft = &h->cgroup_files_dfl[7]; memset(cft, 0, sizeof(*cft)); WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys, h->cgroup_files_dfl)); } static void __init __hugetlb_cgroup_file_legacy_init(int idx) { char buf[32]; struct cftype *cft; struct hstate *h = &hstates[idx]; /* format the size */ mem_fmt(buf, sizeof(buf), huge_page_size(h)); /* Add the limit file */ cft = &h->cgroup_files_legacy[0]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); cft->read_u64 = hugetlb_cgroup_read_u64; cft->write = hugetlb_cgroup_write_legacy; /* Add the reservation limit file */ cft = &h->cgroup_files_legacy[1]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.limit_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_LIMIT); cft->read_u64 = hugetlb_cgroup_read_u64; cft->write = hugetlb_cgroup_write_legacy; /* Add the usage file */ cft = &h->cgroup_files_legacy[2]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); cft->read_u64 = hugetlb_cgroup_read_u64; /* Add the reservation usage file */ cft = &h->cgroup_files_legacy[3]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.usage_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_USAGE); cft->read_u64 = hugetlb_cgroup_read_u64; /* Add the MAX usage file */ cft = &h->cgroup_files_legacy[4]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); cft->write = hugetlb_cgroup_reset; cft->read_u64 = hugetlb_cgroup_read_u64; /* Add the MAX reservation usage file */ cft = &h->cgroup_files_legacy[5]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.max_usage_in_bytes", buf); cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_MAX_USAGE); cft->write = hugetlb_cgroup_reset; cft->read_u64 = hugetlb_cgroup_read_u64; /* Add the failcntfile */ cft = &h->cgroup_files_legacy[6]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); cft->write = hugetlb_cgroup_reset; cft->read_u64 = hugetlb_cgroup_read_u64; /* Add the reservation failcntfile */ cft = &h->cgroup_files_legacy[7]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.rsvd.failcnt", buf); cft->private = MEMFILE_PRIVATE(idx, RES_RSVD_FAILCNT); cft->write = hugetlb_cgroup_reset; cft->read_u64 = hugetlb_cgroup_read_u64; /* Add the numa stat file */ cft = &h->cgroup_files_legacy[8]; snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf); cft->private = MEMFILE_PRIVATE(idx, 1); cft->seq_show = hugetlb_cgroup_read_numa_stat; /* NULL terminate the last cft */ cft = &h->cgroup_files_legacy[9]; memset(cft, 0, sizeof(*cft)); WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, h->cgroup_files_legacy)); } static void __init __hugetlb_cgroup_file_init(int idx) { __hugetlb_cgroup_file_dfl_init(idx); __hugetlb_cgroup_file_legacy_init(idx); } void __init hugetlb_cgroup_file_init(void) { struct hstate *h; for_each_hstate(h) __hugetlb_cgroup_file_init(hstate_index(h)); } /* * hugetlb_lock will make sure a parallel cgroup rmdir won't happen * when we migrate hugepages */ void hugetlb_cgroup_migrate(struct folio *old_folio, struct folio *new_folio) { struct hugetlb_cgroup *h_cg; struct hugetlb_cgroup *h_cg_rsvd; struct hstate *h = folio_hstate(old_folio); if (hugetlb_cgroup_disabled()) return; spin_lock_irq(&hugetlb_lock); h_cg = hugetlb_cgroup_from_folio(old_folio); h_cg_rsvd = hugetlb_cgroup_from_folio_rsvd(old_folio); set_hugetlb_cgroup(old_folio, NULL); set_hugetlb_cgroup_rsvd(old_folio, NULL); /* move the h_cg details to new cgroup */ set_hugetlb_cgroup(new_folio, h_cg); set_hugetlb_cgroup_rsvd(new_folio, h_cg_rsvd); list_move(&new_folio->lru, &h->hugepage_activelist); spin_unlock_irq(&hugetlb_lock); return; } static struct cftype hugetlb_files[] = { {} /* terminate */ }; struct cgroup_subsys hugetlb_cgrp_subsys = { .css_alloc = hugetlb_cgroup_css_alloc, .css_offline = hugetlb_cgroup_css_offline, .css_free = hugetlb_cgroup_css_free, .dfl_cftypes = hugetlb_files, .legacy_cftypes = hugetlb_files, };
15 4 8 7 1 6 5 6 13 12 1 11 10 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 // SPDX-License-Identifier: GPL-2.0-only /* * This is a module which is used for setting the MSS option in TCP packets. * * Copyright (C) 2000 Marc Boucher <marc@mbsi.ca> * Copyright (C) 2007 Patrick McHardy <kaber@trash.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/gfp.h> #include <linux/ipv6.h> #include <linux/tcp.h> #include <net/dst.h> #include <net/flow.h> #include <net/ipv6.h> #include <net/route.h> #include <net/tcp.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_tcpudp.h> #include <linux/netfilter/xt_TCPMSS.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Marc Boucher <marc@mbsi.ca>"); MODULE_DESCRIPTION("Xtables: TCP Maximum Segment Size (MSS) adjustment"); MODULE_ALIAS("ipt_TCPMSS"); MODULE_ALIAS("ip6t_TCPMSS"); static inline unsigned int optlen(const u_int8_t *opt, unsigned int offset) { /* Beware zero-length options: make finite progress */ if (opt[offset] <= TCPOPT_NOP || opt[offset+1] == 0) return 1; else return opt[offset+1]; } static u_int32_t tcpmss_reverse_mtu(struct net *net, const struct sk_buff *skb, unsigned int family) { struct flowi fl; struct rtable *rt = NULL; u_int32_t mtu = ~0U; if (family == PF_INET) { struct flowi4 *fl4 = &fl.u.ip4; memset(fl4, 0, sizeof(*fl4)); fl4->daddr = ip_hdr(skb)->saddr; } else { struct flowi6 *fl6 = &fl.u.ip6; memset(fl6, 0, sizeof(*fl6)); fl6->daddr = ipv6_hdr(skb)->saddr; } nf_route(net, (struct dst_entry **)&rt, &fl, false, family); if (rt != NULL) { mtu = dst_mtu(&rt->dst); dst_release(&rt->dst); } return mtu; } static int tcpmss_mangle_packet(struct sk_buff *skb, const struct xt_action_param *par, unsigned int family, unsigned int tcphoff, unsigned int minlen) { const struct xt_tcpmss_info *info = par->targinfo; struct tcphdr *tcph; int len, tcp_hdrlen; unsigned int i; __be16 oldval; u16 newmss; u8 *opt; /* This is a fragment, no TCP header is available */ if (par->fragoff != 0) return 0; if (skb_ensure_writable(skb, skb->len)) return -1; len = skb->len - tcphoff; if (len < (int)sizeof(struct tcphdr)) return -1; tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); tcp_hdrlen = tcph->doff * 4; if (len < tcp_hdrlen || tcp_hdrlen < sizeof(struct tcphdr)) return -1; if (info->mss == XT_TCPMSS_CLAMP_PMTU) { struct net *net = xt_net(par); unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family); unsigned int min_mtu = min(dst_mtu(skb_dst(skb)), in_mtu); if (min_mtu <= minlen) { net_err_ratelimited("unknown or invalid path-MTU (%u)\n", min_mtu); return -1; } newmss = min_mtu - minlen; } else newmss = info->mss; opt = (u_int8_t *)tcph; for (i = sizeof(struct tcphdr); i <= tcp_hdrlen - TCPOLEN_MSS; i += optlen(opt, i)) { if (opt[i] == TCPOPT_MSS && opt[i+1] == TCPOLEN_MSS) { u_int16_t oldmss; oldmss = (opt[i+2] << 8) | opt[i+3]; /* Never increase MSS, even when setting it, as * doing so results in problems for hosts that rely * on MSS being set correctly. */ if (oldmss <= newmss) return 0; opt[i+2] = (newmss & 0xff00) >> 8; opt[i+3] = newmss & 0x00ff; inet_proto_csum_replace2(&tcph->check, skb, htons(oldmss), htons(newmss), false); return 0; } } /* There is data after the header so the option can't be added * without moving it, and doing so may make the SYN packet * itself too large. Accept the packet unmodified instead. */ if (len > tcp_hdrlen) return 0; /* tcph->doff has 4 bits, do not wrap it to 0 */ if (tcp_hdrlen >= 15 * 4) return 0; /* * MSS Option not found ?! add it.. */ if (skb_tailroom(skb) < TCPOLEN_MSS) { if (pskb_expand_head(skb, 0, TCPOLEN_MSS - skb_tailroom(skb), GFP_ATOMIC)) return -1; tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); } skb_put(skb, TCPOLEN_MSS); /* * IPv4: RFC 1122 states "If an MSS option is not received at * connection setup, TCP MUST assume a default send MSS of 536". * IPv6: RFC 2460 states IPv6 has a minimum MTU of 1280 and a minimum * length IPv6 header of 60, ergo the default MSS value is 1220 * Since no MSS was provided, we must use the default values */ if (xt_family(par) == NFPROTO_IPV4) newmss = min(newmss, (u16)536); else newmss = min(newmss, (u16)1220); opt = (u_int8_t *)tcph + sizeof(struct tcphdr); memmove(opt + TCPOLEN_MSS, opt, len - sizeof(struct tcphdr)); inet_proto_csum_replace2(&tcph->check, skb, htons(len), htons(len + TCPOLEN_MSS), true); opt[0] = TCPOPT_MSS; opt[1] = TCPOLEN_MSS; opt[2] = (newmss & 0xff00) >> 8; opt[3] = newmss & 0x00ff; inet_proto_csum_replace4(&tcph->check, skb, 0, *((__be32 *)opt), false); oldval = ((__be16 *)tcph)[6]; tcph->doff += TCPOLEN_MSS/4; inet_proto_csum_replace2(&tcph->check, skb, oldval, ((__be16 *)tcph)[6], false); return TCPOLEN_MSS; } static unsigned int tcpmss_tg4(struct sk_buff *skb, const struct xt_action_param *par) { struct iphdr *iph = ip_hdr(skb); __be16 newlen; int ret; ret = tcpmss_mangle_packet(skb, par, PF_INET, iph->ihl * 4, sizeof(*iph) + sizeof(struct tcphdr)); if (ret < 0) return NF_DROP; if (ret > 0) { iph = ip_hdr(skb); newlen = htons(ntohs(iph->tot_len) + ret); csum_replace2(&iph->check, iph->tot_len, newlen); iph->tot_len = newlen; } return XT_CONTINUE; } #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static unsigned int tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par) { struct ipv6hdr *ipv6h = ipv6_hdr(skb); u8 nexthdr; __be16 frag_off, oldlen, newlen; int tcphoff; int ret; nexthdr = ipv6h->nexthdr; tcphoff = ipv6_skip_exthdr(skb, sizeof(*ipv6h), &nexthdr, &frag_off); if (tcphoff < 0) return NF_DROP; ret = tcpmss_mangle_packet(skb, par, PF_INET6, tcphoff, sizeof(*ipv6h) + sizeof(struct tcphdr)); if (ret < 0) return NF_DROP; if (ret > 0) { ipv6h = ipv6_hdr(skb); oldlen = ipv6h->payload_len; newlen = htons(ntohs(oldlen) + ret); if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)oldlen), (__force __wsum)newlen); ipv6h->payload_len = newlen; } return XT_CONTINUE; } #endif /* Must specify -p tcp --syn */ static inline bool find_syn_match(const struct xt_entry_match *m) { const struct xt_tcp *tcpinfo = (const struct xt_tcp *)m->data; if (strcmp(m->u.kernel.match->name, "tcp") == 0 && tcpinfo->flg_cmp & TCPHDR_SYN && !(tcpinfo->invflags & XT_TCP_INV_FLAGS)) return true; return false; } static int tcpmss_tg4_check(const struct xt_tgchk_param *par) { const struct xt_tcpmss_info *info = par->targinfo; const struct ipt_entry *e = par->entryinfo; const struct xt_entry_match *ematch; if (info->mss == XT_TCPMSS_CLAMP_PMTU && (par->hook_mask & ~((1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING))) != 0) { pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n"); return -EINVAL; } if (par->nft_compat) return 0; xt_ematch_foreach(ematch, e) if (find_syn_match(ematch)) return 0; pr_info_ratelimited("Only works on TCP SYN packets\n"); return -EINVAL; } #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) static int tcpmss_tg6_check(const struct xt_tgchk_param *par) { const struct xt_tcpmss_info *info = par->targinfo; const struct ip6t_entry *e = par->entryinfo; const struct xt_entry_match *ematch; if (info->mss == XT_TCPMSS_CLAMP_PMTU && (par->hook_mask & ~((1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_POST_ROUTING))) != 0) { pr_info_ratelimited("path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n"); return -EINVAL; } if (par->nft_compat) return 0; xt_ematch_foreach(ematch, e) if (find_syn_match(ematch)) return 0; pr_info_ratelimited("Only works on TCP SYN packets\n"); return -EINVAL; } #endif static struct xt_target tcpmss_tg_reg[] __read_mostly = { { .family = NFPROTO_IPV4, .name = "TCPMSS", .checkentry = tcpmss_tg4_check, .target = tcpmss_tg4, .targetsize = sizeof(struct xt_tcpmss_info), .proto = IPPROTO_TCP, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .family = NFPROTO_IPV6, .name = "TCPMSS", .checkentry = tcpmss_tg6_check, .target = tcpmss_tg6, .targetsize = sizeof(struct xt_tcpmss_info), .proto = IPPROTO_TCP, .me = THIS_MODULE, }, #endif }; static int __init tcpmss_tg_init(void) { return xt_register_targets(tcpmss_tg_reg, ARRAY_SIZE(tcpmss_tg_reg)); } static void __exit tcpmss_tg_exit(void) { xt_unregister_targets(tcpmss_tg_reg, ARRAY_SIZE(tcpmss_tg_reg)); } module_init(tcpmss_tg_init); module_exit(tcpmss_tg_exit);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2005 The University of Waikato, Hamilton, New Zealand. * Copyright (c) 2005 Ian McDonald <ian.mcdonald@jandi.co.nz> * Copyright (c) 2005 Arnaldo Carvalho de Melo <acme@conectiva.com.br> * Copyright (c) 2003 Nils-Erik Mattsson, Joacim Haggmark, Magnus Erixzon */ #include <linux/module.h> #include "../../dccp.h" #include "tfrc.h" #define TFRC_CALC_X_ARRSIZE 500 #define TFRC_CALC_X_SPLIT 50000 /* 0.05 * 1000000, details below */ #define TFRC_SMALLEST_P (TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE) /* TFRC TCP Reno Throughput Equation Lookup Table for f(p) The following two-column lookup table implements a part of the TCP throughput equation from [RFC 3448, sec. 3.1]: s X_calc = -------------------------------------------------------------- R * sqrt(2*b*p/3) + (3 * t_RTO * sqrt(3*b*p/8) * (p + 32*p^3)) Where: X is the transmit rate in bytes/second s is the packet size in bytes R is the round trip time in seconds p is the loss event rate, between 0 and 1.0, of the number of loss events as a fraction of the number of packets transmitted t_RTO is the TCP retransmission timeout value in seconds b is the number of packets acknowledged by a single TCP ACK We can assume that b = 1 and t_RTO is 4 * R. The equation now becomes: s X_calc = ------------------------------------------------------- R * sqrt(p*2/3) + (12 * R * sqrt(p*3/8) * (p + 32*p^3)) which we can break down into: s X_calc = --------- R * f(p) where f(p) is given for 0 < p <= 1 by: f(p) = sqrt(2*p/3) + 12 * sqrt(3*p/8) * (p + 32*p^3) Since this is kernel code, floating-point arithmetic is avoided in favour of integer arithmetic. This means that nearly all fractional parameters are scaled by 1000000: * the parameters p and R * the return result f(p) The lookup table therefore actually tabulates the following function g(q): g(q) = 1000000 * f(q/1000000) Hence, when p <= 1, q must be less than or equal to 1000000. To achieve finer granularity for the practically more relevant case of small values of p (up to 5%), the second column is used; the first one ranges up to 100%. This split corresponds to the value of q = TFRC_CALC_X_SPLIT. At the same time this also determines the smallest resolution possible with this lookup table: TFRC_SMALLEST_P = TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE The entire table is generated by: for(i=0; i < TFRC_CALC_X_ARRSIZE; i++) { lookup[i][0] = g((i+1) * 1000000/TFRC_CALC_X_ARRSIZE); lookup[i][1] = g((i+1) * TFRC_CALC_X_SPLIT/TFRC_CALC_X_ARRSIZE); } With the given configuration, we have, with M = TFRC_CALC_X_ARRSIZE-1, lookup[0][0] = g(1000000/(M+1)) = 1000000 * f(0.2%) lookup[M][0] = g(1000000) = 1000000 * f(100%) lookup[0][1] = g(TFRC_SMALLEST_P) = 1000000 * f(0.01%) lookup[M][1] = g(TFRC_CALC_X_SPLIT) = 1000000 * f(5%) In summary, the two columns represent f(p) for the following ranges: * The first column is for 0.002 <= p <= 1.0 * The second column is for 0.0001 <= p <= 0.05 Where the columns overlap, the second (finer-grained) is given preference, i.e. the first column is used only for p >= 0.05. */ static const u32 tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE][2] = { { 37172, 8172 }, { 53499, 11567 }, { 66664, 14180 }, { 78298, 16388 }, { 89021, 18339 }, { 99147, 20108 }, { 108858, 21738 }, { 118273, 23260 }, { 127474, 24693 }, { 136520, 26052 }, { 145456, 27348 }, { 154316, 28589 }, { 163130, 29783 }, { 171919, 30935 }, { 180704, 32049 }, { 189502, 33130 }, { 198328, 34180 }, { 207194, 35202 }, { 216114, 36198 }, { 225097, 37172 }, { 234153, 38123 }, { 243294, 39055 }, { 252527, 39968 }, { 261861, 40864 }, { 271305, 41743 }, { 280866, 42607 }, { 290553, 43457 }, { 300372, 44293 }, { 310333, 45117 }, { 320441, 45929 }, { 330705, 46729 }, { 341131, 47518 }, { 351728, 48297 }, { 362501, 49066 }, { 373460, 49826 }, { 384609, 50577 }, { 395958, 51320 }, { 407513, 52054 }, { 419281, 52780 }, { 431270, 53499 }, { 443487, 54211 }, { 455940, 54916 }, { 468635, 55614 }, { 481581, 56306 }, { 494785, 56991 }, { 508254, 57671 }, { 521996, 58345 }, { 536019, 59014 }, { 550331, 59677 }, { 564939, 60335 }, { 579851, 60988 }, { 595075, 61636 }, { 610619, 62279 }, { 626491, 62918 }, { 642700, 63553 }, { 659253, 64183 }, { 676158, 64809 }, { 693424, 65431 }, { 711060, 66050 }, { 729073, 66664 }, { 747472, 67275 }, { 766266, 67882 }, { 785464, 68486 }, { 805073, 69087 }, { 825103, 69684 }, { 845562, 70278 }, { 866460, 70868 }, { 887805, 71456 }, { 909606, 72041 }, { 931873, 72623 }, { 954614, 73202 }, { 977839, 73778 }, { 1001557, 74352 }, { 1025777, 74923 }, { 1050508, 75492 }, { 1075761, 76058 }, { 1101544, 76621 }, { 1127867, 77183 }, { 1154739, 77741 }, { 1182172, 78298 }, { 1210173, 78852 }, { 1238753, 79405 }, { 1267922, 79955 }, { 1297689, 80503 }, { 1328066, 81049 }, { 1359060, 81593 }, { 1390684, 82135 }, { 1422947, 82675 }, { 1455859, 83213 }, { 1489430, 83750 }, { 1523671, 84284 }, { 1558593, 84817 }, { 1594205, 85348 }, { 1630518, 85878 }, { 1667543, 86406 }, { 1705290, 86932 }, { 1743770, 87457 }, { 1782994, 87980 }, { 1822973, 88501 }, { 1863717, 89021 }, { 1905237, 89540 }, { 1947545, 90057 }, { 1990650, 90573 }, { 2034566, 91087 }, { 2079301, 91600 }, { 2124869, 92111 }, { 2171279, 92622 }, { 2218543, 93131 }, { 2266673, 93639 }, { 2315680, 94145 }, { 2365575, 94650 }, { 2416371, 95154 }, { 2468077, 95657 }, { 2520707, 96159 }, { 2574271, 96660 }, { 2628782, 97159 }, { 2684250, 97658 }, { 2740689, 98155 }, { 2798110, 98651 }, { 2856524, 99147 }, { 2915944, 99641 }, { 2976382, 100134 }, { 3037850, 100626 }, { 3100360, 101117 }, { 3163924, 101608 }, { 3228554, 102097 }, { 3294263, 102586 }, { 3361063, 103073 }, { 3428966, 103560 }, { 3497984, 104045 }, { 3568131, 104530 }, { 3639419, 105014 }, { 3711860, 105498 }, { 3785467, 105980 }, { 3860253, 106462 }, { 3936229, 106942 }, { 4013410, 107422 }, { 4091808, 107902 }, { 4171435, 108380 }, { 4252306, 108858 }, { 4334431, 109335 }, { 4417825, 109811 }, { 4502501, 110287 }, { 4588472, 110762 }, { 4675750, 111236 }, { 4764349, 111709 }, { 4854283, 112182 }, { 4945564, 112654 }, { 5038206, 113126 }, { 5132223, 113597 }, { 5227627, 114067 }, { 5324432, 114537 }, { 5422652, 115006 }, { 5522299, 115474 }, { 5623389, 115942 }, { 5725934, 116409 }, { 5829948, 116876 }, { 5935446, 117342 }, { 6042439, 117808 }, { 6150943, 118273 }, { 6260972, 118738 }, { 6372538, 119202 }, { 6485657, 119665 }, { 6600342, 120128 }, { 6716607, 120591 }, { 6834467, 121053 }, { 6953935, 121514 }, { 7075025, 121976 }, { 7197752, 122436 }, { 7322131, 122896 }, { 7448175, 123356 }, { 7575898, 123815 }, { 7705316, 124274 }, { 7836442, 124733 }, { 7969291, 125191 }, { 8103877, 125648 }, { 8240216, 126105 }, { 8378321, 126562 }, { 8518208, 127018 }, { 8659890, 127474 }, { 8803384, 127930 }, { 8948702, 128385 }, { 9095861, 128840 }, { 9244875, 129294 }, { 9395760, 129748 }, { 9548529, 130202 }, { 9703198, 130655 }, { 9859782, 131108 }, { 10018296, 131561 }, { 10178755, 132014 }, { 10341174, 132466 }, { 10505569, 132917 }, { 10671954, 133369 }, { 10840345, 133820 }, { 11010757, 134271 }, { 11183206, 134721 }, { 11357706, 135171 }, { 11534274, 135621 }, { 11712924, 136071 }, { 11893673, 136520 }, { 12076536, 136969 }, { 12261527, 137418 }, { 12448664, 137867 }, { 12637961, 138315 }, { 12829435, 138763 }, { 13023101, 139211 }, { 13218974, 139658 }, { 13417071, 140106 }, { 13617407, 140553 }, { 13819999, 140999 }, { 14024862, 141446 }, { 14232012, 141892 }, { 14441465, 142339 }, { 14653238, 142785 }, { 14867346, 143230 }, { 15083805, 143676 }, { 15302632, 144121 }, { 15523842, 144566 }, { 15747453, 145011 }, { 15973479, 145456 }, { 16201939, 145900 }, { 16432847, 146345 }, { 16666221, 146789 }, { 16902076, 147233 }, { 17140429, 147677 }, { 17381297, 148121 }, { 17624696, 148564 }, { 17870643, 149007 }, { 18119154, 149451 }, { 18370247, 149894 }, { 18623936, 150336 }, { 18880241, 150779 }, { 19139176, 151222 }, { 19400759, 151664 }, { 19665007, 152107 }, { 19931936, 152549 }, { 20201564, 152991 }, { 20473907, 153433 }, { 20748982, 153875 }, { 21026807, 154316 }, { 21307399, 154758 }, { 21590773, 155199 }, { 21876949, 155641 }, { 22165941, 156082 }, { 22457769, 156523 }, { 22752449, 156964 }, { 23049999, 157405 }, { 23350435, 157846 }, { 23653774, 158287 }, { 23960036, 158727 }, { 24269236, 159168 }, { 24581392, 159608 }, { 24896521, 160049 }, { 25214642, 160489 }, { 25535772, 160929 }, { 25859927, 161370 }, { 26187127, 161810 }, { 26517388, 162250 }, { 26850728, 162690 }, { 27187165, 163130 }, { 27526716, 163569 }, { 27869400, 164009 }, { 28215234, 164449 }, { 28564236, 164889 }, { 28916423, 165328 }, { 29271815, 165768 }, { 29630428, 166208 }, { 29992281, 166647 }, { 30357392, 167087 }, { 30725779, 167526 }, { 31097459, 167965 }, { 31472452, 168405 }, { 31850774, 168844 }, { 32232445, 169283 }, { 32617482, 169723 }, { 33005904, 170162 }, { 33397730, 170601 }, { 33792976, 171041 }, { 34191663, 171480 }, { 34593807, 171919 }, { 34999428, 172358 }, { 35408544, 172797 }, { 35821174, 173237 }, { 36237335, 173676 }, { 36657047, 174115 }, { 37080329, 174554 }, { 37507197, 174993 }, { 37937673, 175433 }, { 38371773, 175872 }, { 38809517, 176311 }, { 39250924, 176750 }, { 39696012, 177190 }, { 40144800, 177629 }, { 40597308, 178068 }, { 41053553, 178507 }, { 41513554, 178947 }, { 41977332, 179386 }, { 42444904, 179825 }, { 42916290, 180265 }, { 43391509, 180704 }, { 43870579, 181144 }, { 44353520, 181583 }, { 44840352, 182023 }, { 45331092, 182462 }, { 45825761, 182902 }, { 46324378, 183342 }, { 46826961, 183781 }, { 47333531, 184221 }, { 47844106, 184661 }, { 48358706, 185101 }, { 48877350, 185541 }, { 49400058, 185981 }, { 49926849, 186421 }, { 50457743, 186861 }, { 50992759, 187301 }, { 51531916, 187741 }, { 52075235, 188181 }, { 52622735, 188622 }, { 53174435, 189062 }, { 53730355, 189502 }, { 54290515, 189943 }, { 54854935, 190383 }, { 55423634, 190824 }, { 55996633, 191265 }, { 56573950, 191706 }, { 57155606, 192146 }, { 57741621, 192587 }, { 58332014, 193028 }, { 58926806, 193470 }, { 59526017, 193911 }, { 60129666, 194352 }, { 60737774, 194793 }, { 61350361, 195235 }, { 61967446, 195677 }, { 62589050, 196118 }, { 63215194, 196560 }, { 63845897, 197002 }, { 64481179, 197444 }, { 65121061, 197886 }, { 65765563, 198328 }, { 66414705, 198770 }, { 67068508, 199213 }, { 67726992, 199655 }, { 68390177, 200098 }, { 69058085, 200540 }, { 69730735, 200983 }, { 70408147, 201426 }, { 71090343, 201869 }, { 71777343, 202312 }, { 72469168, 202755 }, { 73165837, 203199 }, { 73867373, 203642 }, { 74573795, 204086 }, { 75285124, 204529 }, { 76001380, 204973 }, { 76722586, 205417 }, { 77448761, 205861 }, { 78179926, 206306 }, { 78916102, 206750 }, { 79657310, 207194 }, { 80403571, 207639 }, { 81154906, 208084 }, { 81911335, 208529 }, { 82672880, 208974 }, { 83439562, 209419 }, { 84211402, 209864 }, { 84988421, 210309 }, { 85770640, 210755 }, { 86558080, 211201 }, { 87350762, 211647 }, { 88148708, 212093 }, { 88951938, 212539 }, { 89760475, 212985 }, { 90574339, 213432 }, { 91393551, 213878 }, { 92218133, 214325 }, { 93048107, 214772 }, { 93883493, 215219 }, { 94724314, 215666 }, { 95570590, 216114 }, { 96422343, 216561 }, { 97279594, 217009 }, { 98142366, 217457 }, { 99010679, 217905 }, { 99884556, 218353 }, { 100764018, 218801 }, { 101649086, 219250 }, { 102539782, 219698 }, { 103436128, 220147 }, { 104338146, 220596 }, { 105245857, 221046 }, { 106159284, 221495 }, { 107078448, 221945 }, { 108003370, 222394 }, { 108934074, 222844 }, { 109870580, 223294 }, { 110812910, 223745 }, { 111761087, 224195 }, { 112715133, 224646 }, { 113675069, 225097 }, { 114640918, 225548 }, { 115612702, 225999 }, { 116590442, 226450 }, { 117574162, 226902 }, { 118563882, 227353 }, { 119559626, 227805 }, { 120561415, 228258 }, { 121569272, 228710 }, { 122583219, 229162 }, { 123603278, 229615 }, { 124629471, 230068 }, { 125661822, 230521 }, { 126700352, 230974 }, { 127745083, 231428 }, { 128796039, 231882 }, { 129853241, 232336 }, { 130916713, 232790 }, { 131986475, 233244 }, { 133062553, 233699 }, { 134144966, 234153 }, { 135233739, 234608 }, { 136328894, 235064 }, { 137430453, 235519 }, { 138538440, 235975 }, { 139652876, 236430 }, { 140773786, 236886 }, { 141901190, 237343 }, { 143035113, 237799 }, { 144175576, 238256 }, { 145322604, 238713 }, { 146476218, 239170 }, { 147636442, 239627 }, { 148803298, 240085 }, { 149976809, 240542 }, { 151156999, 241000 }, { 152343890, 241459 }, { 153537506, 241917 }, { 154737869, 242376 }, { 155945002, 242835 }, { 157158929, 243294 }, { 158379673, 243753 }, { 159607257, 244213 }, { 160841704, 244673 }, { 162083037, 245133 }, { 163331279, 245593 }, { 164586455, 246054 }, { 165848586, 246514 }, { 167117696, 246975 }, { 168393810, 247437 }, { 169676949, 247898 }, { 170967138, 248360 }, { 172264399, 248822 }, { 173568757, 249284 }, { 174880235, 249747 }, { 176198856, 250209 }, { 177524643, 250672 }, { 178857621, 251136 }, { 180197813, 251599 }, { 181545242, 252063 }, { 182899933, 252527 }, { 184261908, 252991 }, { 185631191, 253456 }, { 187007807, 253920 }, { 188391778, 254385 }, { 189783129, 254851 }, { 191181884, 255316 }, { 192588065, 255782 }, { 194001698, 256248 }, { 195422805, 256714 }, { 196851411, 257181 }, { 198287540, 257648 }, { 199731215, 258115 }, { 201182461, 258582 }, { 202641302, 259050 }, { 204107760, 259518 }, { 205581862, 259986 }, { 207063630, 260454 }, { 208553088, 260923 }, { 210050262, 261392 }, { 211555174, 261861 }, { 213067849, 262331 }, { 214588312, 262800 }, { 216116586, 263270 }, { 217652696, 263741 }, { 219196666, 264211 }, { 220748520, 264682 }, { 222308282, 265153 }, { 223875978, 265625 }, { 225451630, 266097 }, { 227035265, 266569 }, { 228626905, 267041 }, { 230226576, 267514 }, { 231834302, 267986 }, { 233450107, 268460 }, { 235074016, 268933 }, { 236706054, 269407 }, { 238346244, 269881 }, { 239994613, 270355 }, { 241651183, 270830 }, { 243315981, 271305 } }; /* return largest index i such that fval <= lookup[i][small] */ static inline u32 tfrc_binsearch(u32 fval, u8 small) { u32 try, low = 0, high = TFRC_CALC_X_ARRSIZE - 1; while (low < high) { try = (low + high) / 2; if (fval <= tfrc_calc_x_lookup[try][small]) high = try; else low = try + 1; } return high; } /** * tfrc_calc_x - Calculate the send rate as per section 3.1 of RFC3448 * @s: packet size in bytes * @R: RTT scaled by 1000000 (i.e., microseconds) * @p: loss ratio estimate scaled by 1000000 * * Returns X_calc in bytes per second (not scaled). */ u32 tfrc_calc_x(u16 s, u32 R, u32 p) { u16 index; u32 f; u64 result; /* check against invalid parameters and divide-by-zero */ BUG_ON(p > 1000000); /* p must not exceed 100% */ BUG_ON(p == 0); /* f(0) = 0, divide by zero */ if (R == 0) { /* possible divide by zero */ DCCP_CRIT("WARNING: RTT is 0, returning maximum X_calc."); return ~0U; } if (p <= TFRC_CALC_X_SPLIT) { /* 0.0000 < p <= 0.05 */ if (p < TFRC_SMALLEST_P) { /* 0.0000 < p < 0.0001 */ DCCP_WARN("Value of p (%d) below resolution. " "Substituting %d\n", p, TFRC_SMALLEST_P); index = 0; } else /* 0.0001 <= p <= 0.05 */ index = p/TFRC_SMALLEST_P - 1; f = tfrc_calc_x_lookup[index][1]; } else { /* 0.05 < p <= 1.00 */ index = p/(1000000/TFRC_CALC_X_ARRSIZE) - 1; f = tfrc_calc_x_lookup[index][0]; } /* * Compute X = s/(R*f(p)) in bytes per second. * Since f(p) and R are both scaled by 1000000, we need to multiply by * 1000000^2. To avoid overflow, the result is computed in two stages. * This works under almost all reasonable operational conditions, for a * wide range of parameters. Yet, should some strange combination of * parameters result in overflow, the use of scaled_div32 will catch * this and return UINT_MAX - which is a logically adequate consequence. */ result = scaled_div(s, R); return scaled_div32(result, f); } /** * tfrc_calc_x_reverse_lookup - try to find p given f(p) * @fvalue: function value to match, scaled by 1000000 * * Returns closest match for p, also scaled by 1000000 */ u32 tfrc_calc_x_reverse_lookup(u32 fvalue) { int index; if (fvalue == 0) /* f(p) = 0 whenever p = 0 */ return 0; /* Error cases. */ if (fvalue < tfrc_calc_x_lookup[0][1]) { DCCP_WARN("fvalue %u smaller than resolution\n", fvalue); return TFRC_SMALLEST_P; } if (fvalue > tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][0]) { DCCP_WARN("fvalue %u exceeds bounds!\n", fvalue); return 1000000; } if (fvalue <= tfrc_calc_x_lookup[TFRC_CALC_X_ARRSIZE - 1][1]) { index = tfrc_binsearch(fvalue, 1); return (index + 1) * TFRC_CALC_X_SPLIT / TFRC_CALC_X_ARRSIZE; } /* else ... it must be in the coarse-grained column */ index = tfrc_binsearch(fvalue, 0); return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; } /** * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100% * @loss_event_rate: loss event rate to invert * When @loss_event_rate is large, there is a chance that p is truncated to 0. * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0. */ u32 tfrc_invert_loss_event_rate(u32 loss_event_rate) { if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */ return 0; if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */ return 1000000; return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P); }
21 21 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 // SPDX-License-Identifier: GPL-2.0-or-later /* * compress.c - NTFS kernel compressed attributes handling. * Part of the Linux-NTFS project. * * Copyright (c) 2001-2004 Anton Altaparmakov * Copyright (c) 2002 Richard Russon */ #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/blkdev.h> #include <linux/vmalloc.h> #include <linux/slab.h> #include "attrib.h" #include "inode.h" #include "debug.h" #include "ntfs.h" /** * ntfs_compression_constants - enum of constants used in the compression code */ typedef enum { /* Token types and access mask. */ NTFS_SYMBOL_TOKEN = 0, NTFS_PHRASE_TOKEN = 1, NTFS_TOKEN_MASK = 1, /* Compression sub-block constants. */ NTFS_SB_SIZE_MASK = 0x0fff, NTFS_SB_SIZE = 0x1000, NTFS_SB_IS_COMPRESSED = 0x8000, /* * The maximum compression block size is by definition 16 * the cluster * size, with the maximum supported cluster size being 4kiB. Thus the * maximum compression buffer size is 64kiB, so we use this when * initializing the compression buffer. */ NTFS_MAX_CB_SIZE = 64 * 1024, } ntfs_compression_constants; /* * ntfs_compression_buffer - one buffer for the decompression engine */ static u8 *ntfs_compression_buffer; /* * ntfs_cb_lock - spinlock which protects ntfs_compression_buffer */ static DEFINE_SPINLOCK(ntfs_cb_lock); /** * allocate_compression_buffers - allocate the decompression buffers * * Caller has to hold the ntfs_lock mutex. * * Return 0 on success or -ENOMEM if the allocations failed. */ int allocate_compression_buffers(void) { BUG_ON(ntfs_compression_buffer); ntfs_compression_buffer = vmalloc(NTFS_MAX_CB_SIZE); if (!ntfs_compression_buffer) return -ENOMEM; return 0; } /** * free_compression_buffers - free the decompression buffers * * Caller has to hold the ntfs_lock mutex. */ void free_compression_buffers(void) { BUG_ON(!ntfs_compression_buffer); vfree(ntfs_compression_buffer); ntfs_compression_buffer = NULL; } /** * zero_partial_compressed_page - zero out of bounds compressed page region */ static void zero_partial_compressed_page(struct page *page, const s64 initialized_size) { u8 *kp = page_address(page); unsigned int kp_ofs; ntfs_debug("Zeroing page region outside initialized size."); if (((s64)page->index << PAGE_SHIFT) >= initialized_size) { clear_page(kp); return; } kp_ofs = initialized_size & ~PAGE_MASK; memset(kp + kp_ofs, 0, PAGE_SIZE - kp_ofs); return; } /** * handle_bounds_compressed_page - test for&handle out of bounds compressed page */ static inline void handle_bounds_compressed_page(struct page *page, const loff_t i_size, const s64 initialized_size) { if ((page->index >= (initialized_size >> PAGE_SHIFT)) && (initialized_size < i_size)) zero_partial_compressed_page(page, initialized_size); return; } /** * ntfs_decompress - decompress a compression block into an array of pages * @dest_pages: destination array of pages * @completed_pages: scratch space to track completed pages * @dest_index: current index into @dest_pages (IN/OUT) * @dest_ofs: current offset within @dest_pages[@dest_index] (IN/OUT) * @dest_max_index: maximum index into @dest_pages (IN) * @dest_max_ofs: maximum offset within @dest_pages[@dest_max_index] (IN) * @xpage: the target page (-1 if none) (IN) * @xpage_done: set to 1 if xpage was completed successfully (IN/OUT) * @cb_start: compression block to decompress (IN) * @cb_size: size of compression block @cb_start in bytes (IN) * @i_size: file size when we started the read (IN) * @initialized_size: initialized file size when we started the read (IN) * * The caller must have disabled preemption. ntfs_decompress() reenables it when * the critical section is finished. * * This decompresses the compression block @cb_start into the array of * destination pages @dest_pages starting at index @dest_index into @dest_pages * and at offset @dest_pos into the page @dest_pages[@dest_index]. * * When the page @dest_pages[@xpage] is completed, @xpage_done is set to 1. * If xpage is -1 or @xpage has not been completed, @xpage_done is not modified. * * @cb_start is a pointer to the compression block which needs decompressing * and @cb_size is the size of @cb_start in bytes (8-64kiB). * * Return 0 if success or -EOVERFLOW on error in the compressed stream. * @xpage_done indicates whether the target page (@dest_pages[@xpage]) was * completed during the decompression of the compression block (@cb_start). * * Warning: This function *REQUIRES* PAGE_SIZE >= 4096 or it will blow up * unpredicatbly! You have been warned! * * Note to hackers: This function may not sleep until it has finished accessing * the compression block @cb_start as it is a per-CPU buffer. */ static int ntfs_decompress(struct page *dest_pages[], int completed_pages[], int *dest_index, int *dest_ofs, const int dest_max_index, const int dest_max_ofs, const int xpage, char *xpage_done, u8 *const cb_start, const u32 cb_size, const loff_t i_size, const s64 initialized_size) { /* * Pointers into the compressed data, i.e. the compression block (cb), * and the therein contained sub-blocks (sb). */ u8 *cb_end = cb_start + cb_size; /* End of cb. */ u8 *cb = cb_start; /* Current position in cb. */ u8 *cb_sb_start; /* Beginning of the current sb in the cb. */ u8 *cb_sb_end; /* End of current sb / beginning of next sb. */ /* Variables for uncompressed data / destination. */ struct page *dp; /* Current destination page being worked on. */ u8 *dp_addr; /* Current pointer into dp. */ u8 *dp_sb_start; /* Start of current sub-block in dp. */ u8 *dp_sb_end; /* End of current sb in dp (dp_sb_start + NTFS_SB_SIZE). */ u16 do_sb_start; /* @dest_ofs when starting this sub-block. */ u16 do_sb_end; /* @dest_ofs of end of this sb (do_sb_start + NTFS_SB_SIZE). */ /* Variables for tag and token parsing. */ u8 tag; /* Current tag. */ int token; /* Loop counter for the eight tokens in tag. */ int nr_completed_pages = 0; /* Default error code. */ int err = -EOVERFLOW; ntfs_debug("Entering, cb_size = 0x%x.", cb_size); do_next_sb: ntfs_debug("Beginning sub-block at offset = 0x%zx in the cb.", cb - cb_start); /* * Have we reached the end of the compression block or the end of the * decompressed data? The latter can happen for example if the current * position in the compression block is one byte before its end so the * first two checks do not detect it. */ if (cb == cb_end || !le16_to_cpup((le16*)cb) || (*dest_index == dest_max_index && *dest_ofs == dest_max_ofs)) { int i; ntfs_debug("Completed. Returning success (0)."); err = 0; return_error: /* We can sleep from now on, so we drop lock. */ spin_unlock(&ntfs_cb_lock); /* Second stage: finalize completed pages. */ if (nr_completed_pages > 0) { for (i = 0; i < nr_completed_pages; i++) { int di = completed_pages[i]; dp = dest_pages[di]; /* * If we are outside the initialized size, zero * the out of bounds page range. */ handle_bounds_compressed_page(dp, i_size, initialized_size); flush_dcache_page(dp); kunmap(dp); SetPageUptodate(dp); unlock_page(dp); if (di == xpage) *xpage_done = 1; else put_page(dp); dest_pages[di] = NULL; } } return err; } /* Setup offsets for the current sub-block destination. */ do_sb_start = *dest_ofs; do_sb_end = do_sb_start + NTFS_SB_SIZE; /* Check that we are still within allowed boundaries. */ if (*dest_index == dest_max_index && do_sb_end > dest_max_ofs) goto return_overflow; /* Does the minimum size of a compressed sb overflow valid range? */ if (cb + 6 > cb_end) goto return_overflow; /* Setup the current sub-block source pointers and validate range. */ cb_sb_start = cb; cb_sb_end = cb_sb_start + (le16_to_cpup((le16*)cb) & NTFS_SB_SIZE_MASK) + 3; if (cb_sb_end > cb_end) goto return_overflow; /* Get the current destination page. */ dp = dest_pages[*dest_index]; if (!dp) { /* No page present. Skip decompression of this sub-block. */ cb = cb_sb_end; /* Advance destination position to next sub-block. */ *dest_ofs = (*dest_ofs + NTFS_SB_SIZE) & ~PAGE_MASK; if (!*dest_ofs && (++*dest_index > dest_max_index)) goto return_overflow; goto do_next_sb; } /* We have a valid destination page. Setup the destination pointers. */ dp_addr = (u8*)page_address(dp) + do_sb_start; /* Now, we are ready to process the current sub-block (sb). */ if (!(le16_to_cpup((le16*)cb) & NTFS_SB_IS_COMPRESSED)) { ntfs_debug("Found uncompressed sub-block."); /* This sb is not compressed, just copy it into destination. */ /* Advance source position to first data byte. */ cb += 2; /* An uncompressed sb must be full size. */ if (cb_sb_end - cb != NTFS_SB_SIZE) goto return_overflow; /* Copy the block and advance the source position. */ memcpy(dp_addr, cb, NTFS_SB_SIZE); cb += NTFS_SB_SIZE; /* Advance destination position to next sub-block. */ *dest_ofs += NTFS_SB_SIZE; if (!(*dest_ofs &= ~PAGE_MASK)) { finalize_page: /* * First stage: add current page index to array of * completed pages. */ completed_pages[nr_completed_pages++] = *dest_index; if (++*dest_index > dest_max_index) goto return_overflow; } goto do_next_sb; } ntfs_debug("Found compressed sub-block."); /* This sb is compressed, decompress it into destination. */ /* Setup destination pointers. */ dp_sb_start = dp_addr; dp_sb_end = dp_sb_start + NTFS_SB_SIZE; /* Forward to the first tag in the sub-block. */ cb += 2; do_next_tag: if (cb == cb_sb_end) { /* Check if the decompressed sub-block was not full-length. */ if (dp_addr < dp_sb_end) { int nr_bytes = do_sb_end - *dest_ofs; ntfs_debug("Filling incomplete sub-block with " "zeroes."); /* Zero remainder and update destination position. */ memset(dp_addr, 0, nr_bytes); *dest_ofs += nr_bytes; } /* We have finished the current sub-block. */ if (!(*dest_ofs &= ~PAGE_MASK)) goto finalize_page; goto do_next_sb; } /* Check we are still in range. */ if (cb > cb_sb_end || dp_addr > dp_sb_end) goto return_overflow; /* Get the next tag and advance to first token. */ tag = *cb++; /* Parse the eight tokens described by the tag. */ for (token = 0; token < 8; token++, tag >>= 1) { u16 lg, pt, length, max_non_overlap; register u16 i; u8 *dp_back_addr; /* Check if we are done / still in range. */ if (cb >= cb_sb_end || dp_addr > dp_sb_end) break; /* Determine token type and parse appropriately.*/ if ((tag & NTFS_TOKEN_MASK) == NTFS_SYMBOL_TOKEN) { /* * We have a symbol token, copy the symbol across, and * advance the source and destination positions. */ *dp_addr++ = *cb++; ++*dest_ofs; /* Continue with the next token. */ continue; } /* * We have a phrase token. Make sure it is not the first tag in * the sb as this is illegal and would confuse the code below. */ if (dp_addr == dp_sb_start) goto return_overflow; /* * Determine the number of bytes to go back (p) and the number * of bytes to copy (l). We use an optimized algorithm in which * we first calculate log2(current destination position in sb), * which allows determination of l and p in O(1) rather than * O(n). We just need an arch-optimized log2() function now. */ lg = 0; for (i = *dest_ofs - do_sb_start - 1; i >= 0x10; i >>= 1) lg++; /* Get the phrase token into i. */ pt = le16_to_cpup((le16*)cb); /* * Calculate starting position of the byte sequence in * the destination using the fact that p = (pt >> (12 - lg)) + 1 * and make sure we don't go too far back. */ dp_back_addr = dp_addr - (pt >> (12 - lg)) - 1; if (dp_back_addr < dp_sb_start) goto return_overflow; /* Now calculate the length of the byte sequence. */ length = (pt & (0xfff >> lg)) + 3; /* Advance destination position and verify it is in range. */ *dest_ofs += length; if (*dest_ofs > do_sb_end) goto return_overflow; /* The number of non-overlapping bytes. */ max_non_overlap = dp_addr - dp_back_addr; if (length <= max_non_overlap) { /* The byte sequence doesn't overlap, just copy it. */ memcpy(dp_addr, dp_back_addr, length); /* Advance destination pointer. */ dp_addr += length; } else { /* * The byte sequence does overlap, copy non-overlapping * part and then do a slow byte by byte copy for the * overlapping part. Also, advance the destination * pointer. */ memcpy(dp_addr, dp_back_addr, max_non_overlap); dp_addr += max_non_overlap; dp_back_addr += max_non_overlap; length -= max_non_overlap; while (length--) *dp_addr++ = *dp_back_addr++; } /* Advance source position and continue with the next token. */ cb += 2; } /* No tokens left in the current tag. Continue with the next tag. */ goto do_next_tag; return_overflow: ntfs_error(NULL, "Failed. Returning -EOVERFLOW."); goto return_error; } /** * ntfs_read_compressed_block - read a compressed block into the page cache * @page: locked page in the compression block(s) we need to read * * When we are called the page has already been verified to be locked and the * attribute is known to be non-resident, not encrypted, but compressed. * * 1. Determine which compression block(s) @page is in. * 2. Get hold of all pages corresponding to this/these compression block(s). * 3. Read the (first) compression block. * 4. Decompress it into the corresponding pages. * 5. Throw the compressed data away and proceed to 3. for the next compression * block or return success if no more compression blocks left. * * Warning: We have to be careful what we do about existing pages. They might * have been written to so that we would lose data if we were to just overwrite * them with the out-of-date uncompressed data. * * FIXME: For PAGE_SIZE > cb_size we are not doing the Right Thing(TM) at * the end of the file I think. We need to detect this case and zero the out * of bounds remainder of the page in question and mark it as handled. At the * moment we would just return -EIO on such a page. This bug will only become * apparent if pages are above 8kiB and the NTFS volume only uses 512 byte * clusters so is probably not going to be seen by anyone. Still this should * be fixed. (AIA) * * FIXME: Again for PAGE_SIZE > cb_size we are screwing up both in * handling sparse and compressed cbs. (AIA) * * FIXME: At the moment we don't do any zeroing out in the case that * initialized_size is less than data_size. This should be safe because of the * nature of the compression algorithm used. Just in case we check and output * an error message in read inode if the two sizes are not equal for a * compressed file. (AIA) */ int ntfs_read_compressed_block(struct page *page) { loff_t i_size; s64 initialized_size; struct address_space *mapping = page->mapping; ntfs_inode *ni = NTFS_I(mapping->host); ntfs_volume *vol = ni->vol; struct super_block *sb = vol->sb; runlist_element *rl; unsigned long flags, block_size = sb->s_blocksize; unsigned char block_size_bits = sb->s_blocksize_bits; u8 *cb, *cb_pos, *cb_end; struct buffer_head **bhs; unsigned long offset, index = page->index; u32 cb_size = ni->itype.compressed.block_size; u64 cb_size_mask = cb_size - 1UL; VCN vcn; LCN lcn; /* The first wanted vcn (minimum alignment is PAGE_SIZE). */ VCN start_vcn = (((s64)index << PAGE_SHIFT) & ~cb_size_mask) >> vol->cluster_size_bits; /* * The first vcn after the last wanted vcn (minimum alignment is again * PAGE_SIZE. */ VCN end_vcn = ((((s64)(index + 1UL) << PAGE_SHIFT) + cb_size - 1) & ~cb_size_mask) >> vol->cluster_size_bits; /* Number of compression blocks (cbs) in the wanted vcn range. */ unsigned int nr_cbs = (end_vcn - start_vcn) << vol->cluster_size_bits >> ni->itype.compressed.block_size_bits; /* * Number of pages required to store the uncompressed data from all * compression blocks (cbs) overlapping @page. Due to alignment * guarantees of start_vcn and end_vcn, no need to round up here. */ unsigned int nr_pages = (end_vcn - start_vcn) << vol->cluster_size_bits >> PAGE_SHIFT; unsigned int xpage, max_page, cur_page, cur_ofs, i; unsigned int cb_clusters, cb_max_ofs; int block, max_block, cb_max_page, bhs_size, nr_bhs, err = 0; struct page **pages; int *completed_pages; unsigned char xpage_done = 0; ntfs_debug("Entering, page->index = 0x%lx, cb_size = 0x%x, nr_pages = " "%i.", index, cb_size, nr_pages); /* * Bad things happen if we get here for anything that is not an * unnamed $DATA attribute. */ BUG_ON(ni->type != AT_DATA); BUG_ON(ni->name_len); pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); completed_pages = kmalloc_array(nr_pages + 1, sizeof(int), GFP_NOFS); /* Allocate memory to store the buffer heads we need. */ bhs_size = cb_size / block_size * sizeof(struct buffer_head *); bhs = kmalloc(bhs_size, GFP_NOFS); if (unlikely(!pages || !bhs || !completed_pages)) { kfree(bhs); kfree(pages); kfree(completed_pages); unlock_page(page); ntfs_error(vol->sb, "Failed to allocate internal buffers."); return -ENOMEM; } /* * We have already been given one page, this is the one we must do. * Once again, the alignment guarantees keep it simple. */ offset = start_vcn << vol->cluster_size_bits >> PAGE_SHIFT; xpage = index - offset; pages[xpage] = page; /* * The remaining pages need to be allocated and inserted into the page * cache, alignment guarantees keep all the below much simpler. (-8 */ read_lock_irqsave(&ni->size_lock, flags); i_size = i_size_read(VFS_I(ni)); initialized_size = ni->initialized_size; read_unlock_irqrestore(&ni->size_lock, flags); max_page = ((i_size + PAGE_SIZE - 1) >> PAGE_SHIFT) - offset; /* Is the page fully outside i_size? (truncate in progress) */ if (xpage >= max_page) { kfree(bhs); kfree(pages); kfree(completed_pages); zero_user(page, 0, PAGE_SIZE); ntfs_debug("Compressed read outside i_size - truncated?"); SetPageUptodate(page); unlock_page(page); return 0; } if (nr_pages < max_page) max_page = nr_pages; for (i = 0; i < max_page; i++, offset++) { if (i != xpage) pages[i] = grab_cache_page_nowait(mapping, offset); page = pages[i]; if (page) { /* * We only (re)read the page if it isn't already read * in and/or dirty or we would be losing data or at * least wasting our time. */ if (!PageDirty(page) && (!PageUptodate(page) || PageError(page))) { ClearPageError(page); kmap(page); continue; } unlock_page(page); put_page(page); pages[i] = NULL; } } /* * We have the runlist, and all the destination pages we need to fill. * Now read the first compression block. */ cur_page = 0; cur_ofs = 0; cb_clusters = ni->itype.compressed.block_clusters; do_next_cb: nr_cbs--; nr_bhs = 0; /* Read all cb buffer heads one cluster at a time. */ rl = NULL; for (vcn = start_vcn, start_vcn += cb_clusters; vcn < start_vcn; vcn++) { bool is_retry = false; if (!rl) { lock_retry_remap: down_read(&ni->runlist.lock); rl = ni->runlist.rl; } if (likely(rl != NULL)) { /* Seek to element containing target vcn. */ while (rl->length && rl[1].vcn <= vcn) rl++; lcn = ntfs_rl_vcn_to_lcn(rl, vcn); } else lcn = LCN_RL_NOT_MAPPED; ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.", (unsigned long long)vcn, (unsigned long long)lcn); if (lcn < 0) { /* * When we reach the first sparse cluster we have * finished with the cb. */ if (lcn == LCN_HOLE) break; if (is_retry || lcn != LCN_RL_NOT_MAPPED) goto rl_err; is_retry = true; /* * Attempt to map runlist, dropping lock for the * duration. */ up_read(&ni->runlist.lock); if (!ntfs_map_runlist(ni, vcn)) goto lock_retry_remap; goto map_rl_err; } block = lcn << vol->cluster_size_bits >> block_size_bits; /* Read the lcn from device in chunks of block_size bytes. */ max_block = block + (vol->cluster_size >> block_size_bits); do { ntfs_debug("block = 0x%x.", block); if (unlikely(!(bhs[nr_bhs] = sb_getblk(sb, block)))) goto getblk_err; nr_bhs++; } while (++block < max_block); } /* Release the lock if we took it. */ if (rl) up_read(&ni->runlist.lock); /* Setup and initiate io on all buffer heads. */ for (i = 0; i < nr_bhs; i++) { struct buffer_head *tbh = bhs[i]; if (!trylock_buffer(tbh)) continue; if (unlikely(buffer_uptodate(tbh))) { unlock_buffer(tbh); continue; } get_bh(tbh); tbh->b_end_io = end_buffer_read_sync; submit_bh(REQ_OP_READ, tbh); } /* Wait for io completion on all buffer heads. */ for (i = 0; i < nr_bhs; i++) { struct buffer_head *tbh = bhs[i]; if (buffer_uptodate(tbh)) continue; wait_on_buffer(tbh); /* * We need an optimization barrier here, otherwise we start * hitting the below fixup code when accessing a loopback * mounted ntfs partition. This indicates either there is a * race condition in the loop driver or, more likely, gcc * overoptimises the code without the barrier and it doesn't * do the Right Thing(TM). */ barrier(); if (unlikely(!buffer_uptodate(tbh))) { ntfs_warning(vol->sb, "Buffer is unlocked but not " "uptodate! Unplugging the disk queue " "and rescheduling."); get_bh(tbh); io_schedule(); put_bh(tbh); if (unlikely(!buffer_uptodate(tbh))) goto read_err; ntfs_warning(vol->sb, "Buffer is now uptodate. Good."); } } /* * Get the compression buffer. We must not sleep any more * until we are finished with it. */ spin_lock(&ntfs_cb_lock); cb = ntfs_compression_buffer; BUG_ON(!cb); cb_pos = cb; cb_end = cb + cb_size; /* Copy the buffer heads into the contiguous buffer. */ for (i = 0; i < nr_bhs; i++) { memcpy(cb_pos, bhs[i]->b_data, block_size); cb_pos += block_size; } /* Just a precaution. */ if (cb_pos + 2 <= cb + cb_size) *(u16*)cb_pos = 0; /* Reset cb_pos back to the beginning. */ cb_pos = cb; /* We now have both source (if present) and destination. */ ntfs_debug("Successfully read the compression block."); /* The last page and maximum offset within it for the current cb. */ cb_max_page = (cur_page << PAGE_SHIFT) + cur_ofs + cb_size; cb_max_ofs = cb_max_page & ~PAGE_MASK; cb_max_page >>= PAGE_SHIFT; /* Catch end of file inside a compression block. */ if (cb_max_page > max_page) cb_max_page = max_page; if (vcn == start_vcn - cb_clusters) { /* Sparse cb, zero out page range overlapping the cb. */ ntfs_debug("Found sparse compression block."); /* We can sleep from now on, so we drop lock. */ spin_unlock(&ntfs_cb_lock); if (cb_max_ofs) cb_max_page--; for (; cur_page < cb_max_page; cur_page++) { page = pages[cur_page]; if (page) { if (likely(!cur_ofs)) clear_page(page_address(page)); else memset(page_address(page) + cur_ofs, 0, PAGE_SIZE - cur_ofs); flush_dcache_page(page); kunmap(page); SetPageUptodate(page); unlock_page(page); if (cur_page == xpage) xpage_done = 1; else put_page(page); pages[cur_page] = NULL; } cb_pos += PAGE_SIZE - cur_ofs; cur_ofs = 0; if (cb_pos >= cb_end) break; } /* If we have a partial final page, deal with it now. */ if (cb_max_ofs && cb_pos < cb_end) { page = pages[cur_page]; if (page) memset(page_address(page) + cur_ofs, 0, cb_max_ofs - cur_ofs); /* * No need to update cb_pos at this stage: * cb_pos += cb_max_ofs - cur_ofs; */ cur_ofs = cb_max_ofs; } } else if (vcn == start_vcn) { /* We can't sleep so we need two stages. */ unsigned int cur2_page = cur_page; unsigned int cur_ofs2 = cur_ofs; u8 *cb_pos2 = cb_pos; ntfs_debug("Found uncompressed compression block."); /* Uncompressed cb, copy it to the destination pages. */ /* * TODO: As a big optimization, we could detect this case * before we read all the pages and use block_read_full_folio() * on all full pages instead (we still have to treat partial * pages especially but at least we are getting rid of the * synchronous io for the majority of pages. * Or if we choose not to do the read-ahead/-behind stuff, we * could just return block_read_full_folio(pages[xpage]) as long * as PAGE_SIZE <= cb_size. */ if (cb_max_ofs) cb_max_page--; /* First stage: copy data into destination pages. */ for (; cur_page < cb_max_page; cur_page++) { page = pages[cur_page]; if (page) memcpy(page_address(page) + cur_ofs, cb_pos, PAGE_SIZE - cur_ofs); cb_pos += PAGE_SIZE - cur_ofs; cur_ofs = 0; if (cb_pos >= cb_end) break; } /* If we have a partial final page, deal with it now. */ if (cb_max_ofs && cb_pos < cb_end) { page = pages[cur_page]; if (page) memcpy(page_address(page) + cur_ofs, cb_pos, cb_max_ofs - cur_ofs); cb_pos += cb_max_ofs - cur_ofs; cur_ofs = cb_max_ofs; } /* We can sleep from now on, so drop lock. */ spin_unlock(&ntfs_cb_lock); /* Second stage: finalize pages. */ for (; cur2_page < cb_max_page; cur2_page++) { page = pages[cur2_page]; if (page) { /* * If we are outside the initialized size, zero * the out of bounds page range. */ handle_bounds_compressed_page(page, i_size, initialized_size); flush_dcache_page(page); kunmap(page); SetPageUptodate(page); unlock_page(page); if (cur2_page == xpage) xpage_done = 1; else put_page(page); pages[cur2_page] = NULL; } cb_pos2 += PAGE_SIZE - cur_ofs2; cur_ofs2 = 0; if (cb_pos2 >= cb_end) break; } } else { /* Compressed cb, decompress it into the destination page(s). */ unsigned int prev_cur_page = cur_page; ntfs_debug("Found compressed compression block."); err = ntfs_decompress(pages, completed_pages, &cur_page, &cur_ofs, cb_max_page, cb_max_ofs, xpage, &xpage_done, cb_pos, cb_size - (cb_pos - cb), i_size, initialized_size); /* * We can sleep from now on, lock already dropped by * ntfs_decompress(). */ if (err) { ntfs_error(vol->sb, "ntfs_decompress() failed in inode " "0x%lx with error code %i. Skipping " "this compression block.", ni->mft_no, -err); /* Release the unfinished pages. */ for (; prev_cur_page < cur_page; prev_cur_page++) { page = pages[prev_cur_page]; if (page) { flush_dcache_page(page); kunmap(page); unlock_page(page); if (prev_cur_page != xpage) put_page(page); pages[prev_cur_page] = NULL; } } } } /* Release the buffer heads. */ for (i = 0; i < nr_bhs; i++) brelse(bhs[i]); /* Do we have more work to do? */ if (nr_cbs) goto do_next_cb; /* We no longer need the list of buffer heads. */ kfree(bhs); /* Clean up if we have any pages left. Should never happen. */ for (cur_page = 0; cur_page < max_page; cur_page++) { page = pages[cur_page]; if (page) { ntfs_error(vol->sb, "Still have pages left! " "Terminating them with extreme " "prejudice. Inode 0x%lx, page index " "0x%lx.", ni->mft_no, page->index); flush_dcache_page(page); kunmap(page); unlock_page(page); if (cur_page != xpage) put_page(page); pages[cur_page] = NULL; } } /* We no longer need the list of pages. */ kfree(pages); kfree(completed_pages); /* If we have completed the requested page, we return success. */ if (likely(xpage_done)) return 0; ntfs_debug("Failed. Returning error code %s.", err == -EOVERFLOW ? "EOVERFLOW" : (!err ? "EIO" : "unknown error")); return err < 0 ? err : -EIO; read_err: ntfs_error(vol->sb, "IO error while reading compressed data."); /* Release the buffer heads. */ for (i = 0; i < nr_bhs; i++) brelse(bhs[i]); goto err_out; map_rl_err: ntfs_error(vol->sb, "ntfs_map_runlist() failed. Cannot read " "compression block."); goto err_out; rl_err: up_read(&ni->runlist.lock); ntfs_error(vol->sb, "ntfs_rl_vcn_to_lcn() failed. Cannot read " "compression block."); goto err_out; getblk_err: up_read(&ni->runlist.lock); ntfs_error(vol->sb, "getblk() failed. Cannot read compression block."); err_out: kfree(bhs); for (i = cur_page; i < max_page; i++) { page = pages[i]; if (page) { flush_dcache_page(page); kunmap(page); unlock_page(page); if (i != xpage) put_page(page); } } kfree(pages); kfree(completed_pages); return -EIO; }
233 24 2 2 3 3 2 5 4 7 6 6 237 9 8 39 194 194 1 2 4 195 227 179 1 1 255 255 254 33 25 22 252 248 224 253 1 2 1 1 1 21 18 212 24 231 228 3 3 1 3 1 3 231 23 1 1 1 1 230 5 1 178 170 164 173 254 23 203 178 253 228 6 205 199 2 4 204 217 254 4 4 4 4 4 9 4 8 2 10 5 7 2 2 17 2 9 2 6 2 2 4 55 6 1 41 7 43 7 48 20 28 38 4 38 4 33 9 37 4 40 2 37 5 38 4 38 4 41 1 39 3 37 5 30 12 3 39 48 45 1 33 12 13 13 44 33 44 1 43 44 36 4 4 1 7 8 44 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 // SPDX-License-Identifier: GPL-2.0-only /* * net/sched/sch_netem.c Network emulator * * Many of the algorithms and ideas for this came from * NIST Net which is not copyrighted. * * Authors: Stephen Hemminger <shemminger@osdl.org> * Catalin(ux aka Dino) BOIE <catab at umbrella dot ro> */ #include <linux/mm.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/rtnetlink.h> #include <linux/reciprocal_div.h> #include <linux/rbtree.h> #include <net/gso.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/inet_ecn.h> #define VERSION "1.3" /* Network Emulation Queuing algorithm. ==================================== Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based Network Emulation Tool [2] Luigi Rizzo, DummyNet for FreeBSD ---------------------------------------------------------------- This started out as a simple way to delay outgoing packets to test TCP but has grown to include most of the functionality of a full blown network emulator like NISTnet. It can delay packets and add random jitter (and correlation). The random distribution can be loaded from a table as well to provide normal, Pareto, or experimental curves. Packet loss, duplication, and reordering can also be emulated. This qdisc does not do classification that can be handled in layering other disciplines. It does not need to do bandwidth control either since that can be handled by using token bucket or other rate control. Correlated Loss Generator models Added generation of correlated loss according to the "Gilbert-Elliot" model, a 4-state markov model. References: [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general and intuitive loss model for packet networks and its implementation in the Netem module in the Linux kernel", available in [1] Authors: Stefano Salsano <stefano.salsano at uniroma2.it Fabio Ludovici <fabio.ludovici at yahoo.it> */ struct disttable { u32 size; s16 table[] __counted_by(size); }; struct netem_sched_data { /* internal t(ime)fifo qdisc uses t_root and sch->limit */ struct rb_root t_root; /* a linear queue; reduces rbtree rebalancing when jitter is low */ struct sk_buff *t_head; struct sk_buff *t_tail; /* optional qdisc for classful handling (NULL at netem init) */ struct Qdisc *qdisc; struct qdisc_watchdog watchdog; s64 latency; s64 jitter; u32 loss; u32 ecn; u32 limit; u32 counter; u32 gap; u32 duplicate; u32 reorder; u32 corrupt; u64 rate; s32 packet_overhead; u32 cell_size; struct reciprocal_value cell_size_reciprocal; s32 cell_overhead; struct crndstate { u32 last; u32 rho; } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor; struct prng { u64 seed; struct rnd_state prng_state; } prng; struct disttable *delay_dist; enum { CLG_RANDOM, CLG_4_STATES, CLG_GILB_ELL, } loss_model; enum { TX_IN_GAP_PERIOD = 1, TX_IN_BURST_PERIOD, LOST_IN_GAP_PERIOD, LOST_IN_BURST_PERIOD, } _4_state_model; enum { GOOD_STATE = 1, BAD_STATE, } GE_state_model; /* Correlated Loss Generation models */ struct clgstate { /* state of the Markov chain */ u8 state; /* 4-states and Gilbert-Elliot models */ u32 a1; /* p13 for 4-states or p for GE */ u32 a2; /* p31 for 4-states or r for GE */ u32 a3; /* p32 for 4-states or h for GE */ u32 a4; /* p14 for 4-states or 1-k for GE */ u32 a5; /* p23 used only in 4-states */ } clg; struct tc_netem_slot slot_config; struct slotstate { u64 slot_next; s32 packets_left; s32 bytes_left; } slot; struct disttable *slot_dist; }; /* Time stamp put into socket buffer control block * Only valid when skbs are in our internal t(ime)fifo queue. * * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp, * and skb->next & skb->prev are scratch space for a qdisc, * we save skb->tstamp value in skb->cb[] before destroying it. */ struct netem_skb_cb { u64 time_to_send; }; static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) { /* we assume we can use skb next/prev/tstamp as storage for rb_node */ qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb)); return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data; } /* init_crandom - initialize correlated random number generator * Use entropy source for initial seed. */ static void init_crandom(struct crndstate *state, unsigned long rho) { state->rho = rho; state->last = get_random_u32(); } /* get_crandom - correlated random number generator * Next number depends on last value. * rho is scaled to avoid floating point. */ static u32 get_crandom(struct crndstate *state, struct prng *p) { u64 value, rho; unsigned long answer; struct rnd_state *s = &p->prng_state; if (!state || state->rho == 0) /* no correlation */ return prandom_u32_state(s); value = prandom_u32_state(s); rho = (u64)state->rho + 1; answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32; state->last = answer; return answer; } /* loss_4state - 4-state model loss generator * Generates losses according to the 4-state Markov chain adopted in * the GI (General and Intuitive) loss model. */ static bool loss_4state(struct netem_sched_data *q) { struct clgstate *clg = &q->clg; u32 rnd = prandom_u32_state(&q->prng.prng_state); /* * Makes a comparison between rnd and the transition * probabilities outgoing from the current state, then decides the * next state and if the next packet has to be transmitted or lost. * The four states correspond to: * TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period * LOST_IN_GAP_PERIOD => isolated losses within a gap period * LOST_IN_BURST_PERIOD => lost packets within a burst period * TX_IN_BURST_PERIOD => successfully transmitted packets within a burst period */ switch (clg->state) { case TX_IN_GAP_PERIOD: if (rnd < clg->a4) { clg->state = LOST_IN_GAP_PERIOD; return true; } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) { clg->state = LOST_IN_BURST_PERIOD; return true; } else if (clg->a1 + clg->a4 < rnd) { clg->state = TX_IN_GAP_PERIOD; } break; case TX_IN_BURST_PERIOD: if (rnd < clg->a5) { clg->state = LOST_IN_BURST_PERIOD; return true; } else { clg->state = TX_IN_BURST_PERIOD; } break; case LOST_IN_BURST_PERIOD: if (rnd < clg->a3) clg->state = TX_IN_BURST_PERIOD; else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) { clg->state = TX_IN_GAP_PERIOD; } else if (clg->a2 + clg->a3 < rnd) { clg->state = LOST_IN_BURST_PERIOD; return true; } break; case LOST_IN_GAP_PERIOD: clg->state = TX_IN_GAP_PERIOD; break; } return false; } /* loss_gilb_ell - Gilbert-Elliot model loss generator * Generates losses according to the Gilbert-Elliot loss model or * its special cases (Gilbert or Simple Gilbert) * * Makes a comparison between random number and the transition * probabilities outgoing from the current state, then decides the * next state. A second random number is extracted and the comparison * with the loss probability of the current state decides if the next * packet will be transmitted or lost. */ static bool loss_gilb_ell(struct netem_sched_data *q) { struct clgstate *clg = &q->clg; struct rnd_state *s = &q->prng.prng_state; switch (clg->state) { case GOOD_STATE: if (prandom_u32_state(s) < clg->a1) clg->state = BAD_STATE; if (prandom_u32_state(s) < clg->a4) return true; break; case BAD_STATE: if (prandom_u32_state(s) < clg->a2) clg->state = GOOD_STATE; if (prandom_u32_state(s) > clg->a3) return true; } return false; } static bool loss_event(struct netem_sched_data *q) { switch (q->loss_model) { case CLG_RANDOM: /* Random packet drop 0 => none, ~0 => all */ return q->loss && q->loss >= get_crandom(&q->loss_cor, &q->prng); case CLG_4_STATES: /* 4state loss model algorithm (used also for GI model) * Extracts a value from the markov 4 state loss generator, * if it is 1 drops a packet and if needed writes the event in * the kernel logs */ return loss_4state(q); case CLG_GILB_ELL: /* Gilbert-Elliot loss model algorithm * Extracts a value from the Gilbert-Elliot loss generator, * if it is 1 drops a packet and if needed writes the event in * the kernel logs */ return loss_gilb_ell(q); } return false; /* not reached */ } /* tabledist - return a pseudo-randomly distributed value with mean mu and * std deviation sigma. Uses table lookup to approximate the desired * distribution, and a uniformly-distributed pseudo-random source. */ static s64 tabledist(s64 mu, s32 sigma, struct crndstate *state, struct prng *prng, const struct disttable *dist) { s64 x; long t; u32 rnd; if (sigma == 0) return mu; rnd = get_crandom(state, prng); /* default uniform distribution */ if (dist == NULL) return ((rnd % (2 * (u32)sigma)) + mu) - sigma; t = dist->table[rnd % dist->size]; x = (sigma % NETEM_DIST_SCALE) * t; if (x >= 0) x += NETEM_DIST_SCALE/2; else x -= NETEM_DIST_SCALE/2; return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu; } static u64 packet_time_ns(u64 len, const struct netem_sched_data *q) { len += q->packet_overhead; if (q->cell_size) { u32 cells = reciprocal_divide(len, q->cell_size_reciprocal); if (len > cells * q->cell_size) /* extra cell needed for remainder */ cells++; len = cells * (q->cell_size + q->cell_overhead); } return div64_u64(len * NSEC_PER_SEC, q->rate); } static void tfifo_reset(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); struct rb_node *p = rb_first(&q->t_root); while (p) { struct sk_buff *skb = rb_to_skb(p); p = rb_next(p); rb_erase(&skb->rbnode, &q->t_root); rtnl_kfree_skbs(skb, skb); } rtnl_kfree_skbs(q->t_head, q->t_tail); q->t_head = NULL; q->t_tail = NULL; } static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); u64 tnext = netem_skb_cb(nskb)->time_to_send; if (!q->t_tail || tnext >= netem_skb_cb(q->t_tail)->time_to_send) { if (q->t_tail) q->t_tail->next = nskb; else q->t_head = nskb; q->t_tail = nskb; } else { struct rb_node **p = &q->t_root.rb_node, *parent = NULL; while (*p) { struct sk_buff *skb; parent = *p; skb = rb_to_skb(parent); if (tnext >= netem_skb_cb(skb)->time_to_send) p = &parent->rb_right; else p = &parent->rb_left; } rb_link_node(&nskb->rbnode, parent, p); rb_insert_color(&nskb->rbnode, &q->t_root); } sch->q.qlen++; } /* netem can't properly corrupt a megapacket (like we get from GSO), so instead * when we statistically choose to corrupt one, we instead segment it, returning * the first packet to be corrupted, and re-enqueue the remaining frames */ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct sk_buff *segs; netdev_features_t features = netif_skb_features(skb); segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) { qdisc_drop(skb, sch, to_free); return NULL; } consume_skb(skb); return segs; } /* * Insert one skb into qdisc. * Note: parent depends on return value to account for queue length. * NET_XMIT_DROP: queue length didn't change. * NET_XMIT_SUCCESS: one skb was queued. */ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct netem_sched_data *q = qdisc_priv(sch); /* We don't fill cb now as skb_unshare() may invalidate it */ struct netem_skb_cb *cb; struct sk_buff *skb2; struct sk_buff *segs = NULL; unsigned int prev_len = qdisc_pkt_len(skb); int count = 1; int rc = NET_XMIT_SUCCESS; int rc_drop = NET_XMIT_DROP; /* Do not fool qdisc_drop_all() */ skb->prev = NULL; /* Random duplication */ if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor, &q->prng)) ++count; /* Drop packet? */ if (loss_event(q)) { if (q->ecn && INET_ECN_set_ce(skb)) qdisc_qstats_drop(sch); /* mark packet */ else --count; } if (count == 0) { qdisc_qstats_drop(sch); __qdisc_drop(skb, to_free); return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; } /* If a delay is expected, orphan the skb. (orphaning usually takes * place at TX completion time, so _before_ the link transit delay) */ if (q->latency || q->jitter || q->rate) skb_orphan_partial(skb); /* * If we need to duplicate packet, then re-insert at top of the * qdisc tree, since parent queuer expects that only one * skb will be queued. */ if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) { struct Qdisc *rootq = qdisc_root_bh(sch); u32 dupsave = q->duplicate; /* prevent duplicating a dup... */ q->duplicate = 0; rootq->enqueue(skb2, rootq, to_free); q->duplicate = dupsave; rc_drop = NET_XMIT_SUCCESS; } /* * Randomized packet corruption. * Make copy if needed since we are modifying * If packet is going to be hardware checksummed, then * do it now in software before we mangle it. */ if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor, &q->prng)) { if (skb_is_gso(skb)) { skb = netem_segment(skb, sch, to_free); if (!skb) return rc_drop; segs = skb->next; skb_mark_not_on_list(skb); qdisc_skb_cb(skb)->pkt_len = skb->len; } skb = skb_unshare(skb, GFP_ATOMIC); if (unlikely(!skb)) { qdisc_qstats_drop(sch); goto finish_segs; } if (skb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(skb)) { qdisc_drop(skb, sch, to_free); skb = NULL; goto finish_segs; } skb->data[get_random_u32_below(skb_headlen(skb))] ^= 1<<get_random_u32_below(8); } if (unlikely(sch->q.qlen >= sch->limit)) { /* re-link segs, so that qdisc_drop_all() frees them all */ skb->next = segs; qdisc_drop_all(skb, sch, to_free); return rc_drop; } qdisc_qstats_backlog_inc(sch, skb); cb = netem_skb_cb(skb); if (q->gap == 0 || /* not doing reordering */ q->counter < q->gap - 1 || /* inside last reordering gap */ q->reorder < get_crandom(&q->reorder_cor, &q->prng)) { u64 now; s64 delay; delay = tabledist(q->latency, q->jitter, &q->delay_cor, &q->prng, q->delay_dist); now = ktime_get_ns(); if (q->rate) { struct netem_skb_cb *last = NULL; if (sch->q.tail) last = netem_skb_cb(sch->q.tail); if (q->t_root.rb_node) { struct sk_buff *t_skb; struct netem_skb_cb *t_last; t_skb = skb_rb_last(&q->t_root); t_last = netem_skb_cb(t_skb); if (!last || t_last->time_to_send > last->time_to_send) last = t_last; } if (q->t_tail) { struct netem_skb_cb *t_last = netem_skb_cb(q->t_tail); if (!last || t_last->time_to_send > last->time_to_send) last = t_last; } if (last) { /* * Last packet in queue is reference point (now), * calculate this time bonus and subtract * from delay. */ delay -= last->time_to_send - now; delay = max_t(s64, 0, delay); now = last->time_to_send; } delay += packet_time_ns(qdisc_pkt_len(skb), q); } cb->time_to_send = now + delay; ++q->counter; tfifo_enqueue(skb, sch); } else { /* * Do re-ordering by putting one out of N packets at the front * of the queue. */ cb->time_to_send = ktime_get_ns(); q->counter = 0; __qdisc_enqueue_head(skb, &sch->q); sch->qstats.requeues++; } finish_segs: if (segs) { unsigned int len, last_len; int nb; len = skb ? skb->len : 0; nb = skb ? 1 : 0; while (segs) { skb2 = segs->next; skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; last_len = segs->len; rc = qdisc_enqueue(segs, sch, to_free); if (rc != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(rc)) qdisc_qstats_drop(sch); } else { nb++; len += last_len; } segs = skb2; } /* Parent qdiscs accounted for 1 skb of size @prev_len */ qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len)); } else if (!skb) { return NET_XMIT_DROP; } return NET_XMIT_SUCCESS; } /* Delay the next round with a new future slot with a * correct number of bytes and packets. */ static void get_slot_next(struct netem_sched_data *q, u64 now) { s64 next_delay; if (!q->slot_dist) next_delay = q->slot_config.min_delay + (get_random_u32() * (q->slot_config.max_delay - q->slot_config.min_delay) >> 32); else next_delay = tabledist(q->slot_config.dist_delay, (s32)(q->slot_config.dist_jitter), NULL, &q->prng, q->slot_dist); q->slot.slot_next = now + next_delay; q->slot.packets_left = q->slot_config.max_packets; q->slot.bytes_left = q->slot_config.max_bytes; } static struct sk_buff *netem_peek(struct netem_sched_data *q) { struct sk_buff *skb = skb_rb_first(&q->t_root); u64 t1, t2; if (!skb) return q->t_head; if (!q->t_head) return skb; t1 = netem_skb_cb(skb)->time_to_send; t2 = netem_skb_cb(q->t_head)->time_to_send; if (t1 < t2) return skb; return q->t_head; } static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb) { if (skb == q->t_head) { q->t_head = skb->next; if (!q->t_head) q->t_tail = NULL; } else { rb_erase(&skb->rbnode, &q->t_root); } } static struct sk_buff *netem_dequeue(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); struct sk_buff *skb; tfifo_dequeue: skb = __qdisc_dequeue_head(&sch->q); if (skb) { qdisc_qstats_backlog_dec(sch, skb); deliver: qdisc_bstats_update(sch, skb); return skb; } skb = netem_peek(q); if (skb) { u64 time_to_send; u64 now = ktime_get_ns(); /* if more time remaining? */ time_to_send = netem_skb_cb(skb)->time_to_send; if (q->slot.slot_next && q->slot.slot_next < time_to_send) get_slot_next(q, now); if (time_to_send <= now && q->slot.slot_next <= now) { netem_erase_head(q, skb); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); skb->next = NULL; skb->prev = NULL; /* skb->dev shares skb->rbnode area, * we need to restore its value. */ skb->dev = qdisc_dev(sch); if (q->slot.slot_next) { q->slot.packets_left--; q->slot.bytes_left -= qdisc_pkt_len(skb); if (q->slot.packets_left <= 0 || q->slot.bytes_left <= 0) get_slot_next(q, now); } if (q->qdisc) { unsigned int pkt_len = qdisc_pkt_len(skb); struct sk_buff *to_free = NULL; int err; err = qdisc_enqueue(skb, q->qdisc, &to_free); kfree_skb_list(to_free); if (err != NET_XMIT_SUCCESS && net_xmit_drop_count(err)) { qdisc_qstats_drop(sch); qdisc_tree_reduce_backlog(sch, 1, pkt_len); } goto tfifo_dequeue; } goto deliver; } if (q->qdisc) { skb = q->qdisc->ops->dequeue(q->qdisc); if (skb) goto deliver; } qdisc_watchdog_schedule_ns(&q->watchdog, max(time_to_send, q->slot.slot_next)); } if (q->qdisc) { skb = q->qdisc->ops->dequeue(q->qdisc); if (skb) goto deliver; } return NULL; } static void netem_reset(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); qdisc_reset_queue(sch); tfifo_reset(sch); if (q->qdisc) qdisc_reset(q->qdisc); qdisc_watchdog_cancel(&q->watchdog); } static void dist_free(struct disttable *d) { kvfree(d); } /* * Distribution data is a variable size payload containing * signed 16 bit values. */ static int get_dist_table(struct disttable **tbl, const struct nlattr *attr) { size_t n = nla_len(attr)/sizeof(__s16); const __s16 *data = nla_data(attr); struct disttable *d; int i; if (!n || n > NETEM_DIST_MAX) return -EINVAL; d = kvmalloc(struct_size(d, table, n), GFP_KERNEL); if (!d) return -ENOMEM; d->size = n; for (i = 0; i < n; i++) d->table[i] = data[i]; *tbl = d; return 0; } static void get_slot(struct netem_sched_data *q, const struct nlattr *attr) { const struct tc_netem_slot *c = nla_data(attr); q->slot_config = *c; if (q->slot_config.max_packets == 0) q->slot_config.max_packets = INT_MAX; if (q->slot_config.max_bytes == 0) q->slot_config.max_bytes = INT_MAX; /* capping dist_jitter to the range acceptable by tabledist() */ q->slot_config.dist_jitter = min_t(__s64, INT_MAX, abs(q->slot_config.dist_jitter)); q->slot.packets_left = q->slot_config.max_packets; q->slot.bytes_left = q->slot_config.max_bytes; if (q->slot_config.min_delay | q->slot_config.max_delay | q->slot_config.dist_jitter) q->slot.slot_next = ktime_get_ns(); else q->slot.slot_next = 0; } static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr) { const struct tc_netem_corr *c = nla_data(attr); init_crandom(&q->delay_cor, c->delay_corr); init_crandom(&q->loss_cor, c->loss_corr); init_crandom(&q->dup_cor, c->dup_corr); } static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr) { const struct tc_netem_reorder *r = nla_data(attr); q->reorder = r->probability; init_crandom(&q->reorder_cor, r->correlation); } static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr) { const struct tc_netem_corrupt *r = nla_data(attr); q->corrupt = r->probability; init_crandom(&q->corrupt_cor, r->correlation); } static void get_rate(struct netem_sched_data *q, const struct nlattr *attr) { const struct tc_netem_rate *r = nla_data(attr); q->rate = r->rate; q->packet_overhead = r->packet_overhead; q->cell_size = r->cell_size; q->cell_overhead = r->cell_overhead; if (q->cell_size) q->cell_size_reciprocal = reciprocal_value(q->cell_size); else q->cell_size_reciprocal = (struct reciprocal_value) { 0 }; } static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr) { const struct nlattr *la; int rem; nla_for_each_nested(la, attr, rem) { u16 type = nla_type(la); switch (type) { case NETEM_LOSS_GI: { const struct tc_netem_gimodel *gi = nla_data(la); if (nla_len(la) < sizeof(struct tc_netem_gimodel)) { pr_info("netem: incorrect gi model size\n"); return -EINVAL; } q->loss_model = CLG_4_STATES; q->clg.state = TX_IN_GAP_PERIOD; q->clg.a1 = gi->p13; q->clg.a2 = gi->p31; q->clg.a3 = gi->p32; q->clg.a4 = gi->p14; q->clg.a5 = gi->p23; break; } case NETEM_LOSS_GE: { const struct tc_netem_gemodel *ge = nla_data(la); if (nla_len(la) < sizeof(struct tc_netem_gemodel)) { pr_info("netem: incorrect ge model size\n"); return -EINVAL; } q->loss_model = CLG_GILB_ELL; q->clg.state = GOOD_STATE; q->clg.a1 = ge->p; q->clg.a2 = ge->r; q->clg.a3 = ge->h; q->clg.a4 = ge->k1; break; } default: pr_info("netem: unknown loss type %u\n", type); return -EINVAL; } } return 0; } static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = { [TCA_NETEM_CORR] = { .len = sizeof(struct tc_netem_corr) }, [TCA_NETEM_REORDER] = { .len = sizeof(struct tc_netem_reorder) }, [TCA_NETEM_CORRUPT] = { .len = sizeof(struct tc_netem_corrupt) }, [TCA_NETEM_RATE] = { .len = sizeof(struct tc_netem_rate) }, [TCA_NETEM_LOSS] = { .type = NLA_NESTED }, [TCA_NETEM_ECN] = { .type = NLA_U32 }, [TCA_NETEM_RATE64] = { .type = NLA_U64 }, [TCA_NETEM_LATENCY64] = { .type = NLA_S64 }, [TCA_NETEM_JITTER64] = { .type = NLA_S64 }, [TCA_NETEM_SLOT] = { .len = sizeof(struct tc_netem_slot) }, [TCA_NETEM_PRNG_SEED] = { .type = NLA_U64 }, }; static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, const struct nla_policy *policy, int len) { int nested_len = nla_len(nla) - NLA_ALIGN(len); if (nested_len < 0) { pr_info("netem: invalid attributes len %d\n", nested_len); return -EINVAL; } if (nested_len >= nla_attr_size(0)) return nla_parse_deprecated(tb, maxtype, nla_data(nla) + NLA_ALIGN(len), nested_len, policy, NULL); memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); return 0; } /* Parse netlink message to set options */ static int netem_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct netem_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_NETEM_MAX + 1]; struct disttable *delay_dist = NULL; struct disttable *slot_dist = NULL; struct tc_netem_qopt *qopt; struct clgstate old_clg; int old_loss_model = CLG_RANDOM; int ret; qopt = nla_data(opt); ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt)); if (ret < 0) return ret; if (tb[TCA_NETEM_DELAY_DIST]) { ret = get_dist_table(&delay_dist, tb[TCA_NETEM_DELAY_DIST]); if (ret) goto table_free; } if (tb[TCA_NETEM_SLOT_DIST]) { ret = get_dist_table(&slot_dist, tb[TCA_NETEM_SLOT_DIST]); if (ret) goto table_free; } sch_tree_lock(sch); /* backup q->clg and q->loss_model */ old_clg = q->clg; old_loss_model = q->loss_model; if (tb[TCA_NETEM_LOSS]) { ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]); if (ret) { q->loss_model = old_loss_model; q->clg = old_clg; goto unlock; } } else { q->loss_model = CLG_RANDOM; } if (delay_dist) swap(q->delay_dist, delay_dist); if (slot_dist) swap(q->slot_dist, slot_dist); sch->limit = qopt->limit; q->latency = PSCHED_TICKS2NS(qopt->latency); q->jitter = PSCHED_TICKS2NS(qopt->jitter); q->limit = qopt->limit; q->gap = qopt->gap; q->counter = 0; q->loss = qopt->loss; q->duplicate = qopt->duplicate; /* for compatibility with earlier versions. * if gap is set, need to assume 100% probability */ if (q->gap) q->reorder = ~0; if (tb[TCA_NETEM_CORR]) get_correlation(q, tb[TCA_NETEM_CORR]); if (tb[TCA_NETEM_REORDER]) get_reorder(q, tb[TCA_NETEM_REORDER]); if (tb[TCA_NETEM_CORRUPT]) get_corrupt(q, tb[TCA_NETEM_CORRUPT]); if (tb[TCA_NETEM_RATE]) get_rate(q, tb[TCA_NETEM_RATE]); if (tb[TCA_NETEM_RATE64]) q->rate = max_t(u64, q->rate, nla_get_u64(tb[TCA_NETEM_RATE64])); if (tb[TCA_NETEM_LATENCY64]) q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]); if (tb[TCA_NETEM_JITTER64]) q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]); if (tb[TCA_NETEM_ECN]) q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]); if (tb[TCA_NETEM_SLOT]) get_slot(q, tb[TCA_NETEM_SLOT]); /* capping jitter to the range acceptable by tabledist() */ q->jitter = min_t(s64, abs(q->jitter), INT_MAX); if (tb[TCA_NETEM_PRNG_SEED]) q->prng.seed = nla_get_u64(tb[TCA_NETEM_PRNG_SEED]); else q->prng.seed = get_random_u64(); prandom_seed_state(&q->prng.prng_state, q->prng.seed); unlock: sch_tree_unlock(sch); table_free: dist_free(delay_dist); dist_free(slot_dist); return ret; } static int netem_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct netem_sched_data *q = qdisc_priv(sch); int ret; qdisc_watchdog_init(&q->watchdog, sch); if (!opt) return -EINVAL; q->loss_model = CLG_RANDOM; ret = netem_change(sch, opt, extack); if (ret) pr_info("netem: change failed\n"); return ret; } static void netem_destroy(struct Qdisc *sch) { struct netem_sched_data *q = qdisc_priv(sch); qdisc_watchdog_cancel(&q->watchdog); if (q->qdisc) qdisc_put(q->qdisc); dist_free(q->delay_dist); dist_free(q->slot_dist); } static int dump_loss_model(const struct netem_sched_data *q, struct sk_buff *skb) { struct nlattr *nest; nest = nla_nest_start_noflag(skb, TCA_NETEM_LOSS); if (nest == NULL) goto nla_put_failure; switch (q->loss_model) { case CLG_RANDOM: /* legacy loss model */ nla_nest_cancel(skb, nest); return 0; /* no data */ case CLG_4_STATES: { struct tc_netem_gimodel gi = { .p13 = q->clg.a1, .p31 = q->clg.a2, .p32 = q->clg.a3, .p14 = q->clg.a4, .p23 = q->clg.a5, }; if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi)) goto nla_put_failure; break; } case CLG_GILB_ELL: { struct tc_netem_gemodel ge = { .p = q->clg.a1, .r = q->clg.a2, .h = q->clg.a3, .k1 = q->clg.a4, }; if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge)) goto nla_put_failure; break; } } nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -1; } static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) { const struct netem_sched_data *q = qdisc_priv(sch); struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb); struct tc_netem_qopt qopt; struct tc_netem_corr cor; struct tc_netem_reorder reorder; struct tc_netem_corrupt corrupt; struct tc_netem_rate rate; struct tc_netem_slot slot; qopt.latency = min_t(psched_time_t, PSCHED_NS2TICKS(q->latency), UINT_MAX); qopt.jitter = min_t(psched_time_t, PSCHED_NS2TICKS(q->jitter), UINT_MAX); qopt.limit = q->limit; qopt.loss = q->loss; qopt.gap = q->gap; qopt.duplicate = q->duplicate; if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt)) goto nla_put_failure; if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency)) goto nla_put_failure; if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter)) goto nla_put_failure; cor.delay_corr = q->delay_cor.rho; cor.loss_corr = q->loss_cor.rho; cor.dup_corr = q->dup_cor.rho; if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor)) goto nla_put_failure; reorder.probability = q->reorder; reorder.correlation = q->reorder_cor.rho; if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder)) goto nla_put_failure; corrupt.probability = q->corrupt; corrupt.correlation = q->corrupt_cor.rho; if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt)) goto nla_put_failure; if (q->rate >= (1ULL << 32)) { if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate, TCA_NETEM_PAD)) goto nla_put_failure; rate.rate = ~0U; } else { rate.rate = q->rate; } rate.packet_overhead = q->packet_overhead; rate.cell_size = q->cell_size; rate.cell_overhead = q->cell_overhead; if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate)) goto nla_put_failure; if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn)) goto nla_put_failure; if (dump_loss_model(q, skb) != 0) goto nla_put_failure; if (q->slot_config.min_delay | q->slot_config.max_delay | q->slot_config.dist_jitter) { slot = q->slot_config; if (slot.max_packets == INT_MAX) slot.max_packets = 0; if (slot.max_bytes == INT_MAX) slot.max_bytes = 0; if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot)) goto nla_put_failure; } if (nla_put_u64_64bit(skb, TCA_NETEM_PRNG_SEED, q->prng.seed, TCA_NETEM_PAD)) goto nla_put_failure; return nla_nest_end(skb, nla); nla_put_failure: nlmsg_trim(skb, nla); return -1; } static int netem_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { struct netem_sched_data *q = qdisc_priv(sch); if (cl != 1 || !q->qdisc) /* only one class */ return -ENOENT; tcm->tcm_handle |= TC_H_MIN(1); tcm->tcm_info = q->qdisc->handle; return 0; } static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct netem_sched_data *q = qdisc_priv(sch); *old = qdisc_replace(sch, new, &q->qdisc); return 0; } static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg) { struct netem_sched_data *q = qdisc_priv(sch); return q->qdisc; } static unsigned long netem_find(struct Qdisc *sch, u32 classid) { return 1; } static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker) { if (!walker->stop) { if (!tc_qdisc_stats_dump(sch, 1, walker)) return; } } static const struct Qdisc_class_ops netem_class_ops = { .graft = netem_graft, .leaf = netem_leaf, .find = netem_find, .walk = netem_walk, .dump = netem_dump_class, }; static struct Qdisc_ops netem_qdisc_ops __read_mostly = { .id = "netem", .cl_ops = &netem_class_ops, .priv_size = sizeof(struct netem_sched_data), .enqueue = netem_enqueue, .dequeue = netem_dequeue, .peek = qdisc_peek_dequeued, .init = netem_init, .reset = netem_reset, .destroy = netem_destroy, .change = netem_change, .dump = netem_dump, .owner = THIS_MODULE, }; static int __init netem_module_init(void) { pr_info("netem: version " VERSION "\n"); return register_qdisc(&netem_qdisc_ops); } static void __exit netem_module_exit(void) { unregister_qdisc(&netem_qdisc_ops); } module_init(netem_module_init) module_exit(netem_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Network characteristics emulator qdisc");
11 7 2 2 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 // SPDX-License-Identifier: GPL-2.0-only #include <linux/netlink.h> #include <linux/rtnetlink.h> #include <linux/types.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <linux/in6.h> #include <net/ip.h> int rtm_getroute_parse_ip_proto(struct nlattr *attr, u8 *ip_proto, u8 family, struct netlink_ext_ack *extack) { *ip_proto = nla_get_u8(attr); switch (*ip_proto) { case IPPROTO_TCP: case IPPROTO_UDP: return 0; case IPPROTO_ICMP: if (family != AF_INET) break; return 0; #if IS_ENABLED(CONFIG_IPV6) case IPPROTO_ICMPV6: if (family != AF_INET6) break; return 0; #endif } NL_SET_ERR_MSG(extack, "Unsupported ip proto"); return -EOPNOTSUPP; } EXPORT_SYMBOL_GPL(rtm_getroute_parse_ip_proto);
4 5 25 33 25 16 44 37 47 13 41 10 44 28 26 35 14 43 39 10 10 10 1 9 6 5 8 9 3 2 1 206 113 197 506 92 469 3 3 3 3 425 524 523 521 520 522 524 522 524 520 545 547 444 443 98 98 80 78 3 3 522 524 83 75 75 75 4 71 546 544 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 // SPDX-License-Identifier: GPL-2.0 /* * drivers/usb/core/usb.c * * (C) Copyright Linus Torvalds 1999 * (C) Copyright Johannes Erdfelt 1999-2001 * (C) Copyright Andreas Gal 1999 * (C) Copyright Gregory P. Smith 1999 * (C) Copyright Deti Fliegl 1999 (new USB architecture) * (C) Copyright Randy Dunlap 2000 * (C) Copyright David Brownell 2000-2004 * (C) Copyright Yggdrasil Computing, Inc. 2000 * (usb_device_id matching changes by Adam J. Richter) * (C) Copyright Greg Kroah-Hartman 2002-2003 * * Released under the GPLv2 only. * * NOTE! This is not actually a driver at all, rather this is * just a collection of helper routines that implement the * generic USB things that the real drivers can use.. * * Think of this as a "USB library" rather than anything else, * with no callbacks. Callbacks are evil. */ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/of.h> #include <linux/string.h> #include <linux/bitops.h> #include <linux/slab.h> #include <linux/kmod.h> #include <linux/init.h> #include <linux/spinlock.h> #include <linux/errno.h> #include <linux/usb.h> #include <linux/usb/hcd.h> #include <linux/mutex.h> #include <linux/workqueue.h> #include <linux/debugfs.h> #include <linux/usb/of.h> #include <asm/io.h> #include <linux/scatterlist.h> #include <linux/mm.h> #include <linux/dma-mapping.h> #include "hub.h" const char *usbcore_name = "usbcore"; static bool nousb; /* Disable USB when built into kernel image */ module_param(nousb, bool, 0444); /* * for external read access to <nousb> */ int usb_disabled(void) { return nousb; } EXPORT_SYMBOL_GPL(usb_disabled); #ifdef CONFIG_PM /* Default delay value, in seconds */ static int usb_autosuspend_delay = CONFIG_USB_AUTOSUSPEND_DELAY; module_param_named(autosuspend, usb_autosuspend_delay, int, 0644); MODULE_PARM_DESC(autosuspend, "default autosuspend delay"); #else #define usb_autosuspend_delay 0 #endif static bool match_endpoint(struct usb_endpoint_descriptor *epd, struct usb_endpoint_descriptor **bulk_in, struct usb_endpoint_descriptor **bulk_out, struct usb_endpoint_descriptor **int_in, struct usb_endpoint_descriptor **int_out) { switch (usb_endpoint_type(epd)) { case USB_ENDPOINT_XFER_BULK: if (usb_endpoint_dir_in(epd)) { if (bulk_in && !*bulk_in) { *bulk_in = epd; break; } } else { if (bulk_out && !*bulk_out) { *bulk_out = epd; break; } } return false; case USB_ENDPOINT_XFER_INT: if (usb_endpoint_dir_in(epd)) { if (int_in && !*int_in) { *int_in = epd; break; } } else { if (int_out && !*int_out) { *int_out = epd; break; } } return false; default: return false; } return (!bulk_in || *bulk_in) && (!bulk_out || *bulk_out) && (!int_in || *int_in) && (!int_out || *int_out); } /** * usb_find_common_endpoints() -- look up common endpoint descriptors * @alt: alternate setting to search * @bulk_in: pointer to descriptor pointer, or NULL * @bulk_out: pointer to descriptor pointer, or NULL * @int_in: pointer to descriptor pointer, or NULL * @int_out: pointer to descriptor pointer, or NULL * * Search the alternate setting's endpoint descriptors for the first bulk-in, * bulk-out, interrupt-in and interrupt-out endpoints and return them in the * provided pointers (unless they are NULL). * * If a requested endpoint is not found, the corresponding pointer is set to * NULL. * * Return: Zero if all requested descriptors were found, or -ENXIO otherwise. */ int usb_find_common_endpoints(struct usb_host_interface *alt, struct usb_endpoint_descriptor **bulk_in, struct usb_endpoint_descriptor **bulk_out, struct usb_endpoint_descriptor **int_in, struct usb_endpoint_descriptor **int_out) { struct usb_endpoint_descriptor *epd; int i; if (bulk_in) *bulk_in = NULL; if (bulk_out) *bulk_out = NULL; if (int_in) *int_in = NULL; if (int_out) *int_out = NULL; for (i = 0; i < alt->desc.bNumEndpoints; ++i) { epd = &alt->endpoint[i].desc; if (match_endpoint(epd, bulk_in, bulk_out, int_in, int_out)) return 0; } return -ENXIO; } EXPORT_SYMBOL_GPL(usb_find_common_endpoints); /** * usb_find_common_endpoints_reverse() -- look up common endpoint descriptors * @alt: alternate setting to search * @bulk_in: pointer to descriptor pointer, or NULL * @bulk_out: pointer to descriptor pointer, or NULL * @int_in: pointer to descriptor pointer, or NULL * @int_out: pointer to descriptor pointer, or NULL * * Search the alternate setting's endpoint descriptors for the last bulk-in, * bulk-out, interrupt-in and interrupt-out endpoints and return them in the * provided pointers (unless they are NULL). * * If a requested endpoint is not found, the corresponding pointer is set to * NULL. * * Return: Zero if all requested descriptors were found, or -ENXIO otherwise. */ int usb_find_common_endpoints_reverse(struct usb_host_interface *alt, struct usb_endpoint_descriptor **bulk_in, struct usb_endpoint_descriptor **bulk_out, struct usb_endpoint_descriptor **int_in, struct usb_endpoint_descriptor **int_out) { struct usb_endpoint_descriptor *epd; int i; if (bulk_in) *bulk_in = NULL; if (bulk_out) *bulk_out = NULL; if (int_in) *int_in = NULL; if (int_out) *int_out = NULL; for (i = alt->desc.bNumEndpoints - 1; i >= 0; --i) { epd = &alt->endpoint[i].desc; if (match_endpoint(epd, bulk_in, bulk_out, int_in, int_out)) return 0; } return -ENXIO; } EXPORT_SYMBOL_GPL(usb_find_common_endpoints_reverse); /** * usb_find_endpoint() - Given an endpoint address, search for the endpoint's * usb_host_endpoint structure in an interface's current altsetting. * @intf: the interface whose current altsetting should be searched * @ep_addr: the endpoint address (number and direction) to find * * Search the altsetting's list of endpoints for one with the specified address. * * Return: Pointer to the usb_host_endpoint if found, %NULL otherwise. */ static const struct usb_host_endpoint *usb_find_endpoint( const struct usb_interface *intf, unsigned int ep_addr) { int n; const struct usb_host_endpoint *ep; n = intf->cur_altsetting->desc.bNumEndpoints; ep = intf->cur_altsetting->endpoint; for (; n > 0; (--n, ++ep)) { if (ep->desc.bEndpointAddress == ep_addr) return ep; } return NULL; } /** * usb_check_bulk_endpoints - Check whether an interface's current altsetting * contains a set of bulk endpoints with the given addresses. * @intf: the interface whose current altsetting should be searched * @ep_addrs: 0-terminated array of the endpoint addresses (number and * direction) to look for * * Search for endpoints with the specified addresses and check their types. * * Return: %true if all the endpoints are found and are bulk, %false otherwise. */ bool usb_check_bulk_endpoints( const struct usb_interface *intf, const u8 *ep_addrs) { const struct usb_host_endpoint *ep; for (; *ep_addrs; ++ep_addrs) { ep = usb_find_endpoint(intf, *ep_addrs); if (!ep || !usb_endpoint_xfer_bulk(&ep->desc)) return false; } return true; } EXPORT_SYMBOL_GPL(usb_check_bulk_endpoints); /** * usb_check_int_endpoints - Check whether an interface's current altsetting * contains a set of interrupt endpoints with the given addresses. * @intf: the interface whose current altsetting should be searched * @ep_addrs: 0-terminated array of the endpoint addresses (number and * direction) to look for * * Search for endpoints with the specified addresses and check their types. * * Return: %true if all the endpoints are found and are interrupt, * %false otherwise. */ bool usb_check_int_endpoints( const struct usb_interface *intf, const u8 *ep_addrs) { const struct usb_host_endpoint *ep; for (; *ep_addrs; ++ep_addrs) { ep = usb_find_endpoint(intf, *ep_addrs); if (!ep || !usb_endpoint_xfer_int(&ep->desc)) return false; } return true; } EXPORT_SYMBOL_GPL(usb_check_int_endpoints); /** * usb_find_alt_setting() - Given a configuration, find the alternate setting * for the given interface. * @config: the configuration to search (not necessarily the current config). * @iface_num: interface number to search in * @alt_num: alternate interface setting number to search for. * * Search the configuration's interface cache for the given alt setting. * * Return: The alternate setting, if found. %NULL otherwise. */ struct usb_host_interface *usb_find_alt_setting( struct usb_host_config *config, unsigned int iface_num, unsigned int alt_num) { struct usb_interface_cache *intf_cache = NULL; int i; if (!config) return NULL; for (i = 0; i < config->desc.bNumInterfaces; i++) { if (config->intf_cache[i]->altsetting[0].desc.bInterfaceNumber == iface_num) { intf_cache = config->intf_cache[i]; break; } } if (!intf_cache) return NULL; for (i = 0; i < intf_cache->num_altsetting; i++) if (intf_cache->altsetting[i].desc.bAlternateSetting == alt_num) return &intf_cache->altsetting[i]; printk(KERN_DEBUG "Did not find alt setting %u for intf %u, " "config %u\n", alt_num, iface_num, config->desc.bConfigurationValue); return NULL; } EXPORT_SYMBOL_GPL(usb_find_alt_setting); /** * usb_ifnum_to_if - get the interface object with a given interface number * @dev: the device whose current configuration is considered * @ifnum: the desired interface * * This walks the device descriptor for the currently active configuration * to find the interface object with the particular interface number. * * Note that configuration descriptors are not required to assign interface * numbers sequentially, so that it would be incorrect to assume that * the first interface in that descriptor corresponds to interface zero. * This routine helps device drivers avoid such mistakes. * However, you should make sure that you do the right thing with any * alternate settings available for this interfaces. * * Don't call this function unless you are bound to one of the interfaces * on this device or you have locked the device! * * Return: A pointer to the interface that has @ifnum as interface number, * if found. %NULL otherwise. */ struct usb_interface *usb_ifnum_to_if(const struct usb_device *dev, unsigned ifnum) { struct usb_host_config *config = dev->actconfig; int i; if (!config) return NULL; for (i = 0; i < config->desc.bNumInterfaces; i++) if (config->interface[i]->altsetting[0] .desc.bInterfaceNumber == ifnum) return config->interface[i]; return NULL; } EXPORT_SYMBOL_GPL(usb_ifnum_to_if); /** * usb_altnum_to_altsetting - get the altsetting structure with a given alternate setting number. * @intf: the interface containing the altsetting in question * @altnum: the desired alternate setting number * * This searches the altsetting array of the specified interface for * an entry with the correct bAlternateSetting value. * * Note that altsettings need not be stored sequentially by number, so * it would be incorrect to assume that the first altsetting entry in * the array corresponds to altsetting zero. This routine helps device * drivers avoid such mistakes. * * Don't call this function unless you are bound to the intf interface * or you have locked the device! * * Return: A pointer to the entry of the altsetting array of @intf that * has @altnum as the alternate setting number. %NULL if not found. */ struct usb_host_interface *usb_altnum_to_altsetting( const struct usb_interface *intf, unsigned int altnum) { int i; for (i = 0; i < intf->num_altsetting; i++) { if (intf->altsetting[i].desc.bAlternateSetting == altnum) return &intf->altsetting[i]; } return NULL; } EXPORT_SYMBOL_GPL(usb_altnum_to_altsetting); struct find_interface_arg { int minor; struct device_driver *drv; }; static int __find_interface(struct device *dev, const void *data) { const struct find_interface_arg *arg = data; struct usb_interface *intf; if (!is_usb_interface(dev)) return 0; if (dev->driver != arg->drv) return 0; intf = to_usb_interface(dev); return intf->minor == arg->minor; } /** * usb_find_interface - find usb_interface pointer for driver and device * @drv: the driver whose current configuration is considered * @minor: the minor number of the desired device * * This walks the bus device list and returns a pointer to the interface * with the matching minor and driver. Note, this only works for devices * that share the USB major number. * * Return: A pointer to the interface with the matching major and @minor. */ struct usb_interface *usb_find_interface(struct usb_driver *drv, int minor) { struct find_interface_arg argb; struct device *dev; argb.minor = minor; argb.drv = &drv->driver; dev = bus_find_device(&usb_bus_type, NULL, &argb, __find_interface); /* Drop reference count from bus_find_device */ put_device(dev); return dev ? to_usb_interface(dev) : NULL; } EXPORT_SYMBOL_GPL(usb_find_interface); struct each_dev_arg { void *data; int (*fn)(struct usb_device *, void *); }; static int __each_dev(struct device *dev, void *data) { struct each_dev_arg *arg = (struct each_dev_arg *)data; /* There are struct usb_interface on the same bus, filter them out */ if (!is_usb_device(dev)) return 0; return arg->fn(to_usb_device(dev), arg->data); } /** * usb_for_each_dev - iterate over all USB devices in the system * @data: data pointer that will be handed to the callback function * @fn: callback function to be called for each USB device * * Iterate over all USB devices and call @fn for each, passing it @data. If it * returns anything other than 0, we break the iteration prematurely and return * that value. */ int usb_for_each_dev(void *data, int (*fn)(struct usb_device *, void *)) { struct each_dev_arg arg = {data, fn}; return bus_for_each_dev(&usb_bus_type, NULL, &arg, __each_dev); } EXPORT_SYMBOL_GPL(usb_for_each_dev); /** * usb_release_dev - free a usb device structure when all users of it are finished. * @dev: device that's been disconnected * * Will be called only by the device core when all users of this usb device are * done. */ static void usb_release_dev(struct device *dev) { struct usb_device *udev; struct usb_hcd *hcd; udev = to_usb_device(dev); hcd = bus_to_hcd(udev->bus); usb_destroy_configuration(udev); usb_release_bos_descriptor(udev); of_node_put(dev->of_node); usb_put_hcd(hcd); kfree(udev->product); kfree(udev->manufacturer); kfree(udev->serial); kfree(udev); } static int usb_dev_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct usb_device *usb_dev; usb_dev = to_usb_device(dev); if (add_uevent_var(env, "BUSNUM=%03d", usb_dev->bus->busnum)) return -ENOMEM; if (add_uevent_var(env, "DEVNUM=%03d", usb_dev->devnum)) return -ENOMEM; return 0; } #ifdef CONFIG_PM /* USB device Power-Management thunks. * There's no need to distinguish here between quiescing a USB device * and powering it down; the generic_suspend() routine takes care of * it by skipping the usb_port_suspend() call for a quiesce. And for * USB interfaces there's no difference at all. */ static int usb_dev_prepare(struct device *dev) { return 0; /* Implement eventually? */ } static void usb_dev_complete(struct device *dev) { /* Currently used only for rebinding interfaces */ usb_resume_complete(dev); } static int usb_dev_suspend(struct device *dev) { return usb_suspend(dev, PMSG_SUSPEND); } static int usb_dev_resume(struct device *dev) { return usb_resume(dev, PMSG_RESUME); } static int usb_dev_freeze(struct device *dev) { return usb_suspend(dev, PMSG_FREEZE); } static int usb_dev_thaw(struct device *dev) { return usb_resume(dev, PMSG_THAW); } static int usb_dev_poweroff(struct device *dev) { return usb_suspend(dev, PMSG_HIBERNATE); } static int usb_dev_restore(struct device *dev) { return usb_resume(dev, PMSG_RESTORE); } static const struct dev_pm_ops usb_device_pm_ops = { .prepare = usb_dev_prepare, .complete = usb_dev_complete, .suspend = usb_dev_suspend, .resume = usb_dev_resume, .freeze = usb_dev_freeze, .thaw = usb_dev_thaw, .poweroff = usb_dev_poweroff, .restore = usb_dev_restore, .runtime_suspend = usb_runtime_suspend, .runtime_resume = usb_runtime_resume, .runtime_idle = usb_runtime_idle, }; #endif /* CONFIG_PM */ static char *usb_devnode(const struct device *dev, umode_t *mode, kuid_t *uid, kgid_t *gid) { const struct usb_device *usb_dev; usb_dev = to_usb_device(dev); return kasprintf(GFP_KERNEL, "bus/usb/%03d/%03d", usb_dev->bus->busnum, usb_dev->devnum); } struct device_type usb_device_type = { .name = "usb_device", .release = usb_release_dev, .uevent = usb_dev_uevent, .devnode = usb_devnode, #ifdef CONFIG_PM .pm = &usb_device_pm_ops, #endif }; static bool usb_dev_authorized(struct usb_device *dev, struct usb_hcd *hcd) { struct usb_hub *hub; if (!dev->parent) return true; /* Root hub always ok [and always wired] */ switch (hcd->dev_policy) { case USB_DEVICE_AUTHORIZE_NONE: default: return false; case USB_DEVICE_AUTHORIZE_ALL: return true; case USB_DEVICE_AUTHORIZE_INTERNAL: hub = usb_hub_to_struct_hub(dev->parent); return hub->ports[dev->portnum - 1]->connect_type == USB_PORT_CONNECT_TYPE_HARD_WIRED; } } /** * usb_alloc_dev - usb device constructor (usbcore-internal) * @parent: hub to which device is connected; null to allocate a root hub * @bus: bus used to access the device * @port1: one-based index of port; ignored for root hubs * * Context: task context, might sleep. * * Only hub drivers (including virtual root hub drivers for host * controllers) should ever call this. * * This call may not be used in a non-sleeping context. * * Return: On success, a pointer to the allocated usb device. %NULL on * failure. */ struct usb_device *usb_alloc_dev(struct usb_device *parent, struct usb_bus *bus, unsigned port1) { struct usb_device *dev; struct usb_hcd *usb_hcd = bus_to_hcd(bus); unsigned raw_port = port1; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return NULL; if (!usb_get_hcd(usb_hcd)) { kfree(dev); return NULL; } /* Root hubs aren't true devices, so don't allocate HCD resources */ if (usb_hcd->driver->alloc_dev && parent && !usb_hcd->driver->alloc_dev(usb_hcd, dev)) { usb_put_hcd(bus_to_hcd(bus)); kfree(dev); return NULL; } device_initialize(&dev->dev); dev->dev.bus = &usb_bus_type; dev->dev.type = &usb_device_type; dev->dev.groups = usb_device_groups; set_dev_node(&dev->dev, dev_to_node(bus->sysdev)); dev->state = USB_STATE_ATTACHED; dev->lpm_disable_count = 1; atomic_set(&dev->urbnum, 0); INIT_LIST_HEAD(&dev->ep0.urb_list); dev->ep0.desc.bLength = USB_DT_ENDPOINT_SIZE; dev->ep0.desc.bDescriptorType = USB_DT_ENDPOINT; /* ep0 maxpacket comes later, from device descriptor */ usb_enable_endpoint(dev, &dev->ep0, false); dev->can_submit = 1; /* Save readable and stable topology id, distinguishing devices * by location for diagnostics, tools, driver model, etc. The * string is a path along hub ports, from the root. Each device's * dev->devpath will be stable until USB is re-cabled, and hubs * are often labeled with these port numbers. The name isn't * as stable: bus->busnum changes easily from modprobe order, * cardbus or pci hotplugging, and so on. */ if (unlikely(!parent)) { dev->devpath[0] = '0'; dev->route = 0; dev->dev.parent = bus->controller; device_set_of_node_from_dev(&dev->dev, bus->sysdev); dev_set_name(&dev->dev, "usb%d", bus->busnum); } else { /* match any labeling on the hubs; it's one-based */ if (parent->devpath[0] == '0') { snprintf(dev->devpath, sizeof dev->devpath, "%d", port1); /* Root ports are not counted in route string */ dev->route = 0; } else { snprintf(dev->devpath, sizeof dev->devpath, "%s.%d", parent->devpath, port1); /* Route string assumes hubs have less than 16 ports */ if (port1 < 15) dev->route = parent->route + (port1 << ((parent->level - 1)*4)); else dev->route = parent->route + (15 << ((parent->level - 1)*4)); } dev->dev.parent = &parent->dev; dev_set_name(&dev->dev, "%d-%s", bus->busnum, dev->devpath); if (!parent->parent) { /* device under root hub's port */ raw_port = usb_hcd_find_raw_port_number(usb_hcd, port1); } dev->dev.of_node = usb_of_get_device_node(parent, raw_port); /* hub driver sets up TT records */ } dev->portnum = port1; dev->bus = bus; dev->parent = parent; INIT_LIST_HEAD(&dev->filelist); #ifdef CONFIG_PM pm_runtime_set_autosuspend_delay(&dev->dev, usb_autosuspend_delay * 1000); dev->connect_time = jiffies; dev->active_duration = -jiffies; #endif dev->authorized = usb_dev_authorized(dev, usb_hcd); return dev; } EXPORT_SYMBOL_GPL(usb_alloc_dev); /** * usb_get_dev - increments the reference count of the usb device structure * @dev: the device being referenced * * Each live reference to a device should be refcounted. * * Drivers for USB interfaces should normally record such references in * their probe() methods, when they bind to an interface, and release * them by calling usb_put_dev(), in their disconnect() methods. * However, if a driver does not access the usb_device structure after * its disconnect() method returns then refcounting is not necessary, * because the USB core guarantees that a usb_device will not be * deallocated until after all of its interface drivers have been unbound. * * Return: A pointer to the device with the incremented reference counter. */ struct usb_device *usb_get_dev(struct usb_device *dev) { if (dev) get_device(&dev->dev); return dev; } EXPORT_SYMBOL_GPL(usb_get_dev); /** * usb_put_dev - release a use of the usb device structure * @dev: device that's been disconnected * * Must be called when a user of a device is finished with it. When the last * user of the device calls this function, the memory of the device is freed. */ void usb_put_dev(struct usb_device *dev) { if (dev) put_device(&dev->dev); } EXPORT_SYMBOL_GPL(usb_put_dev); /** * usb_get_intf - increments the reference count of the usb interface structure * @intf: the interface being referenced * * Each live reference to a interface must be refcounted. * * Drivers for USB interfaces should normally record such references in * their probe() methods, when they bind to an interface, and release * them by calling usb_put_intf(), in their disconnect() methods. * However, if a driver does not access the usb_interface structure after * its disconnect() method returns then refcounting is not necessary, * because the USB core guarantees that a usb_interface will not be * deallocated until after its driver has been unbound. * * Return: A pointer to the interface with the incremented reference counter. */ struct usb_interface *usb_get_intf(struct usb_interface *intf) { if (intf) get_device(&intf->dev); return intf; } EXPORT_SYMBOL_GPL(usb_get_intf); /** * usb_put_intf - release a use of the usb interface structure * @intf: interface that's been decremented * * Must be called when a user of an interface is finished with it. When the * last user of the interface calls this function, the memory of the interface * is freed. */ void usb_put_intf(struct usb_interface *intf) { if (intf) put_device(&intf->dev); } EXPORT_SYMBOL_GPL(usb_put_intf); /** * usb_intf_get_dma_device - acquire a reference on the usb interface's DMA endpoint * @intf: the usb interface * * While a USB device cannot perform DMA operations by itself, many USB * controllers can. A call to usb_intf_get_dma_device() returns the DMA endpoint * for the given USB interface, if any. The returned device structure must be * released with put_device(). * * See also usb_get_dma_device(). * * Returns: A reference to the usb interface's DMA endpoint; or NULL if none * exists. */ struct device *usb_intf_get_dma_device(struct usb_interface *intf) { struct usb_device *udev = interface_to_usbdev(intf); struct device *dmadev; if (!udev->bus) return NULL; dmadev = get_device(udev->bus->sysdev); if (!dmadev || !dmadev->dma_mask) { put_device(dmadev); return NULL; } return dmadev; } EXPORT_SYMBOL_GPL(usb_intf_get_dma_device); /* USB device locking * * USB devices and interfaces are locked using the semaphore in their * embedded struct device. The hub driver guarantees that whenever a * device is connected or disconnected, drivers are called with the * USB device locked as well as their particular interface. * * Complications arise when several devices are to be locked at the same * time. Only hub-aware drivers that are part of usbcore ever have to * do this; nobody else needs to worry about it. The rule for locking * is simple: * * When locking both a device and its parent, always lock the * parent first. */ /** * usb_lock_device_for_reset - cautiously acquire the lock for a usb device structure * @udev: device that's being locked * @iface: interface bound to the driver making the request (optional) * * Attempts to acquire the device lock, but fails if the device is * NOTATTACHED or SUSPENDED, or if iface is specified and the interface * is neither BINDING nor BOUND. Rather than sleeping to wait for the * lock, the routine polls repeatedly. This is to prevent deadlock with * disconnect; in some drivers (such as usb-storage) the disconnect() * or suspend() method will block waiting for a device reset to complete. * * Return: A negative error code for failure, otherwise 0. */ int usb_lock_device_for_reset(struct usb_device *udev, const struct usb_interface *iface) { unsigned long jiffies_expire = jiffies + HZ; if (udev->state == USB_STATE_NOTATTACHED) return -ENODEV; if (udev->state == USB_STATE_SUSPENDED) return -EHOSTUNREACH; if (iface && (iface->condition == USB_INTERFACE_UNBINDING || iface->condition == USB_INTERFACE_UNBOUND)) return -EINTR; while (!usb_trylock_device(udev)) { /* If we can't acquire the lock after waiting one second, * we're probably deadlocked */ if (time_after(jiffies, jiffies_expire)) return -EBUSY; msleep(15); if (udev->state == USB_STATE_NOTATTACHED) return -ENODEV; if (udev->state == USB_STATE_SUSPENDED) return -EHOSTUNREACH; if (iface && (iface->condition == USB_INTERFACE_UNBINDING || iface->condition == USB_INTERFACE_UNBOUND)) return -EINTR; } return 0; } EXPORT_SYMBOL_GPL(usb_lock_device_for_reset); /** * usb_get_current_frame_number - return current bus frame number * @dev: the device whose bus is being queried * * Return: The current frame number for the USB host controller used * with the given USB device. This can be used when scheduling * isochronous requests. * * Note: Different kinds of host controller have different "scheduling * horizons". While one type might support scheduling only 32 frames * into the future, others could support scheduling up to 1024 frames * into the future. * */ int usb_get_current_frame_number(struct usb_device *dev) { return usb_hcd_get_frame_number(dev); } EXPORT_SYMBOL_GPL(usb_get_current_frame_number); /*-------------------------------------------------------------------*/ /* * __usb_get_extra_descriptor() finds a descriptor of specific type in the * extra field of the interface and endpoint descriptor structs. */ int __usb_get_extra_descriptor(char *buffer, unsigned size, unsigned char type, void **ptr, size_t minsize) { struct usb_descriptor_header *header; while (size >= sizeof(struct usb_descriptor_header)) { header = (struct usb_descriptor_header *)buffer; if (header->bLength < 2 || header->bLength > size) { printk(KERN_ERR "%s: bogus descriptor, type %d length %d\n", usbcore_name, header->bDescriptorType, header->bLength); return -1; } if (header->bDescriptorType == type && header->bLength >= minsize) { *ptr = header; return 0; } buffer += header->bLength; size -= header->bLength; } return -1; } EXPORT_SYMBOL_GPL(__usb_get_extra_descriptor); /** * usb_alloc_coherent - allocate dma-consistent buffer for URB_NO_xxx_DMA_MAP * @dev: device the buffer will be used with * @size: requested buffer size * @mem_flags: affect whether allocation may block * @dma: used to return DMA address of buffer * * Return: Either null (indicating no buffer could be allocated), or the * cpu-space pointer to a buffer that may be used to perform DMA to the * specified device. Such cpu-space buffers are returned along with the DMA * address (through the pointer provided). * * Note: * These buffers are used with URB_NO_xxx_DMA_MAP set in urb->transfer_flags * to avoid behaviors like using "DMA bounce buffers", or thrashing IOMMU * hardware during URB completion/resubmit. The implementation varies between * platforms, depending on details of how DMA will work to this device. * Using these buffers also eliminates cacheline sharing problems on * architectures where CPU caches are not DMA-coherent. On systems without * bus-snooping caches, these buffers are uncached. * * When the buffer is no longer used, free it with usb_free_coherent(). */ void *usb_alloc_coherent(struct usb_device *dev, size_t size, gfp_t mem_flags, dma_addr_t *dma) { if (!dev || !dev->bus) return NULL; return hcd_buffer_alloc(dev->bus, size, mem_flags, dma); } EXPORT_SYMBOL_GPL(usb_alloc_coherent); /** * usb_free_coherent - free memory allocated with usb_alloc_coherent() * @dev: device the buffer was used with * @size: requested buffer size * @addr: CPU address of buffer * @dma: DMA address of buffer * * This reclaims an I/O buffer, letting it be reused. The memory must have * been allocated using usb_alloc_coherent(), and the parameters must match * those provided in that allocation request. */ void usb_free_coherent(struct usb_device *dev, size_t size, void *addr, dma_addr_t dma) { if (!dev || !dev->bus) return; if (!addr) return; hcd_buffer_free(dev->bus, size, addr, dma); } EXPORT_SYMBOL_GPL(usb_free_coherent); /* * Notifications of device and interface registration */ static int usb_bus_notify(struct notifier_block *nb, unsigned long action, void *data) { struct device *dev = data; switch (action) { case BUS_NOTIFY_ADD_DEVICE: if (dev->type == &usb_device_type) (void) usb_create_sysfs_dev_files(to_usb_device(dev)); else if (dev->type == &usb_if_device_type) usb_create_sysfs_intf_files(to_usb_interface(dev)); break; case BUS_NOTIFY_DEL_DEVICE: if (dev->type == &usb_device_type) usb_remove_sysfs_dev_files(to_usb_device(dev)); else if (dev->type == &usb_if_device_type) usb_remove_sysfs_intf_files(to_usb_interface(dev)); break; } return 0; } static struct notifier_block usb_bus_nb = { .notifier_call = usb_bus_notify, }; static void usb_debugfs_init(void) { debugfs_create_file("devices", 0444, usb_debug_root, NULL, &usbfs_devices_fops); } static void usb_debugfs_cleanup(void) { debugfs_lookup_and_remove("devices", usb_debug_root); } /* * Init */ static int __init usb_init(void) { int retval; if (usb_disabled()) { pr_info("%s: USB support disabled\n", usbcore_name); return 0; } usb_init_pool_max(); usb_debugfs_init(); usb_acpi_register(); retval = bus_register(&usb_bus_type); if (retval) goto bus_register_failed; retval = bus_register_notifier(&usb_bus_type, &usb_bus_nb); if (retval) goto bus_notifier_failed; retval = usb_major_init(); if (retval) goto major_init_failed; retval = class_register(&usbmisc_class); if (retval) goto class_register_failed; retval = usb_register(&usbfs_driver); if (retval) goto driver_register_failed; retval = usb_devio_init(); if (retval) goto usb_devio_init_failed; retval = usb_hub_init(); if (retval) goto hub_init_failed; retval = usb_register_device_driver(&usb_generic_driver, THIS_MODULE); if (!retval) goto out; usb_hub_cleanup(); hub_init_failed: usb_devio_cleanup(); usb_devio_init_failed: usb_deregister(&usbfs_driver); driver_register_failed: class_unregister(&usbmisc_class); class_register_failed: usb_major_cleanup(); major_init_failed: bus_unregister_notifier(&usb_bus_type, &usb_bus_nb); bus_notifier_failed: bus_unregister(&usb_bus_type); bus_register_failed: usb_acpi_unregister(); usb_debugfs_cleanup(); out: return retval; } /* * Cleanup */ static void __exit usb_exit(void) { /* This will matter if shutdown/reboot does exitcalls. */ if (usb_disabled()) return; usb_release_quirk_list(); usb_deregister_device_driver(&usb_generic_driver); usb_major_cleanup(); usb_deregister(&usbfs_driver); usb_devio_cleanup(); usb_hub_cleanup(); class_unregister(&usbmisc_class); bus_unregister_notifier(&usb_bus_type, &usb_bus_nb); bus_unregister(&usb_bus_type); usb_acpi_unregister(); usb_debugfs_cleanup(); idr_destroy(&usb_bus_idr); } subsys_initcall(usb_init); module_exit(usb_exit); MODULE_LICENSE("GPL");
18 1 6 6 3 3 3 2 3 3 3 2 2 2 4 4 1 1 3 3 6 6 6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 /* * Copyright (c) 2018 Cumulus Networks. All rights reserved. * Copyright (c) 2018 David Ahern <dsa@cumulusnetworks.com> * Copyright (c) 2019 Mellanox Technologies. All rights reserved. * * This software is licensed under the GNU General License Version 2, * June 1991 as shown in the file COPYING in the top-level directory of this * source tree. * * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. */ #include <linux/debugfs.h> #include <linux/device.h> #include <linux/etherdevice.h> #include <linux/inet.h> #include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/random.h> #include <linux/rtnetlink.h> #include <linux/workqueue.h> #include <net/devlink.h> #include <net/ip.h> #include <net/flow_offload.h> #include <uapi/linux/devlink.h> #include <uapi/linux/ip.h> #include <uapi/linux/udp.h> #include "netdevsim.h" static unsigned int nsim_dev_port_index(enum nsim_dev_port_type type, unsigned int port_index) { switch (type) { case NSIM_DEV_PORT_TYPE_VF: port_index = NSIM_DEV_VF_PORT_INDEX_BASE + port_index; break; case NSIM_DEV_PORT_TYPE_PF: break; } return port_index; } static inline unsigned int nsim_dev_port_index_to_vf_index(unsigned int port_index) { return port_index - NSIM_DEV_VF_PORT_INDEX_BASE; } static struct dentry *nsim_dev_ddir; unsigned int nsim_dev_get_vfs(struct nsim_dev *nsim_dev) { WARN_ON(!lockdep_rtnl_is_held() && !devl_lock_is_held(priv_to_devlink(nsim_dev))); return nsim_dev->nsim_bus_dev->num_vfs; } static void nsim_bus_dev_set_vfs(struct nsim_bus_dev *nsim_bus_dev, unsigned int num_vfs) { rtnl_lock(); nsim_bus_dev->num_vfs = num_vfs; rtnl_unlock(); } #define NSIM_DEV_DUMMY_REGION_SIZE (1024 * 32) static int nsim_dev_take_snapshot(struct devlink *devlink, const struct devlink_region_ops *ops, struct netlink_ext_ack *extack, u8 **data) { void *dummy_data; dummy_data = kmalloc(NSIM_DEV_DUMMY_REGION_SIZE, GFP_KERNEL); if (!dummy_data) return -ENOMEM; get_random_bytes(dummy_data, NSIM_DEV_DUMMY_REGION_SIZE); *data = dummy_data; return 0; } static ssize_t nsim_dev_take_snapshot_write(struct file *file, const char __user *data, size_t count, loff_t *ppos) { struct nsim_dev *nsim_dev = file->private_data; struct devlink *devlink; u8 *dummy_data; int err; u32 id; devlink = priv_to_devlink(nsim_dev); err = nsim_dev_take_snapshot(devlink, NULL, NULL, &dummy_data); if (err) return err; err = devlink_region_snapshot_id_get(devlink, &id); if (err) { pr_err("Failed to get snapshot id\n"); kfree(dummy_data); return err; } err = devlink_region_snapshot_create(nsim_dev->dummy_region, dummy_data, id); devlink_region_snapshot_id_put(devlink, id); if (err) { pr_err("Failed to create region snapshot\n"); kfree(dummy_data); return err; } return count; } static const struct file_operations nsim_dev_take_snapshot_fops = { .open = simple_open, .write = nsim_dev_take_snapshot_write, .llseek = generic_file_llseek, .owner = THIS_MODULE, }; static ssize_t nsim_dev_trap_fa_cookie_read(struct file *file, char __user *data, size_t count, loff_t *ppos) { struct nsim_dev *nsim_dev = file->private_data; struct flow_action_cookie *fa_cookie; unsigned int buf_len; ssize_t ret; char *buf; spin_lock(&nsim_dev->fa_cookie_lock); fa_cookie = nsim_dev->fa_cookie; if (!fa_cookie) { ret = -EINVAL; goto errout; } buf_len = fa_cookie->cookie_len * 2; buf = kmalloc(buf_len, GFP_ATOMIC); if (!buf) { ret = -ENOMEM; goto errout; } bin2hex(buf, fa_cookie->cookie, fa_cookie->cookie_len); spin_unlock(&nsim_dev->fa_cookie_lock); ret = simple_read_from_buffer(data, count, ppos, buf, buf_len); kfree(buf); return ret; errout: spin_unlock(&nsim_dev->fa_cookie_lock); return ret; } static ssize_t nsim_dev_trap_fa_cookie_write(struct file *file, const char __user *data, size_t count, loff_t *ppos) { struct nsim_dev *nsim_dev = file->private_data; struct flow_action_cookie *fa_cookie; size_t cookie_len; ssize_t ret; char *buf; if (*ppos != 0) return -EINVAL; cookie_len = (count - 1) / 2; if ((count - 1) % 2) return -EINVAL; buf = memdup_user(data, count); if (IS_ERR(buf)) return PTR_ERR(buf); fa_cookie = kmalloc(sizeof(*fa_cookie) + cookie_len, GFP_KERNEL | __GFP_NOWARN); if (!fa_cookie) { ret = -ENOMEM; goto free_buf; } fa_cookie->cookie_len = cookie_len; ret = hex2bin(fa_cookie->cookie, buf, cookie_len); if (ret) goto free_fa_cookie; kfree(buf); spin_lock(&nsim_dev->fa_cookie_lock); kfree(nsim_dev->fa_cookie); nsim_dev->fa_cookie = fa_cookie; spin_unlock(&nsim_dev->fa_cookie_lock); return count; free_fa_cookie: kfree(fa_cookie); free_buf: kfree(buf); return ret; } static const struct file_operations nsim_dev_trap_fa_cookie_fops = { .open = simple_open, .read = nsim_dev_trap_fa_cookie_read, .write = nsim_dev_trap_fa_cookie_write, .llseek = generic_file_llseek, .owner = THIS_MODULE, }; static ssize_t nsim_bus_dev_max_vfs_read(struct file *file, char __user *data, size_t count, loff_t *ppos) { struct nsim_dev *nsim_dev = file->private_data; char buf[11]; ssize_t len; len = scnprintf(buf, sizeof(buf), "%u\n", READ_ONCE(nsim_dev->nsim_bus_dev->max_vfs)); return simple_read_from_buffer(data, count, ppos, buf, len); } static ssize_t nsim_bus_dev_max_vfs_write(struct file *file, const char __user *data, size_t count, loff_t *ppos) { struct nsim_vf_config *vfconfigs; struct nsim_dev *nsim_dev; char buf[10]; ssize_t ret; u32 val; if (*ppos != 0) return 0; if (count >= sizeof(buf)) return -ENOSPC; ret = copy_from_user(buf, data, count); if (ret) return -EFAULT; buf[count] = '\0'; ret = kstrtouint(buf, 10, &val); if (ret) return -EINVAL; /* max_vfs limited by the maximum number of provided port indexes */ if (val > NSIM_DEV_VF_PORT_INDEX_MAX - NSIM_DEV_VF_PORT_INDEX_BASE) return -ERANGE; vfconfigs = kcalloc(val, sizeof(struct nsim_vf_config), GFP_KERNEL | __GFP_NOWARN); if (!vfconfigs) return -ENOMEM; nsim_dev = file->private_data; devl_lock(priv_to_devlink(nsim_dev)); /* Reject if VFs are configured */ if (nsim_dev_get_vfs(nsim_dev)) { ret = -EBUSY; } else { swap(nsim_dev->vfconfigs, vfconfigs); WRITE_ONCE(nsim_dev->nsim_bus_dev->max_vfs, val); *ppos += count; ret = count; } devl_unlock(priv_to_devlink(nsim_dev)); kfree(vfconfigs); return ret; } static const struct file_operations nsim_dev_max_vfs_fops = { .open = simple_open, .read = nsim_bus_dev_max_vfs_read, .write = nsim_bus_dev_max_vfs_write, .llseek = generic_file_llseek, .owner = THIS_MODULE, }; static int nsim_dev_debugfs_init(struct nsim_dev *nsim_dev) { char dev_ddir_name[sizeof(DRV_NAME) + 10]; int err; sprintf(dev_ddir_name, DRV_NAME "%u", nsim_dev->nsim_bus_dev->dev.id); nsim_dev->ddir = debugfs_create_dir(dev_ddir_name, nsim_dev_ddir); if (IS_ERR(nsim_dev->ddir)) return PTR_ERR(nsim_dev->ddir); nsim_dev->ports_ddir = debugfs_create_dir("ports", nsim_dev->ddir); if (IS_ERR(nsim_dev->ports_ddir)) { err = PTR_ERR(nsim_dev->ports_ddir); goto err_ddir; } debugfs_create_bool("fw_update_status", 0600, nsim_dev->ddir, &nsim_dev->fw_update_status); debugfs_create_u32("fw_update_overwrite_mask", 0600, nsim_dev->ddir, &nsim_dev->fw_update_overwrite_mask); debugfs_create_u32("max_macs", 0600, nsim_dev->ddir, &nsim_dev->max_macs); debugfs_create_bool("test1", 0600, nsim_dev->ddir, &nsim_dev->test1); nsim_dev->take_snapshot = debugfs_create_file("take_snapshot", 0200, nsim_dev->ddir, nsim_dev, &nsim_dev_take_snapshot_fops); debugfs_create_bool("dont_allow_reload", 0600, nsim_dev->ddir, &nsim_dev->dont_allow_reload); debugfs_create_bool("fail_reload", 0600, nsim_dev->ddir, &nsim_dev->fail_reload); debugfs_create_file("trap_flow_action_cookie", 0600, nsim_dev->ddir, nsim_dev, &nsim_dev_trap_fa_cookie_fops); debugfs_create_bool("fail_trap_group_set", 0600, nsim_dev->ddir, &nsim_dev->fail_trap_group_set); debugfs_create_bool("fail_trap_policer_set", 0600, nsim_dev->ddir, &nsim_dev->fail_trap_policer_set); debugfs_create_bool("fail_trap_policer_counter_get", 0600, nsim_dev->ddir, &nsim_dev->fail_trap_policer_counter_get); /* caution, dev_max_vfs write takes devlink lock */ debugfs_create_file("max_vfs", 0600, nsim_dev->ddir, nsim_dev, &nsim_dev_max_vfs_fops); nsim_dev->nodes_ddir = debugfs_create_dir("rate_nodes", nsim_dev->ddir); if (IS_ERR(nsim_dev->nodes_ddir)) { err = PTR_ERR(nsim_dev->nodes_ddir); goto err_ports_ddir; } debugfs_create_bool("fail_trap_drop_counter_get", 0600, nsim_dev->ddir, &nsim_dev->fail_trap_drop_counter_get); nsim_udp_tunnels_debugfs_create(nsim_dev); return 0; err_ports_ddir: debugfs_remove_recursive(nsim_dev->ports_ddir); err_ddir: debugfs_remove_recursive(nsim_dev->ddir); return err; } static void nsim_dev_debugfs_exit(struct nsim_dev *nsim_dev) { debugfs_remove_recursive(nsim_dev->nodes_ddir); debugfs_remove_recursive(nsim_dev->ports_ddir); debugfs_remove_recursive(nsim_dev->ddir); } static ssize_t nsim_dev_rate_parent_read(struct file *file, char __user *data, size_t count, loff_t *ppos) { char **name_ptr = file->private_data; size_t len; if (!*name_ptr) return 0; len = strlen(*name_ptr); return simple_read_from_buffer(data, count, ppos, *name_ptr, len); } static const struct file_operations nsim_dev_rate_parent_fops = { .open = simple_open, .read = nsim_dev_rate_parent_read, .llseek = generic_file_llseek, .owner = THIS_MODULE, }; static int nsim_dev_port_debugfs_init(struct nsim_dev *nsim_dev, struct nsim_dev_port *nsim_dev_port) { struct nsim_bus_dev *nsim_bus_dev = nsim_dev->nsim_bus_dev; unsigned int port_index = nsim_dev_port->port_index; char port_ddir_name[16]; char dev_link_name[32]; sprintf(port_ddir_name, "%u", port_index); nsim_dev_port->ddir = debugfs_create_dir(port_ddir_name, nsim_dev->ports_ddir); if (IS_ERR(nsim_dev_port->ddir)) return PTR_ERR(nsim_dev_port->ddir); sprintf(dev_link_name, "../../../" DRV_NAME "%u", nsim_bus_dev->dev.id); if (nsim_dev_port_is_vf(nsim_dev_port)) { unsigned int vf_id = nsim_dev_port_index_to_vf_index(port_index); debugfs_create_u16("tx_share", 0400, nsim_dev_port->ddir, &nsim_dev->vfconfigs[vf_id].min_tx_rate); debugfs_create_u16("tx_max", 0400, nsim_dev_port->ddir, &nsim_dev->vfconfigs[vf_id].max_tx_rate); nsim_dev_port->rate_parent = debugfs_create_file("rate_parent", 0400, nsim_dev_port->ddir, &nsim_dev_port->parent_name, &nsim_dev_rate_parent_fops); } debugfs_create_symlink("dev", nsim_dev_port->ddir, dev_link_name); return 0; } static void nsim_dev_port_debugfs_exit(struct nsim_dev_port *nsim_dev_port) { debugfs_remove_recursive(nsim_dev_port->ddir); } static int nsim_dev_resources_register(struct devlink *devlink) { struct devlink_resource_size_params params = { .size_max = (u64)-1, .size_granularity = 1, .unit = DEVLINK_RESOURCE_UNIT_ENTRY }; int err; /* Resources for IPv4 */ err = devl_resource_register(devlink, "IPv4", (u64)-1, NSIM_RESOURCE_IPV4, DEVLINK_RESOURCE_ID_PARENT_TOP, &params); if (err) { pr_err("Failed to register IPv4 top resource\n"); goto err_out; } err = devl_resource_register(devlink, "fib", (u64)-1, NSIM_RESOURCE_IPV4_FIB, NSIM_RESOURCE_IPV4, &params); if (err) { pr_err("Failed to register IPv4 FIB resource\n"); goto err_out; } err = devl_resource_register(devlink, "fib-rules", (u64)-1, NSIM_RESOURCE_IPV4_FIB_RULES, NSIM_RESOURCE_IPV4, &params); if (err) { pr_err("Failed to register IPv4 FIB rules resource\n"); goto err_out; } /* Resources for IPv6 */ err = devl_resource_register(devlink, "IPv6", (u64)-1, NSIM_RESOURCE_IPV6, DEVLINK_RESOURCE_ID_PARENT_TOP, &params); if (err) { pr_err("Failed to register IPv6 top resource\n"); goto err_out; } err = devl_resource_register(devlink, "fib", (u64)-1, NSIM_RESOURCE_IPV6_FIB, NSIM_RESOURCE_IPV6, &params); if (err) { pr_err("Failed to register IPv6 FIB resource\n"); goto err_out; } err = devl_resource_register(devlink, "fib-rules", (u64)-1, NSIM_RESOURCE_IPV6_FIB_RULES, NSIM_RESOURCE_IPV6, &params); if (err) { pr_err("Failed to register IPv6 FIB rules resource\n"); goto err_out; } /* Resources for nexthops */ err = devl_resource_register(devlink, "nexthops", (u64)-1, NSIM_RESOURCE_NEXTHOPS, DEVLINK_RESOURCE_ID_PARENT_TOP, &params); if (err) { pr_err("Failed to register NEXTHOPS resource\n"); goto err_out; } return 0; err_out: devl_resources_unregister(devlink); return err; } enum nsim_devlink_param_id { NSIM_DEVLINK_PARAM_ID_BASE = DEVLINK_PARAM_GENERIC_ID_MAX, NSIM_DEVLINK_PARAM_ID_TEST1, }; static const struct devlink_param nsim_devlink_params[] = { DEVLINK_PARAM_GENERIC(MAX_MACS, BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), NULL, NULL, NULL), DEVLINK_PARAM_DRIVER(NSIM_DEVLINK_PARAM_ID_TEST1, "test1", DEVLINK_PARAM_TYPE_BOOL, BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), NULL, NULL, NULL), }; static void nsim_devlink_set_params_init_values(struct nsim_dev *nsim_dev, struct devlink *devlink) { union devlink_param_value value; value.vu32 = nsim_dev->max_macs; devl_param_driverinit_value_set(devlink, DEVLINK_PARAM_GENERIC_ID_MAX_MACS, value); value.vbool = nsim_dev->test1; devl_param_driverinit_value_set(devlink, NSIM_DEVLINK_PARAM_ID_TEST1, value); } static void nsim_devlink_param_load_driverinit_values(struct devlink *devlink) { struct nsim_dev *nsim_dev = devlink_priv(devlink); union devlink_param_value saved_value; int err; err = devl_param_driverinit_value_get(devlink, DEVLINK_PARAM_GENERIC_ID_MAX_MACS, &saved_value); if (!err) nsim_dev->max_macs = saved_value.vu32; err = devl_param_driverinit_value_get(devlink, NSIM_DEVLINK_PARAM_ID_TEST1, &saved_value); if (!err) nsim_dev->test1 = saved_value.vbool; } #define NSIM_DEV_DUMMY_REGION_SNAPSHOT_MAX 16 static const struct devlink_region_ops dummy_region_ops = { .name = "dummy", .destructor = &kfree, .snapshot = nsim_dev_take_snapshot, }; static int nsim_dev_dummy_region_init(struct nsim_dev *nsim_dev, struct devlink *devlink) { nsim_dev->dummy_region = devl_region_create(devlink, &dummy_region_ops, NSIM_DEV_DUMMY_REGION_SNAPSHOT_MAX, NSIM_DEV_DUMMY_REGION_SIZE); return PTR_ERR_OR_ZERO(nsim_dev->dummy_region); } static void nsim_dev_dummy_region_exit(struct nsim_dev *nsim_dev) { devl_region_destroy(nsim_dev->dummy_region); } static int __nsim_dev_port_add(struct nsim_dev *nsim_dev, enum nsim_dev_port_type type, unsigned int port_index); static void __nsim_dev_port_del(struct nsim_dev_port *nsim_dev_port); static int nsim_esw_legacy_enable(struct nsim_dev *nsim_dev, struct netlink_ext_ack *extack) { struct devlink *devlink = priv_to_devlink(nsim_dev); struct nsim_dev_port *nsim_dev_port, *tmp; devl_rate_nodes_destroy(devlink); list_for_each_entry_safe(nsim_dev_port, tmp, &nsim_dev->port_list, list) if (nsim_dev_port_is_vf(nsim_dev_port)) __nsim_dev_port_del(nsim_dev_port); nsim_dev->esw_mode = DEVLINK_ESWITCH_MODE_LEGACY; return 0; } static int nsim_esw_switchdev_enable(struct nsim_dev *nsim_dev, struct netlink_ext_ack *extack) { struct nsim_dev_port *nsim_dev_port, *tmp; int i, err; for (i = 0; i < nsim_dev_get_vfs(nsim_dev); i++) { err = __nsim_dev_port_add(nsim_dev, NSIM_DEV_PORT_TYPE_VF, i); if (err) { NL_SET_ERR_MSG_MOD(extack, "Failed to initialize VFs' netdevsim ports"); pr_err("Failed to initialize VF id=%d. %d.\n", i, err); goto err_port_add_vfs; } } nsim_dev->esw_mode = DEVLINK_ESWITCH_MODE_SWITCHDEV; return 0; err_port_add_vfs: list_for_each_entry_safe(nsim_dev_port, tmp, &nsim_dev->port_list, list) if (nsim_dev_port_is_vf(nsim_dev_port)) __nsim_dev_port_del(nsim_dev_port); return err; } static int nsim_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode, struct netlink_ext_ack *extack) { struct nsim_dev *nsim_dev = devlink_priv(devlink); if (mode == nsim_dev->esw_mode) return 0; if (mode == DEVLINK_ESWITCH_MODE_LEGACY) return nsim_esw_legacy_enable(nsim_dev, extack); if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV) return nsim_esw_switchdev_enable(nsim_dev, extack); return -EINVAL; } static int nsim_devlink_eswitch_mode_get(struct devlink *devlink, u16 *mode) { struct nsim_dev *nsim_dev = devlink_priv(devlink); *mode = nsim_dev->esw_mode; return 0; } struct nsim_trap_item { void *trap_ctx; enum devlink_trap_action action; }; struct nsim_trap_data { struct delayed_work trap_report_dw; struct nsim_trap_item *trap_items_arr; u64 *trap_policers_cnt_arr; u64 trap_pkt_cnt; struct nsim_dev *nsim_dev; spinlock_t trap_lock; /* Protects trap_items_arr */ }; /* All driver-specific traps must be documented in * Documentation/networking/devlink/netdevsim.rst */ enum { NSIM_TRAP_ID_BASE = DEVLINK_TRAP_GENERIC_ID_MAX, NSIM_TRAP_ID_FID_MISS, }; #define NSIM_TRAP_NAME_FID_MISS "fid_miss" #define NSIM_TRAP_METADATA DEVLINK_TRAP_METADATA_TYPE_F_IN_PORT #define NSIM_TRAP_DROP(_id, _group_id) \ DEVLINK_TRAP_GENERIC(DROP, DROP, _id, \ DEVLINK_TRAP_GROUP_GENERIC_ID_##_group_id, \ NSIM_TRAP_METADATA) #define NSIM_TRAP_DROP_EXT(_id, _group_id, _metadata) \ DEVLINK_TRAP_GENERIC(DROP, DROP, _id, \ DEVLINK_TRAP_GROUP_GENERIC_ID_##_group_id, \ NSIM_TRAP_METADATA | (_metadata)) #define NSIM_TRAP_EXCEPTION(_id, _group_id) \ DEVLINK_TRAP_GENERIC(EXCEPTION, TRAP, _id, \ DEVLINK_TRAP_GROUP_GENERIC_ID_##_group_id, \ NSIM_TRAP_METADATA) #define NSIM_TRAP_CONTROL(_id, _group_id, _action) \ DEVLINK_TRAP_GENERIC(CONTROL, _action, _id, \ DEVLINK_TRAP_GROUP_GENERIC_ID_##_group_id, \ NSIM_TRAP_METADATA) #define NSIM_TRAP_DRIVER_EXCEPTION(_id, _group_id) \ DEVLINK_TRAP_DRIVER(EXCEPTION, TRAP, NSIM_TRAP_ID_##_id, \ NSIM_TRAP_NAME_##_id, \ DEVLINK_TRAP_GROUP_GENERIC_ID_##_group_id, \ NSIM_TRAP_METADATA) #define NSIM_DEV_TRAP_POLICER_MIN_RATE 1 #define NSIM_DEV_TRAP_POLICER_MAX_RATE 8000 #define NSIM_DEV_TRAP_POLICER_MIN_BURST 8 #define NSIM_DEV_TRAP_POLICER_MAX_BURST 65536 #define NSIM_TRAP_POLICER(_id, _rate, _burst) \ DEVLINK_TRAP_POLICER(_id, _rate, _burst, \ NSIM_DEV_TRAP_POLICER_MAX_RATE, \ NSIM_DEV_TRAP_POLICER_MIN_RATE, \ NSIM_DEV_TRAP_POLICER_MAX_BURST, \ NSIM_DEV_TRAP_POLICER_MIN_BURST) static const struct devlink_trap_policer nsim_trap_policers_arr[] = { NSIM_TRAP_POLICER(1, 1000, 128), NSIM_TRAP_POLICER(2, 2000, 256), NSIM_TRAP_POLICER(3, 3000, 512), }; static const struct devlink_trap_group nsim_trap_groups_arr[] = { DEVLINK_TRAP_GROUP_GENERIC(L2_DROPS, 0), DEVLINK_TRAP_GROUP_GENERIC(L3_DROPS, 1), DEVLINK_TRAP_GROUP_GENERIC(L3_EXCEPTIONS, 1), DEVLINK_TRAP_GROUP_GENERIC(BUFFER_DROPS, 2), DEVLINK_TRAP_GROUP_GENERIC(ACL_DROPS, 3), DEVLINK_TRAP_GROUP_GENERIC(MC_SNOOPING, 3), }; static const struct devlink_trap nsim_traps_arr[] = { NSIM_TRAP_DROP(SMAC_MC, L2_DROPS), NSIM_TRAP_DROP(VLAN_TAG_MISMATCH, L2_DROPS), NSIM_TRAP_DROP(INGRESS_VLAN_FILTER, L2_DROPS), NSIM_TRAP_DROP(INGRESS_STP_FILTER, L2_DROPS), NSIM_TRAP_DROP(EMPTY_TX_LIST, L2_DROPS), NSIM_TRAP_DROP(PORT_LOOPBACK_FILTER, L2_DROPS), NSIM_TRAP_DRIVER_EXCEPTION(FID_MISS, L2_DROPS), NSIM_TRAP_DROP(BLACKHOLE_ROUTE, L3_DROPS), NSIM_TRAP_EXCEPTION(TTL_ERROR, L3_EXCEPTIONS), NSIM_TRAP_DROP(TAIL_DROP, BUFFER_DROPS), NSIM_TRAP_DROP_EXT(INGRESS_FLOW_ACTION_DROP, ACL_DROPS, DEVLINK_TRAP_METADATA_TYPE_F_FA_COOKIE), NSIM_TRAP_DROP_EXT(EGRESS_FLOW_ACTION_DROP, ACL_DROPS, DEVLINK_TRAP_METADATA_TYPE_F_FA_COOKIE), NSIM_TRAP_CONTROL(IGMP_QUERY, MC_SNOOPING, MIRROR), NSIM_TRAP_CONTROL(IGMP_V1_REPORT, MC_SNOOPING, TRAP), }; #define NSIM_TRAP_L4_DATA_LEN 100 static struct sk_buff *nsim_dev_trap_skb_build(void) { int tot_len, data_len = NSIM_TRAP_L4_DATA_LEN; struct sk_buff *skb; struct udphdr *udph; struct ethhdr *eth; struct iphdr *iph; skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); if (!skb) return NULL; tot_len = sizeof(struct iphdr) + sizeof(struct udphdr) + data_len; skb_reset_mac_header(skb); eth = skb_put(skb, sizeof(struct ethhdr)); eth_random_addr(eth->h_dest); eth_random_addr(eth->h_source); eth->h_proto = htons(ETH_P_IP); skb->protocol = htons(ETH_P_IP); skb_set_network_header(skb, skb->len); iph = skb_put(skb, sizeof(struct iphdr)); iph->protocol = IPPROTO_UDP; iph->saddr = in_aton("192.0.2.1"); iph->daddr = in_aton("198.51.100.1"); iph->version = 0x4; iph->frag_off = 0; iph->ihl = 0x5; iph->tot_len = htons(tot_len); iph->ttl = 100; iph->check = 0; iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); skb_set_transport_header(skb, skb->len); udph = skb_put_zero(skb, sizeof(struct udphdr) + data_len); get_random_bytes(&udph->source, sizeof(u16)); get_random_bytes(&udph->dest, sizeof(u16)); udph->len = htons(sizeof(struct udphdr) + data_len); return skb; } static void nsim_dev_trap_report(struct nsim_dev_port *nsim_dev_port) { struct nsim_dev *nsim_dev = nsim_dev_port->ns->nsim_dev; struct devlink *devlink = priv_to_devlink(nsim_dev); struct nsim_trap_data *nsim_trap_data; int i; nsim_trap_data = nsim_dev->trap_data; spin_lock(&nsim_trap_data->trap_lock); for (i = 0; i < ARRAY_SIZE(nsim_traps_arr); i++) { struct flow_action_cookie *fa_cookie = NULL; struct nsim_trap_item *nsim_trap_item; struct sk_buff *skb; bool has_fa_cookie; has_fa_cookie = nsim_traps_arr[i].metadata_cap & DEVLINK_TRAP_METADATA_TYPE_F_FA_COOKIE; nsim_trap_item = &nsim_trap_data->trap_items_arr[i]; if (nsim_trap_item->action == DEVLINK_TRAP_ACTION_DROP) continue; skb = nsim_dev_trap_skb_build(); if (!skb) continue; skb->dev = nsim_dev_port->ns->netdev; /* Trapped packets are usually passed to devlink in softIRQ, * but in this case they are generated in a workqueue. Disable * softIRQs to prevent lockdep from complaining about * "incosistent lock state". */ spin_lock_bh(&nsim_dev->fa_cookie_lock); fa_cookie = has_fa_cookie ? nsim_dev->fa_cookie : NULL; devlink_trap_report(devlink, skb, nsim_trap_item->trap_ctx, &nsim_dev_port->devlink_port, fa_cookie); spin_unlock_bh(&nsim_dev->fa_cookie_lock); consume_skb(skb); } spin_unlock(&nsim_trap_data->trap_lock); } #define NSIM_TRAP_REPORT_INTERVAL_MS 100 static void nsim_dev_trap_report_work(struct work_struct *work) { struct nsim_trap_data *nsim_trap_data; struct nsim_dev_port *nsim_dev_port; struct nsim_dev *nsim_dev; nsim_trap_data = container_of(work, struct nsim_trap_data, trap_report_dw.work); nsim_dev = nsim_trap_data->nsim_dev; /* For each running port and enabled packet trap, generate a UDP * packet with a random 5-tuple and report it. */ if (!devl_trylock(priv_to_devlink(nsim_dev))) { schedule_delayed_work(&nsim_dev->trap_data->trap_report_dw, 0); return; } list_for_each_entry(nsim_dev_port, &nsim_dev->port_list, list) { if (!netif_running(nsim_dev_port->ns->netdev)) continue; nsim_dev_trap_report(nsim_dev_port); } devl_unlock(priv_to_devlink(nsim_dev)); schedule_delayed_work(&nsim_dev->trap_data->trap_report_dw, msecs_to_jiffies(NSIM_TRAP_REPORT_INTERVAL_MS)); } static int nsim_dev_traps_init(struct devlink *devlink) { size_t policers_count = ARRAY_SIZE(nsim_trap_policers_arr); struct nsim_dev *nsim_dev = devlink_priv(devlink); struct nsim_trap_data *nsim_trap_data; int err; nsim_trap_data = kzalloc(sizeof(*nsim_trap_data), GFP_KERNEL); if (!nsim_trap_data) return -ENOMEM; nsim_trap_data->trap_items_arr = kcalloc(ARRAY_SIZE(nsim_traps_arr), sizeof(struct nsim_trap_item), GFP_KERNEL); if (!nsim_trap_data->trap_items_arr) { err = -ENOMEM; goto err_trap_data_free; } nsim_trap_data->trap_policers_cnt_arr = kcalloc(policers_count, sizeof(u64), GFP_KERNEL); if (!nsim_trap_data->trap_policers_cnt_arr) { err = -ENOMEM; goto err_trap_items_free; } /* The lock is used to protect the action state of the registered * traps. The value is written by user and read in delayed work when * iterating over all the traps. */ spin_lock_init(&nsim_trap_data->trap_lock); nsim_trap_data->nsim_dev = nsim_dev; nsim_dev->trap_data = nsim_trap_data; err = devl_trap_policers_register(devlink, nsim_trap_policers_arr, policers_count); if (err) goto err_trap_policers_cnt_free; err = devl_trap_groups_register(devlink, nsim_trap_groups_arr, ARRAY_SIZE(nsim_trap_groups_arr)); if (err) goto err_trap_policers_unregister; err = devl_traps_register(devlink, nsim_traps_arr, ARRAY_SIZE(nsim_traps_arr), NULL); if (err) goto err_trap_groups_unregister; INIT_DELAYED_WORK(&nsim_dev->trap_data->trap_report_dw, nsim_dev_trap_report_work); schedule_delayed_work(&nsim_dev->trap_data->trap_report_dw, msecs_to_jiffies(NSIM_TRAP_REPORT_INTERVAL_MS)); return 0; err_trap_groups_unregister: devl_trap_groups_unregister(devlink, nsim_trap_groups_arr, ARRAY_SIZE(nsim_trap_groups_arr)); err_trap_policers_unregister: devl_trap_policers_unregister(devlink, nsim_trap_policers_arr, ARRAY_SIZE(nsim_trap_policers_arr)); err_trap_policers_cnt_free: kfree(nsim_trap_data->trap_policers_cnt_arr); err_trap_items_free: kfree(nsim_trap_data->trap_items_arr); err_trap_data_free: kfree(nsim_trap_data); return err; } static void nsim_dev_traps_exit(struct devlink *devlink) { struct nsim_dev *nsim_dev = devlink_priv(devlink); /* caution, trap work takes devlink lock */ cancel_delayed_work_sync(&nsim_dev->trap_data->trap_report_dw); devl_traps_unregister(devlink, nsim_traps_arr, ARRAY_SIZE(nsim_traps_arr)); devl_trap_groups_unregister(devlink, nsim_trap_groups_arr, ARRAY_SIZE(nsim_trap_groups_arr)); devl_trap_policers_unregister(devlink, nsim_trap_policers_arr, ARRAY_SIZE(nsim_trap_policers_arr)); kfree(nsim_dev->trap_data->trap_policers_cnt_arr); kfree(nsim_dev->trap_data->trap_items_arr); kfree(nsim_dev->trap_data); } static int nsim_dev_reload_create(struct nsim_dev *nsim_dev, struct netlink_ext_ack *extack); static void nsim_dev_reload_destroy(struct nsim_dev *nsim_dev); static int nsim_dev_reload_down(struct devlink *devlink, bool netns_change, enum devlink_reload_action action, enum devlink_reload_limit limit, struct netlink_ext_ack *extack) { struct nsim_dev *nsim_dev = devlink_priv(devlink); if (nsim_dev->dont_allow_reload) { /* For testing purposes, user set debugfs dont_allow_reload * value to true. So forbid it. */ NL_SET_ERR_MSG_MOD(extack, "User forbid the reload for testing purposes"); return -EOPNOTSUPP; } nsim_dev_reload_destroy(nsim_dev); return 0; } static int nsim_dev_reload_up(struct devlink *devlink, enum devlink_reload_action action, enum devlink_reload_limit limit, u32 *actions_performed, struct netlink_ext_ack *extack) { struct nsim_dev *nsim_dev = devlink_priv(devlink); if (nsim_dev->fail_reload) { /* For testing purposes, user set debugfs fail_reload * value to true. Fail right away. */ NL_SET_ERR_MSG_MOD(extack, "User setup the reload to fail for testing purposes"); return -EINVAL; } *actions_performed = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT); return nsim_dev_reload_create(nsim_dev, extack); } static int nsim_dev_info_get(struct devlink *devlink, struct devlink_info_req *req, struct netlink_ext_ack *extack) { int err; err = devlink_info_version_stored_put_ext(req, "fw.mgmt", "10.20.30", DEVLINK_INFO_VERSION_TYPE_COMPONENT); if (err) return err; return devlink_info_version_running_put_ext(req, "fw.mgmt", "10.20.30", DEVLINK_INFO_VERSION_TYPE_COMPONENT); } #define NSIM_DEV_FLASH_SIZE 500000 #define NSIM_DEV_FLASH_CHUNK_SIZE 1000 #define NSIM_DEV_FLASH_CHUNK_TIME_MS 10 static int nsim_dev_flash_update(struct devlink *devlink, struct devlink_flash_update_params *params, struct netlink_ext_ack *extack) { struct nsim_dev *nsim_dev = devlink_priv(devlink); int i; if ((params->overwrite_mask & ~nsim_dev->fw_update_overwrite_mask) != 0) return -EOPNOTSUPP; if (nsim_dev->fw_update_status) { devlink_flash_update_status_notify(devlink, "Preparing to flash", params->component, 0, 0); } for (i = 0; i < NSIM_DEV_FLASH_SIZE / NSIM_DEV_FLASH_CHUNK_SIZE; i++) { if (nsim_dev->fw_update_status) devlink_flash_update_status_notify(devlink, "Flashing", params->component, i * NSIM_DEV_FLASH_CHUNK_SIZE, NSIM_DEV_FLASH_SIZE); msleep(NSIM_DEV_FLASH_CHUNK_TIME_MS); } if (nsim_dev->fw_update_status) { devlink_flash_update_status_notify(devlink, "Flashing", params->component, NSIM_DEV_FLASH_SIZE, NSIM_DEV_FLASH_SIZE); devlink_flash_update_timeout_notify(devlink, "Flash select", params->component, 81); devlink_flash_update_status_notify(devlink, "Flashing done", params->component, 0, 0); } return 0; } static struct nsim_trap_item * nsim_dev_trap_item_lookup(struct nsim_dev *nsim_dev, u16 trap_id) { struct nsim_trap_data *nsim_trap_data = nsim_dev->trap_data; int i; for (i = 0; i < ARRAY_SIZE(nsim_traps_arr); i++) { if (nsim_traps_arr[i].id == trap_id) return &nsim_trap_data->trap_items_arr[i]; } return NULL; } static int nsim_dev_devlink_trap_init(struct devlink *devlink, const struct devlink_trap *trap, void *trap_ctx) { struct nsim_dev *nsim_dev = devlink_priv(devlink); struct nsim_trap_item *nsim_trap_item; nsim_trap_item = nsim_dev_trap_item_lookup(nsim_dev, trap->id); if (WARN_ON(!nsim_trap_item)) return -ENOENT; nsim_trap_item->trap_ctx = trap_ctx; nsim_trap_item->action = trap->init_action; return 0; } static int nsim_dev_devlink_trap_action_set(struct devlink *devlink, const struct devlink_trap *trap, enum devlink_trap_action action, struct netlink_ext_ack *extack) { struct nsim_dev *nsim_dev = devlink_priv(devlink); struct nsim_trap_item *nsim_trap_item; nsim_trap_item = nsim_dev_trap_item_lookup(nsim_dev, trap->id); if (WARN_ON(!nsim_trap_item)) return -ENOENT; spin_lock(&nsim_dev->trap_data->trap_lock); nsim_trap_item->action = action; spin_unlock(&nsim_dev->trap_data->trap_lock); return 0; } static int nsim_dev_devlink_trap_group_set(struct devlink *devlink, const struct devlink_trap_group *group, const struct devlink_trap_policer *policer, struct netlink_ext_ack *extack) { struct nsim_dev *nsim_dev = devlink_priv(devlink); if (nsim_dev->fail_trap_group_set) return -EINVAL; return 0; } static int nsim_dev_devlink_trap_policer_set(struct devlink *devlink, const struct devlink_trap_policer *policer, u64 rate, u64 burst, struct netlink_ext_ack *extack) { struct nsim_dev *nsim_dev = devlink_priv(devlink); if (nsim_dev->fail_trap_policer_set) { NL_SET_ERR_MSG_MOD(extack, "User setup the operation to fail for testing purposes"); return -EINVAL; } return 0; } static int nsim_dev_devlink_trap_policer_counter_get(struct devlink *devlink, const struct devlink_trap_policer *policer, u64 *p_drops) { struct nsim_dev *nsim_dev = devlink_priv(devlink); u64 *cnt; if (nsim_dev->fail_trap_policer_counter_get) return -EINVAL; cnt = &nsim_dev->trap_data->trap_policers_cnt_arr[policer->id - 1]; *p_drops = (*cnt)++; return 0; } #define NSIM_LINK_SPEED_MAX 5000 /* Mbps */ #define NSIM_LINK_SPEED_UNIT 125000 /* 1 Mbps given in bytes/sec to avoid * u64 overflow during conversion from * bytes to bits. */ static int nsim_rate_bytes_to_units(char *name, u64 *rate, struct netlink_ext_ack *extack) { u64 val; u32 rem; val = div_u64_rem(*rate, NSIM_LINK_SPEED_UNIT, &rem); if (rem) { pr_err("%s rate value %lluBps not in link speed units of 1Mbps.\n", name, *rate); NL_SET_ERR_MSG_MOD(extack, "TX rate value not in link speed units of 1Mbps."); return -EINVAL; } if (val > NSIM_LINK_SPEED_MAX) { pr_err("%s rate value %lluMbps exceed link maximum speed 5000Mbps.\n", name, val); NL_SET_ERR_MSG_MOD(extack, "TX rate value exceed link maximum speed 5000Mbps."); return -EINVAL; } *rate = val; return 0; } static int nsim_leaf_tx_share_set(struct devlink_rate *devlink_rate, void *priv, u64 tx_share, struct netlink_ext_ack *extack) { struct nsim_dev_port *nsim_dev_port = priv; struct nsim_dev *nsim_dev = nsim_dev_port->ns->nsim_dev; int vf_id = nsim_dev_port_index_to_vf_index(nsim_dev_port->port_index); int err; err = nsim_rate_bytes_to_units("tx_share", &tx_share, extack); if (err) return err; nsim_dev->vfconfigs[vf_id].min_tx_rate = tx_share; return 0; } static int nsim_leaf_tx_max_set(struct devlink_rate *devlink_rate, void *priv, u64 tx_max, struct netlink_ext_ack *extack) { struct nsim_dev_port *nsim_dev_port = priv; struct nsim_dev *nsim_dev = nsim_dev_port->ns->nsim_dev; int vf_id = nsim_dev_port_index_to_vf_index(nsim_dev_port->port_index); int err; err = nsim_rate_bytes_to_units("tx_max", &tx_max, extack); if (err) return err; nsim_dev->vfconfigs[vf_id].max_tx_rate = tx_max; return 0; } struct nsim_rate_node { struct dentry *ddir; struct dentry *rate_parent; char *parent_name; u16 tx_share; u16 tx_max; }; static int nsim_node_tx_share_set(struct devlink_rate *devlink_rate, void *priv, u64 tx_share, struct netlink_ext_ack *extack) { struct nsim_rate_node *nsim_node = priv; int err; err = nsim_rate_bytes_to_units("tx_share", &tx_share, extack); if (err) return err; nsim_node->tx_share = tx_share; return 0; } static int nsim_node_tx_max_set(struct devlink_rate *devlink_rate, void *priv, u64 tx_max, struct netlink_ext_ack *extack) { struct nsim_rate_node *nsim_node = priv; int err; err = nsim_rate_bytes_to_units("tx_max", &tx_max, extack); if (err) return err; nsim_node->tx_max = tx_max; return 0; } static int nsim_rate_node_new(struct devlink_rate *node, void **priv, struct netlink_ext_ack *extack) { struct nsim_dev *nsim_dev = devlink_priv(node->devlink); struct nsim_rate_node *nsim_node; if (!nsim_esw_mode_is_switchdev(nsim_dev)) { NL_SET_ERR_MSG_MOD(extack, "Node creation allowed only in switchdev mode."); return -EOPNOTSUPP; } nsim_node = kzalloc(sizeof(*nsim_node), GFP_KERNEL); if (!nsim_node) return -ENOMEM; nsim_node->ddir = debugfs_create_dir(node->name, nsim_dev->nodes_ddir); debugfs_create_u16("tx_share", 0400, nsim_node->ddir, &nsim_node->tx_share); debugfs_create_u16("tx_max", 0400, nsim_node->ddir, &nsim_node->tx_max); nsim_node->rate_parent = debugfs_create_file("rate_parent", 0400, nsim_node->ddir, &nsim_node->parent_name, &nsim_dev_rate_parent_fops); *priv = nsim_node; return 0; } static int nsim_rate_node_del(struct devlink_rate *node, void *priv, struct netlink_ext_ack *extack) { struct nsim_rate_node *nsim_node = priv; debugfs_remove(nsim_node->rate_parent); debugfs_remove_recursive(nsim_node->ddir); kfree(nsim_node); return 0; } static int nsim_rate_leaf_parent_set(struct devlink_rate *child, struct devlink_rate *parent, void *priv_child, void *priv_parent, struct netlink_ext_ack *extack) { struct nsim_dev_port *nsim_dev_port = priv_child; if (parent) nsim_dev_port->parent_name = parent->name; else nsim_dev_port->parent_name = NULL; return 0; } static int nsim_rate_node_parent_set(struct devlink_rate *child, struct devlink_rate *parent, void *priv_child, void *priv_parent, struct netlink_ext_ack *extack) { struct nsim_rate_node *nsim_node = priv_child; if (parent) nsim_node->parent_name = parent->name; else nsim_node->parent_name = NULL; return 0; } static int nsim_dev_devlink_trap_drop_counter_get(struct devlink *devlink, const struct devlink_trap *trap, u64 *p_drops) { struct nsim_dev *nsim_dev = devlink_priv(devlink); u64 *cnt; if (nsim_dev->fail_trap_drop_counter_get) return -EINVAL; cnt = &nsim_dev->trap_data->trap_pkt_cnt; *p_drops = (*cnt)++; return 0; } static const struct devlink_ops nsim_dev_devlink_ops = { .eswitch_mode_set = nsim_devlink_eswitch_mode_set, .eswitch_mode_get = nsim_devlink_eswitch_mode_get, .supported_flash_update_params = DEVLINK_SUPPORT_FLASH_UPDATE_OVERWRITE_MASK, .reload_actions = BIT(DEVLINK_RELOAD_ACTION_DRIVER_REINIT), .reload_down = nsim_dev_reload_down, .reload_up = nsim_dev_reload_up, .info_get = nsim_dev_info_get, .flash_update = nsim_dev_flash_update, .trap_init = nsim_dev_devlink_trap_init, .trap_action_set = nsim_dev_devlink_trap_action_set, .trap_group_set = nsim_dev_devlink_trap_group_set, .trap_policer_set = nsim_dev_devlink_trap_policer_set, .trap_policer_counter_get = nsim_dev_devlink_trap_policer_counter_get, .rate_leaf_tx_share_set = nsim_leaf_tx_share_set, .rate_leaf_tx_max_set = nsim_leaf_tx_max_set, .rate_node_tx_share_set = nsim_node_tx_share_set, .rate_node_tx_max_set = nsim_node_tx_max_set, .rate_node_new = nsim_rate_node_new, .rate_node_del = nsim_rate_node_del, .rate_leaf_parent_set = nsim_rate_leaf_parent_set, .rate_node_parent_set = nsim_rate_node_parent_set, .trap_drop_counter_get = nsim_dev_devlink_trap_drop_counter_get, }; #define NSIM_DEV_MAX_MACS_DEFAULT 32 #define NSIM_DEV_TEST1_DEFAULT true static int __nsim_dev_port_add(struct nsim_dev *nsim_dev, enum nsim_dev_port_type type, unsigned int port_index) { struct devlink_port_attrs attrs = {}; struct nsim_dev_port *nsim_dev_port; struct devlink_port *devlink_port; int err; if (type == NSIM_DEV_PORT_TYPE_VF && !nsim_dev_get_vfs(nsim_dev)) return -EINVAL; nsim_dev_port = kzalloc(sizeof(*nsim_dev_port), GFP_KERNEL); if (!nsim_dev_port) return -ENOMEM; nsim_dev_port->port_index = nsim_dev_port_index(type, port_index); nsim_dev_port->port_type = type; devlink_port = &nsim_dev_port->devlink_port; if (nsim_dev_port_is_pf(nsim_dev_port)) { attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL; attrs.phys.port_number = port_index + 1; } else { attrs.flavour = DEVLINK_PORT_FLAVOUR_PCI_VF; attrs.pci_vf.pf = 0; attrs.pci_vf.vf = port_index; } memcpy(attrs.switch_id.id, nsim_dev->switch_id.id, nsim_dev->switch_id.id_len); attrs.switch_id.id_len = nsim_dev->switch_id.id_len; devlink_port_attrs_set(devlink_port, &attrs); err = devl_port_register(priv_to_devlink(nsim_dev), devlink_port, nsim_dev_port->port_index); if (err) goto err_port_free; err = nsim_dev_port_debugfs_init(nsim_dev, nsim_dev_port); if (err) goto err_dl_port_unregister; nsim_dev_port->ns = nsim_create(nsim_dev, nsim_dev_port); if (IS_ERR(nsim_dev_port->ns)) { err = PTR_ERR(nsim_dev_port->ns); goto err_port_debugfs_exit; } if (nsim_dev_port_is_vf(nsim_dev_port)) { err = devl_rate_leaf_create(&nsim_dev_port->devlink_port, nsim_dev_port, NULL); if (err) goto err_nsim_destroy; } list_add(&nsim_dev_port->list, &nsim_dev->port_list); return 0; err_nsim_destroy: nsim_destroy(nsim_dev_port->ns); err_port_debugfs_exit: nsim_dev_port_debugfs_exit(nsim_dev_port); err_dl_port_unregister: devl_port_unregister(devlink_port); err_port_free: kfree(nsim_dev_port); return err; } static void __nsim_dev_port_del(struct nsim_dev_port *nsim_dev_port) { struct devlink_port *devlink_port = &nsim_dev_port->devlink_port; list_del(&nsim_dev_port->list); if (nsim_dev_port_is_vf(nsim_dev_port)) devl_rate_leaf_destroy(&nsim_dev_port->devlink_port); nsim_destroy(nsim_dev_port->ns); nsim_dev_port_debugfs_exit(nsim_dev_port); devl_port_unregister(devlink_port); kfree(nsim_dev_port); } static void nsim_dev_port_del_all(struct nsim_dev *nsim_dev) { struct nsim_dev_port *nsim_dev_port, *tmp; list_for_each_entry_safe(nsim_dev_port, tmp, &nsim_dev->port_list, list) __nsim_dev_port_del(nsim_dev_port); } static int nsim_dev_port_add_all(struct nsim_dev *nsim_dev, unsigned int port_count) { int i, err; for (i = 0; i < port_count; i++) { err = __nsim_dev_port_add(nsim_dev, NSIM_DEV_PORT_TYPE_PF, i); if (err) goto err_port_del_all; } return 0; err_port_del_all: nsim_dev_port_del_all(nsim_dev); return err; } static int nsim_dev_reload_create(struct nsim_dev *nsim_dev, struct netlink_ext_ack *extack) { struct nsim_bus_dev *nsim_bus_dev = nsim_dev->nsim_bus_dev; struct devlink *devlink; int err; devlink = priv_to_devlink(nsim_dev); nsim_dev = devlink_priv(devlink); INIT_LIST_HEAD(&nsim_dev->port_list); nsim_dev->fw_update_status = true; nsim_dev->fw_update_overwrite_mask = 0; nsim_devlink_param_load_driverinit_values(devlink); err = nsim_dev_dummy_region_init(nsim_dev, devlink); if (err) return err; err = nsim_dev_traps_init(devlink); if (err) goto err_dummy_region_exit; nsim_dev->fib_data = nsim_fib_create(devlink, extack); if (IS_ERR(nsim_dev->fib_data)) { err = PTR_ERR(nsim_dev->fib_data); goto err_traps_exit; } err = nsim_dev_health_init(nsim_dev, devlink); if (err) goto err_fib_destroy; err = nsim_dev_psample_init(nsim_dev); if (err) goto err_health_exit; err = nsim_dev_hwstats_init(nsim_dev); if (err) goto err_psample_exit; err = nsim_dev_port_add_all(nsim_dev, nsim_bus_dev->port_count); if (err) goto err_hwstats_exit; nsim_dev->take_snapshot = debugfs_create_file("take_snapshot", 0200, nsim_dev->ddir, nsim_dev, &nsim_dev_take_snapshot_fops); return 0; err_hwstats_exit: nsim_dev_hwstats_exit(nsim_dev); err_psample_exit: nsim_dev_psample_exit(nsim_dev); err_health_exit: nsim_dev_health_exit(nsim_dev); err_fib_destroy: nsim_fib_destroy(devlink, nsim_dev->fib_data); err_traps_exit: nsim_dev_traps_exit(devlink); err_dummy_region_exit: nsim_dev_dummy_region_exit(nsim_dev); return err; } int nsim_drv_probe(struct nsim_bus_dev *nsim_bus_dev) { struct nsim_dev *nsim_dev; struct devlink *devlink; int err; devlink = devlink_alloc_ns(&nsim_dev_devlink_ops, sizeof(*nsim_dev), nsim_bus_dev->initial_net, &nsim_bus_dev->dev); if (!devlink) return -ENOMEM; devl_lock(devlink); nsim_dev = devlink_priv(devlink); nsim_dev->nsim_bus_dev = nsim_bus_dev; nsim_dev->switch_id.id_len = sizeof(nsim_dev->switch_id.id); get_random_bytes(nsim_dev->switch_id.id, nsim_dev->switch_id.id_len); INIT_LIST_HEAD(&nsim_dev->port_list); nsim_dev->fw_update_status = true; nsim_dev->fw_update_overwrite_mask = 0; nsim_dev->max_macs = NSIM_DEV_MAX_MACS_DEFAULT; nsim_dev->test1 = NSIM_DEV_TEST1_DEFAULT; spin_lock_init(&nsim_dev->fa_cookie_lock); dev_set_drvdata(&nsim_bus_dev->dev, nsim_dev); nsim_dev->vfconfigs = kcalloc(nsim_bus_dev->max_vfs, sizeof(struct nsim_vf_config), GFP_KERNEL | __GFP_NOWARN); if (!nsim_dev->vfconfigs) { err = -ENOMEM; goto err_devlink_unlock; } err = devl_register(devlink); if (err) goto err_vfc_free; err = nsim_dev_resources_register(devlink); if (err) goto err_dl_unregister; err = devl_params_register(devlink, nsim_devlink_params, ARRAY_SIZE(nsim_devlink_params)); if (err) goto err_resource_unregister; nsim_devlink_set_params_init_values(nsim_dev, devlink); err = nsim_dev_dummy_region_init(nsim_dev, devlink); if (err) goto err_params_unregister; err = nsim_dev_traps_init(devlink); if (err) goto err_dummy_region_exit; err = nsim_dev_debugfs_init(nsim_dev); if (err) goto err_traps_exit; nsim_dev->fib_data = nsim_fib_create(devlink, NULL); if (IS_ERR(nsim_dev->fib_data)) { err = PTR_ERR(nsim_dev->fib_data); goto err_debugfs_exit; } err = nsim_dev_health_init(nsim_dev, devlink); if (err) goto err_fib_destroy; err = nsim_bpf_dev_init(nsim_dev); if (err) goto err_health_exit; err = nsim_dev_psample_init(nsim_dev); if (err) goto err_bpf_dev_exit; err = nsim_dev_hwstats_init(nsim_dev); if (err) goto err_psample_exit; err = nsim_dev_port_add_all(nsim_dev, nsim_bus_dev->port_count); if (err) goto err_hwstats_exit; nsim_dev->esw_mode = DEVLINK_ESWITCH_MODE_LEGACY; devl_unlock(devlink); return 0; err_hwstats_exit: nsim_dev_hwstats_exit(nsim_dev); err_psample_exit: nsim_dev_psample_exit(nsim_dev); err_bpf_dev_exit: nsim_bpf_dev_exit(nsim_dev); err_health_exit: nsim_dev_health_exit(nsim_dev); err_fib_destroy: nsim_fib_destroy(devlink, nsim_dev->fib_data); err_debugfs_exit: nsim_dev_debugfs_exit(nsim_dev); err_traps_exit: nsim_dev_traps_exit(devlink); err_dummy_region_exit: nsim_dev_dummy_region_exit(nsim_dev); err_params_unregister: devl_params_unregister(devlink, nsim_devlink_params, ARRAY_SIZE(nsim_devlink_params)); err_resource_unregister: devl_resources_unregister(devlink); err_dl_unregister: devl_unregister(devlink); err_vfc_free: kfree(nsim_dev->vfconfigs); err_devlink_unlock: devl_unlock(devlink); devlink_free(devlink); dev_set_drvdata(&nsim_bus_dev->dev, NULL); return err; } static void nsim_dev_reload_destroy(struct nsim_dev *nsim_dev) { struct devlink *devlink = priv_to_devlink(nsim_dev); if (devlink_is_reload_failed(devlink)) return; debugfs_remove(nsim_dev->take_snapshot); if (nsim_dev_get_vfs(nsim_dev)) { nsim_bus_dev_set_vfs(nsim_dev->nsim_bus_dev, 0); if (nsim_esw_mode_is_switchdev(nsim_dev)) nsim_esw_legacy_enable(nsim_dev, NULL); } nsim_dev_port_del_all(nsim_dev); nsim_dev_hwstats_exit(nsim_dev); nsim_dev_psample_exit(nsim_dev); nsim_dev_health_exit(nsim_dev); nsim_fib_destroy(devlink, nsim_dev->fib_data); nsim_dev_traps_exit(devlink); nsim_dev_dummy_region_exit(nsim_dev); } void nsim_drv_remove(struct nsim_bus_dev *nsim_bus_dev) { struct nsim_dev *nsim_dev = dev_get_drvdata(&nsim_bus_dev->dev); struct devlink *devlink = priv_to_devlink(nsim_dev); devl_lock(devlink); nsim_dev_reload_destroy(nsim_dev); nsim_bpf_dev_exit(nsim_dev); nsim_dev_debugfs_exit(nsim_dev); devl_params_unregister(devlink, nsim_devlink_params, ARRAY_SIZE(nsim_devlink_params)); devl_resources_unregister(devlink); devl_unregister(devlink); kfree(nsim_dev->vfconfigs); kfree(nsim_dev->fa_cookie); devl_unlock(devlink); devlink_free(devlink); dev_set_drvdata(&nsim_bus_dev->dev, NULL); } static struct nsim_dev_port * __nsim_dev_port_lookup(struct nsim_dev *nsim_dev, enum nsim_dev_port_type type, unsigned int port_index) { struct nsim_dev_port *nsim_dev_port; port_index = nsim_dev_port_index(type, port_index); list_for_each_entry(nsim_dev_port, &nsim_dev->port_list, list) if (nsim_dev_port->port_index == port_index) return nsim_dev_port; return NULL; } int nsim_drv_port_add(struct nsim_bus_dev *nsim_bus_dev, enum nsim_dev_port_type type, unsigned int port_index) { struct nsim_dev *nsim_dev = dev_get_drvdata(&nsim_bus_dev->dev); int err; devl_lock(priv_to_devlink(nsim_dev)); if (__nsim_dev_port_lookup(nsim_dev, type, port_index)) err = -EEXIST; else err = __nsim_dev_port_add(nsim_dev, type, port_index); devl_unlock(priv_to_devlink(nsim_dev)); return err; } int nsim_drv_port_del(struct nsim_bus_dev *nsim_bus_dev, enum nsim_dev_port_type type, unsigned int port_index) { struct nsim_dev *nsim_dev = dev_get_drvdata(&nsim_bus_dev->dev); struct nsim_dev_port *nsim_dev_port; int err = 0; devl_lock(priv_to_devlink(nsim_dev)); nsim_dev_port = __nsim_dev_port_lookup(nsim_dev, type, port_index); if (!nsim_dev_port) err = -ENOENT; else __nsim_dev_port_del(nsim_dev_port); devl_unlock(priv_to_devlink(nsim_dev)); return err; } int nsim_drv_configure_vfs(struct nsim_bus_dev *nsim_bus_dev, unsigned int num_vfs) { struct nsim_dev *nsim_dev = dev_get_drvdata(&nsim_bus_dev->dev); struct devlink *devlink = priv_to_devlink(nsim_dev); int ret = 0; devl_lock(devlink); if (nsim_bus_dev->num_vfs == num_vfs) goto exit_unlock; if (nsim_bus_dev->num_vfs && num_vfs) { ret = -EBUSY; goto exit_unlock; } if (nsim_bus_dev->max_vfs < num_vfs) { ret = -ENOMEM; goto exit_unlock; } nsim_bus_dev_set_vfs(nsim_bus_dev, num_vfs); if (nsim_esw_mode_is_switchdev(nsim_dev)) { if (num_vfs) { ret = nsim_esw_switchdev_enable(nsim_dev, NULL); if (ret) { nsim_bus_dev_set_vfs(nsim_bus_dev, 0); goto exit_unlock; } } else { nsim_esw_legacy_enable(nsim_dev, NULL); } } exit_unlock: devl_unlock(devlink); return ret; } int nsim_dev_init(void) { nsim_dev_ddir = debugfs_create_dir(DRV_NAME, NULL); return PTR_ERR_OR_ZERO(nsim_dev_ddir); } void nsim_dev_exit(void) { debugfs_remove_recursive(nsim_dev_ddir); }
3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 // SPDX-License-Identifier: GPL-2.0-only /* * i8042 keyboard and mouse controller driver for Linux * * Copyright (c) 1999-2004 Vojtech Pavlik */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/types.h> #include <linux/delay.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/ioport.h> #include <linux/init.h> #include <linux/serio.h> #include <linux/err.h> #include <linux/rcupdate.h> #include <linux/platform_device.h> #include <linux/i8042.h> #include <linux/slab.h> #include <linux/suspend.h> #include <linux/property.h> #include <asm/io.h> MODULE_AUTHOR("Vojtech Pavlik <vojtech@suse.cz>"); MODULE_DESCRIPTION("i8042 keyboard and mouse controller driver"); MODULE_LICENSE("GPL"); static bool i8042_nokbd; module_param_named(nokbd, i8042_nokbd, bool, 0); MODULE_PARM_DESC(nokbd, "Do not probe or use KBD port."); static bool i8042_noaux; module_param_named(noaux, i8042_noaux, bool, 0); MODULE_PARM_DESC(noaux, "Do not probe or use AUX (mouse) port."); static bool i8042_nomux; module_param_named(nomux, i8042_nomux, bool, 0); MODULE_PARM_DESC(nomux, "Do not check whether an active multiplexing controller is present."); static bool i8042_unlock; module_param_named(unlock, i8042_unlock, bool, 0); MODULE_PARM_DESC(unlock, "Ignore keyboard lock."); static bool i8042_probe_defer; module_param_named(probe_defer, i8042_probe_defer, bool, 0); MODULE_PARM_DESC(probe_defer, "Allow deferred probing."); enum i8042_controller_reset_mode { I8042_RESET_NEVER, I8042_RESET_ALWAYS, I8042_RESET_ON_S2RAM, #define I8042_RESET_DEFAULT I8042_RESET_ON_S2RAM }; static enum i8042_controller_reset_mode i8042_reset = I8042_RESET_DEFAULT; static int i8042_set_reset(const char *val, const struct kernel_param *kp) { enum i8042_controller_reset_mode *arg = kp->arg; int error; bool reset; if (val) { error = kstrtobool(val, &reset); if (error) return error; } else { reset = true; } *arg = reset ? I8042_RESET_ALWAYS : I8042_RESET_NEVER; return 0; } static const struct kernel_param_ops param_ops_reset_param = { .flags = KERNEL_PARAM_OPS_FL_NOARG, .set = i8042_set_reset, }; #define param_check_reset_param(name, p) \ __param_check(name, p, enum i8042_controller_reset_mode) module_param_named(reset, i8042_reset, reset_param, 0); MODULE_PARM_DESC(reset, "Reset controller on resume, cleanup or both"); static bool i8042_direct; module_param_named(direct, i8042_direct, bool, 0); MODULE_PARM_DESC(direct, "Put keyboard port into non-translated mode."); static bool i8042_dumbkbd; module_param_named(dumbkbd, i8042_dumbkbd, bool, 0); MODULE_PARM_DESC(dumbkbd, "Pretend that controller can only read data from keyboard"); static bool i8042_noloop; module_param_named(noloop, i8042_noloop, bool, 0); MODULE_PARM_DESC(noloop, "Disable the AUX Loopback command while probing for the AUX port"); static bool i8042_notimeout; module_param_named(notimeout, i8042_notimeout, bool, 0); MODULE_PARM_DESC(notimeout, "Ignore timeouts signalled by i8042"); static bool i8042_kbdreset; module_param_named(kbdreset, i8042_kbdreset, bool, 0); MODULE_PARM_DESC(kbdreset, "Reset device connected to KBD port"); #ifdef CONFIG_X86 static bool i8042_dritek; module_param_named(dritek, i8042_dritek, bool, 0); MODULE_PARM_DESC(dritek, "Force enable the Dritek keyboard extension"); #endif #ifdef CONFIG_PNP static bool i8042_nopnp; module_param_named(nopnp, i8042_nopnp, bool, 0); MODULE_PARM_DESC(nopnp, "Do not use PNP to detect controller settings"); #endif #define DEBUG #ifdef DEBUG static bool i8042_debug; module_param_named(debug, i8042_debug, bool, 0600); MODULE_PARM_DESC(debug, "Turn i8042 debugging mode on and off"); static bool i8042_unmask_kbd_data; module_param_named(unmask_kbd_data, i8042_unmask_kbd_data, bool, 0600); MODULE_PARM_DESC(unmask_kbd_data, "Unconditional enable (may reveal sensitive data) of normally sanitize-filtered kbd data traffic debug log [pre-condition: i8042.debug=1 enabled]"); #endif static bool i8042_present; static bool i8042_bypass_aux_irq_test; static char i8042_kbd_firmware_id[128]; static char i8042_aux_firmware_id[128]; static struct fwnode_handle *i8042_kbd_fwnode; #include "i8042.h" /* * i8042_lock protects serialization between i8042_command and * the interrupt handler. */ static DEFINE_SPINLOCK(i8042_lock); /* * Writers to AUX and KBD ports as well as users issuing i8042_command * directly should acquire i8042_mutex (by means of calling * i8042_lock_chip() and i8042_unlock_chip() helpers) to ensure that * they do not disturb each other (unfortunately in many i8042 * implementations write to one of the ports will immediately abort * command that is being processed by another port). */ static DEFINE_MUTEX(i8042_mutex); struct i8042_port { struct serio *serio; int irq; bool exists; bool driver_bound; signed char mux; }; #define I8042_KBD_PORT_NO 0 #define I8042_AUX_PORT_NO 1 #define I8042_MUX_PORT_NO 2 #define I8042_NUM_PORTS (I8042_NUM_MUX_PORTS + 2) static struct i8042_port i8042_ports[I8042_NUM_PORTS]; static unsigned char i8042_initial_ctr; static unsigned char i8042_ctr; static bool i8042_mux_present; static bool i8042_kbd_irq_registered; static bool i8042_aux_irq_registered; static unsigned char i8042_suppress_kbd_ack; static struct platform_device *i8042_platform_device; static struct notifier_block i8042_kbd_bind_notifier_block; static irqreturn_t i8042_interrupt(int irq, void *dev_id); static bool (*i8042_platform_filter)(unsigned char data, unsigned char str, struct serio *serio); void i8042_lock_chip(void) { mutex_lock(&i8042_mutex); } EXPORT_SYMBOL(i8042_lock_chip); void i8042_unlock_chip(void) { mutex_unlock(&i8042_mutex); } EXPORT_SYMBOL(i8042_unlock_chip); int i8042_install_filter(bool (*filter)(unsigned char data, unsigned char str, struct serio *serio)) { unsigned long flags; int ret = 0; spin_lock_irqsave(&i8042_lock, flags); if (i8042_platform_filter) { ret = -EBUSY; goto out; } i8042_platform_filter = filter; out: spin_unlock_irqrestore(&i8042_lock, flags); return ret; } EXPORT_SYMBOL(i8042_install_filter); int i8042_remove_filter(bool (*filter)(unsigned char data, unsigned char str, struct serio *port)) { unsigned long flags; int ret = 0; spin_lock_irqsave(&i8042_lock, flags); if (i8042_platform_filter != filter) { ret = -EINVAL; goto out; } i8042_platform_filter = NULL; out: spin_unlock_irqrestore(&i8042_lock, flags); return ret; } EXPORT_SYMBOL(i8042_remove_filter); /* * The i8042_wait_read() and i8042_wait_write functions wait for the i8042 to * be ready for reading values from it / writing values to it. * Called always with i8042_lock held. */ static int i8042_wait_read(void) { int i = 0; while ((~i8042_read_status() & I8042_STR_OBF) && (i < I8042_CTL_TIMEOUT)) { udelay(50); i++; } return -(i == I8042_CTL_TIMEOUT); } static int i8042_wait_write(void) { int i = 0; while ((i8042_read_status() & I8042_STR_IBF) && (i < I8042_CTL_TIMEOUT)) { udelay(50); i++; } return -(i == I8042_CTL_TIMEOUT); } /* * i8042_flush() flushes all data that may be in the keyboard and mouse buffers * of the i8042 down the toilet. */ static int i8042_flush(void) { unsigned long flags; unsigned char data, str; int count = 0; int retval = 0; spin_lock_irqsave(&i8042_lock, flags); while ((str = i8042_read_status()) & I8042_STR_OBF) { if (count++ < I8042_BUFFER_SIZE) { udelay(50); data = i8042_read_data(); dbg("%02x <- i8042 (flush, %s)\n", data, str & I8042_STR_AUXDATA ? "aux" : "kbd"); } else { retval = -EIO; break; } } spin_unlock_irqrestore(&i8042_lock, flags); return retval; } /* * i8042_command() executes a command on the i8042. It also sends the input * parameter(s) of the commands to it, and receives the output value(s). The * parameters are to be stored in the param array, and the output is placed * into the same array. The number of the parameters and output values is * encoded in bits 8-11 of the command number. */ static int __i8042_command(unsigned char *param, int command) { int i, error; if (i8042_noloop && command == I8042_CMD_AUX_LOOP) return -1; error = i8042_wait_write(); if (error) return error; dbg("%02x -> i8042 (command)\n", command & 0xff); i8042_write_command(command & 0xff); for (i = 0; i < ((command >> 12) & 0xf); i++) { error = i8042_wait_write(); if (error) { dbg(" -- i8042 (wait write timeout)\n"); return error; } dbg("%02x -> i8042 (parameter)\n", param[i]); i8042_write_data(param[i]); } for (i = 0; i < ((command >> 8) & 0xf); i++) { error = i8042_wait_read(); if (error) { dbg(" -- i8042 (wait read timeout)\n"); return error; } if (command == I8042_CMD_AUX_LOOP && !(i8042_read_status() & I8042_STR_AUXDATA)) { dbg(" -- i8042 (auxerr)\n"); return -1; } param[i] = i8042_read_data(); dbg("%02x <- i8042 (return)\n", param[i]); } return 0; } int i8042_command(unsigned char *param, int command) { unsigned long flags; int retval; if (!i8042_present) return -1; spin_lock_irqsave(&i8042_lock, flags); retval = __i8042_command(param, command); spin_unlock_irqrestore(&i8042_lock, flags); return retval; } EXPORT_SYMBOL(i8042_command); /* * i8042_kbd_write() sends a byte out through the keyboard interface. */ static int i8042_kbd_write(struct serio *port, unsigned char c) { unsigned long flags; int retval = 0; spin_lock_irqsave(&i8042_lock, flags); if (!(retval = i8042_wait_write())) { dbg("%02x -> i8042 (kbd-data)\n", c); i8042_write_data(c); } spin_unlock_irqrestore(&i8042_lock, flags); return retval; } /* * i8042_aux_write() sends a byte out through the aux interface. */ static int i8042_aux_write(struct serio *serio, unsigned char c) { struct i8042_port *port = serio->port_data; return i8042_command(&c, port->mux == -1 ? I8042_CMD_AUX_SEND : I8042_CMD_MUX_SEND + port->mux); } /* * i8042_port_close attempts to clear AUX or KBD port state by disabling * and then re-enabling it. */ static void i8042_port_close(struct serio *serio) { int irq_bit; int disable_bit; const char *port_name; if (serio == i8042_ports[I8042_AUX_PORT_NO].serio) { irq_bit = I8042_CTR_AUXINT; disable_bit = I8042_CTR_AUXDIS; port_name = "AUX"; } else { irq_bit = I8042_CTR_KBDINT; disable_bit = I8042_CTR_KBDDIS; port_name = "KBD"; } i8042_ctr &= ~irq_bit; if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) pr_warn("Can't write CTR while closing %s port\n", port_name); udelay(50); i8042_ctr &= ~disable_bit; i8042_ctr |= irq_bit; if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) pr_err("Can't reactivate %s port\n", port_name); /* * See if there is any data appeared while we were messing with * port state. */ i8042_interrupt(0, NULL); } /* * i8042_start() is called by serio core when port is about to finish * registering. It will mark port as existing so i8042_interrupt can * start sending data through it. */ static int i8042_start(struct serio *serio) { struct i8042_port *port = serio->port_data; device_set_wakeup_capable(&serio->dev, true); /* * On platforms using suspend-to-idle, allow the keyboard to * wake up the system from sleep by enabling keyboard wakeups * by default. This is consistent with keyboard wakeup * behavior on many platforms using suspend-to-RAM (ACPI S3) * by default. */ if (pm_suspend_default_s2idle() && serio == i8042_ports[I8042_KBD_PORT_NO].serio) { device_set_wakeup_enable(&serio->dev, true); } spin_lock_irq(&i8042_lock); port->exists = true; spin_unlock_irq(&i8042_lock); return 0; } /* * i8042_stop() marks serio port as non-existing so i8042_interrupt * will not try to send data to the port that is about to go away. * The function is called by serio core as part of unregister procedure. */ static void i8042_stop(struct serio *serio) { struct i8042_port *port = serio->port_data; spin_lock_irq(&i8042_lock); port->exists = false; port->serio = NULL; spin_unlock_irq(&i8042_lock); /* * We need to make sure that interrupt handler finishes using * our serio port before we return from this function. * We synchronize with both AUX and KBD IRQs because there is * a (very unlikely) chance that AUX IRQ is raised for KBD port * and vice versa. */ synchronize_irq(I8042_AUX_IRQ); synchronize_irq(I8042_KBD_IRQ); } /* * i8042_filter() filters out unwanted bytes from the input data stream. * It is called from i8042_interrupt and thus is running with interrupts * off and i8042_lock held. */ static bool i8042_filter(unsigned char data, unsigned char str, struct serio *serio) { if (unlikely(i8042_suppress_kbd_ack)) { if ((~str & I8042_STR_AUXDATA) && (data == 0xfa || data == 0xfe)) { i8042_suppress_kbd_ack--; dbg("Extra keyboard ACK - filtered out\n"); return true; } } if (i8042_platform_filter && i8042_platform_filter(data, str, serio)) { dbg("Filtered out by platform filter\n"); return true; } return false; } /* * i8042_interrupt() is the most important function in this driver - * it handles the interrupts from the i8042, and sends incoming bytes * to the upper layers. */ static irqreturn_t i8042_interrupt(int irq, void *dev_id) { struct i8042_port *port; struct serio *serio; unsigned long flags; unsigned char str, data; unsigned int dfl; unsigned int port_no; bool filtered; int ret = 1; spin_lock_irqsave(&i8042_lock, flags); str = i8042_read_status(); if (unlikely(~str & I8042_STR_OBF)) { spin_unlock_irqrestore(&i8042_lock, flags); if (irq) dbg("Interrupt %d, without any data\n", irq); ret = 0; goto out; } data = i8042_read_data(); if (i8042_mux_present && (str & I8042_STR_AUXDATA)) { static unsigned long last_transmit; static unsigned char last_str; dfl = 0; if (str & I8042_STR_MUXERR) { dbg("MUX error, status is %02x, data is %02x\n", str, data); /* * When MUXERR condition is signalled the data register can only contain * 0xfd, 0xfe or 0xff if implementation follows the spec. Unfortunately * it is not always the case. Some KBCs also report 0xfc when there is * nothing connected to the port while others sometimes get confused which * port the data came from and signal error leaving the data intact. They * _do not_ revert to legacy mode (actually I've never seen KBC reverting * to legacy mode yet, when we see one we'll add proper handling). * Anyway, we process 0xfc, 0xfd, 0xfe and 0xff as timeouts, and for the * rest assume that the data came from the same serio last byte * was transmitted (if transmission happened not too long ago). */ switch (data) { default: if (time_before(jiffies, last_transmit + HZ/10)) { str = last_str; break; } fallthrough; /* report timeout */ case 0xfc: case 0xfd: case 0xfe: dfl = SERIO_TIMEOUT; data = 0xfe; break; case 0xff: dfl = SERIO_PARITY; data = 0xfe; break; } } port_no = I8042_MUX_PORT_NO + ((str >> 6) & 3); last_str = str; last_transmit = jiffies; } else { dfl = ((str & I8042_STR_PARITY) ? SERIO_PARITY : 0) | ((str & I8042_STR_TIMEOUT && !i8042_notimeout) ? SERIO_TIMEOUT : 0); port_no = (str & I8042_STR_AUXDATA) ? I8042_AUX_PORT_NO : I8042_KBD_PORT_NO; } port = &i8042_ports[port_no]; serio = port->exists ? port->serio : NULL; filter_dbg(port->driver_bound, data, "<- i8042 (interrupt, %d, %d%s%s)\n", port_no, irq, dfl & SERIO_PARITY ? ", bad parity" : "", dfl & SERIO_TIMEOUT ? ", timeout" : ""); filtered = i8042_filter(data, str, serio); spin_unlock_irqrestore(&i8042_lock, flags); if (likely(serio && !filtered)) serio_interrupt(serio, data, dfl); out: return IRQ_RETVAL(ret); } /* * i8042_enable_kbd_port enables keyboard port on chip */ static int i8042_enable_kbd_port(void) { i8042_ctr &= ~I8042_CTR_KBDDIS; i8042_ctr |= I8042_CTR_KBDINT; if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { i8042_ctr &= ~I8042_CTR_KBDINT; i8042_ctr |= I8042_CTR_KBDDIS; pr_err("Failed to enable KBD port\n"); return -EIO; } return 0; } /* * i8042_enable_aux_port enables AUX (mouse) port on chip */ static int i8042_enable_aux_port(void) { i8042_ctr &= ~I8042_CTR_AUXDIS; i8042_ctr |= I8042_CTR_AUXINT; if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { i8042_ctr &= ~I8042_CTR_AUXINT; i8042_ctr |= I8042_CTR_AUXDIS; pr_err("Failed to enable AUX port\n"); return -EIO; } return 0; } /* * i8042_enable_mux_ports enables 4 individual AUX ports after * the controller has been switched into Multiplexed mode */ static int i8042_enable_mux_ports(void) { unsigned char param; int i; for (i = 0; i < I8042_NUM_MUX_PORTS; i++) { i8042_command(&param, I8042_CMD_MUX_PFX + i); i8042_command(&param, I8042_CMD_AUX_ENABLE); } return i8042_enable_aux_port(); } /* * i8042_set_mux_mode checks whether the controller has an * active multiplexor and puts the chip into Multiplexed (true) * or Legacy (false) mode. */ static int i8042_set_mux_mode(bool multiplex, unsigned char *mux_version) { unsigned char param, val; /* * Get rid of bytes in the queue. */ i8042_flush(); /* * Internal loopback test - send three bytes, they should come back from the * mouse interface, the last should be version. */ param = val = 0xf0; if (i8042_command(&param, I8042_CMD_AUX_LOOP) || param != val) return -1; param = val = multiplex ? 0x56 : 0xf6; if (i8042_command(&param, I8042_CMD_AUX_LOOP) || param != val) return -1; param = val = multiplex ? 0xa4 : 0xa5; if (i8042_command(&param, I8042_CMD_AUX_LOOP) || param == val) return -1; /* * Workaround for interference with USB Legacy emulation * that causes a v10.12 MUX to be found. */ if (param == 0xac) return -1; if (mux_version) *mux_version = param; return 0; } /* * i8042_check_mux() checks whether the controller supports the PS/2 Active * Multiplexing specification by Synaptics, Phoenix, Insyde and * LCS/Telegraphics. */ static int i8042_check_mux(void) { unsigned char mux_version; if (i8042_set_mux_mode(true, &mux_version)) return -1; pr_info("Detected active multiplexing controller, rev %d.%d\n", (mux_version >> 4) & 0xf, mux_version & 0xf); /* * Disable all muxed ports by disabling AUX. */ i8042_ctr |= I8042_CTR_AUXDIS; i8042_ctr &= ~I8042_CTR_AUXINT; if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { pr_err("Failed to disable AUX port, can't use MUX\n"); return -EIO; } i8042_mux_present = true; return 0; } /* * The following is used to test AUX IRQ delivery. */ static struct completion i8042_aux_irq_delivered; static bool i8042_irq_being_tested; static irqreturn_t i8042_aux_test_irq(int irq, void *dev_id) { unsigned long flags; unsigned char str, data; int ret = 0; spin_lock_irqsave(&i8042_lock, flags); str = i8042_read_status(); if (str & I8042_STR_OBF) { data = i8042_read_data(); dbg("%02x <- i8042 (aux_test_irq, %s)\n", data, str & I8042_STR_AUXDATA ? "aux" : "kbd"); if (i8042_irq_being_tested && data == 0xa5 && (str & I8042_STR_AUXDATA)) complete(&i8042_aux_irq_delivered); ret = 1; } spin_unlock_irqrestore(&i8042_lock, flags); return IRQ_RETVAL(ret); } /* * i8042_toggle_aux - enables or disables AUX port on i8042 via command and * verifies success by readinng CTR. Used when testing for presence of AUX * port. */ static int i8042_toggle_aux(bool on) { unsigned char param; int i; if (i8042_command(&param, on ? I8042_CMD_AUX_ENABLE : I8042_CMD_AUX_DISABLE)) return -1; /* some chips need some time to set the I8042_CTR_AUXDIS bit */ for (i = 0; i < 100; i++) { udelay(50); if (i8042_command(&param, I8042_CMD_CTL_RCTR)) return -1; if (!(param & I8042_CTR_AUXDIS) == on) return 0; } return -1; } /* * i8042_check_aux() applies as much paranoia as it can at detecting * the presence of an AUX interface. */ static int i8042_check_aux(void) { int retval = -1; bool irq_registered = false; bool aux_loop_broken = false; unsigned long flags; unsigned char param; /* * Get rid of bytes in the queue. */ i8042_flush(); /* * Internal loopback test - filters out AT-type i8042's. Unfortunately * SiS screwed up and their 5597 doesn't support the LOOP command even * though it has an AUX port. */ param = 0x5a; retval = i8042_command(&param, I8042_CMD_AUX_LOOP); if (retval || param != 0x5a) { /* * External connection test - filters out AT-soldered PS/2 i8042's * 0x00 - no error, 0x01-0x03 - clock/data stuck, 0xff - general error * 0xfa - no error on some notebooks which ignore the spec * Because it's common for chipsets to return error on perfectly functioning * AUX ports, we test for this only when the LOOP command failed. */ if (i8042_command(&param, I8042_CMD_AUX_TEST) || (param && param != 0xfa && param != 0xff)) return -1; /* * If AUX_LOOP completed without error but returned unexpected data * mark it as broken */ if (!retval) aux_loop_broken = true; } /* * Bit assignment test - filters out PS/2 i8042's in AT mode */ if (i8042_toggle_aux(false)) { pr_warn("Failed to disable AUX port, but continuing anyway... Is this a SiS?\n"); pr_warn("If AUX port is really absent please use the 'i8042.noaux' option\n"); } if (i8042_toggle_aux(true)) return -1; /* * Reset keyboard (needed on some laptops to successfully detect * touchpad, e.g., some Gigabyte laptop models with Elantech * touchpads). */ if (i8042_kbdreset) { pr_warn("Attempting to reset device connected to KBD port\n"); i8042_kbd_write(NULL, (unsigned char) 0xff); } /* * Test AUX IRQ delivery to make sure BIOS did not grab the IRQ and * used it for a PCI card or somethig else. */ if (i8042_noloop || i8042_bypass_aux_irq_test || aux_loop_broken) { /* * Without LOOP command we can't test AUX IRQ delivery. Assume the port * is working and hope we are right. */ retval = 0; goto out; } if (request_irq(I8042_AUX_IRQ, i8042_aux_test_irq, IRQF_SHARED, "i8042", i8042_platform_device)) goto out; irq_registered = true; if (i8042_enable_aux_port()) goto out; spin_lock_irqsave(&i8042_lock, flags); init_completion(&i8042_aux_irq_delivered); i8042_irq_being_tested = true; param = 0xa5; retval = __i8042_command(&param, I8042_CMD_AUX_LOOP & 0xf0ff); spin_unlock_irqrestore(&i8042_lock, flags); if (retval) goto out; if (wait_for_completion_timeout(&i8042_aux_irq_delivered, msecs_to_jiffies(250)) == 0) { /* * AUX IRQ was never delivered so we need to flush the controller to * get rid of the byte we put there; otherwise keyboard may not work. */ dbg(" -- i8042 (aux irq test timeout)\n"); i8042_flush(); retval = -1; } out: /* * Disable the interface. */ i8042_ctr |= I8042_CTR_AUXDIS; i8042_ctr &= ~I8042_CTR_AUXINT; if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) retval = -1; if (irq_registered) free_irq(I8042_AUX_IRQ, i8042_platform_device); return retval; } static int i8042_controller_check(void) { if (i8042_flush()) { pr_info("No controller found\n"); return -ENODEV; } return 0; } static int i8042_controller_selftest(void) { unsigned char param; int i = 0; /* * We try this 5 times; on some really fragile systems this does not * take the first time... */ do { if (i8042_command(&param, I8042_CMD_CTL_TEST)) { pr_err("i8042 controller selftest timeout\n"); return -ENODEV; } if (param == I8042_RET_CTL_TEST) return 0; dbg("i8042 controller selftest: %#x != %#x\n", param, I8042_RET_CTL_TEST); msleep(50); } while (i++ < 5); #ifdef CONFIG_X86 /* * On x86, we don't fail entire i8042 initialization if controller * reset fails in hopes that keyboard port will still be functional * and user will still get a working keyboard. This is especially * important on netbooks. On other arches we trust hardware more. */ pr_info("giving up on controller selftest, continuing anyway...\n"); return 0; #else pr_err("i8042 controller selftest failed\n"); return -EIO; #endif } /* * i8042_controller_init initializes the i8042 controller, and, * most importantly, sets it into non-xlated mode if that's * desired. */ static int i8042_controller_init(void) { unsigned long flags; int n = 0; unsigned char ctr[2]; /* * Save the CTR for restore on unload / reboot. */ do { if (n >= 10) { pr_err("Unable to get stable CTR read\n"); return -EIO; } if (n != 0) udelay(50); if (i8042_command(&ctr[n++ % 2], I8042_CMD_CTL_RCTR)) { pr_err("Can't read CTR while initializing i8042\n"); return i8042_probe_defer ? -EPROBE_DEFER : -EIO; } } while (n < 2 || ctr[0] != ctr[1]); i8042_initial_ctr = i8042_ctr = ctr[0]; /* * Disable the keyboard interface and interrupt. */ i8042_ctr |= I8042_CTR_KBDDIS; i8042_ctr &= ~I8042_CTR_KBDINT; /* * Handle keylock. */ spin_lock_irqsave(&i8042_lock, flags); if (~i8042_read_status() & I8042_STR_KEYLOCK) { if (i8042_unlock) i8042_ctr |= I8042_CTR_IGNKEYLOCK; else pr_warn("Warning: Keylock active\n"); } spin_unlock_irqrestore(&i8042_lock, flags); /* * If the chip is configured into nontranslated mode by the BIOS, don't * bother enabling translating and be happy. */ if (~i8042_ctr & I8042_CTR_XLATE) i8042_direct = true; /* * Set nontranslated mode for the kbd interface if requested by an option. * After this the kbd interface becomes a simple serial in/out, like the aux * interface is. We don't do this by default, since it can confuse notebook * BIOSes. */ if (i8042_direct) i8042_ctr &= ~I8042_CTR_XLATE; /* * Write CTR back. */ if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { pr_err("Can't write CTR while initializing i8042\n"); return -EIO; } /* * Flush whatever accumulated while we were disabling keyboard port. */ i8042_flush(); return 0; } /* * Reset the controller and reset CRT to the original value set by BIOS. */ static void i8042_controller_reset(bool s2r_wants_reset) { i8042_flush(); /* * Disable both KBD and AUX interfaces so they don't get in the way */ i8042_ctr |= I8042_CTR_KBDDIS | I8042_CTR_AUXDIS; i8042_ctr &= ~(I8042_CTR_KBDINT | I8042_CTR_AUXINT); if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) pr_warn("Can't write CTR while resetting\n"); /* * Disable MUX mode if present. */ if (i8042_mux_present) i8042_set_mux_mode(false, NULL); /* * Reset the controller if requested. */ if (i8042_reset == I8042_RESET_ALWAYS || (i8042_reset == I8042_RESET_ON_S2RAM && s2r_wants_reset)) { i8042_controller_selftest(); } /* * Restore the original control register setting. */ if (i8042_command(&i8042_initial_ctr, I8042_CMD_CTL_WCTR)) pr_warn("Can't restore CTR\n"); } /* * i8042_panic_blink() will turn the keyboard LEDs on or off and is called * when kernel panics. Flashing LEDs is useful for users running X who may * not see the console and will help distinguishing panics from "real" * lockups. * * Note that DELAY has a limit of 10ms so we will not get stuck here * waiting for KBC to free up even if KBD interrupt is off */ #define DELAY do { mdelay(1); if (++delay > 10) return delay; } while(0) static long i8042_panic_blink(int state) { long delay = 0; char led; led = (state) ? 0x01 | 0x04 : 0; while (i8042_read_status() & I8042_STR_IBF) DELAY; dbg("%02x -> i8042 (panic blink)\n", 0xed); i8042_suppress_kbd_ack = 2; i8042_write_data(0xed); /* set leds */ DELAY; while (i8042_read_status() & I8042_STR_IBF) DELAY; DELAY; dbg("%02x -> i8042 (panic blink)\n", led); i8042_write_data(led); DELAY; return delay; } #undef DELAY #ifdef CONFIG_X86 static void i8042_dritek_enable(void) { unsigned char param = 0x90; int error; error = i8042_command(&param, 0x1059); if (error) pr_warn("Failed to enable DRITEK extension: %d\n", error); } #endif #ifdef CONFIG_PM /* * Here we try to reset everything back to a state we had * before suspending. */ static int i8042_controller_resume(bool s2r_wants_reset) { int error; error = i8042_controller_check(); if (error) return error; if (i8042_reset == I8042_RESET_ALWAYS || (i8042_reset == I8042_RESET_ON_S2RAM && s2r_wants_reset)) { error = i8042_controller_selftest(); if (error) return error; } /* * Restore original CTR value and disable all ports */ i8042_ctr = i8042_initial_ctr; if (i8042_direct) i8042_ctr &= ~I8042_CTR_XLATE; i8042_ctr |= I8042_CTR_AUXDIS | I8042_CTR_KBDDIS; i8042_ctr &= ~(I8042_CTR_AUXINT | I8042_CTR_KBDINT); if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { pr_warn("Can't write CTR to resume, retrying...\n"); msleep(50); if (i8042_command(&i8042_ctr, I8042_CMD_CTL_WCTR)) { pr_err("CTR write retry failed\n"); return -EIO; } } #ifdef CONFIG_X86 if (i8042_dritek) i8042_dritek_enable(); #endif if (i8042_mux_present) { if (i8042_set_mux_mode(true, NULL) || i8042_enable_mux_ports()) pr_warn("failed to resume active multiplexor, mouse won't work\n"); } else if (i8042_ports[I8042_AUX_PORT_NO].serio) i8042_enable_aux_port(); if (i8042_ports[I8042_KBD_PORT_NO].serio) i8042_enable_kbd_port(); i8042_interrupt(0, NULL); return 0; } /* * Here we try to restore the original BIOS settings to avoid * upsetting it. */ static int i8042_pm_suspend(struct device *dev) { int i; if (pm_suspend_via_firmware()) i8042_controller_reset(true); /* Set up serio interrupts for system wakeup. */ for (i = 0; i < I8042_NUM_PORTS; i++) { struct serio *serio = i8042_ports[i].serio; if (serio && device_may_wakeup(&serio->dev)) enable_irq_wake(i8042_ports[i].irq); } return 0; } static int i8042_pm_resume_noirq(struct device *dev) { if (!pm_resume_via_firmware()) i8042_interrupt(0, NULL); return 0; } static int i8042_pm_resume(struct device *dev) { bool want_reset; int i; for (i = 0; i < I8042_NUM_PORTS; i++) { struct serio *serio = i8042_ports[i].serio; if (serio && device_may_wakeup(&serio->dev)) disable_irq_wake(i8042_ports[i].irq); } /* * If platform firmware was not going to be involved in suspend, we did * not restore the controller state to whatever it had been at boot * time, so we do not need to do anything. */ if (!pm_suspend_via_firmware()) return 0; /* * We only need to reset the controller if we are resuming after handing * off control to the platform firmware, otherwise we can simply restore * the mode. */ want_reset = pm_resume_via_firmware(); return i8042_controller_resume(want_reset); } static int i8042_pm_thaw(struct device *dev) { i8042_interrupt(0, NULL); return 0; } static int i8042_pm_reset(struct device *dev) { i8042_controller_reset(false); return 0; } static int i8042_pm_restore(struct device *dev) { return i8042_controller_resume(false); } static const struct dev_pm_ops i8042_pm_ops = { .suspend = i8042_pm_suspend, .resume_noirq = i8042_pm_resume_noirq, .resume = i8042_pm_resume, .thaw = i8042_pm_thaw, .poweroff = i8042_pm_reset, .restore = i8042_pm_restore, }; #endif /* CONFIG_PM */ /* * We need to reset the 8042 back to original mode on system shutdown, * because otherwise BIOSes will be confused. */ static void i8042_shutdown(struct platform_device *dev) { i8042_controller_reset(false); } static int i8042_create_kbd_port(void) { struct serio *serio; struct i8042_port *port = &i8042_ports[I8042_KBD_PORT_NO]; serio = kzalloc(sizeof(struct serio), GFP_KERNEL); if (!serio) return -ENOMEM; serio->id.type = i8042_direct ? SERIO_8042 : SERIO_8042_XL; serio->write = i8042_dumbkbd ? NULL : i8042_kbd_write; serio->start = i8042_start; serio->stop = i8042_stop; serio->close = i8042_port_close; serio->ps2_cmd_mutex = &i8042_mutex; serio->port_data = port; serio->dev.parent = &i8042_platform_device->dev; strscpy(serio->name, "i8042 KBD port", sizeof(serio->name)); strscpy(serio->phys, I8042_KBD_PHYS_DESC, sizeof(serio->phys)); strscpy(serio->firmware_id, i8042_kbd_firmware_id, sizeof(serio->firmware_id)); set_primary_fwnode(&serio->dev, i8042_kbd_fwnode); port->serio = serio; port->irq = I8042_KBD_IRQ; return 0; } static int i8042_create_aux_port(int idx) { struct serio *serio; int port_no = idx < 0 ? I8042_AUX_PORT_NO : I8042_MUX_PORT_NO + idx; struct i8042_port *port = &i8042_ports[port_no]; serio = kzalloc(sizeof(struct serio), GFP_KERNEL); if (!serio) return -ENOMEM; serio->id.type = SERIO_8042; serio->write = i8042_aux_write; serio->start = i8042_start; serio->stop = i8042_stop; serio->ps2_cmd_mutex = &i8042_mutex; serio->port_data = port; serio->dev.parent = &i8042_platform_device->dev; if (idx < 0) { strscpy(serio->name, "i8042 AUX port", sizeof(serio->name)); strscpy(serio->phys, I8042_AUX_PHYS_DESC, sizeof(serio->phys)); strscpy(serio->firmware_id, i8042_aux_firmware_id, sizeof(serio->firmware_id)); serio->close = i8042_port_close; } else { snprintf(serio->name, sizeof(serio->name), "i8042 AUX%d port", idx); snprintf(serio->phys, sizeof(serio->phys), I8042_MUX_PHYS_DESC, idx + 1); strscpy(serio->firmware_id, i8042_aux_firmware_id, sizeof(serio->firmware_id)); } port->serio = serio; port->mux = idx; port->irq = I8042_AUX_IRQ; return 0; } static void i8042_free_kbd_port(void) { kfree(i8042_ports[I8042_KBD_PORT_NO].serio); i8042_ports[I8042_KBD_PORT_NO].serio = NULL; } static void i8042_free_aux_ports(void) { int i; for (i = I8042_AUX_PORT_NO; i < I8042_NUM_PORTS; i++) { kfree(i8042_ports[i].serio); i8042_ports[i].serio = NULL; } } static void i8042_register_ports(void) { int i; for (i = 0; i < I8042_NUM_PORTS; i++) { struct serio *serio = i8042_ports[i].serio; if (!serio) continue; printk(KERN_INFO "serio: %s at %#lx,%#lx irq %d\n", serio->name, (unsigned long) I8042_DATA_REG, (unsigned long) I8042_COMMAND_REG, i8042_ports[i].irq); serio_register_port(serio); } } static void i8042_unregister_ports(void) { int i; for (i = 0; i < I8042_NUM_PORTS; i++) { if (i8042_ports[i].serio) { serio_unregister_port(i8042_ports[i].serio); i8042_ports[i].serio = NULL; } } } static void i8042_free_irqs(void) { if (i8042_aux_irq_registered) free_irq(I8042_AUX_IRQ, i8042_platform_device); if (i8042_kbd_irq_registered) free_irq(I8042_KBD_IRQ, i8042_platform_device); i8042_aux_irq_registered = i8042_kbd_irq_registered = false; } static int i8042_setup_aux(void) { int (*aux_enable)(void); int error; int i; if (i8042_check_aux()) return -ENODEV; if (i8042_nomux || i8042_check_mux()) { error = i8042_create_aux_port(-1); if (error) goto err_free_ports; aux_enable = i8042_enable_aux_port; } else { for (i = 0; i < I8042_NUM_MUX_PORTS; i++) { error = i8042_create_aux_port(i); if (error) goto err_free_ports; } aux_enable = i8042_enable_mux_ports; } error = request_irq(I8042_AUX_IRQ, i8042_interrupt, IRQF_SHARED, "i8042", i8042_platform_device); if (error) goto err_free_ports; error = aux_enable(); if (error) goto err_free_irq; i8042_aux_irq_registered = true; return 0; err_free_irq: free_irq(I8042_AUX_IRQ, i8042_platform_device); err_free_ports: i8042_free_aux_ports(); return error; } static int i8042_setup_kbd(void) { int error; error = i8042_create_kbd_port(); if (error) return error; error = request_irq(I8042_KBD_IRQ, i8042_interrupt, IRQF_SHARED, "i8042", i8042_platform_device); if (error) goto err_free_port; error = i8042_enable_kbd_port(); if (error) goto err_free_irq; i8042_kbd_irq_registered = true; return 0; err_free_irq: free_irq(I8042_KBD_IRQ, i8042_platform_device); err_free_port: i8042_free_kbd_port(); return error; } static int i8042_kbd_bind_notifier(struct notifier_block *nb, unsigned long action, void *data) { struct device *dev = data; struct serio *serio = to_serio_port(dev); struct i8042_port *port = serio->port_data; if (serio != i8042_ports[I8042_KBD_PORT_NO].serio) return 0; switch (action) { case BUS_NOTIFY_BOUND_DRIVER: port->driver_bound = true; break; case BUS_NOTIFY_UNBIND_DRIVER: port->driver_bound = false; break; } return 0; } static int i8042_probe(struct platform_device *dev) { int error; if (i8042_reset == I8042_RESET_ALWAYS) { error = i8042_controller_selftest(); if (error) return error; } error = i8042_controller_init(); if (error) return error; #ifdef CONFIG_X86 if (i8042_dritek) i8042_dritek_enable(); #endif if (!i8042_noaux) { error = i8042_setup_aux(); if (error && error != -ENODEV && error != -EBUSY) goto out_fail; } if (!i8042_nokbd) { error = i8042_setup_kbd(); if (error) goto out_fail; } /* * Ok, everything is ready, let's register all serio ports */ i8042_register_ports(); return 0; out_fail: i8042_free_aux_ports(); /* in case KBD failed but AUX not */ i8042_free_irqs(); i8042_controller_reset(false); return error; } static void i8042_remove(struct platform_device *dev) { i8042_unregister_ports(); i8042_free_irqs(); i8042_controller_reset(false); } static struct platform_driver i8042_driver = { .driver = { .name = "i8042", #ifdef CONFIG_PM .pm = &i8042_pm_ops, #endif }, .probe = i8042_probe, .remove_new = i8042_remove, .shutdown = i8042_shutdown, }; static struct notifier_block i8042_kbd_bind_notifier_block = { .notifier_call = i8042_kbd_bind_notifier, }; static int __init i8042_init(void) { int err; dbg_init(); err = i8042_platform_init(); if (err) return (err == -ENODEV) ? 0 : err; err = i8042_controller_check(); if (err) goto err_platform_exit; /* Set this before creating the dev to allow i8042_command to work right away */ i8042_present = true; err = platform_driver_register(&i8042_driver); if (err) goto err_platform_exit; i8042_platform_device = platform_device_alloc("i8042", -1); if (!i8042_platform_device) { err = -ENOMEM; goto err_unregister_driver; } err = platform_device_add(i8042_platform_device); if (err) goto err_free_device; bus_register_notifier(&serio_bus, &i8042_kbd_bind_notifier_block); panic_blink = i8042_panic_blink; return 0; err_free_device: platform_device_put(i8042_platform_device); err_unregister_driver: platform_driver_unregister(&i8042_driver); err_platform_exit: i8042_platform_exit(); return err; } static void __exit i8042_exit(void) { if (!i8042_present) return; platform_device_unregister(i8042_platform_device); platform_driver_unregister(&i8042_driver); i8042_platform_exit(); bus_unregister_notifier(&serio_bus, &i8042_kbd_bind_notifier_block); panic_blink = NULL; } module_init(i8042_init); module_exit(i8042_exit);
345 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 /* SPDX-License-Identifier: GPL-2.0-only */ /* * fs/kernfs/kernfs-internal.h - kernfs internal header file * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <teheo@suse.de> */ #ifndef __KERNFS_INTERNAL_H #define __KERNFS_INTERNAL_H #include <linux/lockdep.h> #include <linux/fs.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/xattr.h> #include <linux/kernfs.h> #include <linux/fs_context.h> struct kernfs_iattrs { kuid_t ia_uid; kgid_t ia_gid; struct timespec64 ia_atime; struct timespec64 ia_mtime; struct timespec64 ia_ctime; struct simple_xattrs xattrs; atomic_t nr_user_xattrs; atomic_t user_xattr_size; }; struct kernfs_root { /* published fields */ struct kernfs_node *kn; unsigned int flags; /* KERNFS_ROOT_* flags */ /* private fields, do not use outside kernfs proper */ struct idr ino_idr; u32 last_id_lowbits; u32 id_highbits; struct kernfs_syscall_ops *syscall_ops; /* list of kernfs_super_info of this root, protected by kernfs_rwsem */ struct list_head supers; wait_queue_head_t deactivate_waitq; struct rw_semaphore kernfs_rwsem; struct rw_semaphore kernfs_iattr_rwsem; struct rw_semaphore kernfs_supers_rwsem; }; /* +1 to avoid triggering overflow warning when negating it */ #define KN_DEACTIVATED_BIAS (INT_MIN + 1) /* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */ /** * kernfs_root - find out the kernfs_root a kernfs_node belongs to * @kn: kernfs_node of interest * * Return: the kernfs_root @kn belongs to. */ static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn) { /* if parent exists, it's always a dir; otherwise, @sd is a dir */ if (kn->parent) kn = kn->parent; return kn->dir.root; } /* * mount.c */ struct kernfs_super_info { struct super_block *sb; /* * The root associated with this super_block. Each super_block is * identified by the root and ns it's associated with. */ struct kernfs_root *root; /* * Each sb is associated with one namespace tag, currently the * network namespace of the task which mounted this kernfs * instance. If multiple tags become necessary, make the following * an array and compare kernfs_node tag against every entry. */ const void *ns; /* anchored at kernfs_root->supers, protected by kernfs_rwsem */ struct list_head node; }; #define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info)) static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry) { if (d_really_is_negative(dentry)) return NULL; return d_inode(dentry)->i_private; } static inline void kernfs_set_rev(struct kernfs_node *parent, struct dentry *dentry) { dentry->d_time = parent->dir.rev; } static inline void kernfs_inc_rev(struct kernfs_node *parent) { parent->dir.rev++; } static inline bool kernfs_dir_changed(struct kernfs_node *parent, struct dentry *dentry) { if (parent->dir.rev != dentry->d_time) return true; return false; } extern const struct super_operations kernfs_sops; extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache; /* * inode.c */ extern const struct xattr_handler * const kernfs_xattr_handlers[]; void kernfs_evict_inode(struct inode *inode); int kernfs_iop_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr); int kernfs_iop_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags); ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size); int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); /* * dir.c */ extern const struct dentry_operations kernfs_dops; extern const struct file_operations kernfs_dir_fops; extern const struct inode_operations kernfs_dir_iops; struct kernfs_node *kernfs_get_active(struct kernfs_node *kn); void kernfs_put_active(struct kernfs_node *kn); int kernfs_add_one(struct kernfs_node *kn); struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, unsigned flags); /* * file.c */ extern const struct file_operations kernfs_file_fops; bool kernfs_should_drain_open_files(struct kernfs_node *kn); void kernfs_drain_open_files(struct kernfs_node *kn); /* * symlink.c */ extern const struct inode_operations kernfs_symlink_iops; /* * kernfs locks */ extern struct kernfs_global_locks *kernfs_locks; #endif /* __KERNFS_INTERNAL_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FS_CEPH_MDS_CLIENT_H #define _FS_CEPH_MDS_CLIENT_H #include <linux/completion.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/rbtree.h> #include <linux/spinlock.h> #include <linux/refcount.h> #include <linux/utsname.h> #include <linux/ktime.h> #include <linux/ceph/types.h> #include <linux/ceph/messenger.h> #include <linux/ceph/auth.h> #include "mdsmap.h" #include "metric.h" #include "super.h" /* The first 8 bits are reserved for old ceph releases */ enum ceph_feature_type { CEPHFS_FEATURE_MIMIC = 8, CEPHFS_FEATURE_REPLY_ENCODING, CEPHFS_FEATURE_RECLAIM_CLIENT, CEPHFS_FEATURE_LAZY_CAP_WANTED, CEPHFS_FEATURE_MULTI_RECONNECT, CEPHFS_FEATURE_DELEG_INO, CEPHFS_FEATURE_METRIC_COLLECT, CEPHFS_FEATURE_ALTERNATE_NAME, CEPHFS_FEATURE_NOTIFY_SESSION_STATE, CEPHFS_FEATURE_OP_GETVXATTR, CEPHFS_FEATURE_32BITS_RETRY_FWD, CEPHFS_FEATURE_NEW_SNAPREALM_INFO, CEPHFS_FEATURE_HAS_OWNER_UIDGID, CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_HAS_OWNER_UIDGID, }; #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ 0, 1, 2, 3, 4, 5, 6, 7, \ CEPHFS_FEATURE_MIMIC, \ CEPHFS_FEATURE_REPLY_ENCODING, \ CEPHFS_FEATURE_LAZY_CAP_WANTED, \ CEPHFS_FEATURE_MULTI_RECONNECT, \ CEPHFS_FEATURE_DELEG_INO, \ CEPHFS_FEATURE_METRIC_COLLECT, \ CEPHFS_FEATURE_ALTERNATE_NAME, \ CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \ CEPHFS_FEATURE_OP_GETVXATTR, \ CEPHFS_FEATURE_32BITS_RETRY_FWD, \ CEPHFS_FEATURE_HAS_OWNER_UIDGID, \ } /* * Some lock dependencies: * * session->s_mutex * mdsc->mutex * * mdsc->snap_rwsem * * ci->i_ceph_lock * mdsc->snap_flush_lock * mdsc->cap_delay_lock * */ struct ceph_fs_client; struct ceph_cap; /* * parsed info about a single inode. pointers are into the encoded * on-wire structures within the mds reply message payload. */ struct ceph_mds_reply_info_in { struct ceph_mds_reply_inode *in; struct ceph_dir_layout dir_layout; u32 symlink_len; char *symlink; u32 xattr_len; char *xattr_data; u64 inline_version; u32 inline_len; char *inline_data; u32 pool_ns_len; char *pool_ns_data; u64 max_bytes; u64 max_files; s32 dir_pin; struct ceph_timespec btime; struct ceph_timespec snap_btime; u8 *fscrypt_auth; u8 *fscrypt_file; u32 fscrypt_auth_len; u32 fscrypt_file_len; u64 rsnaps; u64 change_attr; }; struct ceph_mds_reply_dir_entry { bool is_nokey; char *name; u32 name_len; u32 raw_hash; struct ceph_mds_reply_lease *lease; struct ceph_mds_reply_info_in inode; loff_t offset; }; struct ceph_mds_reply_xattr { char *xattr_value; size_t xattr_value_len; }; /* * parsed info about an mds reply, including information about * either: 1) the target inode and/or its parent directory and dentry, * and directory contents (for readdir results), or * 2) the file range lock info (for fcntl F_GETLK results). */ struct ceph_mds_reply_info_parsed { struct ceph_mds_reply_head *head; /* trace */ struct ceph_mds_reply_info_in diri, targeti; struct ceph_mds_reply_dirfrag *dirfrag; char *dname; u8 *altname; u32 dname_len; u32 altname_len; struct ceph_mds_reply_lease *dlease; struct ceph_mds_reply_xattr xattr_info; /* extra */ union { /* for fcntl F_GETLK results */ struct ceph_filelock *filelock_reply; /* for readdir results */ struct { struct ceph_mds_reply_dirfrag *dir_dir; size_t dir_buf_size; int dir_nr; bool dir_end; bool dir_complete; bool hash_order; bool offset_hash; struct ceph_mds_reply_dir_entry *dir_entries; }; /* for create results */ struct { bool has_create_ino; u64 ino; }; }; /* encoded blob describing snapshot contexts for certain operations (e.g., open) */ void *snapblob; int snapblob_len; }; /* * cap releases are batched and sent to the MDS en masse. * * Account for per-message overhead of mds_cap_release header * and __le32 for osd epoch barrier trailing field. */ #define CEPH_CAPS_PER_RELEASE ((PAGE_SIZE - sizeof(u32) - \ sizeof(struct ceph_mds_cap_release)) / \ sizeof(struct ceph_mds_cap_item)) /* * state associated with each MDS<->client session */ enum { CEPH_MDS_SESSION_NEW = 1, CEPH_MDS_SESSION_OPENING = 2, CEPH_MDS_SESSION_OPEN = 3, CEPH_MDS_SESSION_HUNG = 4, CEPH_MDS_SESSION_RESTARTING = 5, CEPH_MDS_SESSION_RECONNECTING = 6, CEPH_MDS_SESSION_CLOSING = 7, CEPH_MDS_SESSION_CLOSED = 8, CEPH_MDS_SESSION_REJECTED = 9, }; struct ceph_mds_session { struct ceph_mds_client *s_mdsc; int s_mds; int s_state; unsigned long s_ttl; /* time until mds kills us */ unsigned long s_features; u64 s_seq; /* incoming msg seq # */ struct mutex s_mutex; /* serialize session messages */ struct ceph_connection s_con; struct ceph_auth_handshake s_auth; atomic_t s_cap_gen; /* inc each time we get mds stale msg */ unsigned long s_cap_ttl; /* when session caps expire. protected by s_mutex */ /* protected by s_cap_lock */ spinlock_t s_cap_lock; refcount_t s_ref; struct list_head s_caps; /* all caps issued by this session */ struct ceph_cap *s_cap_iterator; int s_nr_caps; int s_num_cap_releases; int s_cap_reconnect; int s_readonly; struct list_head s_cap_releases; /* waiting cap_release messages */ struct work_struct s_cap_release_work; /* See ceph_inode_info->i_dirty_item. */ struct list_head s_cap_dirty; /* inodes w/ dirty caps */ /* See ceph_inode_info->i_flushing_item. */ struct list_head s_cap_flushing; /* inodes w/ flushing caps */ unsigned long s_renew_requested; /* last time we sent a renew req */ u64 s_renew_seq; struct list_head s_waiting; /* waiting requests */ struct list_head s_unsafe; /* unsafe requests */ struct xarray s_delegated_inos; }; /* * modes of choosing which MDS to send a request to */ enum { USE_ANY_MDS, USE_RANDOM_MDS, USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */ }; struct ceph_mds_request; struct ceph_mds_client; /* * request completion callback */ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc, struct ceph_mds_request *req); /* * wait for request completion callback */ typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc, struct ceph_mds_request *req); /* * an in-flight mds request */ struct ceph_mds_request { u64 r_tid; /* transaction id */ struct rb_node r_node; struct ceph_mds_client *r_mdsc; struct kref r_kref; int r_op; /* mds op code */ /* operation on what? */ struct inode *r_inode; /* arg1 */ struct dentry *r_dentry; /* arg1 */ struct dentry *r_old_dentry; /* arg2: rename from or link from */ struct inode *r_old_dentry_dir; /* arg2: old dentry's parent dir */ char *r_path1, *r_path2; struct ceph_vino r_ino1, r_ino2; struct inode *r_parent; /* parent dir inode */ struct inode *r_target_inode; /* resulting inode */ struct inode *r_new_inode; /* new inode (for creates) */ #define CEPH_MDS_R_DIRECT_IS_HASH (1) /* r_direct_hash is valid */ #define CEPH_MDS_R_ABORTED (2) /* call was aborted */ #define CEPH_MDS_R_GOT_UNSAFE (3) /* got an unsafe reply */ #define CEPH_MDS_R_GOT_SAFE (4) /* got a safe reply */ #define CEPH_MDS_R_GOT_RESULT (5) /* got a result */ #define CEPH_MDS_R_DID_PREPOPULATE (6) /* prepopulated readdir */ #define CEPH_MDS_R_PARENT_LOCKED (7) /* is r_parent->i_rwsem wlocked? */ #define CEPH_MDS_R_ASYNC (8) /* async request */ #define CEPH_MDS_R_FSCRYPT_FILE (9) /* must marshal fscrypt_file field */ unsigned long r_req_flags; struct mutex r_fill_mutex; union ceph_mds_request_args r_args; struct ceph_fscrypt_auth *r_fscrypt_auth; u64 r_fscrypt_file; u8 *r_altname; /* fscrypt binary crypttext for long filenames */ u32 r_altname_len; /* length of r_altname */ int r_fmode; /* file mode, if expecting cap */ int r_request_release_offset; const struct cred *r_cred; struct mnt_idmap *r_mnt_idmap; struct timespec64 r_stamp; /* for choosing which mds to send this request to */ int r_direct_mode; u32 r_direct_hash; /* choose dir frag based on this dentry hash */ /* data payload is used for xattr ops */ struct ceph_pagelist *r_pagelist; /* what caps shall we drop? */ int r_inode_drop, r_inode_unless; int r_dentry_drop, r_dentry_unless; int r_old_dentry_drop, r_old_dentry_unless; struct inode *r_old_inode; int r_old_inode_drop, r_old_inode_unless; struct ceph_msg *r_request; /* original request */ struct ceph_msg *r_reply; struct ceph_mds_reply_info_parsed r_reply_info; int r_err; u32 r_readdir_offset; struct page *r_locked_page; int r_dir_caps; int r_num_caps; unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */ unsigned long r_started; /* start time to measure timeout against */ unsigned long r_start_latency; /* start time to measure latency */ unsigned long r_end_latency; /* finish time to measure latency */ unsigned long r_request_started; /* start time for mds request only, used to measure lease durations */ /* link unsafe requests to parent directory, for fsync */ struct inode *r_unsafe_dir; struct list_head r_unsafe_dir_item; /* unsafe requests that modify the target inode */ struct list_head r_unsafe_target_item; struct ceph_mds_session *r_session; int r_attempts; /* resend attempts */ int r_num_fwd; /* number of forward attempts */ int r_resend_mds; /* mds to resend to next, if any*/ u32 r_sent_on_mseq; /* cap mseq request was sent at*/ u64 r_deleg_ino; struct list_head r_wait; struct completion r_completion; struct completion r_safe_completion; ceph_mds_request_callback_t r_callback; struct list_head r_unsafe_item; /* per-session unsafe list item */ long long r_dir_release_cnt; long long r_dir_ordered_cnt; int r_readdir_cache_idx; int r_feature_needed; struct ceph_cap_reservation r_caps_reservation; }; struct ceph_pool_perm { struct rb_node node; int perm; s64 pool; size_t pool_ns_len; char pool_ns[]; }; struct ceph_snapid_map { struct rb_node node; struct list_head lru; atomic_t ref; dev_t dev; u64 snap; unsigned long last_used; }; /* * node for list of quotarealm inodes that are not visible from the filesystem * mountpoint, but required to handle, e.g. quotas. */ struct ceph_quotarealm_inode { struct rb_node node; u64 ino; unsigned long timeout; /* last time a lookup failed for this inode */ struct mutex mutex; struct inode *inode; }; struct cap_wait { struct list_head list; u64 ino; pid_t tgid; int need; int want; }; enum { CEPH_MDSC_STOPPING_BEGIN = 1, CEPH_MDSC_STOPPING_FLUSHING = 2, CEPH_MDSC_STOPPING_FLUSHED = 3, }; /* * mds client state */ struct ceph_mds_client { struct ceph_fs_client *fsc; struct mutex mutex; /* all nested structures */ struct ceph_mdsmap *mdsmap; struct completion safe_umount_waiters; wait_queue_head_t session_close_wq; struct list_head waiting_for_map; int mdsmap_err; struct ceph_mds_session **sessions; /* NULL for mds if no session */ atomic_t num_sessions; int max_sessions; /* len of sessions array */ spinlock_t stopping_lock; /* protect snap_empty */ int stopping; /* the stage of shutting down */ atomic_t stopping_blockers; struct completion stopping_waiter; atomic64_t quotarealms_count; /* # realms with quota */ /* * We keep a list of inodes we don't see in the mountpoint but that we * need to track quota realms. */ struct rb_root quotarealms_inodes; struct mutex quotarealms_inodes_mutex; /* * snap_rwsem will cover cap linkage into snaprealms, and * realm snap contexts. (later, we can do per-realm snap * contexts locks..) the empty list contains realms with no * references (implying they contain no inodes with caps) that * should be destroyed. */ u64 last_snap_seq; struct rw_semaphore snap_rwsem; struct rb_root snap_realms; struct list_head snap_empty; int num_snap_realms; spinlock_t snap_empty_lock; /* protect snap_empty */ u64 last_tid; /* most recent mds request */ u64 oldest_tid; /* oldest incomplete mds request, excluding setfilelock requests */ struct rb_root request_tree; /* pending mds requests */ struct delayed_work delayed_work; /* delayed work */ unsigned long last_renew_caps; /* last time we renewed our caps */ struct list_head cap_delay_list; /* caps with delayed release */ spinlock_t cap_delay_lock; /* protects cap_delay_list */ struct list_head snap_flush_list; /* cap_snaps ready to flush */ spinlock_t snap_flush_lock; u64 last_cap_flush_tid; struct list_head cap_flush_list; struct list_head cap_dirty_migrating; /* ...that are migration... */ int num_cap_flushing; /* # caps we are flushing */ spinlock_t cap_dirty_lock; /* protects above items */ wait_queue_head_t cap_flushing_wq; struct work_struct cap_reclaim_work; atomic_t cap_reclaim_pending; /* * Cap reservations * * Maintain a global pool of preallocated struct ceph_caps, referenced * by struct ceph_caps_reservations. This ensures that we preallocate * memory needed to successfully process an MDS response. (If an MDS * sends us cap information and we fail to process it, we will have * problems due to the client and MDS being out of sync.) * * Reservations are 'owned' by a ceph_cap_reservation context. */ spinlock_t caps_list_lock; struct list_head caps_list; /* unused (reserved or unreserved) */ struct list_head cap_wait_list; int caps_total_count; /* total caps allocated */ int caps_use_count; /* in use */ int caps_use_max; /* max used caps */ int caps_reserve_count; /* unused, reserved */ int caps_avail_count; /* unused, unreserved */ int caps_min_count; /* keep at least this many (unreserved) */ spinlock_t dentry_list_lock; struct list_head dentry_leases; /* fifo list */ struct list_head dentry_dir_leases; /* lru list */ struct ceph_client_metric metric; spinlock_t snapid_map_lock; struct rb_root snapid_map_tree; struct list_head snapid_map_lru; struct rw_semaphore pool_perm_rwsem; struct rb_root pool_perm_tree; char nodename[__NEW_UTS_LEN + 1]; }; extern const char *ceph_mds_op_name(int op); extern bool check_session_state(struct ceph_mds_session *s); void inc_session_sequence(struct ceph_mds_session *s); extern struct ceph_mds_session * __ceph_lookup_mds_session(struct ceph_mds_client *, int mds); extern const char *ceph_session_state_name(int s); extern struct ceph_mds_session * ceph_get_mds_session(struct ceph_mds_session *s); extern void ceph_put_mds_session(struct ceph_mds_session *s); extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc, struct ceph_msg *msg, int mds); extern int ceph_mdsc_init(struct ceph_fs_client *fsc); extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc); extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc); extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc); extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); extern void ceph_invalidate_dir_request(struct ceph_mds_request *req); extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, struct inode *dir); extern struct ceph_mds_request * ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req); int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req, ceph_mds_request_wait_callback_t wait_func); extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, struct inode *dir, struct ceph_mds_request *req); extern void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req); extern void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req); static inline void ceph_mdsc_get_request(struct ceph_mds_request *req) { kref_get(&req->r_kref); } extern void ceph_mdsc_release_request(struct kref *kref); static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) { kref_put(&req->r_kref, ceph_mdsc_release_request); } extern void send_flush_mdlog(struct ceph_mds_session *s); extern void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc, void (*cb)(struct ceph_mds_session *), bool check_state); extern struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq); extern void __ceph_queue_cap_release(struct ceph_mds_session *session, struct ceph_cap *cap); extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc); extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr); extern int ceph_iterate_session_caps(struct ceph_mds_session *session, int (*cb)(struct inode *, int mds, void *), void *arg); extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc); static inline void ceph_mdsc_free_path(char *path, int len) { if (!IS_ERR_OR_NULL(path)) __putname(path - (PATH_MAX - 1 - len)); } extern char *ceph_mdsc_build_path(struct ceph_mds_client *mdsc, struct dentry *dentry, int *plen, u64 *base, int for_wire); extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry); extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, struct dentry *dentry, char action, u32 seq); extern void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg); extern void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg); extern struct ceph_mds_session * ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target); extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); extern int ceph_trim_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, int max_caps); static inline int ceph_wait_on_async_create(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); return wait_on_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT, TASK_KILLABLE); } extern int ceph_wait_on_conflict_unlink(struct dentry *dentry); extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session); extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino); extern bool enable_unsafe_idmap; #endif
1 1 1 93 17 28 18 92 86 2 42 114 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 // SPDX-License-Identifier: GPL-2.0-only /* * misc.c * * PURPOSE * Miscellaneous routines for the OSTA-UDF(tm) filesystem. * * COPYRIGHT * (C) 1998 Dave Boynton * (C) 1998-2004 Ben Fennema * (C) 1999-2000 Stelias Computing Inc * * HISTORY * * 04/19/99 blf partial support for reading/writing specific EA's */ #include "udfdecl.h" #include <linux/fs.h> #include <linux/string.h> #include <linux/crc-itu-t.h> #include "udf_i.h" #include "udf_sb.h" struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size, uint32_t type, uint8_t loc) { uint8_t *ea = NULL, *ad = NULL; int offset; uint16_t crclen; struct udf_inode_info *iinfo = UDF_I(inode); ea = iinfo->i_data; if (iinfo->i_lenEAttr) { ad = iinfo->i_data + iinfo->i_lenEAttr; } else { ad = ea; size += sizeof(struct extendedAttrHeaderDesc); } offset = inode->i_sb->s_blocksize - udf_file_entry_alloc_offset(inode) - iinfo->i_lenAlloc; /* TODO - Check for FreeEASpace */ if (loc & 0x01 && offset >= size) { struct extendedAttrHeaderDesc *eahd; eahd = (struct extendedAttrHeaderDesc *)ea; if (iinfo->i_lenAlloc) memmove(&ad[size], ad, iinfo->i_lenAlloc); if (iinfo->i_lenEAttr) { /* check checksum/crc */ if (eahd->descTag.tagIdent != cpu_to_le16(TAG_IDENT_EAHD) || le32_to_cpu(eahd->descTag.tagLocation) != iinfo->i_location.logicalBlockNum) return NULL; } else { struct udf_sb_info *sbi = UDF_SB(inode->i_sb); size -= sizeof(struct extendedAttrHeaderDesc); iinfo->i_lenEAttr += sizeof(struct extendedAttrHeaderDesc); eahd->descTag.tagIdent = cpu_to_le16(TAG_IDENT_EAHD); if (sbi->s_udfrev >= 0x0200) eahd->descTag.descVersion = cpu_to_le16(3); else eahd->descTag.descVersion = cpu_to_le16(2); eahd->descTag.tagSerialNum = cpu_to_le16(sbi->s_serial_number); eahd->descTag.tagLocation = cpu_to_le32( iinfo->i_location.logicalBlockNum); eahd->impAttrLocation = cpu_to_le32(0xFFFFFFFF); eahd->appAttrLocation = cpu_to_le32(0xFFFFFFFF); } offset = iinfo->i_lenEAttr; if (type < 2048) { if (le32_to_cpu(eahd->appAttrLocation) < iinfo->i_lenEAttr) { uint32_t aal = le32_to_cpu(eahd->appAttrLocation); memmove(&ea[offset - aal + size], &ea[aal], offset - aal); offset -= aal; eahd->appAttrLocation = cpu_to_le32(aal + size); } if (le32_to_cpu(eahd->impAttrLocation) < iinfo->i_lenEAttr) { uint32_t ial = le32_to_cpu(eahd->impAttrLocation); memmove(&ea[offset - ial + size], &ea[ial], offset - ial); offset -= ial; eahd->impAttrLocation = cpu_to_le32(ial + size); } } else if (type < 65536) { if (le32_to_cpu(eahd->appAttrLocation) < iinfo->i_lenEAttr) { uint32_t aal = le32_to_cpu(eahd->appAttrLocation); memmove(&ea[offset - aal + size], &ea[aal], offset - aal); offset -= aal; eahd->appAttrLocation = cpu_to_le32(aal + size); } } /* rewrite CRC + checksum of eahd */ crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(struct tag); eahd->descTag.descCRCLength = cpu_to_le16(crclen); eahd->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)eahd + sizeof(struct tag), crclen)); eahd->descTag.tagChecksum = udf_tag_checksum(&eahd->descTag); iinfo->i_lenEAttr += size; return (struct genericFormat *)&ea[offset]; } return NULL; } struct genericFormat *udf_get_extendedattr(struct inode *inode, uint32_t type, uint8_t subtype) { struct genericFormat *gaf; uint8_t *ea = NULL; uint32_t offset; struct udf_inode_info *iinfo = UDF_I(inode); ea = iinfo->i_data; if (iinfo->i_lenEAttr) { struct extendedAttrHeaderDesc *eahd; eahd = (struct extendedAttrHeaderDesc *)ea; /* check checksum/crc */ if (eahd->descTag.tagIdent != cpu_to_le16(TAG_IDENT_EAHD) || le32_to_cpu(eahd->descTag.tagLocation) != iinfo->i_location.logicalBlockNum) return NULL; if (type < 2048) offset = sizeof(struct extendedAttrHeaderDesc); else if (type < 65536) offset = le32_to_cpu(eahd->impAttrLocation); else offset = le32_to_cpu(eahd->appAttrLocation); while (offset + sizeof(*gaf) < iinfo->i_lenEAttr) { uint32_t attrLength; gaf = (struct genericFormat *)&ea[offset]; attrLength = le32_to_cpu(gaf->attrLength); /* Detect undersized elements and buffer overflows */ if ((attrLength < sizeof(*gaf)) || (attrLength > (iinfo->i_lenEAttr - offset))) break; if (le32_to_cpu(gaf->attrType) == type && gaf->attrSubtype == subtype) return gaf; else offset += attrLength; } } return NULL; } /* * udf_read_tagged * * PURPOSE * Read the first block of a tagged descriptor. * * HISTORY * July 1, 1997 - Andrew E. Mileski * Written, tested, and released. */ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, uint32_t location, uint16_t *ident) { struct tag *tag_p; struct buffer_head *bh = NULL; u8 checksum; /* Read the block */ if (block == 0xFFFFFFFF) return NULL; bh = sb_bread(sb, block); if (!bh) { udf_err(sb, "read failed, block=%u, location=%u\n", block, location); return NULL; } tag_p = (struct tag *)(bh->b_data); *ident = le16_to_cpu(tag_p->tagIdent); if (location != le32_to_cpu(tag_p->tagLocation)) { udf_debug("location mismatch block %u, tag %u != %u\n", block, le32_to_cpu(tag_p->tagLocation), location); goto error_out; } /* Verify the tag checksum */ checksum = udf_tag_checksum(tag_p); if (checksum != tag_p->tagChecksum) { udf_err(sb, "tag checksum failed, block %u: 0x%02x != 0x%02x\n", block, checksum, tag_p->tagChecksum); goto error_out; } /* Verify the tag version */ if (tag_p->descVersion != cpu_to_le16(0x0002U) && tag_p->descVersion != cpu_to_le16(0x0003U)) { udf_err(sb, "tag version 0x%04x != 0x0002 || 0x0003, block %u\n", le16_to_cpu(tag_p->descVersion), block); goto error_out; } /* Verify the descriptor CRC */ if (le16_to_cpu(tag_p->descCRCLength) + sizeof(struct tag) > sb->s_blocksize || le16_to_cpu(tag_p->descCRC) == crc_itu_t(0, bh->b_data + sizeof(struct tag), le16_to_cpu(tag_p->descCRCLength))) return bh; udf_debug("Crc failure block %u: crc = %u, crclen = %u\n", block, le16_to_cpu(tag_p->descCRC), le16_to_cpu(tag_p->descCRCLength)); error_out: brelse(bh); return NULL; } struct buffer_head *udf_read_ptagged(struct super_block *sb, struct kernel_lb_addr *loc, uint32_t offset, uint16_t *ident) { return udf_read_tagged(sb, udf_get_lb_pblock(sb, loc, offset), loc->logicalBlockNum + offset, ident); } void udf_update_tag(char *data, int length) { struct tag *tptr = (struct tag *)data; length -= sizeof(struct tag); tptr->descCRCLength = cpu_to_le16(length); tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(struct tag), length)); tptr->tagChecksum = udf_tag_checksum(tptr); } void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum, uint32_t loc, int length) { struct tag *tptr = (struct tag *)data; tptr->tagIdent = cpu_to_le16(ident); tptr->descVersion = cpu_to_le16(version); tptr->tagSerialNum = cpu_to_le16(snum); tptr->tagLocation = cpu_to_le32(loc); udf_update_tag(data, length); } u8 udf_tag_checksum(const struct tag *t) { u8 *data = (u8 *)t; u8 checksum = 0; int i; for (i = 0; i < sizeof(struct tag); ++i) if (i != 4) /* position of checksum */ checksum += data[i]; return checksum; }
6 6 6 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 /* SPDX-License-Identifier: GPL-2.0 */ /* * devres.c - managed gpio resources * This file is based on kernel/irq/devres.c * * Copyright (c) 2011 John Crispin <john@phrozen.org> */ #include <linux/module.h> #include <linux/err.h> #include <linux/gpio.h> #include <linux/gpio/consumer.h> #include <linux/device.h> #include <linux/gfp.h> #include "gpiolib.h" static void devm_gpiod_release(struct device *dev, void *res) { struct gpio_desc **desc = res; gpiod_put(*desc); } static int devm_gpiod_match(struct device *dev, void *res, void *data) { struct gpio_desc **this = res, **gpio = data; return *this == *gpio; } static void devm_gpiod_release_array(struct device *dev, void *res) { struct gpio_descs **descs = res; gpiod_put_array(*descs); } static int devm_gpiod_match_array(struct device *dev, void *res, void *data) { struct gpio_descs **this = res, **gpios = data; return *this == *gpios; } /** * devm_gpiod_get - Resource-managed gpiod_get() * @dev: GPIO consumer * @con_id: function within the GPIO consumer * @flags: optional GPIO initialization flags * * Managed gpiod_get(). GPIO descriptors returned from this function are * automatically disposed on driver detach. See gpiod_get() for detailed * information about behavior and return values. */ struct gpio_desc *__must_check devm_gpiod_get(struct device *dev, const char *con_id, enum gpiod_flags flags) { return devm_gpiod_get_index(dev, con_id, 0, flags); } EXPORT_SYMBOL_GPL(devm_gpiod_get); /** * devm_gpiod_get_optional - Resource-managed gpiod_get_optional() * @dev: GPIO consumer * @con_id: function within the GPIO consumer * @flags: optional GPIO initialization flags * * Managed gpiod_get_optional(). GPIO descriptors returned from this function * are automatically disposed on driver detach. See gpiod_get_optional() for * detailed information about behavior and return values. */ struct gpio_desc *__must_check devm_gpiod_get_optional(struct device *dev, const char *con_id, enum gpiod_flags flags) { return devm_gpiod_get_index_optional(dev, con_id, 0, flags); } EXPORT_SYMBOL_GPL(devm_gpiod_get_optional); /** * devm_gpiod_get_index - Resource-managed gpiod_get_index() * @dev: GPIO consumer * @con_id: function within the GPIO consumer * @idx: index of the GPIO to obtain in the consumer * @flags: optional GPIO initialization flags * * Managed gpiod_get_index(). GPIO descriptors returned from this function are * automatically disposed on driver detach. See gpiod_get_index() for detailed * information about behavior and return values. */ struct gpio_desc *__must_check devm_gpiod_get_index(struct device *dev, const char *con_id, unsigned int idx, enum gpiod_flags flags) { struct gpio_desc **dr; struct gpio_desc *desc; desc = gpiod_get_index(dev, con_id, idx, flags); if (IS_ERR(desc)) return desc; /* * For non-exclusive GPIO descriptors, check if this descriptor is * already under resource management by this device. */ if (flags & GPIOD_FLAGS_BIT_NONEXCLUSIVE) { struct devres *dres; dres = devres_find(dev, devm_gpiod_release, devm_gpiod_match, &desc); if (dres) return desc; } dr = devres_alloc(devm_gpiod_release, sizeof(struct gpio_desc *), GFP_KERNEL); if (!dr) { gpiod_put(desc); return ERR_PTR(-ENOMEM); } *dr = desc; devres_add(dev, dr); return desc; } EXPORT_SYMBOL_GPL(devm_gpiod_get_index); /** * devm_fwnode_gpiod_get_index - get a GPIO descriptor from a given node * @dev: GPIO consumer * @fwnode: firmware node containing GPIO reference * @con_id: function within the GPIO consumer * @index: index of the GPIO to obtain in the consumer * @flags: GPIO initialization flags * @label: label to attach to the requested GPIO * * GPIO descriptors returned from this function are automatically disposed on * driver detach. * * On successful request the GPIO pin is configured in accordance with * provided @flags. */ struct gpio_desc *devm_fwnode_gpiod_get_index(struct device *dev, struct fwnode_handle *fwnode, const char *con_id, int index, enum gpiod_flags flags, const char *label) { struct gpio_desc **dr; struct gpio_desc *desc; dr = devres_alloc(devm_gpiod_release, sizeof(struct gpio_desc *), GFP_KERNEL); if (!dr) return ERR_PTR(-ENOMEM); desc = fwnode_gpiod_get_index(fwnode, con_id, index, flags, label); if (IS_ERR(desc)) { devres_free(dr); return desc; } *dr = desc; devres_add(dev, dr); return desc; } EXPORT_SYMBOL_GPL(devm_fwnode_gpiod_get_index); /** * devm_gpiod_get_index_optional - Resource-managed gpiod_get_index_optional() * @dev: GPIO consumer * @con_id: function within the GPIO consumer * @index: index of the GPIO to obtain in the consumer * @flags: optional GPIO initialization flags * * Managed gpiod_get_index_optional(). GPIO descriptors returned from this * function are automatically disposed on driver detach. See * gpiod_get_index_optional() for detailed information about behavior and * return values. */ struct gpio_desc *__must_check devm_gpiod_get_index_optional(struct device *dev, const char *con_id, unsigned int index, enum gpiod_flags flags) { struct gpio_desc *desc; desc = devm_gpiod_get_index(dev, con_id, index, flags); if (gpiod_not_found(desc)) return NULL; return desc; } EXPORT_SYMBOL_GPL(devm_gpiod_get_index_optional); /** * devm_gpiod_get_array - Resource-managed gpiod_get_array() * @dev: GPIO consumer * @con_id: function within the GPIO consumer * @flags: optional GPIO initialization flags * * Managed gpiod_get_array(). GPIO descriptors returned from this function are * automatically disposed on driver detach. See gpiod_get_array() for detailed * information about behavior and return values. */ struct gpio_descs *__must_check devm_gpiod_get_array(struct device *dev, const char *con_id, enum gpiod_flags flags) { struct gpio_descs **dr; struct gpio_descs *descs; dr = devres_alloc(devm_gpiod_release_array, sizeof(struct gpio_descs *), GFP_KERNEL); if (!dr) return ERR_PTR(-ENOMEM); descs = gpiod_get_array(dev, con_id, flags); if (IS_ERR(descs)) { devres_free(dr); return descs; } *dr = descs; devres_add(dev, dr); return descs; } EXPORT_SYMBOL_GPL(devm_gpiod_get_array); /** * devm_gpiod_get_array_optional - Resource-managed gpiod_get_array_optional() * @dev: GPIO consumer * @con_id: function within the GPIO consumer * @flags: optional GPIO initialization flags * * Managed gpiod_get_array_optional(). GPIO descriptors returned from this * function are automatically disposed on driver detach. * See gpiod_get_array_optional() for detailed information about behavior and * return values. */ struct gpio_descs *__must_check devm_gpiod_get_array_optional(struct device *dev, const char *con_id, enum gpiod_flags flags) { struct gpio_descs *descs; descs = devm_gpiod_get_array(dev, con_id, flags); if (gpiod_not_found(descs)) return NULL; return descs; } EXPORT_SYMBOL_GPL(devm_gpiod_get_array_optional); /** * devm_gpiod_put - Resource-managed gpiod_put() * @dev: GPIO consumer * @desc: GPIO descriptor to dispose of * * Dispose of a GPIO descriptor obtained with devm_gpiod_get() or * devm_gpiod_get_index(). Normally this function will not be called as the GPIO * will be disposed of by the resource management code. */ void devm_gpiod_put(struct device *dev, struct gpio_desc *desc) { WARN_ON(devres_release(dev, devm_gpiod_release, devm_gpiod_match, &desc)); } EXPORT_SYMBOL_GPL(devm_gpiod_put); /** * devm_gpiod_unhinge - Remove resource management from a gpio descriptor * @dev: GPIO consumer * @desc: GPIO descriptor to remove resource management from * * Remove resource management from a GPIO descriptor. This is needed when * you want to hand over lifecycle management of a descriptor to another * mechanism. */ void devm_gpiod_unhinge(struct device *dev, struct gpio_desc *desc) { int ret; if (IS_ERR_OR_NULL(desc)) return; ret = devres_destroy(dev, devm_gpiod_release, devm_gpiod_match, &desc); /* * If the GPIO descriptor is requested as nonexclusive, we * may call this function several times on the same descriptor * so it is OK if devres_destroy() returns -ENOENT. */ if (ret == -ENOENT) return; /* Anything else we should warn about */ WARN_ON(ret); } EXPORT_SYMBOL_GPL(devm_gpiod_unhinge); /** * devm_gpiod_put_array - Resource-managed gpiod_put_array() * @dev: GPIO consumer * @descs: GPIO descriptor array to dispose of * * Dispose of an array of GPIO descriptors obtained with devm_gpiod_get_array(). * Normally this function will not be called as the GPIOs will be disposed of * by the resource management code. */ void devm_gpiod_put_array(struct device *dev, struct gpio_descs *descs) { WARN_ON(devres_release(dev, devm_gpiod_release_array, devm_gpiod_match_array, &descs)); } EXPORT_SYMBOL_GPL(devm_gpiod_put_array); static void devm_gpio_release(struct device *dev, void *res) { unsigned *gpio = res; gpio_free(*gpio); } /** * devm_gpio_request - request a GPIO for a managed device * @dev: device to request the GPIO for * @gpio: GPIO to allocate * @label: the name of the requested GPIO * * Except for the extra @dev argument, this function takes the * same arguments and performs the same function as * gpio_request(). GPIOs requested with this function will be * automatically freed on driver detach. */ int devm_gpio_request(struct device *dev, unsigned gpio, const char *label) { unsigned *dr; int rc; dr = devres_alloc(devm_gpio_release, sizeof(unsigned), GFP_KERNEL); if (!dr) return -ENOMEM; rc = gpio_request(gpio, label); if (rc) { devres_free(dr); return rc; } *dr = gpio; devres_add(dev, dr); return 0; } EXPORT_SYMBOL_GPL(devm_gpio_request); /** * devm_gpio_request_one - request a single GPIO with initial setup * @dev: device to request for * @gpio: the GPIO number * @flags: GPIO configuration as specified by GPIOF_* * @label: a literal description string of this GPIO */ int devm_gpio_request_one(struct device *dev, unsigned gpio, unsigned long flags, const char *label) { unsigned *dr; int rc; dr = devres_alloc(devm_gpio_release, sizeof(unsigned), GFP_KERNEL); if (!dr) return -ENOMEM; rc = gpio_request_one(gpio, flags, label); if (rc) { devres_free(dr); return rc; } *dr = gpio; devres_add(dev, dr); return 0; } EXPORT_SYMBOL_GPL(devm_gpio_request_one); static void devm_gpio_chip_release(void *data) { struct gpio_chip *gc = data; gpiochip_remove(gc); } /** * devm_gpiochip_add_data_with_key() - Resource managed gpiochip_add_data_with_key() * @dev: pointer to the device that gpio_chip belongs to. * @gc: the GPIO chip to register * @data: driver-private data associated with this chip * @lock_key: lockdep class for IRQ lock * @request_key: lockdep class for IRQ request * * Context: potentially before irqs will work * * The gpio chip automatically be released when the device is unbound. * * Returns: * A negative errno if the chip can't be registered, such as because the * gc->base is invalid or already associated with a different chip. * Otherwise it returns zero as a success code. */ int devm_gpiochip_add_data_with_key(struct device *dev, struct gpio_chip *gc, void *data, struct lock_class_key *lock_key, struct lock_class_key *request_key) { int ret; ret = gpiochip_add_data_with_key(gc, data, lock_key, request_key); if (ret < 0) return ret; return devm_add_action_or_reset(dev, devm_gpio_chip_release, gc); } EXPORT_SYMBOL_GPL(devm_gpiochip_add_data_with_key);
5 2 4 5 7 5 5 6 6 32 32 6 4 14 32 32 1 3 7 8 1 1 1 1 6 5 3 1 1 1 23 23 1 2 4 87 2 66 31 77 63 9 1 71 20 55 17 1 1 1 1 1 1 3 1 3 1 1 1 2 1 7 1 2 1 2 1 16 16 12 1 11 3 2 1 7 1 6 2 8 9 1 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 // SPDX-License-Identifier: GPL-2.0-or-later /* AF_RXRPC implementation * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/kernel.h> #include <linux/net.h> #include <linux/slab.h> #include <linux/skbuff.h> #include <linux/random.h> #include <linux/poll.h> #include <linux/proc_fs.h> #include <linux/key-type.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/af_rxrpc.h> #define CREATE_TRACE_POINTS #include "ar-internal.h" MODULE_DESCRIPTION("RxRPC network protocol"); MODULE_AUTHOR("Red Hat, Inc."); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_RXRPC); unsigned int rxrpc_debug; // = RXRPC_DEBUG_KPROTO; module_param_named(debug, rxrpc_debug, uint, 0644); MODULE_PARM_DESC(debug, "RxRPC debugging mask"); static struct proto rxrpc_proto; static const struct proto_ops rxrpc_rpc_ops; /* current debugging ID */ atomic_t rxrpc_debug_id; EXPORT_SYMBOL(rxrpc_debug_id); /* count of skbs currently in use */ atomic_t rxrpc_n_rx_skbs; struct workqueue_struct *rxrpc_workqueue; static void rxrpc_sock_destructor(struct sock *); /* * see if an RxRPC socket is currently writable */ static inline int rxrpc_writable(struct sock *sk) { return refcount_read(&sk->sk_wmem_alloc) < (size_t) sk->sk_sndbuf; } /* * wait for write bufferage to become available */ static void rxrpc_write_space(struct sock *sk) { _enter("%p", sk); rcu_read_lock(); if (rxrpc_writable(sk)) { struct socket_wq *wq = rcu_dereference(sk->sk_wq); if (skwq_has_sleeper(wq)) wake_up_interruptible(&wq->wait); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } rcu_read_unlock(); } /* * validate an RxRPC address */ static int rxrpc_validate_address(struct rxrpc_sock *rx, struct sockaddr_rxrpc *srx, int len) { unsigned int tail; if (len < sizeof(struct sockaddr_rxrpc)) return -EINVAL; if (srx->srx_family != AF_RXRPC) return -EAFNOSUPPORT; if (srx->transport_type != SOCK_DGRAM) return -ESOCKTNOSUPPORT; len -= offsetof(struct sockaddr_rxrpc, transport); if (srx->transport_len < sizeof(sa_family_t) || srx->transport_len > len) return -EINVAL; switch (srx->transport.family) { case AF_INET: if (rx->family != AF_INET && rx->family != AF_INET6) return -EAFNOSUPPORT; if (srx->transport_len < sizeof(struct sockaddr_in)) return -EINVAL; tail = offsetof(struct sockaddr_rxrpc, transport.sin.__pad); break; #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: if (rx->family != AF_INET6) return -EAFNOSUPPORT; if (srx->transport_len < sizeof(struct sockaddr_in6)) return -EINVAL; tail = offsetof(struct sockaddr_rxrpc, transport) + sizeof(struct sockaddr_in6); break; #endif default: return -EAFNOSUPPORT; } if (tail < len) memset((void *)srx + tail, 0, len - tail); _debug("INET: %pISp", &srx->transport); return 0; } /* * bind a local address to an RxRPC socket */ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) { struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *)saddr; struct rxrpc_local *local; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); u16 service_id; int ret; _enter("%p,%p,%d", rx, saddr, len); ret = rxrpc_validate_address(rx, srx, len); if (ret < 0) goto error; service_id = srx->srx_service; lock_sock(&rx->sk); switch (rx->sk.sk_state) { case RXRPC_UNBOUND: rx->srx = *srx; local = rxrpc_lookup_local(sock_net(&rx->sk), &rx->srx); if (IS_ERR(local)) { ret = PTR_ERR(local); goto error_unlock; } if (service_id) { write_lock(&local->services_lock); if (local->service) goto service_in_use; rx->local = local; local->service = rx; write_unlock(&local->services_lock); rx->sk.sk_state = RXRPC_SERVER_BOUND; } else { rx->local = local; rx->sk.sk_state = RXRPC_CLIENT_BOUND; } break; case RXRPC_SERVER_BOUND: ret = -EINVAL; if (service_id == 0) goto error_unlock; ret = -EADDRINUSE; if (service_id == rx->srx.srx_service) goto error_unlock; ret = -EINVAL; srx->srx_service = rx->srx.srx_service; if (memcmp(srx, &rx->srx, sizeof(*srx)) != 0) goto error_unlock; rx->second_service = service_id; rx->sk.sk_state = RXRPC_SERVER_BOUND2; break; default: ret = -EINVAL; goto error_unlock; } release_sock(&rx->sk); _leave(" = 0"); return 0; service_in_use: write_unlock(&local->services_lock); rxrpc_unuse_local(local, rxrpc_local_unuse_bind); rxrpc_put_local(local, rxrpc_local_put_bind); ret = -EADDRINUSE; error_unlock: release_sock(&rx->sk); error: _leave(" = %d", ret); return ret; } /* * set the number of pending calls permitted on a listening socket */ static int rxrpc_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; struct rxrpc_sock *rx = rxrpc_sk(sk); unsigned int max, old; int ret; _enter("%p,%d", rx, backlog); lock_sock(&rx->sk); switch (rx->sk.sk_state) { case RXRPC_UNBOUND: ret = -EADDRNOTAVAIL; break; case RXRPC_SERVER_BOUND: case RXRPC_SERVER_BOUND2: ASSERT(rx->local != NULL); max = READ_ONCE(rxrpc_max_backlog); ret = -EINVAL; if (backlog == INT_MAX) backlog = max; else if (backlog < 0 || backlog > max) break; old = sk->sk_max_ack_backlog; sk->sk_max_ack_backlog = backlog; ret = rxrpc_service_prealloc(rx, GFP_KERNEL); if (ret == 0) rx->sk.sk_state = RXRPC_SERVER_LISTENING; else sk->sk_max_ack_backlog = old; break; case RXRPC_SERVER_LISTENING: if (backlog == 0) { rx->sk.sk_state = RXRPC_SERVER_LISTEN_DISABLED; sk->sk_max_ack_backlog = 0; rxrpc_discard_prealloc(rx); ret = 0; break; } fallthrough; default: ret = -EBUSY; break; } release_sock(&rx->sk); _leave(" = %d", ret); return ret; } /** * rxrpc_kernel_lookup_peer - Obtain remote transport endpoint for an address * @sock: The socket through which it will be accessed * @srx: The network address * @gfp: Allocation flags * * Lookup or create a remote transport endpoint record for the specified * address and return it with a ref held. */ struct rxrpc_peer *rxrpc_kernel_lookup_peer(struct socket *sock, struct sockaddr_rxrpc *srx, gfp_t gfp) { struct rxrpc_sock *rx = rxrpc_sk(sock->sk); int ret; ret = rxrpc_validate_address(rx, srx, sizeof(*srx)); if (ret < 0) return ERR_PTR(ret); return rxrpc_lookup_peer(rx->local, srx, gfp); } EXPORT_SYMBOL(rxrpc_kernel_lookup_peer); /** * rxrpc_kernel_get_peer - Get a reference on a peer * @peer: The peer to get a reference on. * * Get a record for the remote peer in a call. */ struct rxrpc_peer *rxrpc_kernel_get_peer(struct rxrpc_peer *peer) { return peer ? rxrpc_get_peer(peer, rxrpc_peer_get_application) : NULL; } EXPORT_SYMBOL(rxrpc_kernel_get_peer); /** * rxrpc_kernel_put_peer - Allow a kernel app to drop a peer reference * @peer: The peer to drop a ref on */ void rxrpc_kernel_put_peer(struct rxrpc_peer *peer) { rxrpc_put_peer(peer, rxrpc_peer_put_application); } EXPORT_SYMBOL(rxrpc_kernel_put_peer); /** * rxrpc_kernel_begin_call - Allow a kernel service to begin a call * @sock: The socket on which to make the call * @peer: The peer to contact * @key: The security context to use (defaults to socket setting) * @user_call_ID: The ID to use * @tx_total_len: Total length of data to transmit during the call (or -1) * @hard_timeout: The maximum lifespan of the call in sec * @gfp: The allocation constraints * @notify_rx: Where to send notifications instead of socket queue * @service_id: The ID of the service to contact * @upgrade: Request service upgrade for call * @interruptibility: The call is interruptible, or can be canceled. * @debug_id: The debug ID for tracing to be assigned to the call * * Allow a kernel service to begin a call on the nominated socket. This just * sets up all the internal tracking structures and allocates connection and * call IDs as appropriate. The call to be used is returned. * * The default socket destination address and security may be overridden by * supplying @srx and @key. */ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, struct rxrpc_peer *peer, struct key *key, unsigned long user_call_ID, s64 tx_total_len, u32 hard_timeout, gfp_t gfp, rxrpc_notify_rx_t notify_rx, u16 service_id, bool upgrade, enum rxrpc_interruptibility interruptibility, unsigned int debug_id) { struct rxrpc_conn_parameters cp; struct rxrpc_call_params p; struct rxrpc_call *call; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); _enter(",,%x,%lx", key_serial(key), user_call_ID); if (WARN_ON_ONCE(peer->local != rx->local)) return ERR_PTR(-EIO); lock_sock(&rx->sk); if (!key) key = rx->key; if (key && !key->payload.data[0]) key = NULL; /* a no-security key */ memset(&p, 0, sizeof(p)); p.user_call_ID = user_call_ID; p.tx_total_len = tx_total_len; p.interruptibility = interruptibility; p.kernel = true; p.timeouts.hard = hard_timeout; memset(&cp, 0, sizeof(cp)); cp.local = rx->local; cp.peer = peer; cp.key = key; cp.security_level = rx->min_sec_level; cp.exclusive = false; cp.upgrade = upgrade; cp.service_id = service_id; call = rxrpc_new_client_call(rx, &cp, &p, gfp, debug_id); /* The socket has been unlocked. */ if (!IS_ERR(call)) { call->notify_rx = notify_rx; mutex_unlock(&call->user_mutex); } _leave(" = %p", call); return call; } EXPORT_SYMBOL(rxrpc_kernel_begin_call); /* * Dummy function used to stop the notifier talking to recvmsg(). */ static void rxrpc_dummy_notify_rx(struct sock *sk, struct rxrpc_call *rxcall, unsigned long call_user_ID) { } /** * rxrpc_kernel_shutdown_call - Allow a kernel service to shut down a call it was using * @sock: The socket the call is on * @call: The call to end * * Allow a kernel service to shut down a call it was using. The call must be * complete before this is called (the call should be aborted if necessary). */ void rxrpc_kernel_shutdown_call(struct socket *sock, struct rxrpc_call *call) { _enter("%d{%d}", call->debug_id, refcount_read(&call->ref)); mutex_lock(&call->user_mutex); if (!test_bit(RXRPC_CALL_RELEASED, &call->flags)) { rxrpc_release_call(rxrpc_sk(sock->sk), call); /* Make sure we're not going to call back into a kernel service */ if (call->notify_rx) { spin_lock(&call->notify_lock); call->notify_rx = rxrpc_dummy_notify_rx; spin_unlock(&call->notify_lock); } } mutex_unlock(&call->user_mutex); } EXPORT_SYMBOL(rxrpc_kernel_shutdown_call); /** * rxrpc_kernel_put_call - Release a reference to a call * @sock: The socket the call is on * @call: The call to put * * Drop the application's ref on an rxrpc call. */ void rxrpc_kernel_put_call(struct socket *sock, struct rxrpc_call *call) { rxrpc_put_call(call, rxrpc_call_put_kernel); } EXPORT_SYMBOL(rxrpc_kernel_put_call); /** * rxrpc_kernel_check_life - Check to see whether a call is still alive * @sock: The socket the call is on * @call: The call to check * * Allow a kernel service to find out whether a call is still alive - whether * it has completed successfully and all received data has been consumed. */ bool rxrpc_kernel_check_life(const struct socket *sock, const struct rxrpc_call *call) { if (!rxrpc_call_is_complete(call)) return true; if (call->completion != RXRPC_CALL_SUCCEEDED) return false; return !skb_queue_empty(&call->recvmsg_queue); } EXPORT_SYMBOL(rxrpc_kernel_check_life); /** * rxrpc_kernel_get_epoch - Retrieve the epoch value from a call. * @sock: The socket the call is on * @call: The call to query * * Allow a kernel service to retrieve the epoch value from a service call to * see if the client at the other end rebooted. */ u32 rxrpc_kernel_get_epoch(struct socket *sock, struct rxrpc_call *call) { return call->conn->proto.epoch; } EXPORT_SYMBOL(rxrpc_kernel_get_epoch); /** * rxrpc_kernel_new_call_notification - Get notifications of new calls * @sock: The socket to intercept received messages on * @notify_new_call: Function to be called when new calls appear * @discard_new_call: Function to discard preallocated calls * * Allow a kernel service to be given notifications about new calls. */ void rxrpc_kernel_new_call_notification( struct socket *sock, rxrpc_notify_new_call_t notify_new_call, rxrpc_discard_new_call_t discard_new_call) { struct rxrpc_sock *rx = rxrpc_sk(sock->sk); rx->notify_new_call = notify_new_call; rx->discard_new_call = discard_new_call; } EXPORT_SYMBOL(rxrpc_kernel_new_call_notification); /** * rxrpc_kernel_set_max_life - Set maximum lifespan on a call * @sock: The socket the call is on * @call: The call to configure * @hard_timeout: The maximum lifespan of the call in jiffies * * Set the maximum lifespan of a call. The call will end with ETIME or * ETIMEDOUT if it takes longer than this. */ void rxrpc_kernel_set_max_life(struct socket *sock, struct rxrpc_call *call, unsigned long hard_timeout) { unsigned long now; mutex_lock(&call->user_mutex); now = jiffies; hard_timeout += now; WRITE_ONCE(call->expect_term_by, hard_timeout); rxrpc_reduce_call_timer(call, hard_timeout, now, rxrpc_timer_set_for_hard); mutex_unlock(&call->user_mutex); } EXPORT_SYMBOL(rxrpc_kernel_set_max_life); /* * connect an RxRPC socket * - this just targets it at a specific destination; no actual connection * negotiation takes place */ static int rxrpc_connect(struct socket *sock, struct sockaddr *addr, int addr_len, int flags) { struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *)addr; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); int ret; _enter("%p,%p,%d,%d", rx, addr, addr_len, flags); ret = rxrpc_validate_address(rx, srx, addr_len); if (ret < 0) { _leave(" = %d [bad addr]", ret); return ret; } lock_sock(&rx->sk); ret = -EISCONN; if (test_bit(RXRPC_SOCK_CONNECTED, &rx->flags)) goto error; switch (rx->sk.sk_state) { case RXRPC_UNBOUND: rx->sk.sk_state = RXRPC_CLIENT_UNBOUND; break; case RXRPC_CLIENT_UNBOUND: case RXRPC_CLIENT_BOUND: break; default: ret = -EBUSY; goto error; } rx->connect_srx = *srx; set_bit(RXRPC_SOCK_CONNECTED, &rx->flags); ret = 0; error: release_sock(&rx->sk); return ret; } /* * send a message through an RxRPC socket * - in a client this does a number of things: * - finds/sets up a connection for the security specified (if any) * - initiates a call (ID in control data) * - ends the request phase of a call (if MSG_MORE is not set) * - sends a call data packet * - may send an abort (abort code in control data) */ static int rxrpc_sendmsg(struct socket *sock, struct msghdr *m, size_t len) { struct rxrpc_local *local; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); int ret; _enter(",{%d},,%zu", rx->sk.sk_state, len); if (m->msg_flags & MSG_OOB) return -EOPNOTSUPP; if (m->msg_name) { ret = rxrpc_validate_address(rx, m->msg_name, m->msg_namelen); if (ret < 0) { _leave(" = %d [bad addr]", ret); return ret; } } lock_sock(&rx->sk); switch (rx->sk.sk_state) { case RXRPC_UNBOUND: case RXRPC_CLIENT_UNBOUND: rx->srx.srx_family = AF_RXRPC; rx->srx.srx_service = 0; rx->srx.transport_type = SOCK_DGRAM; rx->srx.transport.family = rx->family; switch (rx->family) { case AF_INET: rx->srx.transport_len = sizeof(struct sockaddr_in); break; #ifdef CONFIG_AF_RXRPC_IPV6 case AF_INET6: rx->srx.transport_len = sizeof(struct sockaddr_in6); break; #endif default: ret = -EAFNOSUPPORT; goto error_unlock; } local = rxrpc_lookup_local(sock_net(sock->sk), &rx->srx); if (IS_ERR(local)) { ret = PTR_ERR(local); goto error_unlock; } rx->local = local; rx->sk.sk_state = RXRPC_CLIENT_BOUND; fallthrough; case RXRPC_CLIENT_BOUND: if (!m->msg_name && test_bit(RXRPC_SOCK_CONNECTED, &rx->flags)) { m->msg_name = &rx->connect_srx; m->msg_namelen = sizeof(rx->connect_srx); } fallthrough; case RXRPC_SERVER_BOUND: case RXRPC_SERVER_LISTENING: ret = rxrpc_do_sendmsg(rx, m, len); /* The socket has been unlocked */ goto out; default: ret = -EINVAL; goto error_unlock; } error_unlock: release_sock(&rx->sk); out: _leave(" = %d", ret); return ret; } int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val) { if (sk->sk_state != RXRPC_UNBOUND) return -EISCONN; if (val > RXRPC_SECURITY_MAX) return -EINVAL; lock_sock(sk); rxrpc_sk(sk)->min_sec_level = val; release_sock(sk); return 0; } EXPORT_SYMBOL(rxrpc_sock_set_min_security_level); /* * set RxRPC socket options */ static int rxrpc_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct rxrpc_sock *rx = rxrpc_sk(sock->sk); unsigned int min_sec_level; u16 service_upgrade[2]; int ret; _enter(",%d,%d,,%d", level, optname, optlen); lock_sock(&rx->sk); ret = -EOPNOTSUPP; if (level == SOL_RXRPC) { switch (optname) { case RXRPC_EXCLUSIVE_CONNECTION: ret = -EINVAL; if (optlen != 0) goto error; ret = -EISCONN; if (rx->sk.sk_state != RXRPC_UNBOUND) goto error; rx->exclusive = true; goto success; case RXRPC_SECURITY_KEY: ret = -EINVAL; if (rx->key) goto error; ret = -EISCONN; if (rx->sk.sk_state != RXRPC_UNBOUND) goto error; ret = rxrpc_request_key(rx, optval, optlen); goto error; case RXRPC_SECURITY_KEYRING: ret = -EINVAL; if (rx->key) goto error; ret = -EISCONN; if (rx->sk.sk_state != RXRPC_UNBOUND) goto error; ret = rxrpc_server_keyring(rx, optval, optlen); goto error; case RXRPC_MIN_SECURITY_LEVEL: ret = -EINVAL; if (optlen != sizeof(unsigned int)) goto error; ret = -EISCONN; if (rx->sk.sk_state != RXRPC_UNBOUND) goto error; ret = copy_from_sockptr(&min_sec_level, optval, sizeof(unsigned int)); if (ret < 0) goto error; ret = -EINVAL; if (min_sec_level > RXRPC_SECURITY_MAX) goto error; rx->min_sec_level = min_sec_level; goto success; case RXRPC_UPGRADEABLE_SERVICE: ret = -EINVAL; if (optlen != sizeof(service_upgrade) || rx->service_upgrade.from != 0) goto error; ret = -EISCONN; if (rx->sk.sk_state != RXRPC_SERVER_BOUND2) goto error; ret = -EFAULT; if (copy_from_sockptr(service_upgrade, optval, sizeof(service_upgrade)) != 0) goto error; ret = -EINVAL; if ((service_upgrade[0] != rx->srx.srx_service || service_upgrade[1] != rx->second_service) && (service_upgrade[0] != rx->second_service || service_upgrade[1] != rx->srx.srx_service)) goto error; rx->service_upgrade.from = service_upgrade[0]; rx->service_upgrade.to = service_upgrade[1]; goto success; default: break; } } success: ret = 0; error: release_sock(&rx->sk); return ret; } /* * Get socket options. */ static int rxrpc_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *_optlen) { int optlen; if (level != SOL_RXRPC) return -EOPNOTSUPP; if (get_user(optlen, _optlen)) return -EFAULT; switch (optname) { case RXRPC_SUPPORTED_CMSG: if (optlen < sizeof(int)) return -ETOOSMALL; if (put_user(RXRPC__SUPPORTED - 1, (int __user *)optval) || put_user(sizeof(int), _optlen)) return -EFAULT; return 0; default: return -EOPNOTSUPP; } } /* * permit an RxRPC socket to be polled */ static __poll_t rxrpc_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct rxrpc_sock *rx = rxrpc_sk(sk); __poll_t mask; sock_poll_wait(file, sock, wait); mask = 0; /* the socket is readable if there are any messages waiting on the Rx * queue */ if (!list_empty(&rx->recvmsg_q)) mask |= EPOLLIN | EPOLLRDNORM; /* the socket is writable if there is space to add new data to the * socket; there is no guarantee that any particular call in progress * on the socket may have space in the Tx ACK window */ if (rxrpc_writable(sk)) mask |= EPOLLOUT | EPOLLWRNORM; return mask; } /* * create an RxRPC socket */ static int rxrpc_create(struct net *net, struct socket *sock, int protocol, int kern) { struct rxrpc_net *rxnet; struct rxrpc_sock *rx; struct sock *sk; _enter("%p,%d", sock, protocol); /* we support transport protocol UDP/UDP6 only */ if (protocol != PF_INET && IS_ENABLED(CONFIG_AF_RXRPC_IPV6) && protocol != PF_INET6) return -EPROTONOSUPPORT; if (sock->type != SOCK_DGRAM) return -ESOCKTNOSUPPORT; sock->ops = &rxrpc_rpc_ops; sock->state = SS_UNCONNECTED; sk = sk_alloc(net, PF_RXRPC, GFP_KERNEL, &rxrpc_proto, kern); if (!sk) return -ENOMEM; sock_init_data(sock, sk); sock_set_flag(sk, SOCK_RCU_FREE); sk->sk_state = RXRPC_UNBOUND; sk->sk_write_space = rxrpc_write_space; sk->sk_max_ack_backlog = 0; sk->sk_destruct = rxrpc_sock_destructor; rx = rxrpc_sk(sk); rx->family = protocol; rx->calls = RB_ROOT; spin_lock_init(&rx->incoming_lock); INIT_LIST_HEAD(&rx->sock_calls); INIT_LIST_HEAD(&rx->to_be_accepted); INIT_LIST_HEAD(&rx->recvmsg_q); spin_lock_init(&rx->recvmsg_lock); rwlock_init(&rx->call_lock); memset(&rx->srx, 0, sizeof(rx->srx)); rxnet = rxrpc_net(sock_net(&rx->sk)); timer_reduce(&rxnet->peer_keepalive_timer, jiffies + 1); _leave(" = 0 [%p]", rx); return 0; } /* * Kill all the calls on a socket and shut it down. */ static int rxrpc_shutdown(struct socket *sock, int flags) { struct sock *sk = sock->sk; struct rxrpc_sock *rx = rxrpc_sk(sk); int ret = 0; _enter("%p,%d", sk, flags); if (flags != SHUT_RDWR) return -EOPNOTSUPP; if (sk->sk_state == RXRPC_CLOSE) return -ESHUTDOWN; lock_sock(sk); if (sk->sk_state < RXRPC_CLOSE) { sk->sk_state = RXRPC_CLOSE; sk->sk_shutdown = SHUTDOWN_MASK; } else { ret = -ESHUTDOWN; } rxrpc_discard_prealloc(rx); release_sock(sk); return ret; } /* * RxRPC socket destructor */ static void rxrpc_sock_destructor(struct sock *sk) { _enter("%p", sk); rxrpc_purge_queue(&sk->sk_receive_queue); WARN_ON(refcount_read(&sk->sk_wmem_alloc)); WARN_ON(!sk_unhashed(sk)); WARN_ON(sk->sk_socket); if (!sock_flag(sk, SOCK_DEAD)) { printk("Attempt to release alive rxrpc socket: %p\n", sk); return; } } /* * release an RxRPC socket */ static int rxrpc_release_sock(struct sock *sk) { struct rxrpc_sock *rx = rxrpc_sk(sk); _enter("%p{%d,%d}", sk, sk->sk_state, refcount_read(&sk->sk_refcnt)); /* declare the socket closed for business */ sock_orphan(sk); sk->sk_shutdown = SHUTDOWN_MASK; /* We want to kill off all connections from a service socket * as fast as possible because we can't share these; client * sockets, on the other hand, can share an endpoint. */ switch (sk->sk_state) { case RXRPC_SERVER_BOUND: case RXRPC_SERVER_BOUND2: case RXRPC_SERVER_LISTENING: case RXRPC_SERVER_LISTEN_DISABLED: rx->local->service_closed = true; break; } sk->sk_state = RXRPC_CLOSE; if (rx->local && rx->local->service == rx) { write_lock(&rx->local->services_lock); rx->local->service = NULL; write_unlock(&rx->local->services_lock); } /* try to flush out this socket */ rxrpc_discard_prealloc(rx); rxrpc_release_calls_on_socket(rx); flush_workqueue(rxrpc_workqueue); rxrpc_purge_queue(&sk->sk_receive_queue); rxrpc_unuse_local(rx->local, rxrpc_local_unuse_release_sock); rxrpc_put_local(rx->local, rxrpc_local_put_release_sock); rx->local = NULL; key_put(rx->key); rx->key = NULL; key_put(rx->securities); rx->securities = NULL; sock_put(sk); _leave(" = 0"); return 0; } /* * release an RxRPC BSD socket on close() or equivalent */ static int rxrpc_release(struct socket *sock) { struct sock *sk = sock->sk; _enter("%p{%p}", sock, sk); if (!sk) return 0; sock->sk = NULL; return rxrpc_release_sock(sk); } /* * RxRPC network protocol */ static const struct proto_ops rxrpc_rpc_ops = { .family = PF_RXRPC, .owner = THIS_MODULE, .release = rxrpc_release, .bind = rxrpc_bind, .connect = rxrpc_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = rxrpc_poll, .ioctl = sock_no_ioctl, .listen = rxrpc_listen, .shutdown = rxrpc_shutdown, .setsockopt = rxrpc_setsockopt, .getsockopt = rxrpc_getsockopt, .sendmsg = rxrpc_sendmsg, .recvmsg = rxrpc_recvmsg, .mmap = sock_no_mmap, }; static struct proto rxrpc_proto = { .name = "RXRPC", .owner = THIS_MODULE, .obj_size = sizeof(struct rxrpc_sock), .max_header = sizeof(struct rxrpc_wire_header), }; static const struct net_proto_family rxrpc_family_ops = { .family = PF_RXRPC, .create = rxrpc_create, .owner = THIS_MODULE, }; /* * initialise and register the RxRPC protocol */ static int __init af_rxrpc_init(void) { int ret = -1; BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > sizeof_field(struct sk_buff, cb)); ret = -ENOMEM; rxrpc_gen_version_string(); rxrpc_call_jar = kmem_cache_create( "rxrpc_call_jar", sizeof(struct rxrpc_call), 0, SLAB_HWCACHE_ALIGN, NULL); if (!rxrpc_call_jar) { pr_notice("Failed to allocate call jar\n"); goto error_call_jar; } rxrpc_workqueue = alloc_ordered_workqueue("krxrpcd", WQ_HIGHPRI | WQ_MEM_RECLAIM); if (!rxrpc_workqueue) { pr_notice("Failed to allocate work queue\n"); goto error_work_queue; } ret = rxrpc_init_security(); if (ret < 0) { pr_crit("Cannot initialise security\n"); goto error_security; } ret = register_pernet_device(&rxrpc_net_ops); if (ret) goto error_pernet; ret = proto_register(&rxrpc_proto, 1); if (ret < 0) { pr_crit("Cannot register protocol\n"); goto error_proto; } ret = sock_register(&rxrpc_family_ops); if (ret < 0) { pr_crit("Cannot register socket family\n"); goto error_sock; } ret = register_key_type(&key_type_rxrpc); if (ret < 0) { pr_crit("Cannot register client key type\n"); goto error_key_type; } ret = register_key_type(&key_type_rxrpc_s); if (ret < 0) { pr_crit("Cannot register server key type\n"); goto error_key_type_s; } ret = rxrpc_sysctl_init(); if (ret < 0) { pr_crit("Cannot register sysctls\n"); goto error_sysctls; } return 0; error_sysctls: unregister_key_type(&key_type_rxrpc_s); error_key_type_s: unregister_key_type(&key_type_rxrpc); error_key_type: sock_unregister(PF_RXRPC); error_sock: proto_unregister(&rxrpc_proto); error_proto: unregister_pernet_device(&rxrpc_net_ops); error_pernet: rxrpc_exit_security(); error_security: destroy_workqueue(rxrpc_workqueue); error_work_queue: kmem_cache_destroy(rxrpc_call_jar); error_call_jar: return ret; } /* * unregister the RxRPC protocol */ static void __exit af_rxrpc_exit(void) { _enter(""); rxrpc_sysctl_exit(); unregister_key_type(&key_type_rxrpc_s); unregister_key_type(&key_type_rxrpc); sock_unregister(PF_RXRPC); proto_unregister(&rxrpc_proto); unregister_pernet_device(&rxrpc_net_ops); ASSERTCMP(atomic_read(&rxrpc_n_rx_skbs), ==, 0); /* Make sure the local and peer records pinned by any dying connections * are released. */ rcu_barrier(); destroy_workqueue(rxrpc_workqueue); rxrpc_exit_security(); kmem_cache_destroy(rxrpc_call_jar); _leave(""); } module_init(af_rxrpc_init); module_exit(af_rxrpc_exit);
1761 753 4101 3815 3820 1 205 204 77 2 188 282 648 2 2 233 231 1 2240 826 88 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM writeback #if !defined(_TRACE_WRITEBACK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_WRITEBACK_H #include <linux/tracepoint.h> #include <linux/backing-dev.h> #include <linux/writeback.h> #define show_inode_state(state) \ __print_flags(state, "|", \ {I_DIRTY_SYNC, "I_DIRTY_SYNC"}, \ {I_DIRTY_DATASYNC, "I_DIRTY_DATASYNC"}, \ {I_DIRTY_PAGES, "I_DIRTY_PAGES"}, \ {I_NEW, "I_NEW"}, \ {I_WILL_FREE, "I_WILL_FREE"}, \ {I_FREEING, "I_FREEING"}, \ {I_CLEAR, "I_CLEAR"}, \ {I_SYNC, "I_SYNC"}, \ {I_DIRTY_TIME, "I_DIRTY_TIME"}, \ {I_REFERENCED, "I_REFERENCED"} \ ) /* enums need to be exported to user space */ #undef EM #undef EMe #define EM(a,b) TRACE_DEFINE_ENUM(a); #define EMe(a,b) TRACE_DEFINE_ENUM(a); #define WB_WORK_REASON \ EM( WB_REASON_BACKGROUND, "background") \ EM( WB_REASON_VMSCAN, "vmscan") \ EM( WB_REASON_SYNC, "sync") \ EM( WB_REASON_PERIODIC, "periodic") \ EM( WB_REASON_LAPTOP_TIMER, "laptop_timer") \ EM( WB_REASON_FS_FREE_SPACE, "fs_free_space") \ EM( WB_REASON_FORKER_THREAD, "forker_thread") \ EMe(WB_REASON_FOREIGN_FLUSH, "foreign_flush") WB_WORK_REASON /* * Now redefine the EM() and EMe() macros to map the enums to the strings * that will be printed in the output. */ #undef EM #undef EMe #define EM(a,b) { a, b }, #define EMe(a,b) { a, b } struct wb_writeback_work; DECLARE_EVENT_CLASS(writeback_folio_template, TP_PROTO(struct folio *folio, struct address_space *mapping), TP_ARGS(folio, mapping), TP_STRUCT__entry ( __array(char, name, 32) __field(ino_t, ino) __field(pgoff_t, index) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(mapping ? inode_to_bdi(mapping->host) : NULL), 32); __entry->ino = (mapping && mapping->host) ? mapping->host->i_ino : 0; __entry->index = folio->index; ), TP_printk("bdi %s: ino=%lu index=%lu", __entry->name, (unsigned long)__entry->ino, __entry->index ) ); DEFINE_EVENT(writeback_folio_template, writeback_dirty_folio, TP_PROTO(struct folio *folio, struct address_space *mapping), TP_ARGS(folio, mapping) ); DEFINE_EVENT(writeback_folio_template, folio_wait_writeback, TP_PROTO(struct folio *folio, struct address_space *mapping), TP_ARGS(folio, mapping) ); DECLARE_EVENT_CLASS(writeback_dirty_inode_template, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags), TP_STRUCT__entry ( __array(char, name, 32) __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, flags) ), TP_fast_assign( struct backing_dev_info *bdi = inode_to_bdi(inode); /* may be called for files on pseudo FSes w/ unregistered bdi */ strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->flags = flags; ), TP_printk("bdi %s: ino=%lu state=%s flags=%s", __entry->name, (unsigned long)__entry->ino, show_inode_state(__entry->state), show_inode_state(__entry->flags) ) ); DEFINE_EVENT(writeback_dirty_inode_template, writeback_mark_inode_dirty, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags) ); DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags) ); DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags) ); #ifdef CREATE_TRACE_POINTS #ifdef CONFIG_CGROUP_WRITEBACK static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb) { return cgroup_ino(wb->memcg_css->cgroup); } static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) { if (wbc->wb) return __trace_wb_assign_cgroup(wbc->wb); else return 1; } #else /* CONFIG_CGROUP_WRITEBACK */ static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb) { return 1; } static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) { return 1; } #endif /* CONFIG_CGROUP_WRITEBACK */ #endif /* CREATE_TRACE_POINTS */ #ifdef CONFIG_CGROUP_WRITEBACK TRACE_EVENT(inode_foreign_history, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned int history), TP_ARGS(inode, wbc, history), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(ino_t, cgroup_ino) __field(unsigned int, history) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); __entry->history = history; ), TP_printk("bdi %s: ino=%lu cgroup_ino=%lu history=0x%x", __entry->name, (unsigned long)__entry->ino, (unsigned long)__entry->cgroup_ino, __entry->history ) ); TRACE_EVENT(inode_switch_wbs, TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb, struct bdi_writeback *new_wb), TP_ARGS(inode, old_wb, new_wb), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(ino_t, old_cgroup_ino) __field(ino_t, new_cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32); __entry->ino = inode->i_ino; __entry->old_cgroup_ino = __trace_wb_assign_cgroup(old_wb); __entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb); ), TP_printk("bdi %s: ino=%lu old_cgroup_ino=%lu new_cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, (unsigned long)__entry->old_cgroup_ino, (unsigned long)__entry->new_cgroup_ino ) ); TRACE_EVENT(track_foreign_dirty, TP_PROTO(struct folio *folio, struct bdi_writeback *wb), TP_ARGS(folio, wb), TP_STRUCT__entry( __array(char, name, 32) __field(u64, bdi_id) __field(ino_t, ino) __field(unsigned int, memcg_id) __field(ino_t, cgroup_ino) __field(ino_t, page_cgroup_ino) ), TP_fast_assign( struct address_space *mapping = folio_mapping(folio); struct inode *inode = mapping ? mapping->host : NULL; strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->bdi_id = wb->bdi->id; __entry->ino = inode ? inode->i_ino : 0; __entry->memcg_id = wb->memcg_css->id; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); __entry->page_cgroup_ino = cgroup_ino(folio_memcg(folio)->css.cgroup); ), TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu", __entry->name, __entry->bdi_id, (unsigned long)__entry->ino, __entry->memcg_id, (unsigned long)__entry->cgroup_ino, (unsigned long)__entry->page_cgroup_ino ) ); TRACE_EVENT(flush_foreign, TP_PROTO(struct bdi_writeback *wb, unsigned int frn_bdi_id, unsigned int frn_memcg_id), TP_ARGS(wb, frn_bdi_id, frn_memcg_id), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, cgroup_ino) __field(unsigned int, frn_bdi_id) __field(unsigned int, frn_memcg_id) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); __entry->frn_bdi_id = frn_bdi_id; __entry->frn_memcg_id = frn_memcg_id; ), TP_printk("bdi %s: cgroup_ino=%lu frn_bdi_id=%u frn_memcg_id=%u", __entry->name, (unsigned long)__entry->cgroup_ino, __entry->frn_bdi_id, __entry->frn_memcg_id ) ); #endif DECLARE_EVENT_CLASS(writeback_write_inode_template, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc), TP_STRUCT__entry ( __array(char, name, 32) __field(ino_t, ino) __field(int, sync_mode) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->sync_mode = wbc->sync_mode; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, __entry->sync_mode, (unsigned long)__entry->cgroup_ino ) ); DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode_start, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc) ); DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc) ); DECLARE_EVENT_CLASS(writeback_work_class, TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), TP_ARGS(wb, work), TP_STRUCT__entry( __array(char, name, 32) __field(long, nr_pages) __field(dev_t, sb_dev) __field(int, sync_mode) __field(int, for_kupdate) __field(int, range_cyclic) __field(int, for_background) __field(int, reason) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->nr_pages = work->nr_pages; __entry->sb_dev = work->sb ? work->sb->s_dev : 0; __entry->sync_mode = work->sync_mode; __entry->for_kupdate = work->for_kupdate; __entry->range_cyclic = work->range_cyclic; __entry->for_background = work->for_background; __entry->reason = work->reason; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d " "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%lu", __entry->name, MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev), __entry->nr_pages, __entry->sync_mode, __entry->for_kupdate, __entry->range_cyclic, __entry->for_background, __print_symbolic(__entry->reason, WB_WORK_REASON), (unsigned long)__entry->cgroup_ino ) ); #define DEFINE_WRITEBACK_WORK_EVENT(name) \ DEFINE_EVENT(writeback_work_class, name, \ TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), \ TP_ARGS(wb, work)) DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); DEFINE_WRITEBACK_WORK_EVENT(writeback_start); DEFINE_WRITEBACK_WORK_EVENT(writeback_written); DEFINE_WRITEBACK_WORK_EVENT(writeback_wait); TRACE_EVENT(writeback_pages_written, TP_PROTO(long pages_written), TP_ARGS(pages_written), TP_STRUCT__entry( __field(long, pages) ), TP_fast_assign( __entry->pages = pages_written; ), TP_printk("%ld", __entry->pages) ); DECLARE_EVENT_CLASS(writeback_class, TP_PROTO(struct bdi_writeback *wb), TP_ARGS(wb), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: cgroup_ino=%lu", __entry->name, (unsigned long)__entry->cgroup_ino ) ); #define DEFINE_WRITEBACK_EVENT(name) \ DEFINE_EVENT(writeback_class, name, \ TP_PROTO(struct bdi_writeback *wb), \ TP_ARGS(wb)) DEFINE_WRITEBACK_EVENT(writeback_wake_background); TRACE_EVENT(writeback_bdi_register, TP_PROTO(struct backing_dev_info *bdi), TP_ARGS(bdi), TP_STRUCT__entry( __array(char, name, 32) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); ), TP_printk("bdi %s", __entry->name ) ); DECLARE_EVENT_CLASS(wbc_class, TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), TP_ARGS(wbc, bdi), TP_STRUCT__entry( __array(char, name, 32) __field(long, nr_to_write) __field(long, pages_skipped) __field(int, sync_mode) __field(int, for_kupdate) __field(int, for_background) __field(int, for_reclaim) __field(int, range_cyclic) __field(long, range_start) __field(long, range_end) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); __entry->nr_to_write = wbc->nr_to_write; __entry->pages_skipped = wbc->pages_skipped; __entry->sync_mode = wbc->sync_mode; __entry->for_kupdate = wbc->for_kupdate; __entry->for_background = wbc->for_background; __entry->for_reclaim = wbc->for_reclaim; __entry->range_cyclic = wbc->range_cyclic; __entry->range_start = (long)wbc->range_start; __entry->range_end = (long)wbc->range_end; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " "bgrd=%d reclm=%d cyclic=%d " "start=0x%lx end=0x%lx cgroup_ino=%lu", __entry->name, __entry->nr_to_write, __entry->pages_skipped, __entry->sync_mode, __entry->for_kupdate, __entry->for_background, __entry->for_reclaim, __entry->range_cyclic, __entry->range_start, __entry->range_end, (unsigned long)__entry->cgroup_ino ) ) #define DEFINE_WBC_EVENT(name) \ DEFINE_EVENT(wbc_class, name, \ TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \ TP_ARGS(wbc, bdi)) DEFINE_WBC_EVENT(wbc_writepage); TRACE_EVENT(writeback_queue_io, TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work, unsigned long dirtied_before, int moved), TP_ARGS(wb, work, dirtied_before, moved), TP_STRUCT__entry( __array(char, name, 32) __field(unsigned long, older) __field(long, age) __field(int, moved) __field(int, reason) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->older = dirtied_before; __entry->age = (jiffies - dirtied_before) * 1000 / HZ; __entry->moved = moved; __entry->reason = work->reason; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%lu", __entry->name, __entry->older, /* dirtied_before in jiffies */ __entry->age, /* dirtied_before in relative milliseconds */ __entry->moved, __print_symbolic(__entry->reason, WB_WORK_REASON), (unsigned long)__entry->cgroup_ino ) ); TRACE_EVENT(global_dirty_state, TP_PROTO(unsigned long background_thresh, unsigned long dirty_thresh ), TP_ARGS(background_thresh, dirty_thresh ), TP_STRUCT__entry( __field(unsigned long, nr_dirty) __field(unsigned long, nr_writeback) __field(unsigned long, background_thresh) __field(unsigned long, dirty_thresh) __field(unsigned long, dirty_limit) __field(unsigned long, nr_dirtied) __field(unsigned long, nr_written) ), TP_fast_assign( __entry->nr_dirty = global_node_page_state(NR_FILE_DIRTY); __entry->nr_writeback = global_node_page_state(NR_WRITEBACK); __entry->nr_dirtied = global_node_page_state(NR_DIRTIED); __entry->nr_written = global_node_page_state(NR_WRITTEN); __entry->background_thresh = background_thresh; __entry->dirty_thresh = dirty_thresh; __entry->dirty_limit = global_wb_domain.dirty_limit; ), TP_printk("dirty=%lu writeback=%lu " "bg_thresh=%lu thresh=%lu limit=%lu " "dirtied=%lu written=%lu", __entry->nr_dirty, __entry->nr_writeback, __entry->background_thresh, __entry->dirty_thresh, __entry->dirty_limit, __entry->nr_dirtied, __entry->nr_written ) ); #define KBps(x) ((x) << (PAGE_SHIFT - 10)) TRACE_EVENT(bdi_dirty_ratelimit, TP_PROTO(struct bdi_writeback *wb, unsigned long dirty_rate, unsigned long task_ratelimit), TP_ARGS(wb, dirty_rate, task_ratelimit), TP_STRUCT__entry( __array(char, bdi, 32) __field(unsigned long, write_bw) __field(unsigned long, avg_write_bw) __field(unsigned long, dirty_rate) __field(unsigned long, dirty_ratelimit) __field(unsigned long, task_ratelimit) __field(unsigned long, balanced_dirty_ratelimit) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32); __entry->write_bw = KBps(wb->write_bandwidth); __entry->avg_write_bw = KBps(wb->avg_write_bandwidth); __entry->dirty_rate = KBps(dirty_rate); __entry->dirty_ratelimit = KBps(wb->dirty_ratelimit); __entry->task_ratelimit = KBps(task_ratelimit); __entry->balanced_dirty_ratelimit = KBps(wb->balanced_dirty_ratelimit); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: " "write_bw=%lu awrite_bw=%lu dirty_rate=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "balanced_dirty_ratelimit=%lu cgroup_ino=%lu", __entry->bdi, __entry->write_bw, /* write bandwidth */ __entry->avg_write_bw, /* avg write bandwidth */ __entry->dirty_rate, /* bdi dirty rate */ __entry->dirty_ratelimit, /* base ratelimit */ __entry->task_ratelimit, /* ratelimit with position control */ __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */ (unsigned long)__entry->cgroup_ino ) ); TRACE_EVENT(balance_dirty_pages, TP_PROTO(struct bdi_writeback *wb, unsigned long thresh, unsigned long bg_thresh, unsigned long dirty, unsigned long bdi_thresh, unsigned long bdi_dirty, unsigned long dirty_ratelimit, unsigned long task_ratelimit, unsigned long dirtied, unsigned long period, long pause, unsigned long start_time), TP_ARGS(wb, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty, dirty_ratelimit, task_ratelimit, dirtied, period, pause, start_time), TP_STRUCT__entry( __array( char, bdi, 32) __field(unsigned long, limit) __field(unsigned long, setpoint) __field(unsigned long, dirty) __field(unsigned long, bdi_setpoint) __field(unsigned long, bdi_dirty) __field(unsigned long, dirty_ratelimit) __field(unsigned long, task_ratelimit) __field(unsigned int, dirtied) __field(unsigned int, dirtied_pause) __field(unsigned long, paused) __field( long, pause) __field(unsigned long, period) __field( long, think) __field(ino_t, cgroup_ino) ), TP_fast_assign( unsigned long freerun = (thresh + bg_thresh) / 2; strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32); __entry->limit = global_wb_domain.dirty_limit; __entry->setpoint = (global_wb_domain.dirty_limit + freerun) / 2; __entry->dirty = dirty; __entry->bdi_setpoint = __entry->setpoint * bdi_thresh / (thresh + 1); __entry->bdi_dirty = bdi_dirty; __entry->dirty_ratelimit = KBps(dirty_ratelimit); __entry->task_ratelimit = KBps(task_ratelimit); __entry->dirtied = dirtied; __entry->dirtied_pause = current->nr_dirtied_pause; __entry->think = current->dirty_paused_when == 0 ? 0 : (long)(jiffies - current->dirty_paused_when) * 1000/HZ; __entry->period = period * 1000 / HZ; __entry->pause = pause * 1000 / HZ; __entry->paused = (jiffies - start_time) * 1000 / HZ; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: " "limit=%lu setpoint=%lu dirty=%lu " "bdi_setpoint=%lu bdi_dirty=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "dirtied=%u dirtied_pause=%u " "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%lu", __entry->bdi, __entry->limit, __entry->setpoint, __entry->dirty, __entry->bdi_setpoint, __entry->bdi_dirty, __entry->dirty_ratelimit, __entry->task_ratelimit, __entry->dirtied, __entry->dirtied_pause, __entry->paused, /* ms */ __entry->pause, /* ms */ __entry->period, /* ms */ __entry->think, /* ms */ (unsigned long)__entry->cgroup_ino ) ); TRACE_EVENT(writeback_sb_inodes_requeue, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, dirtied_when) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->dirtied_when = inode->dirtied_when; __entry->cgroup_ino = __trace_wb_assign_cgroup(inode_to_wb(inode)); ), TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, show_inode_state(__entry->state), __entry->dirtied_when, (jiffies - __entry->dirtied_when) / HZ, (unsigned long)__entry->cgroup_ino ) ); DECLARE_EVENT_CLASS(writeback_single_inode_template, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned long nr_to_write ), TP_ARGS(inode, wbc, nr_to_write), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, dirtied_when) __field(unsigned long, writeback_index) __field(long, nr_to_write) __field(unsigned long, wrote) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->dirtied_when = inode->dirtied_when; __entry->writeback_index = inode->i_mapping->writeback_index; __entry->nr_to_write = nr_to_write; __entry->wrote = nr_to_write - wbc->nr_to_write; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu " "index=%lu to_write=%ld wrote=%lu cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, show_inode_state(__entry->state), __entry->dirtied_when, (jiffies - __entry->dirtied_when) / HZ, __entry->writeback_index, __entry->nr_to_write, __entry->wrote, (unsigned long)__entry->cgroup_ino ) ); DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_start, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned long nr_to_write), TP_ARGS(inode, wbc, nr_to_write) ); DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned long nr_to_write), TP_ARGS(inode, wbc, nr_to_write) ); DECLARE_EVENT_CLASS(writeback_inode_template, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field(unsigned long, state ) __field( __u16, mode ) __field(unsigned long, dirtied_when ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->mode = inode->i_mode; __entry->dirtied_when = inode->dirtied_when; ), TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long)__entry->ino, __entry->dirtied_when, show_inode_state(__entry->state), __entry->mode) ); DEFINE_EVENT(writeback_inode_template, writeback_lazytime, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(writeback_inode_template, writeback_lazytime_iput, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(writeback_inode_template, writeback_dirty_inode_enqueue, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); /* * Inode writeback list tracking. */ DEFINE_EVENT(writeback_inode_template, sb_mark_inode_writeback, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(writeback_inode_template, sb_clear_inode_writeback, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); #endif /* _TRACE_WRITEBACK_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
38138 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __X86_KERNEL_FPU_INTERNAL_H #define __X86_KERNEL_FPU_INTERNAL_H extern struct fpstate init_fpstate; /* CPU feature check wrappers */ static __always_inline __pure bool use_xsave(void) { return cpu_feature_enabled(X86_FEATURE_XSAVE); } static __always_inline __pure bool use_fxsr(void) { return cpu_feature_enabled(X86_FEATURE_FXSR); } #ifdef CONFIG_X86_DEBUG_FPU # define WARN_ON_FPU(x) WARN_ON_ONCE(x) #else # define WARN_ON_FPU(x) ({ (void)(x); 0; }) #endif /* Used in init.c */ extern void fpstate_init_user(struct fpstate *fpstate); extern void fpstate_reset(struct fpu *fpu); #endif
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 // SPDX-License-Identifier: GPL-2.0 /* MPTCP socket monitoring support * * Copyright (c) 2019 Red Hat * * Author: Davide Caratti <dcaratti@redhat.com> */ #include <linux/kernel.h> #include <linux/net.h> #include <linux/inet_diag.h> #include <net/netlink.h> #include <uapi/linux/mptcp.h> #include "protocol.h" static int subflow_get_info(const struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *sf; struct nlattr *start; u32 flags = 0; int err; start = nla_nest_start_noflag(skb, INET_ULP_INFO_MPTCP); if (!start) return -EMSGSIZE; rcu_read_lock(); sf = rcu_dereference(inet_csk(sk)->icsk_ulp_data); if (!sf) { err = 0; goto nla_failure; } if (sf->mp_capable) flags |= MPTCP_SUBFLOW_FLAG_MCAP_REM; if (sf->request_mptcp) flags |= MPTCP_SUBFLOW_FLAG_MCAP_LOC; if (sf->mp_join) flags |= MPTCP_SUBFLOW_FLAG_JOIN_REM; if (sf->request_join) flags |= MPTCP_SUBFLOW_FLAG_JOIN_LOC; if (sf->backup) flags |= MPTCP_SUBFLOW_FLAG_BKUP_REM; if (sf->request_bkup) flags |= MPTCP_SUBFLOW_FLAG_BKUP_LOC; if (sf->fully_established) flags |= MPTCP_SUBFLOW_FLAG_FULLY_ESTABLISHED; if (sf->conn_finished) flags |= MPTCP_SUBFLOW_FLAG_CONNECTED; if (sf->map_valid) flags |= MPTCP_SUBFLOW_FLAG_MAPVALID; if (nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_TOKEN_REM, sf->remote_token) || nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_TOKEN_LOC, sf->token) || nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ, sf->rel_write_seq) || nla_put_u64_64bit(skb, MPTCP_SUBFLOW_ATTR_MAP_SEQ, sf->map_seq, MPTCP_SUBFLOW_ATTR_PAD) || nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_MAP_SFSEQ, sf->map_subflow_seq) || nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_SSN_OFFSET, sf->ssn_offset) || nla_put_u16(skb, MPTCP_SUBFLOW_ATTR_MAP_DATALEN, sf->map_data_len) || nla_put_u32(skb, MPTCP_SUBFLOW_ATTR_FLAGS, flags) || nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_REM, sf->remote_id) || nla_put_u8(skb, MPTCP_SUBFLOW_ATTR_ID_LOC, sf->local_id)) { err = -EMSGSIZE; goto nla_failure; } rcu_read_unlock(); nla_nest_end(skb, start); return 0; nla_failure: rcu_read_unlock(); nla_nest_cancel(skb, start); return err; } static size_t subflow_get_info_size(const struct sock *sk) { size_t size = 0; size += nla_total_size(0) + /* INET_ULP_INFO_MPTCP */ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_TOKEN_REM */ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_TOKEN_LOC */ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_RELWRITE_SEQ */ nla_total_size_64bit(8) + /* MPTCP_SUBFLOW_ATTR_MAP_SEQ */ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_MAP_SFSEQ */ nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_SSN_OFFSET */ nla_total_size(2) + /* MPTCP_SUBFLOW_ATTR_MAP_DATALEN */ nla_total_size(4) + /* MPTCP_SUBFLOW_ATTR_FLAGS */ nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_REM */ nla_total_size(1) + /* MPTCP_SUBFLOW_ATTR_ID_LOC */ 0; return size; } void mptcp_diag_subflow_init(struct tcp_ulp_ops *ops) { ops->get_info = subflow_get_info; ops->get_info_size = subflow_get_info_size; }
2345 2345 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/pm_qos.h> static inline void device_pm_init_common(struct device *dev) { if (!dev->power.early_init) { spin_lock_init(&dev->power.lock); dev->power.qos = NULL; dev->power.early_init = true; } } #ifdef CONFIG_PM static inline void pm_runtime_early_init(struct device *dev) { dev->power.disable_depth = 1; device_pm_init_common(dev); } extern void pm_runtime_init(struct device *dev); extern void pm_runtime_reinit(struct device *dev); extern void pm_runtime_remove(struct device *dev); extern u64 pm_runtime_active_time(struct device *dev); #define WAKE_IRQ_DEDICATED_ALLOCATED BIT(0) #define WAKE_IRQ_DEDICATED_MANAGED BIT(1) #define WAKE_IRQ_DEDICATED_REVERSE BIT(2) #define WAKE_IRQ_DEDICATED_MASK (WAKE_IRQ_DEDICATED_ALLOCATED | \ WAKE_IRQ_DEDICATED_MANAGED | \ WAKE_IRQ_DEDICATED_REVERSE) #define WAKE_IRQ_DEDICATED_ENABLED BIT(3) struct wake_irq { struct device *dev; unsigned int status; int irq; const char *name; }; extern void dev_pm_arm_wake_irq(struct wake_irq *wirq); extern void dev_pm_disarm_wake_irq(struct wake_irq *wirq); extern void dev_pm_enable_wake_irq_check(struct device *dev, bool can_change_status); extern void dev_pm_disable_wake_irq_check(struct device *dev, bool cond_disable); extern void dev_pm_enable_wake_irq_complete(struct device *dev); #ifdef CONFIG_PM_SLEEP extern void device_wakeup_attach_irq(struct device *dev, struct wake_irq *wakeirq); extern void device_wakeup_detach_irq(struct device *dev); extern void device_wakeup_arm_wake_irqs(void); extern void device_wakeup_disarm_wake_irqs(void); #else static inline void device_wakeup_attach_irq(struct device *dev, struct wake_irq *wakeirq) {} static inline void device_wakeup_detach_irq(struct device *dev) { } #endif /* CONFIG_PM_SLEEP */ /* * sysfs.c */ extern int dpm_sysfs_add(struct device *dev); extern void dpm_sysfs_remove(struct device *dev); extern void rpm_sysfs_remove(struct device *dev); extern int wakeup_sysfs_add(struct device *dev); extern void wakeup_sysfs_remove(struct device *dev); extern int pm_qos_sysfs_add_resume_latency(struct device *dev); extern void pm_qos_sysfs_remove_resume_latency(struct device *dev); extern int pm_qos_sysfs_add_flags(struct device *dev); extern void pm_qos_sysfs_remove_flags(struct device *dev); extern int pm_qos_sysfs_add_latency_tolerance(struct device *dev); extern void pm_qos_sysfs_remove_latency_tolerance(struct device *dev); extern int dpm_sysfs_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid); #else /* CONFIG_PM */ static inline void pm_runtime_early_init(struct device *dev) { device_pm_init_common(dev); } static inline void pm_runtime_init(struct device *dev) {} static inline void pm_runtime_reinit(struct device *dev) {} static inline void pm_runtime_remove(struct device *dev) {} static inline int dpm_sysfs_add(struct device *dev) { return 0; } static inline void dpm_sysfs_remove(struct device *dev) {} static inline int dpm_sysfs_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid) { return 0; } #endif #ifdef CONFIG_PM_SLEEP /* kernel/power/main.c */ extern int pm_async_enabled; /* drivers/base/power/main.c */ extern struct list_head dpm_list; /* The active device list */ static inline struct device *to_device(struct list_head *entry) { return container_of(entry, struct device, power.entry); } extern void device_pm_sleep_init(struct device *dev); extern void device_pm_add(struct device *); extern void device_pm_remove(struct device *); extern void device_pm_move_before(struct device *, struct device *); extern void device_pm_move_after(struct device *, struct device *); extern void device_pm_move_last(struct device *); extern void device_pm_check_callbacks(struct device *dev); static inline bool device_pm_initialized(struct device *dev) { return dev->power.in_dpm_list; } /* drivers/base/power/wakeup_stats.c */ extern int wakeup_source_sysfs_add(struct device *parent, struct wakeup_source *ws); extern void wakeup_source_sysfs_remove(struct wakeup_source *ws); extern int pm_wakeup_source_sysfs_add(struct device *parent); #else /* !CONFIG_PM_SLEEP */ static inline void device_pm_sleep_init(struct device *dev) {} static inline void device_pm_add(struct device *dev) {} static inline void device_pm_remove(struct device *dev) { pm_runtime_remove(dev); } static inline void device_pm_move_before(struct device *deva, struct device *devb) {} static inline void device_pm_move_after(struct device *deva, struct device *devb) {} static inline void device_pm_move_last(struct device *dev) {} static inline void device_pm_check_callbacks(struct device *dev) {} static inline bool device_pm_initialized(struct device *dev) { return device_is_registered(dev); } static inline int pm_wakeup_source_sysfs_add(struct device *parent) { return 0; } #endif /* !CONFIG_PM_SLEEP */ static inline void device_pm_init(struct device *dev) { device_pm_init_common(dev); device_pm_sleep_init(dev); pm_runtime_init(dev); }
6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 // SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/wait.h> #include <linux/writeback.h> #include <linux/iversion.h> #include <linux/filelock.h> #include "super.h" #include "mds_client.h" #include "cache.h" #include "crypto.h" #include <linux/ceph/decode.h> #include <linux/ceph/messenger.h> /* * Capability management * * The Ceph metadata servers control client access to inode metadata * and file data by issuing capabilities, granting clients permission * to read and/or write both inode field and file data to OSDs * (storage nodes). Each capability consists of a set of bits * indicating which operations are allowed. * * If the client holds a *_SHARED cap, the client has a coherent value * that can be safely read from the cached inode. * * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the * client is allowed to change inode attributes (e.g., file size, * mtime), note its dirty state in the ceph_cap, and asynchronously * flush that metadata change to the MDS. * * In the event of a conflicting operation (perhaps by another * client), the MDS will revoke the conflicting client capabilities. * * In order for a client to cache an inode, it must hold a capability * with at least one MDS server. When inodes are released, release * notifications are batched and periodically sent en masse to the MDS * cluster to release server state. */ static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc); static void __kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_inode_info *ci, u64 oldest_flush_tid); /* * Generate readable cap strings for debugging output. */ #define MAX_CAP_STR 20 static char cap_str[MAX_CAP_STR][40]; static DEFINE_SPINLOCK(cap_str_lock); static int last_cap_str; static char *gcap_string(char *s, int c) { if (c & CEPH_CAP_GSHARED) *s++ = 's'; if (c & CEPH_CAP_GEXCL) *s++ = 'x'; if (c & CEPH_CAP_GCACHE) *s++ = 'c'; if (c & CEPH_CAP_GRD) *s++ = 'r'; if (c & CEPH_CAP_GWR) *s++ = 'w'; if (c & CEPH_CAP_GBUFFER) *s++ = 'b'; if (c & CEPH_CAP_GWREXTEND) *s++ = 'a'; if (c & CEPH_CAP_GLAZYIO) *s++ = 'l'; return s; } const char *ceph_cap_string(int caps) { int i; char *s; int c; spin_lock(&cap_str_lock); i = last_cap_str++; if (last_cap_str == MAX_CAP_STR) last_cap_str = 0; spin_unlock(&cap_str_lock); s = cap_str[i]; if (caps & CEPH_CAP_PIN) *s++ = 'p'; c = (caps >> CEPH_CAP_SAUTH) & 3; if (c) { *s++ = 'A'; s = gcap_string(s, c); } c = (caps >> CEPH_CAP_SLINK) & 3; if (c) { *s++ = 'L'; s = gcap_string(s, c); } c = (caps >> CEPH_CAP_SXATTR) & 3; if (c) { *s++ = 'X'; s = gcap_string(s, c); } c = caps >> CEPH_CAP_SFILE; if (c) { *s++ = 'F'; s = gcap_string(s, c); } if (s == cap_str[i]) *s++ = '-'; *s = 0; return cap_str[i]; } void ceph_caps_init(struct ceph_mds_client *mdsc) { INIT_LIST_HEAD(&mdsc->caps_list); spin_lock_init(&mdsc->caps_list_lock); } void ceph_caps_finalize(struct ceph_mds_client *mdsc) { struct ceph_cap *cap; spin_lock(&mdsc->caps_list_lock); while (!list_empty(&mdsc->caps_list)) { cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item); list_del(&cap->caps_item); kmem_cache_free(ceph_cap_cachep, cap); } mdsc->caps_total_count = 0; mdsc->caps_avail_count = 0; mdsc->caps_use_count = 0; mdsc->caps_reserve_count = 0; mdsc->caps_min_count = 0; spin_unlock(&mdsc->caps_list_lock); } void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc, struct ceph_mount_options *fsopt) { spin_lock(&mdsc->caps_list_lock); mdsc->caps_min_count = fsopt->max_readdir; if (mdsc->caps_min_count < 1024) mdsc->caps_min_count = 1024; mdsc->caps_use_max = fsopt->caps_max; if (mdsc->caps_use_max > 0 && mdsc->caps_use_max < mdsc->caps_min_count) mdsc->caps_use_max = mdsc->caps_min_count; spin_unlock(&mdsc->caps_list_lock); } static void __ceph_unreserve_caps(struct ceph_mds_client *mdsc, int nr_caps) { struct ceph_cap *cap; int i; if (nr_caps) { BUG_ON(mdsc->caps_reserve_count < nr_caps); mdsc->caps_reserve_count -= nr_caps; if (mdsc->caps_avail_count >= mdsc->caps_reserve_count + mdsc->caps_min_count) { mdsc->caps_total_count -= nr_caps; for (i = 0; i < nr_caps; i++) { cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item); list_del(&cap->caps_item); kmem_cache_free(ceph_cap_cachep, cap); } } else { mdsc->caps_avail_count += nr_caps; } doutc(mdsc->fsc->client, "caps %d = %d used + %d resv + %d avail\n", mdsc->caps_total_count, mdsc->caps_use_count, mdsc->caps_reserve_count, mdsc->caps_avail_count); BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + mdsc->caps_reserve_count + mdsc->caps_avail_count); } } /* * Called under mdsc->mutex. */ int ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx, int need) { struct ceph_client *cl = mdsc->fsc->client; int i, j; struct ceph_cap *cap; int have; int alloc = 0; int max_caps; int err = 0; bool trimmed = false; struct ceph_mds_session *s; LIST_HEAD(newcaps); doutc(cl, "ctx=%p need=%d\n", ctx, need); /* first reserve any caps that are already allocated */ spin_lock(&mdsc->caps_list_lock); if (mdsc->caps_avail_count >= need) have = need; else have = mdsc->caps_avail_count; mdsc->caps_avail_count -= have; mdsc->caps_reserve_count += have; BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + mdsc->caps_reserve_count + mdsc->caps_avail_count); spin_unlock(&mdsc->caps_list_lock); for (i = have; i < need; ) { cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); if (cap) { list_add(&cap->caps_item, &newcaps); alloc++; i++; continue; } if (!trimmed) { for (j = 0; j < mdsc->max_sessions; j++) { s = __ceph_lookup_mds_session(mdsc, j); if (!s) continue; mutex_unlock(&mdsc->mutex); mutex_lock(&s->s_mutex); max_caps = s->s_nr_caps - (need - i); ceph_trim_caps(mdsc, s, max_caps); mutex_unlock(&s->s_mutex); ceph_put_mds_session(s); mutex_lock(&mdsc->mutex); } trimmed = true; spin_lock(&mdsc->caps_list_lock); if (mdsc->caps_avail_count) { int more_have; if (mdsc->caps_avail_count >= need - i) more_have = need - i; else more_have = mdsc->caps_avail_count; i += more_have; have += more_have; mdsc->caps_avail_count -= more_have; mdsc->caps_reserve_count += more_have; } spin_unlock(&mdsc->caps_list_lock); continue; } pr_warn_client(cl, "ctx=%p ENOMEM need=%d got=%d\n", ctx, need, have + alloc); err = -ENOMEM; break; } if (!err) { BUG_ON(have + alloc != need); ctx->count = need; ctx->used = 0; } spin_lock(&mdsc->caps_list_lock); mdsc->caps_total_count += alloc; mdsc->caps_reserve_count += alloc; list_splice(&newcaps, &mdsc->caps_list); BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + mdsc->caps_reserve_count + mdsc->caps_avail_count); if (err) __ceph_unreserve_caps(mdsc, have + alloc); spin_unlock(&mdsc->caps_list_lock); doutc(cl, "ctx=%p %d = %d used + %d resv + %d avail\n", ctx, mdsc->caps_total_count, mdsc->caps_use_count, mdsc->caps_reserve_count, mdsc->caps_avail_count); return err; } void ceph_unreserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx) { struct ceph_client *cl = mdsc->fsc->client; bool reclaim = false; if (!ctx->count) return; doutc(cl, "ctx=%p count=%d\n", ctx, ctx->count); spin_lock(&mdsc->caps_list_lock); __ceph_unreserve_caps(mdsc, ctx->count); ctx->count = 0; if (mdsc->caps_use_max > 0 && mdsc->caps_use_count > mdsc->caps_use_max) reclaim = true; spin_unlock(&mdsc->caps_list_lock); if (reclaim) ceph_reclaim_caps_nr(mdsc, ctx->used); } struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap *cap = NULL; /* temporary, until we do something about cap import/export */ if (!ctx) { cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); if (cap) { spin_lock(&mdsc->caps_list_lock); mdsc->caps_use_count++; mdsc->caps_total_count++; spin_unlock(&mdsc->caps_list_lock); } else { spin_lock(&mdsc->caps_list_lock); if (mdsc->caps_avail_count) { BUG_ON(list_empty(&mdsc->caps_list)); mdsc->caps_avail_count--; mdsc->caps_use_count++; cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item); list_del(&cap->caps_item); BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + mdsc->caps_reserve_count + mdsc->caps_avail_count); } spin_unlock(&mdsc->caps_list_lock); } return cap; } spin_lock(&mdsc->caps_list_lock); doutc(cl, "ctx=%p (%d) %d = %d used + %d resv + %d avail\n", ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count, mdsc->caps_reserve_count, mdsc->caps_avail_count); BUG_ON(!ctx->count); BUG_ON(ctx->count > mdsc->caps_reserve_count); BUG_ON(list_empty(&mdsc->caps_list)); ctx->count--; ctx->used++; mdsc->caps_reserve_count--; mdsc->caps_use_count++; cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item); list_del(&cap->caps_item); BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + mdsc->caps_reserve_count + mdsc->caps_avail_count); spin_unlock(&mdsc->caps_list_lock); return cap; } void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap) { struct ceph_client *cl = mdsc->fsc->client; spin_lock(&mdsc->caps_list_lock); doutc(cl, "%p %d = %d used + %d resv + %d avail\n", cap, mdsc->caps_total_count, mdsc->caps_use_count, mdsc->caps_reserve_count, mdsc->caps_avail_count); mdsc->caps_use_count--; /* * Keep some preallocated caps around (ceph_min_count), to * avoid lots of free/alloc churn. */ if (mdsc->caps_avail_count >= mdsc->caps_reserve_count + mdsc->caps_min_count) { mdsc->caps_total_count--; kmem_cache_free(ceph_cap_cachep, cap); } else { mdsc->caps_avail_count++; list_add(&cap->caps_item, &mdsc->caps_list); } BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + mdsc->caps_reserve_count + mdsc->caps_avail_count); spin_unlock(&mdsc->caps_list_lock); } void ceph_reservation_status(struct ceph_fs_client *fsc, int *total, int *avail, int *used, int *reserved, int *min) { struct ceph_mds_client *mdsc = fsc->mdsc; spin_lock(&mdsc->caps_list_lock); if (total) *total = mdsc->caps_total_count; if (avail) *avail = mdsc->caps_avail_count; if (used) *used = mdsc->caps_use_count; if (reserved) *reserved = mdsc->caps_reserve_count; if (min) *min = mdsc->caps_min_count; spin_unlock(&mdsc->caps_list_lock); } /* * Find ceph_cap for given mds, if any. * * Called with i_ceph_lock held. */ struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) { struct ceph_cap *cap; struct rb_node *n = ci->i_caps.rb_node; while (n) { cap = rb_entry(n, struct ceph_cap, ci_node); if (mds < cap->mds) n = n->rb_left; else if (mds > cap->mds) n = n->rb_right; else return cap; } return NULL; } struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds) { struct ceph_cap *cap; spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); spin_unlock(&ci->i_ceph_lock); return cap; } /* * Called under i_ceph_lock. */ static void __insert_cap_node(struct ceph_inode_info *ci, struct ceph_cap *new) { struct rb_node **p = &ci->i_caps.rb_node; struct rb_node *parent = NULL; struct ceph_cap *cap = NULL; while (*p) { parent = *p; cap = rb_entry(parent, struct ceph_cap, ci_node); if (new->mds < cap->mds) p = &(*p)->rb_left; else if (new->mds > cap->mds) p = &(*p)->rb_right; else BUG(); } rb_link_node(&new->ci_node, parent, p); rb_insert_color(&new->ci_node, &ci->i_caps); } /* * (re)set cap hold timeouts, which control the delayed release * of unused caps back to the MDS. Should be called on cap use. */ static void __cap_set_timeouts(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { struct inode *inode = &ci->netfs.inode; struct ceph_mount_options *opt = mdsc->fsc->mount_options; ci->i_hold_caps_max = round_jiffies(jiffies + opt->caps_wanted_delay_max * HZ); doutc(mdsc->fsc->client, "%p %llx.%llx %lu\n", inode, ceph_vinop(inode), ci->i_hold_caps_max - jiffies); } /* * (Re)queue cap at the end of the delayed cap release list. * * If I_FLUSH is set, leave the inode at the front of the list. * * Caller holds i_ceph_lock * -> we take mdsc->cap_delay_lock */ static void __cap_delay_requeue(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { struct inode *inode = &ci->netfs.inode; doutc(mdsc->fsc->client, "%p %llx.%llx flags 0x%lx at %lu\n", inode, ceph_vinop(inode), ci->i_ceph_flags, ci->i_hold_caps_max); if (!mdsc->stopping) { spin_lock(&mdsc->cap_delay_lock); if (!list_empty(&ci->i_cap_delay_list)) { if (ci->i_ceph_flags & CEPH_I_FLUSH) goto no_change; list_del_init(&ci->i_cap_delay_list); } __cap_set_timeouts(mdsc, ci); list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list); no_change: spin_unlock(&mdsc->cap_delay_lock); } } /* * Queue an inode for immediate writeback. Mark inode with I_FLUSH, * indicating we should send a cap message to flush dirty metadata * asap, and move to the front of the delayed cap list. */ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { struct inode *inode = &ci->netfs.inode; doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, ceph_vinop(inode)); spin_lock(&mdsc->cap_delay_lock); ci->i_ceph_flags |= CEPH_I_FLUSH; if (!list_empty(&ci->i_cap_delay_list)) list_del_init(&ci->i_cap_delay_list); list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list); spin_unlock(&mdsc->cap_delay_lock); } /* * Cancel delayed work on cap. * * Caller must hold i_ceph_lock. */ static void __cap_delay_cancel(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { struct inode *inode = &ci->netfs.inode; doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, ceph_vinop(inode)); if (list_empty(&ci->i_cap_delay_list)) return; spin_lock(&mdsc->cap_delay_lock); list_del_init(&ci->i_cap_delay_list); spin_unlock(&mdsc->cap_delay_lock); } /* Common issue checks for add_cap, handle_cap_grant. */ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, unsigned issued) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); unsigned had = __ceph_caps_issued(ci, NULL); lockdep_assert_held(&ci->i_ceph_lock); /* * Each time we receive FILE_CACHE anew, we increment * i_rdcache_gen. */ if (S_ISREG(ci->netfs.inode.i_mode) && (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) { ci->i_rdcache_gen++; } /* * If FILE_SHARED is newly issued, mark dir not complete. We don't * know what happened to this directory while we didn't have the cap. * If FILE_SHARED is being revoked, also mark dir not complete. It * stops on-going cached readdir. */ if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) { if (issued & CEPH_CAP_FILE_SHARED) atomic_inc(&ci->i_shared_gen); if (S_ISDIR(ci->netfs.inode.i_mode)) { doutc(cl, " marking %p NOT complete\n", inode); __ceph_dir_clear_complete(ci); } } /* Wipe saved layout if we're losing DIR_CREATE caps */ if (S_ISDIR(ci->netfs.inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) && !(issued & CEPH_CAP_DIR_CREATE)) { ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns)); memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); } } /** * change_auth_cap_ses - move inode to appropriate lists when auth caps change * @ci: inode to be moved * @session: new auth caps session */ void change_auth_cap_ses(struct ceph_inode_info *ci, struct ceph_mds_session *session) { lockdep_assert_held(&ci->i_ceph_lock); if (list_empty(&ci->i_dirty_item) && list_empty(&ci->i_flushing_item)) return; spin_lock(&session->s_mdsc->cap_dirty_lock); if (!list_empty(&ci->i_dirty_item)) list_move(&ci->i_dirty_item, &session->s_cap_dirty); if (!list_empty(&ci->i_flushing_item)) list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); spin_unlock(&session->s_mdsc->cap_dirty_lock); } /* * Add a capability under the given MDS session. * * Caller should hold session snap_rwsem (read) and ci->i_ceph_lock * * @fmode is the open file mode, if we are opening a file, otherwise * it is < 0. (This is so we can atomically add the cap and add an * open file reference to it.) */ void ceph_add_cap(struct inode *inode, struct ceph_mds_session *session, u64 cap_id, unsigned issued, unsigned wanted, unsigned seq, unsigned mseq, u64 realmino, int flags, struct ceph_cap **new_cap) { struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_cap *cap; int mds = session->s_mds; int actual_wanted; u32 gen; lockdep_assert_held(&ci->i_ceph_lock); doutc(cl, "%p %llx.%llx mds%d cap %llx %s seq %d\n", inode, ceph_vinop(inode), session->s_mds, cap_id, ceph_cap_string(issued), seq); gen = atomic_read(&session->s_cap_gen); cap = __get_cap_for_mds(ci, mds); if (!cap) { cap = *new_cap; *new_cap = NULL; cap->issued = 0; cap->implemented = 0; cap->mds = mds; cap->mds_wanted = 0; cap->mseq = 0; cap->ci = ci; __insert_cap_node(ci, cap); /* add to session cap list */ cap->session = session; spin_lock(&session->s_cap_lock); list_add_tail(&cap->session_caps, &session->s_caps); session->s_nr_caps++; atomic64_inc(&mdsc->metric.total_caps); spin_unlock(&session->s_cap_lock); } else { spin_lock(&session->s_cap_lock); list_move_tail(&cap->session_caps, &session->s_caps); spin_unlock(&session->s_cap_lock); if (cap->cap_gen < gen) cap->issued = cap->implemented = CEPH_CAP_PIN; /* * auth mds of the inode changed. we received the cap export * message, but still haven't received the cap import message. * handle_cap_export() updated the new auth MDS' cap. * * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing * a message that was send before the cap import message. So * don't remove caps. */ if (ceph_seq_cmp(seq, cap->seq) <= 0) { WARN_ON(cap != ci->i_auth_cap); WARN_ON(cap->cap_id != cap_id); seq = cap->seq; mseq = cap->mseq; issued |= cap->issued; flags |= CEPH_CAP_FLAG_AUTH; } } if (!ci->i_snap_realm || ((flags & CEPH_CAP_FLAG_AUTH) && realmino != (u64)-1 && ci->i_snap_realm->ino != realmino)) { /* * add this inode to the appropriate snap realm */ struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc, realmino); if (realm) ceph_change_snap_realm(inode, realm); else WARN(1, "%s: couldn't find snap realm 0x%llx (ino 0x%llx oldrealm 0x%llx)\n", __func__, realmino, ci->i_vino.ino, ci->i_snap_realm ? ci->i_snap_realm->ino : 0); } __check_cap_issue(ci, cap, issued); /* * If we are issued caps we don't want, or the mds' wanted * value appears to be off, queue a check so we'll release * later and/or update the mds wanted value. */ actual_wanted = __ceph_caps_wanted(ci); if ((wanted & ~actual_wanted) || (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) { doutc(cl, "issued %s, mds wanted %s, actual %s, queueing\n", ceph_cap_string(issued), ceph_cap_string(wanted), ceph_cap_string(actual_wanted)); __cap_delay_requeue(mdsc, ci); } if (flags & CEPH_CAP_FLAG_AUTH) { if (!ci->i_auth_cap || ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) { if (ci->i_auth_cap && ci->i_auth_cap->session != cap->session) change_auth_cap_ses(ci, cap->session); ci->i_auth_cap = cap; cap->mds_wanted = wanted; } } else { WARN_ON(ci->i_auth_cap == cap); } doutc(cl, "inode %p %llx.%llx cap %p %s now %s seq %d mds%d\n", inode, ceph_vinop(inode), cap, ceph_cap_string(issued), ceph_cap_string(issued|cap->issued), seq, mds); cap->cap_id = cap_id; cap->issued = issued; cap->implemented |= issued; if (ceph_seq_cmp(mseq, cap->mseq) > 0) cap->mds_wanted = wanted; else cap->mds_wanted |= wanted; cap->seq = seq; cap->issue_seq = seq; cap->mseq = mseq; cap->cap_gen = gen; wake_up_all(&ci->i_cap_wq); } /* * Return true if cap has not timed out and belongs to the current * generation of the MDS session (i.e. has not gone 'stale' due to * us losing touch with the mds). */ static int __cap_is_valid(struct ceph_cap *cap) { struct inode *inode = &cap->ci->netfs.inode; struct ceph_client *cl = cap->session->s_mdsc->fsc->client; unsigned long ttl; u32 gen; gen = atomic_read(&cap->session->s_cap_gen); ttl = cap->session->s_cap_ttl; if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) { doutc(cl, "%p %llx.%llx cap %p issued %s but STALE (gen %u vs %u)\n", inode, ceph_vinop(inode), cap, ceph_cap_string(cap->issued), cap->cap_gen, gen); return 0; } return 1; } /* * Return set of valid cap bits issued to us. Note that caps time * out, and may be invalidated in bulk if the client session times out * and session->s_cap_gen is bumped. */ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); int have = ci->i_snap_caps; struct ceph_cap *cap; struct rb_node *p; if (implemented) *implemented = 0; for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); if (!__cap_is_valid(cap)) continue; doutc(cl, "%p %llx.%llx cap %p issued %s\n", inode, ceph_vinop(inode), cap, ceph_cap_string(cap->issued)); have |= cap->issued; if (implemented) *implemented |= cap->implemented; } /* * exclude caps issued by non-auth MDS, but are been revoking * by the auth MDS. The non-auth MDS should be revoking/exporting * these caps, but the message is delayed. */ if (ci->i_auth_cap) { cap = ci->i_auth_cap; have &= ~cap->implemented | cap->issued; } return have; } /* * Get cap bits issued by caps other than @ocap */ int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap) { int have = ci->i_snap_caps; struct ceph_cap *cap; struct rb_node *p; for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); if (cap == ocap) continue; if (!__cap_is_valid(cap)) continue; have |= cap->issued; } return have; } /* * Move a cap to the end of the LRU (oldest caps at list head, newest * at list tail). */ static void __touch_cap(struct ceph_cap *cap) { struct inode *inode = &cap->ci->netfs.inode; struct ceph_mds_session *s = cap->session; struct ceph_client *cl = s->s_mdsc->fsc->client; spin_lock(&s->s_cap_lock); if (!s->s_cap_iterator) { doutc(cl, "%p %llx.%llx cap %p mds%d\n", inode, ceph_vinop(inode), cap, s->s_mds); list_move_tail(&cap->session_caps, &s->s_caps); } else { doutc(cl, "%p %llx.%llx cap %p mds%d NOP, iterating over caps\n", inode, ceph_vinop(inode), cap, s->s_mds); } spin_unlock(&s->s_cap_lock); } /* * Check if we hold the given mask. If so, move the cap(s) to the * front of their respective LRUs. (This is the preferred way for * callers to check for caps they want.) */ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_cap *cap; struct rb_node *p; int have = ci->i_snap_caps; if ((have & mask) == mask) { doutc(cl, "mask %p %llx.%llx snap issued %s (mask %s)\n", inode, ceph_vinop(inode), ceph_cap_string(have), ceph_cap_string(mask)); return 1; } for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); if (!__cap_is_valid(cap)) continue; if ((cap->issued & mask) == mask) { doutc(cl, "mask %p %llx.%llx cap %p issued %s (mask %s)\n", inode, ceph_vinop(inode), cap, ceph_cap_string(cap->issued), ceph_cap_string(mask)); if (touch) __touch_cap(cap); return 1; } /* does a combination of caps satisfy mask? */ have |= cap->issued; if ((have & mask) == mask) { doutc(cl, "mask %p %llx.%llx combo issued %s (mask %s)\n", inode, ceph_vinop(inode), ceph_cap_string(cap->issued), ceph_cap_string(mask)); if (touch) { struct rb_node *q; /* touch this + preceding caps */ __touch_cap(cap); for (q = rb_first(&ci->i_caps); q != p; q = rb_next(q)) { cap = rb_entry(q, struct ceph_cap, ci_node); if (!__cap_is_valid(cap)) continue; if (cap->issued & mask) __touch_cap(cap); } } return 1; } } return 0; } int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask, int touch) { struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb); int r; r = __ceph_caps_issued_mask(ci, mask, touch); if (r) ceph_update_cap_hit(&fsc->mdsc->metric); else ceph_update_cap_mis(&fsc->mdsc->metric); return r; } /* * Return true if mask caps are currently being revoked by an MDS. */ int __ceph_caps_revoking_other(struct ceph_inode_info *ci, struct ceph_cap *ocap, int mask) { struct ceph_cap *cap; struct rb_node *p; for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); if (cap != ocap && (cap->implemented & ~cap->issued & mask)) return 1; } return 0; } int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); int ret; spin_lock(&ci->i_ceph_lock); ret = __ceph_caps_revoking_other(ci, NULL, mask); spin_unlock(&ci->i_ceph_lock); doutc(cl, "%p %llx.%llx %s = %d\n", inode, ceph_vinop(inode), ceph_cap_string(mask), ret); return ret; } int __ceph_caps_used(struct ceph_inode_info *ci) { int used = 0; if (ci->i_pin_ref) used |= CEPH_CAP_PIN; if (ci->i_rd_ref) used |= CEPH_CAP_FILE_RD; if (ci->i_rdcache_ref || (S_ISREG(ci->netfs.inode.i_mode) && ci->netfs.inode.i_data.nrpages)) used |= CEPH_CAP_FILE_CACHE; if (ci->i_wr_ref) used |= CEPH_CAP_FILE_WR; if (ci->i_wb_ref || ci->i_wrbuffer_ref) used |= CEPH_CAP_FILE_BUFFER; if (ci->i_fx_ref) used |= CEPH_CAP_FILE_EXCL; return used; } #define FMODE_WAIT_BIAS 1000 /* * wanted, by virtue of open file modes */ int __ceph_caps_file_wanted(struct ceph_inode_info *ci) { const int PIN_SHIFT = ffs(CEPH_FILE_MODE_PIN); const int RD_SHIFT = ffs(CEPH_FILE_MODE_RD); const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR); const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY); struct ceph_mount_options *opt = ceph_inode_to_fs_client(&ci->netfs.inode)->mount_options; unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ; unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ; if (S_ISDIR(ci->netfs.inode.i_mode)) { int want = 0; /* use used_cutoff here, to keep dir's wanted caps longer */ if (ci->i_nr_by_mode[RD_SHIFT] > 0 || time_after(ci->i_last_rd, used_cutoff)) want |= CEPH_CAP_ANY_SHARED; if (ci->i_nr_by_mode[WR_SHIFT] > 0 || time_after(ci->i_last_wr, used_cutoff)) { want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; if (opt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS) want |= CEPH_CAP_ANY_DIR_OPS; } if (want || ci->i_nr_by_mode[PIN_SHIFT] > 0) want |= CEPH_CAP_PIN; return want; } else { int bits = 0; if (ci->i_nr_by_mode[RD_SHIFT] > 0) { if (ci->i_nr_by_mode[RD_SHIFT] >= FMODE_WAIT_BIAS || time_after(ci->i_last_rd, used_cutoff)) bits |= 1 << RD_SHIFT; } else if (time_after(ci->i_last_rd, idle_cutoff)) { bits |= 1 << RD_SHIFT; } if (ci->i_nr_by_mode[WR_SHIFT] > 0) { if (ci->i_nr_by_mode[WR_SHIFT] >= FMODE_WAIT_BIAS || time_after(ci->i_last_wr, used_cutoff)) bits |= 1 << WR_SHIFT; } else if (time_after(ci->i_last_wr, idle_cutoff)) { bits |= 1 << WR_SHIFT; } /* check lazyio only when read/write is wanted */ if ((bits & (CEPH_FILE_MODE_RDWR << 1)) && ci->i_nr_by_mode[LAZY_SHIFT] > 0) bits |= 1 << LAZY_SHIFT; return bits ? ceph_caps_for_mode(bits >> 1) : 0; } } /* * wanted, by virtue of open file modes AND cap refs (buffered/cached data) */ int __ceph_caps_wanted(struct ceph_inode_info *ci) { int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci); if (S_ISDIR(ci->netfs.inode.i_mode)) { /* we want EXCL if holding caps of dir ops */ if (w & CEPH_CAP_ANY_DIR_OPS) w |= CEPH_CAP_FILE_EXCL; } else { /* we want EXCL if dirty data */ if (w & CEPH_CAP_FILE_BUFFER) w |= CEPH_CAP_FILE_EXCL; } return w; } /* * Return caps we have registered with the MDS(s) as 'wanted'. */ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check) { struct ceph_cap *cap; struct rb_node *p; int mds_wanted = 0; for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); if (check && !__cap_is_valid(cap)) continue; if (cap == ci->i_auth_cap) mds_wanted |= cap->mds_wanted; else mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR); } return mds_wanted; } int ceph_is_any_caps(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); int ret; spin_lock(&ci->i_ceph_lock); ret = __ceph_is_any_real_caps(ci); spin_unlock(&ci->i_ceph_lock); return ret; } /* * Remove a cap. Take steps to deal with a racing iterate_session_caps. * * caller should hold i_ceph_lock. * caller will not hold session s_mutex if called from destroy_inode. */ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) { struct ceph_mds_session *session = cap->session; struct ceph_client *cl = session->s_mdsc->fsc->client; struct ceph_inode_info *ci = cap->ci; struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc; int removed = 0; /* 'ci' being NULL means the remove have already occurred */ if (!ci) { doutc(cl, "inode is NULL\n"); return; } lockdep_assert_held(&ci->i_ceph_lock); doutc(cl, "%p from %p %llx.%llx\n", cap, inode, ceph_vinop(inode)); mdsc = ceph_inode_to_fs_client(&ci->netfs.inode)->mdsc; /* remove from inode's cap rbtree, and clear auth cap */ rb_erase(&cap->ci_node, &ci->i_caps); if (ci->i_auth_cap == cap) ci->i_auth_cap = NULL; /* remove from session list */ spin_lock(&session->s_cap_lock); if (session->s_cap_iterator == cap) { /* not yet, we are iterating over this very cap */ doutc(cl, "delaying %p removal from session %p\n", cap, cap->session); } else { list_del_init(&cap->session_caps); session->s_nr_caps--; atomic64_dec(&mdsc->metric.total_caps); cap->session = NULL; removed = 1; } /* protect backpointer with s_cap_lock: see iterate_session_caps */ cap->ci = NULL; /* * s_cap_reconnect is protected by s_cap_lock. no one changes * s_cap_gen while session is in the reconnect state. */ if (queue_release && (!session->s_cap_reconnect || cap->cap_gen == atomic_read(&session->s_cap_gen))) { cap->queue_release = 1; if (removed) { __ceph_queue_cap_release(session, cap); removed = 0; } } else { cap->queue_release = 0; } cap->cap_ino = ci->i_vino.ino; spin_unlock(&session->s_cap_lock); if (removed) ceph_put_cap(mdsc, cap); if (!__ceph_is_any_real_caps(ci)) { /* when reconnect denied, we remove session caps forcibly, * i_wr_ref can be non-zero. If there are ongoing write, * keep i_snap_realm. */ if (ci->i_wr_ref == 0 && ci->i_snap_realm) ceph_change_snap_realm(&ci->netfs.inode, NULL); __cap_delay_cancel(mdsc, ci); } } void ceph_remove_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, bool queue_release) { struct ceph_inode_info *ci = cap->ci; struct ceph_fs_client *fsc; /* 'ci' being NULL means the remove have already occurred */ if (!ci) { doutc(mdsc->fsc->client, "inode is NULL\n"); return; } lockdep_assert_held(&ci->i_ceph_lock); fsc = ceph_inode_to_fs_client(&ci->netfs.inode); WARN_ON_ONCE(ci->i_auth_cap == cap && !list_empty(&ci->i_dirty_item) && !fsc->blocklisted && !ceph_inode_is_shutdown(&ci->netfs.inode)); __ceph_remove_cap(cap, queue_release); } struct cap_msg_args { struct ceph_mds_session *session; u64 ino, cid, follows; u64 flush_tid, oldest_flush_tid, size, max_size; u64 xattr_version; u64 change_attr; struct ceph_buffer *xattr_buf; struct ceph_buffer *old_xattr_buf; struct timespec64 atime, mtime, ctime, btime; int op, caps, wanted, dirty; u32 seq, issue_seq, mseq, time_warp_seq; u32 flags; kuid_t uid; kgid_t gid; umode_t mode; bool inline_data; bool wake; bool encrypted; u32 fscrypt_auth_len; u8 fscrypt_auth[sizeof(struct ceph_fscrypt_auth)]; // for context }; /* Marshal up the cap msg to the MDS */ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) { struct ceph_mds_caps *fc; void *p; struct ceph_mds_client *mdsc = arg->session->s_mdsc; struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc; doutc(mdsc->fsc->client, "%s %llx %llx caps %s wanted %s dirty %s seq %u/%u" " tid %llu/%llu mseq %u follows %lld size %llu/%llu" " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(arg->op), arg->cid, arg->ino, ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted), ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq, arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows, arg->size, arg->max_size, arg->xattr_version, arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0); msg->hdr.version = cpu_to_le16(12); msg->hdr.tid = cpu_to_le64(arg->flush_tid); fc = msg->front.iov_base; memset(fc, 0, sizeof(*fc)); fc->cap_id = cpu_to_le64(arg->cid); fc->op = cpu_to_le32(arg->op); fc->seq = cpu_to_le32(arg->seq); fc->issue_seq = cpu_to_le32(arg->issue_seq); fc->migrate_seq = cpu_to_le32(arg->mseq); fc->caps = cpu_to_le32(arg->caps); fc->wanted = cpu_to_le32(arg->wanted); fc->dirty = cpu_to_le32(arg->dirty); fc->ino = cpu_to_le64(arg->ino); fc->snap_follows = cpu_to_le64(arg->follows); #if IS_ENABLED(CONFIG_FS_ENCRYPTION) if (arg->encrypted) fc->size = cpu_to_le64(round_up(arg->size, CEPH_FSCRYPT_BLOCK_SIZE)); else #endif fc->size = cpu_to_le64(arg->size); fc->max_size = cpu_to_le64(arg->max_size); ceph_encode_timespec64(&fc->mtime, &arg->mtime); ceph_encode_timespec64(&fc->atime, &arg->atime); ceph_encode_timespec64(&fc->ctime, &arg->ctime); fc->time_warp_seq = cpu_to_le32(arg->time_warp_seq); fc->uid = cpu_to_le32(from_kuid(&init_user_ns, arg->uid)); fc->gid = cpu_to_le32(from_kgid(&init_user_ns, arg->gid)); fc->mode = cpu_to_le32(arg->mode); fc->xattr_version = cpu_to_le64(arg->xattr_version); if (arg->xattr_buf) { msg->middle = ceph_buffer_get(arg->xattr_buf); fc->xattr_len = cpu_to_le32(arg->xattr_buf->vec.iov_len); msg->hdr.middle_len = cpu_to_le32(arg->xattr_buf->vec.iov_len); } p = fc + 1; /* flock buffer size (version 2) */ ceph_encode_32(&p, 0); /* inline version (version 4) */ ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE); /* inline data size */ ceph_encode_32(&p, 0); /* * osd_epoch_barrier (version 5) * The epoch_barrier is protected osdc->lock, so READ_ONCE here in * case it was recently changed */ ceph_encode_32(&p, READ_ONCE(osdc->epoch_barrier)); /* oldest_flush_tid (version 6) */ ceph_encode_64(&p, arg->oldest_flush_tid); /* * caller_uid/caller_gid (version 7) * * Currently, we don't properly track which caller dirtied the caps * last, and force a flush of them when there is a conflict. For now, * just set this to 0:0, to emulate how the MDS has worked up to now. */ ceph_encode_32(&p, 0); ceph_encode_32(&p, 0); /* pool namespace (version 8) (mds always ignores this) */ ceph_encode_32(&p, 0); /* btime and change_attr (version 9) */ ceph_encode_timespec64(p, &arg->btime); p += sizeof(struct ceph_timespec); ceph_encode_64(&p, arg->change_attr); /* Advisory flags (version 10) */ ceph_encode_32(&p, arg->flags); /* dirstats (version 11) - these are r/o on the client */ ceph_encode_64(&p, 0); ceph_encode_64(&p, 0); #if IS_ENABLED(CONFIG_FS_ENCRYPTION) /* * fscrypt_auth and fscrypt_file (version 12) * * fscrypt_auth holds the crypto context (if any). fscrypt_file * tracks the real i_size as an __le64 field (and we use a rounded-up * i_size in the traditional size field). */ ceph_encode_32(&p, arg->fscrypt_auth_len); ceph_encode_copy(&p, arg->fscrypt_auth, arg->fscrypt_auth_len); ceph_encode_32(&p, sizeof(__le64)); ceph_encode_64(&p, arg->size); #else /* CONFIG_FS_ENCRYPTION */ ceph_encode_32(&p, 0); ceph_encode_32(&p, 0); #endif /* CONFIG_FS_ENCRYPTION */ } /* * Queue cap releases when an inode is dropped from our cache. */ void __ceph_remove_caps(struct ceph_inode_info *ci) { struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; struct rb_node *p; /* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU) * may call __ceph_caps_issued_mask() on a freeing inode. */ spin_lock(&ci->i_ceph_lock); p = rb_first(&ci->i_caps); while (p) { struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); p = rb_next(p); ceph_remove_cap(mdsc, cap, true); } spin_unlock(&ci->i_ceph_lock); } /* * Prepare to send a cap message to an MDS. Update the cap state, and populate * the arg struct with the parameters that will need to be sent. This should * be done under the i_ceph_lock to guard against changes to cap state. * * Make note of max_size reported/requested from mds, revoked caps * that have now been implemented. */ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, int op, int flags, int used, int want, int retain, int flushing, u64 flush_tid, u64 oldest_flush_tid) { struct ceph_inode_info *ci = cap->ci; struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); int held, revoking; lockdep_assert_held(&ci->i_ceph_lock); held = cap->issued | cap->implemented; revoking = cap->implemented & ~cap->issued; retain &= ~revoking; doutc(cl, "%p %llx.%llx cap %p session %p %s -> %s (revoking %s)\n", inode, ceph_vinop(inode), cap, cap->session, ceph_cap_string(held), ceph_cap_string(held & retain), ceph_cap_string(revoking)); BUG_ON((retain & CEPH_CAP_PIN) == 0); ci->i_ceph_flags &= ~CEPH_I_FLUSH; cap->issued &= retain; /* drop bits we don't want */ /* * Wake up any waiters on wanted -> needed transition. This is due to * the weird transition from buffered to sync IO... we need to flush * dirty pages _before_ allowing sync writes to avoid reordering. */ arg->wake = cap->implemented & ~cap->issued; cap->implemented &= cap->issued | used; cap->mds_wanted = want; arg->session = cap->session; arg->ino = ceph_vino(inode).ino; arg->cid = cap->cap_id; arg->follows = flushing ? ci->i_head_snapc->seq : 0; arg->flush_tid = flush_tid; arg->oldest_flush_tid = oldest_flush_tid; arg->size = i_size_read(inode); ci->i_reported_size = arg->size; arg->max_size = ci->i_wanted_max_size; if (cap == ci->i_auth_cap) { if (want & CEPH_CAP_ANY_FILE_WR) ci->i_requested_max_size = arg->max_size; else ci->i_requested_max_size = 0; } if (flushing & CEPH_CAP_XATTR_EXCL) { arg->old_xattr_buf = __ceph_build_xattrs_blob(ci); arg->xattr_version = ci->i_xattrs.version; arg->xattr_buf = ci->i_xattrs.blob; } else { arg->xattr_buf = NULL; arg->old_xattr_buf = NULL; } arg->mtime = inode_get_mtime(inode); arg->atime = inode_get_atime(inode); arg->ctime = inode_get_ctime(inode); arg->btime = ci->i_btime; arg->change_attr = inode_peek_iversion_raw(inode); arg->op = op; arg->caps = cap->implemented; arg->wanted = want; arg->dirty = flushing; arg->seq = cap->seq; arg->issue_seq = cap->issue_seq; arg->mseq = cap->mseq; arg->time_warp_seq = ci->i_time_warp_seq; arg->uid = inode->i_uid; arg->gid = inode->i_gid; arg->mode = inode->i_mode; arg->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; if (!(flags & CEPH_CLIENT_CAPS_PENDING_CAPSNAP) && !list_empty(&ci->i_cap_snaps)) { struct ceph_cap_snap *capsnap; list_for_each_entry_reverse(capsnap, &ci->i_cap_snaps, ci_item) { if (capsnap->cap_flush.tid) break; if (capsnap->need_flush) { flags |= CEPH_CLIENT_CAPS_PENDING_CAPSNAP; break; } } } arg->flags = flags; arg->encrypted = IS_ENCRYPTED(inode); #if IS_ENABLED(CONFIG_FS_ENCRYPTION) if (ci->fscrypt_auth_len && WARN_ON_ONCE(ci->fscrypt_auth_len > sizeof(struct ceph_fscrypt_auth))) { /* Don't set this if it's too big */ arg->fscrypt_auth_len = 0; } else { arg->fscrypt_auth_len = ci->fscrypt_auth_len; memcpy(arg->fscrypt_auth, ci->fscrypt_auth, min_t(size_t, ci->fscrypt_auth_len, sizeof(arg->fscrypt_auth))); } #endif /* CONFIG_FS_ENCRYPTION */ } #if IS_ENABLED(CONFIG_FS_ENCRYPTION) #define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \ 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4 + 8) static inline int cap_msg_size(struct cap_msg_args *arg) { return CAP_MSG_FIXED_FIELDS + arg->fscrypt_auth_len; } #else #define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \ 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4) static inline int cap_msg_size(struct cap_msg_args *arg) { return CAP_MSG_FIXED_FIELDS; } #endif /* CONFIG_FS_ENCRYPTION */ /* * Send a cap msg on the given inode. * * Caller should hold snap_rwsem (read), s_mutex. */ static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci) { struct ceph_msg *msg; struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(arg), GFP_NOFS, false); if (!msg) { pr_err_client(cl, "error allocating cap msg: ino (%llx.%llx)" " flushing %s tid %llu, requeuing cap.\n", ceph_vinop(inode), ceph_cap_string(arg->dirty), arg->flush_tid); spin_lock(&ci->i_ceph_lock); __cap_delay_requeue(arg->session->s_mdsc, ci); spin_unlock(&ci->i_ceph_lock); return; } encode_cap_msg(msg, arg); ceph_con_send(&arg->session->s_con, msg); ceph_buffer_put(arg->old_xattr_buf); if (arg->wake) wake_up_all(&ci->i_cap_wq); } static inline int __send_flush_snap(struct inode *inode, struct ceph_mds_session *session, struct ceph_cap_snap *capsnap, u32 mseq, u64 oldest_flush_tid) { struct cap_msg_args arg; struct ceph_msg *msg; arg.session = session; arg.ino = ceph_vino(inode).ino; arg.cid = 0; arg.follows = capsnap->follows; arg.flush_tid = capsnap->cap_flush.tid; arg.oldest_flush_tid = oldest_flush_tid; arg.size = capsnap->size; arg.max_size = 0; arg.xattr_version = capsnap->xattr_version; arg.xattr_buf = capsnap->xattr_blob; arg.old_xattr_buf = NULL; arg.atime = capsnap->atime; arg.mtime = capsnap->mtime; arg.ctime = capsnap->ctime; arg.btime = capsnap->btime; arg.change_attr = capsnap->change_attr; arg.op = CEPH_CAP_OP_FLUSHSNAP; arg.caps = capsnap->issued; arg.wanted = 0; arg.dirty = capsnap->dirty; arg.seq = 0; arg.issue_seq = 0; arg.mseq = mseq; arg.time_warp_seq = capsnap->time_warp_seq; arg.uid = capsnap->uid; arg.gid = capsnap->gid; arg.mode = capsnap->mode; arg.inline_data = capsnap->inline_data; arg.flags = 0; arg.wake = false; arg.encrypted = IS_ENCRYPTED(inode); /* No fscrypt_auth changes from a capsnap.*/ arg.fscrypt_auth_len = 0; msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(&arg), GFP_NOFS, false); if (!msg) return -ENOMEM; encode_cap_msg(msg, &arg); ceph_con_send(&arg.session->s_con, msg); return 0; } /* * When a snapshot is taken, clients accumulate dirty metadata on * inodes with capabilities in ceph_cap_snaps to describe the file * state at the time the snapshot was taken. This must be flushed * asynchronously back to the MDS once sync writes complete and dirty * data is written out. * * Called under i_ceph_lock. */ static void __ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session *session) __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap_snap *capsnap; u64 oldest_flush_tid = 0; u64 first_tid = 1, last_tid = 0; doutc(cl, "%p %llx.%llx session %p\n", inode, ceph_vinop(inode), session); list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { /* * we need to wait for sync writes to complete and for dirty * pages to be written out. */ if (capsnap->dirty_pages || capsnap->writing) break; /* should be removed by ceph_try_drop_cap_snap() */ BUG_ON(!capsnap->need_flush); /* only flush each capsnap once */ if (capsnap->cap_flush.tid > 0) { doutc(cl, "already flushed %p, skipping\n", capsnap); continue; } spin_lock(&mdsc->cap_dirty_lock); capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid; list_add_tail(&capsnap->cap_flush.g_list, &mdsc->cap_flush_list); if (oldest_flush_tid == 0) oldest_flush_tid = __get_oldest_flush_tid(mdsc); if (list_empty(&ci->i_flushing_item)) { list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); } spin_unlock(&mdsc->cap_dirty_lock); list_add_tail(&capsnap->cap_flush.i_list, &ci->i_cap_flush_list); if (first_tid == 1) first_tid = capsnap->cap_flush.tid; last_tid = capsnap->cap_flush.tid; } ci->i_ceph_flags &= ~CEPH_I_FLUSH_SNAPS; while (first_tid <= last_tid) { struct ceph_cap *cap = ci->i_auth_cap; struct ceph_cap_flush *cf = NULL, *iter; int ret; if (!(cap && cap->session == session)) { doutc(cl, "%p %llx.%llx auth cap %p not mds%d, stop\n", inode, ceph_vinop(inode), cap, session->s_mds); break; } ret = -ENOENT; list_for_each_entry(iter, &ci->i_cap_flush_list, i_list) { if (iter->tid >= first_tid) { cf = iter; ret = 0; break; } } if (ret < 0) break; first_tid = cf->tid + 1; capsnap = container_of(cf, struct ceph_cap_snap, cap_flush); refcount_inc(&capsnap->nref); spin_unlock(&ci->i_ceph_lock); doutc(cl, "%p %llx.%llx capsnap %p tid %llu %s\n", inode, ceph_vinop(inode), capsnap, cf->tid, ceph_cap_string(capsnap->dirty)); ret = __send_flush_snap(inode, session, capsnap, cap->mseq, oldest_flush_tid); if (ret < 0) { pr_err_client(cl, "error sending cap flushsnap, " "ino (%llx.%llx) tid %llu follows %llu\n", ceph_vinop(inode), cf->tid, capsnap->follows); } ceph_put_cap_snap(capsnap); spin_lock(&ci->i_ceph_lock); } } void ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session **psession) { struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_mds_session *session = NULL; bool need_put = false; int mds; doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode)); if (psession) session = *psession; retry: spin_lock(&ci->i_ceph_lock); if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) { doutc(cl, " no capsnap needs flush, doing nothing\n"); goto out; } if (!ci->i_auth_cap) { doutc(cl, " no auth cap (migrating?), doing nothing\n"); goto out; } mds = ci->i_auth_cap->session->s_mds; if (session && session->s_mds != mds) { doutc(cl, " oops, wrong session %p mutex\n", session); ceph_put_mds_session(session); session = NULL; } if (!session) { spin_unlock(&ci->i_ceph_lock); mutex_lock(&mdsc->mutex); session = __ceph_lookup_mds_session(mdsc, mds); mutex_unlock(&mdsc->mutex); goto retry; } // make sure flushsnap messages are sent in proper order. if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) __kick_flushing_caps(mdsc, session, ci, 0); __ceph_flush_snaps(ci, session); out: spin_unlock(&ci->i_ceph_lock); if (psession) *psession = session; else ceph_put_mds_session(session); /* we flushed them all; remove this inode from the queue */ spin_lock(&mdsc->snap_flush_lock); if (!list_empty(&ci->i_snap_flush_item)) need_put = true; list_del_init(&ci->i_snap_flush_item); spin_unlock(&mdsc->snap_flush_lock); if (need_put) iput(inode); } /* * Mark caps dirty. If inode is newly dirty, return the dirty flags. * Caller is then responsible for calling __mark_inode_dirty with the * returned flags value. */ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, struct ceph_cap_flush **pcf) { struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb)->mdsc; struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); int was = ci->i_dirty_caps; int dirty = 0; lockdep_assert_held(&ci->i_ceph_lock); if (!ci->i_auth_cap) { pr_warn_client(cl, "%p %llx.%llx mask %s, " "but no auth cap (session was closed?)\n", inode, ceph_vinop(inode), ceph_cap_string(mask)); return 0; } doutc(cl, "%p %llx.%llx %s dirty %s -> %s\n", inode, ceph_vinop(inode), ceph_cap_string(mask), ceph_cap_string(was), ceph_cap_string(was | mask)); ci->i_dirty_caps |= mask; if (was == 0) { struct ceph_mds_session *session = ci->i_auth_cap->session; WARN_ON_ONCE(ci->i_prealloc_cap_flush); swap(ci->i_prealloc_cap_flush, *pcf); if (!ci->i_head_snapc) { WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem)); ci->i_head_snapc = ceph_get_snap_context( ci->i_snap_realm->cached_context); } doutc(cl, "%p %llx.%llx now dirty snapc %p auth cap %p\n", inode, ceph_vinop(inode), ci->i_head_snapc, ci->i_auth_cap); BUG_ON(!list_empty(&ci->i_dirty_item)); spin_lock(&mdsc->cap_dirty_lock); list_add(&ci->i_dirty_item, &session->s_cap_dirty); spin_unlock(&mdsc->cap_dirty_lock); if (ci->i_flushing_caps == 0) { ihold(inode); dirty |= I_DIRTY_SYNC; } } else { WARN_ON_ONCE(!ci->i_prealloc_cap_flush); } BUG_ON(list_empty(&ci->i_dirty_item)); if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) && (mask & CEPH_CAP_FILE_BUFFER)) dirty |= I_DIRTY_DATASYNC; __cap_delay_requeue(mdsc, ci); return dirty; } struct ceph_cap_flush *ceph_alloc_cap_flush(void) { struct ceph_cap_flush *cf; cf = kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL); if (!cf) return NULL; cf->is_capsnap = false; return cf; } void ceph_free_cap_flush(struct ceph_cap_flush *cf) { if (cf) kmem_cache_free(ceph_cap_flush_cachep, cf); } static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc) { if (!list_empty(&mdsc->cap_flush_list)) { struct ceph_cap_flush *cf = list_first_entry(&mdsc->cap_flush_list, struct ceph_cap_flush, g_list); return cf->tid; } return 0; } /* * Remove cap_flush from the mdsc's or inode's flushing cap list. * Return true if caller needs to wake up flush waiters. */ static bool __detach_cap_flush_from_mdsc(struct ceph_mds_client *mdsc, struct ceph_cap_flush *cf) { struct ceph_cap_flush *prev; bool wake = cf->wake; if (wake && cf->g_list.prev != &mdsc->cap_flush_list) { prev = list_prev_entry(cf, g_list); prev->wake = true; wake = false; } list_del_init(&cf->g_list); return wake; } static bool __detach_cap_flush_from_ci(struct ceph_inode_info *ci, struct ceph_cap_flush *cf) { struct ceph_cap_flush *prev; bool wake = cf->wake; if (wake && cf->i_list.prev != &ci->i_cap_flush_list) { prev = list_prev_entry(cf, i_list); prev->wake = true; wake = false; } list_del_init(&cf->i_list); return wake; } /* * Add dirty inode to the flushing list. Assigned a seq number so we * can wait for caps to flush without starving. * * Called under i_ceph_lock. Returns the flush tid. */ static u64 __mark_caps_flushing(struct inode *inode, struct ceph_mds_session *session, bool wake, u64 *oldest_flush_tid) { struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_cap_flush *cf = NULL; int flushing; lockdep_assert_held(&ci->i_ceph_lock); BUG_ON(ci->i_dirty_caps == 0); BUG_ON(list_empty(&ci->i_dirty_item)); BUG_ON(!ci->i_prealloc_cap_flush); flushing = ci->i_dirty_caps; doutc(cl, "flushing %s, flushing_caps %s -> %s\n", ceph_cap_string(flushing), ceph_cap_string(ci->i_flushing_caps), ceph_cap_string(ci->i_flushing_caps | flushing)); ci->i_flushing_caps |= flushing; ci->i_dirty_caps = 0; doutc(cl, "%p %llx.%llx now !dirty\n", inode, ceph_vinop(inode)); swap(cf, ci->i_prealloc_cap_flush); cf->caps = flushing; cf->wake = wake; spin_lock(&mdsc->cap_dirty_lock); list_del_init(&ci->i_dirty_item); cf->tid = ++mdsc->last_cap_flush_tid; list_add_tail(&cf->g_list, &mdsc->cap_flush_list); *oldest_flush_tid = __get_oldest_flush_tid(mdsc); if (list_empty(&ci->i_flushing_item)) { list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); mdsc->num_cap_flushing++; } spin_unlock(&mdsc->cap_dirty_lock); list_add_tail(&cf->i_list, &ci->i_cap_flush_list); return cf->tid; } /* * try to invalidate mapping pages without blocking. */ static int try_nonblocking_invalidate(struct inode *inode) __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u32 invalidating_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); ceph_fscache_invalidate(inode, false); invalidate_mapping_pages(&inode->i_data, 0, -1); spin_lock(&ci->i_ceph_lock); if (inode->i_data.nrpages == 0 && invalidating_gen == ci->i_rdcache_gen) { /* success. */ doutc(cl, "%p %llx.%llx success\n", inode, ceph_vinop(inode)); /* save any racing async invalidate some trouble */ ci->i_rdcache_revoking = ci->i_rdcache_gen - 1; return 0; } doutc(cl, "%p %llx.%llx failed\n", inode, ceph_vinop(inode)); return -1; } bool __ceph_should_report_size(struct ceph_inode_info *ci) { loff_t size = i_size_read(&ci->netfs.inode); /* mds will adjust max size according to the reported size */ if (ci->i_flushing_caps & CEPH_CAP_FILE_WR) return false; if (size >= ci->i_max_size) return true; /* half of previous max_size increment has been used */ if (ci->i_max_size > ci->i_reported_size && (size << 1) >= ci->i_max_size + ci->i_reported_size) return true; return false; } /* * Swiss army knife function to examine currently used and wanted * versus held caps. Release, flush, ack revoked caps to mds as * appropriate. * * CHECK_CAPS_AUTHONLY - we should only check the auth cap * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without * further delay. */ void ceph_check_caps(struct ceph_inode_info *ci, int flags) { struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_cap *cap; u64 flush_tid, oldest_flush_tid; int file_wanted, used, cap_used; int issued, implemented, want, retain, revoking, flushing = 0; int mds = -1; /* keep track of how far we've gone through i_caps list to avoid an infinite loop on retry */ struct rb_node *p; bool queue_invalidate = false; bool tried_invalidate = false; bool queue_writeback = false; struct ceph_mds_session *session = NULL; spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { ci->i_ceph_flags |= CEPH_I_ASYNC_CHECK_CAPS; /* Don't send messages until we get async create reply */ spin_unlock(&ci->i_ceph_lock); return; } if (ci->i_ceph_flags & CEPH_I_FLUSH) flags |= CHECK_CAPS_FLUSH; retry: /* Caps wanted by virtue of active open files. */ file_wanted = __ceph_caps_file_wanted(ci); /* Caps which have active references against them */ used = __ceph_caps_used(ci); /* * "issued" represents the current caps that the MDS wants us to have. * "implemented" is the set that we have been granted, and includes the * ones that have not yet been returned to the MDS (the "revoking" set, * usually because they have outstanding references). */ issued = __ceph_caps_issued(ci, &implemented); revoking = implemented & ~issued; want = file_wanted; /* The ones we currently want to retain (may be adjusted below) */ retain = file_wanted | used | CEPH_CAP_PIN; if (!mdsc->stopping && inode->i_nlink > 0) { if (file_wanted) { retain |= CEPH_CAP_ANY; /* be greedy */ } else if (S_ISDIR(inode->i_mode) && (issued & CEPH_CAP_FILE_SHARED) && __ceph_dir_is_complete(ci)) { /* * If a directory is complete, we want to keep * the exclusive cap. So that MDS does not end up * revoking the shared cap on every create/unlink * operation. */ if (IS_RDONLY(inode)) { want = CEPH_CAP_ANY_SHARED; } else { want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; } retain |= want; } else { retain |= CEPH_CAP_ANY_SHARED; /* * keep RD only if we didn't have the file open RW, * because then the mds would revoke it anyway to * journal max_size=0. */ if (ci->i_max_size == 0) retain |= CEPH_CAP_ANY_RD; } } doutc(cl, "%p %llx.%llx file_want %s used %s dirty %s " "flushing %s issued %s revoking %s retain %s %s%s%s\n", inode, ceph_vinop(inode), ceph_cap_string(file_wanted), ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps), ceph_cap_string(ci->i_flushing_caps), ceph_cap_string(issued), ceph_cap_string(revoking), ceph_cap_string(retain), (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "", (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "", (flags & CHECK_CAPS_NOINVAL) ? " NOINVAL" : ""); /* * If we no longer need to hold onto old our caps, and we may * have cached pages, but don't want them, then try to invalidate. * If we fail, it's because pages are locked.... try again later. */ if ((!(flags & CHECK_CAPS_NOINVAL) || mdsc->stopping) && S_ISREG(inode->i_mode) && !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */ inode->i_data.nrpages && /* have cached pages */ (revoking & (CEPH_CAP_FILE_CACHE| CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */ !tried_invalidate) { doutc(cl, "trying to invalidate on %p %llx.%llx\n", inode, ceph_vinop(inode)); if (try_nonblocking_invalidate(inode) < 0) { doutc(cl, "queuing invalidate\n"); queue_invalidate = true; ci->i_rdcache_revoking = ci->i_rdcache_gen; } tried_invalidate = true; goto retry; } for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { int mflags = 0; struct cap_msg_args arg; cap = rb_entry(p, struct ceph_cap, ci_node); /* avoid looping forever */ if (mds >= cap->mds || ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap)) continue; /* * If we have an auth cap, we don't need to consider any * overlapping caps as used. */ cap_used = used; if (ci->i_auth_cap && cap != ci->i_auth_cap) cap_used &= ~ci->i_auth_cap->issued; revoking = cap->implemented & ~cap->issued; doutc(cl, " mds%d cap %p used %s issued %s implemented %s revoking %s\n", cap->mds, cap, ceph_cap_string(cap_used), ceph_cap_string(cap->issued), ceph_cap_string(cap->implemented), ceph_cap_string(revoking)); if (cap == ci->i_auth_cap && (cap->issued & CEPH_CAP_FILE_WR)) { /* request larger max_size from MDS? */ if (ci->i_wanted_max_size > ci->i_max_size && ci->i_wanted_max_size > ci->i_requested_max_size) { doutc(cl, "requesting new max_size\n"); goto ack; } /* approaching file_max? */ if (__ceph_should_report_size(ci)) { doutc(cl, "i_size approaching max_size\n"); goto ack; } } /* flush anything dirty? */ if (cap == ci->i_auth_cap) { if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) { doutc(cl, "flushing dirty caps\n"); goto ack; } if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) { doutc(cl, "flushing snap caps\n"); goto ack; } } /* completed revocation? going down and there are no caps? */ if (revoking) { if ((revoking & cap_used) == 0) { doutc(cl, "completed revocation of %s\n", ceph_cap_string(cap->implemented & ~cap->issued)); goto ack; } /* * If the "i_wrbuffer_ref" was increased by mmap or generic * cache write just before the ceph_check_caps() is called, * the Fb capability revoking will fail this time. Then we * must wait for the BDI's delayed work to flush the dirty * pages and to release the "i_wrbuffer_ref", which will cost * at most 5 seconds. That means the MDS needs to wait at * most 5 seconds to finished the Fb capability's revocation. * * Let's queue a writeback for it. */ if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref && (revoking & CEPH_CAP_FILE_BUFFER)) queue_writeback = true; } /* want more caps from mds? */ if (want & ~cap->mds_wanted) { if (want & ~(cap->mds_wanted | cap->issued)) goto ack; if (!__cap_is_valid(cap)) goto ack; } /* things we might delay */ if ((cap->issued & ~retain) == 0) continue; /* nope, all good */ ack: ceph_put_mds_session(session); session = ceph_get_mds_session(cap->session); /* kick flushing and flush snaps before sending normal * cap message */ if (cap == ci->i_auth_cap && (ci->i_ceph_flags & (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) { if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) __kick_flushing_caps(mdsc, session, ci, 0); if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) __ceph_flush_snaps(ci, session); goto retry; } if (cap == ci->i_auth_cap && ci->i_dirty_caps) { flushing = ci->i_dirty_caps; flush_tid = __mark_caps_flushing(inode, session, false, &oldest_flush_tid); if (flags & CHECK_CAPS_FLUSH && list_empty(&session->s_cap_dirty)) mflags |= CEPH_CLIENT_CAPS_SYNC; } else { flushing = 0; flush_tid = 0; spin_lock(&mdsc->cap_dirty_lock); oldest_flush_tid = __get_oldest_flush_tid(mdsc); spin_unlock(&mdsc->cap_dirty_lock); } mds = cap->mds; /* remember mds, so we don't repeat */ __prep_cap(&arg, cap, CEPH_CAP_OP_UPDATE, mflags, cap_used, want, retain, flushing, flush_tid, oldest_flush_tid); spin_unlock(&ci->i_ceph_lock); __send_cap(&arg, ci); spin_lock(&ci->i_ceph_lock); goto retry; /* retake i_ceph_lock and restart our cap scan. */ } /* periodically re-calculate caps wanted by open files */ if (__ceph_is_any_real_caps(ci) && list_empty(&ci->i_cap_delay_list) && (file_wanted & ~CEPH_CAP_PIN) && !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { __cap_delay_requeue(mdsc, ci); } spin_unlock(&ci->i_ceph_lock); ceph_put_mds_session(session); if (queue_writeback) ceph_queue_writeback(inode); if (queue_invalidate) ceph_queue_invalidate(inode); } /* * Try to flush dirty caps back to the auth mds. */ static int try_flush_caps(struct inode *inode, u64 *ptid) { struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); int flushing = 0; u64 flush_tid = 0, oldest_flush_tid = 0; spin_lock(&ci->i_ceph_lock); retry_locked: if (ci->i_dirty_caps && ci->i_auth_cap) { struct ceph_cap *cap = ci->i_auth_cap; struct cap_msg_args arg; struct ceph_mds_session *session = cap->session; if (session->s_state < CEPH_MDS_SESSION_OPEN) { spin_unlock(&ci->i_ceph_lock); goto out; } if (ci->i_ceph_flags & (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS)) { if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) __kick_flushing_caps(mdsc, session, ci, 0); if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) __ceph_flush_snaps(ci, session); goto retry_locked; } flushing = ci->i_dirty_caps; flush_tid = __mark_caps_flushing(inode, session, true, &oldest_flush_tid); __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC, __ceph_caps_used(ci), __ceph_caps_wanted(ci), (cap->issued | cap->implemented), flushing, flush_tid, oldest_flush_tid); spin_unlock(&ci->i_ceph_lock); __send_cap(&arg, ci); } else { if (!list_empty(&ci->i_cap_flush_list)) { struct ceph_cap_flush *cf = list_last_entry(&ci->i_cap_flush_list, struct ceph_cap_flush, i_list); cf->wake = true; flush_tid = cf->tid; } flushing = ci->i_flushing_caps; spin_unlock(&ci->i_ceph_lock); } out: *ptid = flush_tid; return flushing; } /* * Return true if we've flushed caps through the given flush_tid. */ static int caps_are_flushed(struct inode *inode, u64 flush_tid) { struct ceph_inode_info *ci = ceph_inode(inode); int ret = 1; spin_lock(&ci->i_ceph_lock); if (!list_empty(&ci->i_cap_flush_list)) { struct ceph_cap_flush * cf = list_first_entry(&ci->i_cap_flush_list, struct ceph_cap_flush, i_list); if (cf->tid <= flush_tid) ret = 0; } spin_unlock(&ci->i_ceph_lock); return ret; } /* * flush the mdlog and wait for any unsafe requests to complete. */ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode) { struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_request *req1 = NULL, *req2 = NULL; int ret, err = 0; spin_lock(&ci->i_unsafe_lock); if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) { req1 = list_last_entry(&ci->i_unsafe_dirops, struct ceph_mds_request, r_unsafe_dir_item); ceph_mdsc_get_request(req1); } if (!list_empty(&ci->i_unsafe_iops)) { req2 = list_last_entry(&ci->i_unsafe_iops, struct ceph_mds_request, r_unsafe_target_item); ceph_mdsc_get_request(req2); } spin_unlock(&ci->i_unsafe_lock); /* * Trigger to flush the journal logs in all the relevant MDSes * manually, or in the worst case we must wait at most 5 seconds * to wait the journal logs to be flushed by the MDSes periodically. */ if (req1 || req2) { struct ceph_mds_request *req; struct ceph_mds_session **sessions; struct ceph_mds_session *s; unsigned int max_sessions; int i; mutex_lock(&mdsc->mutex); max_sessions = mdsc->max_sessions; sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL); if (!sessions) { mutex_unlock(&mdsc->mutex); err = -ENOMEM; goto out; } spin_lock(&ci->i_unsafe_lock); if (req1) { list_for_each_entry(req, &ci->i_unsafe_dirops, r_unsafe_dir_item) { s = req->r_session; if (!s) continue; if (!sessions[s->s_mds]) { s = ceph_get_mds_session(s); sessions[s->s_mds] = s; } } } if (req2) { list_for_each_entry(req, &ci->i_unsafe_iops, r_unsafe_target_item) { s = req->r_session; if (!s) continue; if (!sessions[s->s_mds]) { s = ceph_get_mds_session(s); sessions[s->s_mds] = s; } } } spin_unlock(&ci->i_unsafe_lock); /* the auth MDS */ spin_lock(&ci->i_ceph_lock); if (ci->i_auth_cap) { s = ci->i_auth_cap->session; if (!sessions[s->s_mds]) sessions[s->s_mds] = ceph_get_mds_session(s); } spin_unlock(&ci->i_ceph_lock); mutex_unlock(&mdsc->mutex); /* send flush mdlog request to MDSes */ for (i = 0; i < max_sessions; i++) { s = sessions[i]; if (s) { send_flush_mdlog(s); ceph_put_mds_session(s); } } kfree(sessions); } doutc(cl, "%p %llx.%llx wait on tid %llu %llu\n", inode, ceph_vinop(inode), req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL); if (req1) { ret = !wait_for_completion_timeout(&req1->r_safe_completion, ceph_timeout_jiffies(req1->r_timeout)); if (ret) err = -EIO; } if (req2) { ret = !wait_for_completion_timeout(&req2->r_safe_completion, ceph_timeout_jiffies(req2->r_timeout)); if (ret) err = -EIO; } out: if (req1) ceph_mdsc_put_request(req1); if (req2) ceph_mdsc_put_request(req2); return err; } int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = ceph_inode_to_client(inode); u64 flush_tid; int ret, err; int dirty; doutc(cl, "%p %llx.%llx%s\n", inode, ceph_vinop(inode), datasync ? " datasync" : ""); ret = file_write_and_wait_range(file, start, end); if (datasync) goto out; ret = ceph_wait_on_async_create(inode); if (ret) goto out; dirty = try_flush_caps(inode, &flush_tid); doutc(cl, "dirty caps are %s\n", ceph_cap_string(dirty)); err = flush_mdlog_and_wait_inode_unsafe_requests(inode); /* * only wait on non-file metadata writeback (the mds * can recover size and mtime, so we don't need to * wait for that) */ if (!err && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { err = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); } if (err < 0) ret = err; err = file_check_and_advance_wb_err(file); if (err < 0) ret = err; out: doutc(cl, "%p %llx.%llx%s result=%d\n", inode, ceph_vinop(inode), datasync ? " datasync" : "", ret); return ret; } /* * Flush any dirty caps back to the mds. If we aren't asked to wait, * queue inode for flush but don't do so immediately, because we can * get by with fewer MDS messages if we wait for data writeback to * complete first. */ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = ceph_inode_to_client(inode); u64 flush_tid; int err = 0; int dirty; int wait = (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync); doutc(cl, "%p %llx.%llx wait=%d\n", inode, ceph_vinop(inode), wait); ceph_fscache_unpin_writeback(inode, wbc); if (wait) { err = ceph_wait_on_async_create(inode); if (err) return err; dirty = try_flush_caps(inode, &flush_tid); if (dirty) err = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); } else { struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; spin_lock(&ci->i_ceph_lock); if (__ceph_caps_dirty(ci)) __cap_delay_requeue_front(mdsc, ci); spin_unlock(&ci->i_ceph_lock); } return err; } static void __kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_inode_info *ci, u64 oldest_flush_tid) __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap *cap; struct ceph_cap_flush *cf; int ret; u64 first_tid = 0; u64 last_snap_flush = 0; /* Don't do anything until create reply comes in */ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) return; ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) { if (cf->is_capsnap) { last_snap_flush = cf->tid; break; } } list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) { if (cf->tid < first_tid) continue; cap = ci->i_auth_cap; if (!(cap && cap->session == session)) { pr_err_client(cl, "%p auth cap %p not mds%d ???\n", inode, cap, session->s_mds); break; } first_tid = cf->tid + 1; if (!cf->is_capsnap) { struct cap_msg_args arg; doutc(cl, "%p %llx.%llx cap %p tid %llu %s\n", inode, ceph_vinop(inode), cap, cf->tid, ceph_cap_string(cf->caps)); __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, (cf->tid < last_snap_flush ? CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0), __ceph_caps_used(ci), __ceph_caps_wanted(ci), (cap->issued | cap->implemented), cf->caps, cf->tid, oldest_flush_tid); spin_unlock(&ci->i_ceph_lock); __send_cap(&arg, ci); } else { struct ceph_cap_snap *capsnap = container_of(cf, struct ceph_cap_snap, cap_flush); doutc(cl, "%p %llx.%llx capsnap %p tid %llu %s\n", inode, ceph_vinop(inode), capsnap, cf->tid, ceph_cap_string(capsnap->dirty)); refcount_inc(&capsnap->nref); spin_unlock(&ci->i_ceph_lock); ret = __send_flush_snap(inode, session, capsnap, cap->mseq, oldest_flush_tid); if (ret < 0) { pr_err_client(cl, "error sending cap flushsnap," " %p %llx.%llx tid %llu follows %llu\n", inode, ceph_vinop(inode), cf->tid, capsnap->follows); } ceph_put_cap_snap(capsnap); } spin_lock(&ci->i_ceph_lock); } } void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci; struct ceph_cap *cap; u64 oldest_flush_tid; doutc(cl, "mds%d\n", session->s_mds); spin_lock(&mdsc->cap_dirty_lock); oldest_flush_tid = __get_oldest_flush_tid(mdsc); spin_unlock(&mdsc->cap_dirty_lock); list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { struct inode *inode = &ci->netfs.inode; spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; if (!(cap && cap->session == session)) { pr_err_client(cl, "%p %llx.%llx auth cap %p not mds%d ???\n", inode, ceph_vinop(inode), cap, session->s_mds); spin_unlock(&ci->i_ceph_lock); continue; } /* * if flushing caps were revoked, we re-send the cap flush * in client reconnect stage. This guarantees MDS * processes * the cap flush message before issuing the flushing caps to * other client. */ if ((cap->issued & ci->i_flushing_caps) != ci->i_flushing_caps) { /* encode_caps_cb() also will reset these sequence * numbers. make sure sequence numbers in cap flush * message match later reconnect message */ cap->seq = 0; cap->issue_seq = 0; cap->mseq = 0; __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid); } else { ci->i_ceph_flags |= CEPH_I_KICK_FLUSH; } spin_unlock(&ci->i_ceph_lock); } } void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci; struct ceph_cap *cap; u64 oldest_flush_tid; lockdep_assert_held(&session->s_mutex); doutc(cl, "mds%d\n", session->s_mds); spin_lock(&mdsc->cap_dirty_lock); oldest_flush_tid = __get_oldest_flush_tid(mdsc); spin_unlock(&mdsc->cap_dirty_lock); list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { struct inode *inode = &ci->netfs.inode; spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; if (!(cap && cap->session == session)) { pr_err_client(cl, "%p %llx.%llx auth cap %p not mds%d ???\n", inode, ceph_vinop(inode), cap, session->s_mds); spin_unlock(&ci->i_ceph_lock); continue; } if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid); } spin_unlock(&ci->i_ceph_lock); } } void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session, struct ceph_inode_info *ci) { struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_cap *cap = ci->i_auth_cap; struct inode *inode = &ci->netfs.inode; lockdep_assert_held(&ci->i_ceph_lock); doutc(mdsc->fsc->client, "%p %llx.%llx flushing %s\n", inode, ceph_vinop(inode), ceph_cap_string(ci->i_flushing_caps)); if (!list_empty(&ci->i_cap_flush_list)) { u64 oldest_flush_tid; spin_lock(&mdsc->cap_dirty_lock); list_move_tail(&ci->i_flushing_item, &cap->session->s_cap_flushing); oldest_flush_tid = __get_oldest_flush_tid(mdsc); spin_unlock(&mdsc->cap_dirty_lock); __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid); } } /* * Take references to capabilities we hold, so that we don't release * them to the MDS prematurely. */ void ceph_take_cap_refs(struct ceph_inode_info *ci, int got, bool snap_rwsem_locked) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); lockdep_assert_held(&ci->i_ceph_lock); if (got & CEPH_CAP_PIN) ci->i_pin_ref++; if (got & CEPH_CAP_FILE_RD) ci->i_rd_ref++; if (got & CEPH_CAP_FILE_CACHE) ci->i_rdcache_ref++; if (got & CEPH_CAP_FILE_EXCL) ci->i_fx_ref++; if (got & CEPH_CAP_FILE_WR) { if (ci->i_wr_ref == 0 && !ci->i_head_snapc) { BUG_ON(!snap_rwsem_locked); ci->i_head_snapc = ceph_get_snap_context( ci->i_snap_realm->cached_context); } ci->i_wr_ref++; } if (got & CEPH_CAP_FILE_BUFFER) { if (ci->i_wb_ref == 0) ihold(inode); ci->i_wb_ref++; doutc(cl, "%p %llx.%llx wb %d -> %d (?)\n", inode, ceph_vinop(inode), ci->i_wb_ref-1, ci->i_wb_ref); } } /* * Try to grab cap references. Specify those refs we @want, and the * minimal set we @need. Also include the larger offset we are writing * to (when applicable), and check against max_size here as well. * Note that caller is responsible for ensuring max_size increases are * requested from the MDS. * * Returns 0 if caps were not able to be acquired (yet), 1 if succeed, * or a negative error code. There are 3 speical error codes: * -EAGAIN: need to sleep but non-blocking is specified * -EFBIG: ask caller to call check_max_size() and try again. * -EUCLEAN: ask caller to call ceph_renew_caps() and try again. */ enum { /* first 8 bits are reserved for CEPH_FILE_MODE_FOO */ NON_BLOCKING = (1 << 8), CHECK_FILELOCK = (1 << 9), }; static int try_get_cap_refs(struct inode *inode, int need, int want, loff_t endoff, int flags, int *got) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; struct ceph_client *cl = ceph_inode_to_client(inode); int ret = 0; int have, implemented; bool snap_rwsem_locked = false; doutc(cl, "%p %llx.%llx need %s want %s\n", inode, ceph_vinop(inode), ceph_cap_string(need), ceph_cap_string(want)); again: spin_lock(&ci->i_ceph_lock); if ((flags & CHECK_FILELOCK) && (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) { doutc(cl, "%p %llx.%llx error filelock\n", inode, ceph_vinop(inode)); ret = -EIO; goto out_unlock; } /* finish pending truncate */ while (ci->i_truncate_pending) { spin_unlock(&ci->i_ceph_lock); if (snap_rwsem_locked) { up_read(&mdsc->snap_rwsem); snap_rwsem_locked = false; } __ceph_do_pending_vmtruncate(inode); spin_lock(&ci->i_ceph_lock); } have = __ceph_caps_issued(ci, &implemented); if (have & need & CEPH_CAP_FILE_WR) { if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { doutc(cl, "%p %llx.%llx endoff %llu > maxsize %llu\n", inode, ceph_vinop(inode), endoff, ci->i_max_size); if (endoff > ci->i_requested_max_size) ret = ci->i_auth_cap ? -EFBIG : -EUCLEAN; goto out_unlock; } /* * If a sync write is in progress, we must wait, so that we * can get a final snapshot value for size+mtime. */ if (__ceph_have_pending_cap_snap(ci)) { doutc(cl, "%p %llx.%llx cap_snap_pending\n", inode, ceph_vinop(inode)); goto out_unlock; } } if ((have & need) == need) { /* * Look at (implemented & ~have & not) so that we keep waiting * on transition from wanted -> needed caps. This is needed * for WRBUFFER|WR -> WR to avoid a new WR sync write from * going before a prior buffered writeback happens. * * For RDCACHE|RD -> RD, there is not need to wait and we can * just exclude the revoking caps and force to sync read. */ int not = want & ~(have & need); int revoking = implemented & ~have; int exclude = revoking & not; doutc(cl, "%p %llx.%llx have %s but not %s (revoking %s)\n", inode, ceph_vinop(inode), ceph_cap_string(have), ceph_cap_string(not), ceph_cap_string(revoking)); if (!exclude || !(exclude & CEPH_CAP_FILE_BUFFER)) { if (!snap_rwsem_locked && !ci->i_head_snapc && (need & CEPH_CAP_FILE_WR)) { if (!down_read_trylock(&mdsc->snap_rwsem)) { /* * we can not call down_read() when * task isn't in TASK_RUNNING state */ if (flags & NON_BLOCKING) { ret = -EAGAIN; goto out_unlock; } spin_unlock(&ci->i_ceph_lock); down_read(&mdsc->snap_rwsem); snap_rwsem_locked = true; goto again; } snap_rwsem_locked = true; } if ((have & want) == want) *got = need | (want & ~exclude); else *got = need; ceph_take_cap_refs(ci, *got, true); ret = 1; } } else { int session_readonly = false; int mds_wanted; if (ci->i_auth_cap && (need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) { struct ceph_mds_session *s = ci->i_auth_cap->session; spin_lock(&s->s_cap_lock); session_readonly = s->s_readonly; spin_unlock(&s->s_cap_lock); } if (session_readonly) { doutc(cl, "%p %llx.%llx need %s but mds%d readonly\n", inode, ceph_vinop(inode), ceph_cap_string(need), ci->i_auth_cap->mds); ret = -EROFS; goto out_unlock; } if (ceph_inode_is_shutdown(inode)) { doutc(cl, "%p %llx.%llx inode is shutdown\n", inode, ceph_vinop(inode)); ret = -ESTALE; goto out_unlock; } mds_wanted = __ceph_caps_mds_wanted(ci, false); if (need & ~mds_wanted) { doutc(cl, "%p %llx.%llx need %s > mds_wanted %s\n", inode, ceph_vinop(inode), ceph_cap_string(need), ceph_cap_string(mds_wanted)); ret = -EUCLEAN; goto out_unlock; } doutc(cl, "%p %llx.%llx have %s need %s\n", inode, ceph_vinop(inode), ceph_cap_string(have), ceph_cap_string(need)); } out_unlock: __ceph_touch_fmode(ci, mdsc, flags); spin_unlock(&ci->i_ceph_lock); if (snap_rwsem_locked) up_read(&mdsc->snap_rwsem); if (!ret) ceph_update_cap_mis(&mdsc->metric); else if (ret == 1) ceph_update_cap_hit(&mdsc->metric); doutc(cl, "%p %llx.%llx ret %d got %s\n", inode, ceph_vinop(inode), ret, ceph_cap_string(*got)); return ret; } /* * Check the offset we are writing up to against our current * max_size. If necessary, tell the MDS we want to write to * a larger offset. */ static void check_max_size(struct inode *inode, loff_t endoff) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = ceph_inode_to_client(inode); int check = 0; /* do we need to explicitly request a larger max_size? */ spin_lock(&ci->i_ceph_lock); if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) { doutc(cl, "write %p %llx.%llx at large endoff %llu, req max_size\n", inode, ceph_vinop(inode), endoff); ci->i_wanted_max_size = endoff; } /* duplicate ceph_check_caps()'s logic */ if (ci->i_auth_cap && (ci->i_auth_cap->issued & CEPH_CAP_FILE_WR) && ci->i_wanted_max_size > ci->i_max_size && ci->i_wanted_max_size > ci->i_requested_max_size) check = 1; spin_unlock(&ci->i_ceph_lock); if (check) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY); } static inline int get_used_fmode(int caps) { int fmode = 0; if (caps & CEPH_CAP_FILE_RD) fmode |= CEPH_FILE_MODE_RD; if (caps & CEPH_CAP_FILE_WR) fmode |= CEPH_FILE_MODE_WR; return fmode; } int ceph_try_get_caps(struct inode *inode, int need, int want, bool nonblock, int *got) { int ret, flags; BUG_ON(need & ~CEPH_CAP_FILE_RD); BUG_ON(want & ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO | CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | CEPH_CAP_ANY_DIR_OPS)); if (need) { ret = ceph_pool_perm_check(inode, need); if (ret < 0) return ret; } flags = get_used_fmode(need | want); if (nonblock) flags |= NON_BLOCKING; ret = try_get_cap_refs(inode, need, want, 0, flags, got); /* three special error codes */ if (ret == -EAGAIN || ret == -EFBIG || ret == -EUCLEAN) ret = 0; return ret; } /* * Wait for caps, and take cap references. If we can't get a WR cap * due to a small max_size, make sure we check_max_size (and possibly * ask the mds) so we don't get hung up indefinitely. */ int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need, int want, loff_t endoff, int *got) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); int ret, _got, flags; ret = ceph_pool_perm_check(inode, need); if (ret < 0) return ret; if (fi && (fi->fmode & CEPH_FILE_MODE_WR) && fi->filp_gen != READ_ONCE(fsc->filp_gen)) return -EBADF; flags = get_used_fmode(need | want); while (true) { flags &= CEPH_FILE_MODE_MASK; if (vfs_inode_has_locks(inode)) flags |= CHECK_FILELOCK; _got = 0; ret = try_get_cap_refs(inode, need, want, endoff, flags, &_got); WARN_ON_ONCE(ret == -EAGAIN); if (!ret) { struct ceph_mds_client *mdsc = fsc->mdsc; struct cap_wait cw; DEFINE_WAIT_FUNC(wait, woken_wake_function); cw.ino = ceph_ino(inode); cw.tgid = current->tgid; cw.need = need; cw.want = want; spin_lock(&mdsc->caps_list_lock); list_add(&cw.list, &mdsc->cap_wait_list); spin_unlock(&mdsc->caps_list_lock); /* make sure used fmode not timeout */ ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS); add_wait_queue(&ci->i_cap_wq, &wait); flags |= NON_BLOCKING; while (!(ret = try_get_cap_refs(inode, need, want, endoff, flags, &_got))) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; } wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } remove_wait_queue(&ci->i_cap_wq, &wait); ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS); spin_lock(&mdsc->caps_list_lock); list_del(&cw.list); spin_unlock(&mdsc->caps_list_lock); if (ret == -EAGAIN) continue; } if (fi && (fi->fmode & CEPH_FILE_MODE_WR) && fi->filp_gen != READ_ONCE(fsc->filp_gen)) { if (ret >= 0 && _got) ceph_put_cap_refs(ci, _got); return -EBADF; } if (ret < 0) { if (ret == -EFBIG || ret == -EUCLEAN) { int ret2 = ceph_wait_on_async_create(inode); if (ret2 < 0) return ret2; } if (ret == -EFBIG) { check_max_size(inode, endoff); continue; } if (ret == -EUCLEAN) { /* session was killed, try renew caps */ ret = ceph_renew_caps(inode, flags); if (ret == 0) continue; } return ret; } if (S_ISREG(ci->netfs.inode.i_mode) && ceph_has_inline_data(ci) && (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && i_size_read(inode) > 0) { struct page *page = find_get_page(inode->i_mapping, 0); if (page) { bool uptodate = PageUptodate(page); put_page(page); if (uptodate) break; } /* * drop cap refs first because getattr while * holding * caps refs can cause deadlock. */ ceph_put_cap_refs(ci, _got); _got = 0; /* * getattr request will bring inline data into * page cache */ ret = __ceph_do_getattr(inode, NULL, CEPH_STAT_CAP_INLINE_DATA, true); if (ret < 0) return ret; continue; } break; } *got = _got; return 0; } int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got) { struct ceph_file_info *fi = filp->private_data; struct inode *inode = file_inode(filp); return __ceph_get_caps(inode, fi, need, want, endoff, got); } /* * Take cap refs. Caller must already know we hold at least one ref * on the caps in question or we don't know this is safe. */ void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) { spin_lock(&ci->i_ceph_lock); ceph_take_cap_refs(ci, caps, false); spin_unlock(&ci->i_ceph_lock); } /* * drop cap_snap that is not associated with any snapshot. * we don't need to send FLUSHSNAP message for it. */ static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); if (!capsnap->need_flush && !capsnap->writing && !capsnap->dirty_pages) { doutc(cl, "%p follows %llu\n", capsnap, capsnap->follows); BUG_ON(capsnap->cap_flush.tid > 0); ceph_put_snap_context(capsnap->context); if (!list_is_last(&capsnap->ci_item, &ci->i_cap_snaps)) ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; list_del(&capsnap->ci_item); ceph_put_cap_snap(capsnap); return 1; } return 0; } enum put_cap_refs_mode { PUT_CAP_REFS_SYNC = 0, PUT_CAP_REFS_NO_CHECK, PUT_CAP_REFS_ASYNC, }; /* * Release cap refs. * * If we released the last ref on any given cap, call ceph_check_caps * to release (or schedule a release). * * If we are releasing a WR cap (from a sync write), finalize any affected * cap_snap, and wake up any waiters. */ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, enum put_cap_refs_mode mode) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); int last = 0, put = 0, flushsnaps = 0, wake = 0; bool check_flushsnaps = false; spin_lock(&ci->i_ceph_lock); if (had & CEPH_CAP_PIN) --ci->i_pin_ref; if (had & CEPH_CAP_FILE_RD) if (--ci->i_rd_ref == 0) last++; if (had & CEPH_CAP_FILE_CACHE) if (--ci->i_rdcache_ref == 0) last++; if (had & CEPH_CAP_FILE_EXCL) if (--ci->i_fx_ref == 0) last++; if (had & CEPH_CAP_FILE_BUFFER) { if (--ci->i_wb_ref == 0) { last++; /* put the ref held by ceph_take_cap_refs() */ put++; check_flushsnaps = true; } doutc(cl, "%p %llx.%llx wb %d -> %d (?)\n", inode, ceph_vinop(inode), ci->i_wb_ref+1, ci->i_wb_ref); } if (had & CEPH_CAP_FILE_WR) { if (--ci->i_wr_ref == 0) { /* * The Fb caps will always be took and released * together with the Fw caps. */ WARN_ON_ONCE(ci->i_wb_ref); last++; check_flushsnaps = true; if (ci->i_wrbuffer_ref_head == 0 && ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { BUG_ON(!ci->i_head_snapc); ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; } /* see comment in __ceph_remove_cap() */ if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm) ceph_change_snap_realm(inode, NULL); } } if (check_flushsnaps && __ceph_have_pending_cap_snap(ci)) { struct ceph_cap_snap *capsnap = list_last_entry(&ci->i_cap_snaps, struct ceph_cap_snap, ci_item); capsnap->writing = 0; if (ceph_try_drop_cap_snap(ci, capsnap)) /* put the ref held by ceph_queue_cap_snap() */ put++; else if (__ceph_finish_cap_snap(ci, capsnap)) flushsnaps = 1; wake = 1; } spin_unlock(&ci->i_ceph_lock); doutc(cl, "%p %llx.%llx had %s%s%s\n", inode, ceph_vinop(inode), ceph_cap_string(had), last ? " last" : "", put ? " put" : ""); switch (mode) { case PUT_CAP_REFS_SYNC: if (last) ceph_check_caps(ci, 0); else if (flushsnaps) ceph_flush_snaps(ci, NULL); break; case PUT_CAP_REFS_ASYNC: if (last) ceph_queue_check_caps(inode); else if (flushsnaps) ceph_queue_flush_snaps(inode); break; default: break; } if (wake) wake_up_all(&ci->i_cap_wq); while (put-- > 0) iput(inode); } void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) { __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_SYNC); } void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had) { __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_ASYNC); } void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had) { __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_NO_CHECK); } /* * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap * context. Adjust per-snap dirty page accounting as appropriate. * Once all dirty data for a cap_snap is flushed, flush snapped file * metadata back to the MDS. If we dropped the last ref, call * ceph_check_caps. */ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_cap_snap *capsnap = NULL, *iter; int put = 0; bool last = false; bool flush_snaps = false; bool complete_capsnap = false; spin_lock(&ci->i_ceph_lock); ci->i_wrbuffer_ref -= nr; if (ci->i_wrbuffer_ref == 0) { last = true; put++; } if (ci->i_head_snapc == snapc) { ci->i_wrbuffer_ref_head -= nr; if (ci->i_wrbuffer_ref_head == 0 && ci->i_wr_ref == 0 && ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { BUG_ON(!ci->i_head_snapc); ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; } doutc(cl, "on %p %llx.%llx head %d/%d -> %d/%d %s\n", inode, ceph_vinop(inode), ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr, ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, last ? " LAST" : ""); } else { list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) { if (iter->context == snapc) { capsnap = iter; break; } } if (!capsnap) { /* * The capsnap should already be removed when removing * auth cap in the case of a forced unmount. */ WARN_ON_ONCE(ci->i_auth_cap); goto unlock; } capsnap->dirty_pages -= nr; if (capsnap->dirty_pages == 0) { complete_capsnap = true; if (!capsnap->writing) { if (ceph_try_drop_cap_snap(ci, capsnap)) { put++; } else { ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; flush_snaps = true; } } } doutc(cl, "%p %llx.%llx cap_snap %p snap %lld %d/%d -> %d/%d %s%s\n", inode, ceph_vinop(inode), capsnap, capsnap->context->seq, ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr, ci->i_wrbuffer_ref, capsnap->dirty_pages, last ? " (wrbuffer last)" : "", complete_capsnap ? " (complete capsnap)" : ""); } unlock: spin_unlock(&ci->i_ceph_lock); if (last) { ceph_check_caps(ci, 0); } else if (flush_snaps) { ceph_flush_snaps(ci, NULL); } if (complete_capsnap) wake_up_all(&ci->i_cap_wq); while (put-- > 0) { iput(inode); } } /* * Invalidate unlinked inode's aliases, so we can drop the inode ASAP. */ static void invalidate_aliases(struct inode *inode) { struct ceph_client *cl = ceph_inode_to_client(inode); struct dentry *dn, *prev = NULL; doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode)); d_prune_aliases(inode); /* * For non-directory inode, d_find_alias() only returns * hashed dentry. After calling d_invalidate(), the * dentry becomes unhashed. * * For directory inode, d_find_alias() can return * unhashed dentry. But directory inode should have * one alias at most. */ while ((dn = d_find_alias(inode))) { if (dn == prev) { dput(dn); break; } d_invalidate(dn); if (prev) dput(prev); prev = dn; } if (prev) dput(prev); } struct cap_extra_info { struct ceph_string *pool_ns; /* inline data */ u64 inline_version; void *inline_data; u32 inline_len; /* dirstat */ bool dirstat_valid; u64 nfiles; u64 nsubdirs; u64 change_attr; /* currently issued */ int issued; struct timespec64 btime; u8 *fscrypt_auth; u32 fscrypt_auth_len; u64 fscrypt_file_size; }; /* * Handle a cap GRANT message from the MDS. (Note that a GRANT may * actually be a revocation if it specifies a smaller cap set.) * * caller holds s_mutex and i_ceph_lock, we drop both. */ static void handle_cap_grant(struct inode *inode, struct ceph_mds_session *session, struct ceph_cap *cap, struct ceph_mds_caps *grant, struct ceph_buffer *xattr_buf, struct cap_extra_info *extra_info) __releases(ci->i_ceph_lock) __releases(session->s_mdsc->snap_rwsem) { struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); int seq = le32_to_cpu(grant->seq); int newcaps = le32_to_cpu(grant->caps); int used, wanted, dirty; u64 size = le64_to_cpu(grant->size); u64 max_size = le64_to_cpu(grant->max_size); unsigned char check_caps = 0; bool was_stale = cap->cap_gen < atomic_read(&session->s_cap_gen); bool wake = false; bool writeback = false; bool queue_trunc = false; bool queue_invalidate = false; bool deleted_inode = false; bool fill_inline = false; /* * If there is at least one crypto block then we'll trust * fscrypt_file_size. If the real length of the file is 0, then * ignore it (it has probably been truncated down to 0 by the MDS). */ if (IS_ENCRYPTED(inode) && size) size = extra_info->fscrypt_file_size; doutc(cl, "%p %llx.%llx cap %p mds%d seq %d %s\n", inode, ceph_vinop(inode), cap, session->s_mds, seq, ceph_cap_string(newcaps)); doutc(cl, " size %llu max_size %llu, i_size %llu\n", size, max_size, i_size_read(inode)); /* * If CACHE is being revoked, and we have no dirty buffers, * try to invalidate (once). (If there are dirty buffers, we * will invalidate _after_ writeback.) */ if (S_ISREG(inode->i_mode) && /* don't invalidate readdir cache */ ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && !(ci->i_wrbuffer_ref || ci->i_wb_ref)) { if (try_nonblocking_invalidate(inode)) { /* there were locked pages.. invalidate later in a separate thread. */ if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { queue_invalidate = true; ci->i_rdcache_revoking = ci->i_rdcache_gen; } } } if (was_stale) cap->issued = cap->implemented = CEPH_CAP_PIN; /* * auth mds of the inode changed. we received the cap export message, * but still haven't received the cap import message. handle_cap_export * updated the new auth MDS' cap. * * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message * that was sent before the cap import message. So don't remove caps. */ if (ceph_seq_cmp(seq, cap->seq) <= 0) { WARN_ON(cap != ci->i_auth_cap); WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id)); seq = cap->seq; newcaps |= cap->issued; } /* side effects now are allowed */ cap->cap_gen = atomic_read(&session->s_cap_gen); cap->seq = seq; __check_cap_issue(ci, cap, newcaps); inode_set_max_iversion_raw(inode, extra_info->change_attr); if ((newcaps & CEPH_CAP_AUTH_SHARED) && (extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) { umode_t mode = le32_to_cpu(grant->mode); if (inode_wrong_type(inode, mode)) pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n", ceph_vinop(inode), inode->i_mode, mode); else inode->i_mode = mode; inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid)); ci->i_btime = extra_info->btime; doutc(cl, "%p %llx.%llx mode 0%o uid.gid %d.%d\n", inode, ceph_vinop(inode), inode->i_mode, from_kuid(&init_user_ns, inode->i_uid), from_kgid(&init_user_ns, inode->i_gid)); #if IS_ENABLED(CONFIG_FS_ENCRYPTION) if (ci->fscrypt_auth_len != extra_info->fscrypt_auth_len || memcmp(ci->fscrypt_auth, extra_info->fscrypt_auth, ci->fscrypt_auth_len)) pr_warn_ratelimited_client(cl, "cap grant attempt to change fscrypt_auth on non-I_NEW inode (old len %d new len %d)\n", ci->fscrypt_auth_len, extra_info->fscrypt_auth_len); #endif } if ((newcaps & CEPH_CAP_LINK_SHARED) && (extra_info->issued & CEPH_CAP_LINK_EXCL) == 0) { set_nlink(inode, le32_to_cpu(grant->nlink)); if (inode->i_nlink == 0) deleted_inode = true; } if ((extra_info->issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) { int len = le32_to_cpu(grant->xattr_len); u64 version = le64_to_cpu(grant->xattr_version); if (version > ci->i_xattrs.version) { doutc(cl, " got new xattrs v%llu on %p %llx.%llx len %d\n", version, inode, ceph_vinop(inode), len); if (ci->i_xattrs.blob) ceph_buffer_put(ci->i_xattrs.blob); ci->i_xattrs.blob = ceph_buffer_get(xattr_buf); ci->i_xattrs.version = version; ceph_forget_all_cached_acls(inode); ceph_security_invalidate_secctx(inode); } } if (newcaps & CEPH_CAP_ANY_RD) { struct timespec64 mtime, atime, ctime; /* ctime/mtime/atime? */ ceph_decode_timespec64(&mtime, &grant->mtime); ceph_decode_timespec64(&atime, &grant->atime); ceph_decode_timespec64(&ctime, &grant->ctime); ceph_fill_file_time(inode, extra_info->issued, le32_to_cpu(grant->time_warp_seq), &ctime, &mtime, &atime); } if ((newcaps & CEPH_CAP_FILE_SHARED) && extra_info->dirstat_valid) { ci->i_files = extra_info->nfiles; ci->i_subdirs = extra_info->nsubdirs; } if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) { /* file layout may have changed */ s64 old_pool = ci->i_layout.pool_id; struct ceph_string *old_ns; ceph_file_layout_from_legacy(&ci->i_layout, &grant->layout); old_ns = rcu_dereference_protected(ci->i_layout.pool_ns, lockdep_is_held(&ci->i_ceph_lock)); rcu_assign_pointer(ci->i_layout.pool_ns, extra_info->pool_ns); if (ci->i_layout.pool_id != old_pool || extra_info->pool_ns != old_ns) ci->i_ceph_flags &= ~CEPH_I_POOL_PERM; extra_info->pool_ns = old_ns; /* size/truncate_seq? */ queue_trunc = ceph_fill_file_size(inode, extra_info->issued, le32_to_cpu(grant->truncate_seq), le64_to_cpu(grant->truncate_size), size); } if (ci->i_auth_cap == cap && (newcaps & CEPH_CAP_ANY_FILE_WR)) { if (max_size != ci->i_max_size) { doutc(cl, "max_size %lld -> %llu\n", ci->i_max_size, max_size); ci->i_max_size = max_size; if (max_size >= ci->i_wanted_max_size) { ci->i_wanted_max_size = 0; /* reset */ ci->i_requested_max_size = 0; } wake = true; } } /* check cap bits */ wanted = __ceph_caps_wanted(ci); used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); doutc(cl, " my wanted = %s, used = %s, dirty %s\n", ceph_cap_string(wanted), ceph_cap_string(used), ceph_cap_string(dirty)); if ((was_stale || le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) && (wanted & ~(cap->mds_wanted | newcaps))) { /* * If mds is importing cap, prior cap messages that update * 'wanted' may get dropped by mds (migrate seq mismatch). * * We don't send cap message to update 'wanted' if what we * want are already issued. If mds revokes caps, cap message * that releases caps also tells mds what we want. But if * caps got revoked by mds forcedly (session stale). We may * haven't told mds what we want. */ check_caps = 1; } /* revocation, grant, or no-op? */ if (cap->issued & ~newcaps) { int revoking = cap->issued & ~newcaps; doutc(cl, "revocation: %s -> %s (revoking %s)\n", ceph_cap_string(cap->issued), ceph_cap_string(newcaps), ceph_cap_string(revoking)); if (S_ISREG(inode->i_mode) && (revoking & used & CEPH_CAP_FILE_BUFFER)) writeback = true; /* initiate writeback; will delay ack */ else if (queue_invalidate && revoking == CEPH_CAP_FILE_CACHE && (newcaps & CEPH_CAP_FILE_LAZYIO) == 0) ; /* do nothing yet, invalidation will be queued */ else if (cap == ci->i_auth_cap) check_caps = 1; /* check auth cap only */ else check_caps = 2; /* check all caps */ /* If there is new caps, try to wake up the waiters */ if (~cap->issued & newcaps) wake = true; cap->issued = newcaps; cap->implemented |= newcaps; } else if (cap->issued == newcaps) { doutc(cl, "caps unchanged: %s -> %s\n", ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); } else { doutc(cl, "grant: %s -> %s\n", ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); /* non-auth MDS is revoking the newly grant caps ? */ if (cap == ci->i_auth_cap && __ceph_caps_revoking_other(ci, cap, newcaps)) check_caps = 2; cap->issued = newcaps; cap->implemented |= newcaps; /* add bits only, to * avoid stepping on a * pending revocation */ wake = true; } BUG_ON(cap->issued & ~cap->implemented); /* don't let check_caps skip sending a response to MDS for revoke msgs */ if (le32_to_cpu(grant->op) == CEPH_CAP_OP_REVOKE) { cap->mds_wanted = 0; if (cap == ci->i_auth_cap) check_caps = 1; /* check auth cap only */ else check_caps = 2; /* check all caps */ } if (extra_info->inline_version > 0 && extra_info->inline_version >= ci->i_inline_version) { ci->i_inline_version = extra_info->inline_version; if (ci->i_inline_version != CEPH_INLINE_NONE && (newcaps & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO))) fill_inline = true; } if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { if (ci->i_auth_cap == cap) { if (newcaps & ~extra_info->issued) wake = true; if (ci->i_requested_max_size > max_size || !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) { /* re-request max_size if necessary */ ci->i_requested_max_size = 0; wake = true; } ceph_kick_flushing_inode_caps(session, ci); } up_read(&session->s_mdsc->snap_rwsem); } spin_unlock(&ci->i_ceph_lock); if (fill_inline) ceph_fill_inline_data(inode, NULL, extra_info->inline_data, extra_info->inline_len); if (queue_trunc) ceph_queue_vmtruncate(inode); if (writeback) /* * queue inode for writeback: we can't actually call * filemap_write_and_wait, etc. from message handler * context. */ ceph_queue_writeback(inode); if (queue_invalidate) ceph_queue_invalidate(inode); if (deleted_inode) invalidate_aliases(inode); if (wake) wake_up_all(&ci->i_cap_wq); mutex_unlock(&session->s_mutex); if (check_caps == 1) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL); else if (check_caps == 2) ceph_check_caps(ci, CHECK_CAPS_NOINVAL); } /* * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the * MDS has been safely committed. */ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, struct ceph_mds_caps *m, struct ceph_mds_session *session, struct ceph_cap *cap) __releases(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap_flush *cf, *tmp_cf; LIST_HEAD(to_remove); unsigned seq = le32_to_cpu(m->seq); int dirty = le32_to_cpu(m->dirty); int cleaned = 0; bool drop = false; bool wake_ci = false; bool wake_mdsc = false; list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) { /* Is this the one that was flushed? */ if (cf->tid == flush_tid) cleaned = cf->caps; /* Is this a capsnap? */ if (cf->is_capsnap) continue; if (cf->tid <= flush_tid) { /* * An earlier or current tid. The FLUSH_ACK should * represent a superset of this flush's caps. */ wake_ci |= __detach_cap_flush_from_ci(ci, cf); list_add_tail(&cf->i_list, &to_remove); } else { /* * This is a later one. Any caps in it are still dirty * so don't count them as cleaned. */ cleaned &= ~cf->caps; if (!cleaned) break; } } doutc(cl, "%p %llx.%llx mds%d seq %d on %s cleaned %s, flushing %s -> %s\n", inode, ceph_vinop(inode), session->s_mds, seq, ceph_cap_string(dirty), ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps), ceph_cap_string(ci->i_flushing_caps & ~cleaned)); if (list_empty(&to_remove) && !cleaned) goto out; ci->i_flushing_caps &= ~cleaned; spin_lock(&mdsc->cap_dirty_lock); list_for_each_entry(cf, &to_remove, i_list) wake_mdsc |= __detach_cap_flush_from_mdsc(mdsc, cf); if (ci->i_flushing_caps == 0) { if (list_empty(&ci->i_cap_flush_list)) { list_del_init(&ci->i_flushing_item); if (!list_empty(&session->s_cap_flushing)) { struct inode *inode = &list_first_entry(&session->s_cap_flushing, struct ceph_inode_info, i_flushing_item)->netfs.inode; doutc(cl, " mds%d still flushing cap on %p %llx.%llx\n", session->s_mds, inode, ceph_vinop(inode)); } } mdsc->num_cap_flushing--; doutc(cl, " %p %llx.%llx now !flushing\n", inode, ceph_vinop(inode)); if (ci->i_dirty_caps == 0) { doutc(cl, " %p %llx.%llx now clean\n", inode, ceph_vinop(inode)); BUG_ON(!list_empty(&ci->i_dirty_item)); drop = true; if (ci->i_wr_ref == 0 && ci->i_wrbuffer_ref_head == 0) { BUG_ON(!ci->i_head_snapc); ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; } } else { BUG_ON(list_empty(&ci->i_dirty_item)); } } spin_unlock(&mdsc->cap_dirty_lock); out: spin_unlock(&ci->i_ceph_lock); while (!list_empty(&to_remove)) { cf = list_first_entry(&to_remove, struct ceph_cap_flush, i_list); list_del_init(&cf->i_list); if (!cf->is_capsnap) ceph_free_cap_flush(cf); } if (wake_ci) wake_up_all(&ci->i_cap_wq); if (wake_mdsc) wake_up_all(&mdsc->cap_flushing_wq); if (drop) iput(inode); } void __ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, bool *wake_ci, bool *wake_mdsc) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; struct ceph_client *cl = mdsc->fsc->client; bool ret; lockdep_assert_held(&ci->i_ceph_lock); doutc(cl, "removing capsnap %p, %p %llx.%llx ci %p\n", capsnap, inode, ceph_vinop(inode), ci); list_del_init(&capsnap->ci_item); ret = __detach_cap_flush_from_ci(ci, &capsnap->cap_flush); if (wake_ci) *wake_ci = ret; spin_lock(&mdsc->cap_dirty_lock); if (list_empty(&ci->i_cap_flush_list)) list_del_init(&ci->i_flushing_item); ret = __detach_cap_flush_from_mdsc(mdsc, &capsnap->cap_flush); if (wake_mdsc) *wake_mdsc = ret; spin_unlock(&mdsc->cap_dirty_lock); } void ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, bool *wake_ci, bool *wake_mdsc) { struct ceph_inode_info *ci = ceph_inode(inode); lockdep_assert_held(&ci->i_ceph_lock); WARN_ON_ONCE(capsnap->dirty_pages || capsnap->writing); __ceph_remove_capsnap(inode, capsnap, wake_ci, wake_mdsc); } /* * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can * throw away our cap_snap. * * Caller hold s_mutex. */ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, struct ceph_mds_caps *m, struct ceph_mds_session *session) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; struct ceph_client *cl = mdsc->fsc->client; u64 follows = le64_to_cpu(m->snap_follows); struct ceph_cap_snap *capsnap = NULL, *iter; bool wake_ci = false; bool wake_mdsc = false; doutc(cl, "%p %llx.%llx ci %p mds%d follows %lld\n", inode, ceph_vinop(inode), ci, session->s_mds, follows); spin_lock(&ci->i_ceph_lock); list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) { if (iter->follows == follows) { if (iter->cap_flush.tid != flush_tid) { doutc(cl, " cap_snap %p follows %lld " "tid %lld != %lld\n", iter, follows, flush_tid, iter->cap_flush.tid); break; } capsnap = iter; break; } else { doutc(cl, " skipping cap_snap %p follows %lld\n", iter, iter->follows); } } if (capsnap) ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc); spin_unlock(&ci->i_ceph_lock); if (capsnap) { ceph_put_snap_context(capsnap->context); ceph_put_cap_snap(capsnap); if (wake_ci) wake_up_all(&ci->i_cap_wq); if (wake_mdsc) wake_up_all(&mdsc->cap_flushing_wq); iput(inode); } } /* * Handle TRUNC from MDS, indicating file truncation. * * caller hold s_mutex. */ static bool handle_cap_trunc(struct inode *inode, struct ceph_mds_caps *trunc, struct ceph_mds_session *session, struct cap_extra_info *extra_info) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = ceph_inode_to_client(inode); int mds = session->s_mds; int seq = le32_to_cpu(trunc->seq); u32 truncate_seq = le32_to_cpu(trunc->truncate_seq); u64 truncate_size = le64_to_cpu(trunc->truncate_size); u64 size = le64_to_cpu(trunc->size); int implemented = 0; int dirty = __ceph_caps_dirty(ci); int issued = __ceph_caps_issued(ceph_inode(inode), &implemented); bool queue_trunc = false; lockdep_assert_held(&ci->i_ceph_lock); issued |= implemented | dirty; /* * If there is at least one crypto block then we'll trust * fscrypt_file_size. If the real length of the file is 0, then * ignore it (it has probably been truncated down to 0 by the MDS). */ if (IS_ENCRYPTED(inode) && size) size = extra_info->fscrypt_file_size; doutc(cl, "%p %llx.%llx mds%d seq %d to %lld truncate seq %d\n", inode, ceph_vinop(inode), mds, seq, truncate_size, truncate_seq); queue_trunc = ceph_fill_file_size(inode, issued, truncate_seq, truncate_size, size); return queue_trunc; } /* * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a * different one. If we are the most recent migration we've seen (as * indicated by mseq), make note of the migrating cap bits for the * duration (until we see the corresponding IMPORT). * * caller holds s_mutex */ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, struct ceph_mds_cap_peer *ph, struct ceph_mds_session *session) { struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_session *tsession = NULL; struct ceph_cap *cap, *tcap, *new_cap = NULL; struct ceph_inode_info *ci = ceph_inode(inode); u64 t_cap_id; unsigned mseq = le32_to_cpu(ex->migrate_seq); unsigned t_seq, t_mseq; int target, issued; int mds = session->s_mds; if (ph) { t_cap_id = le64_to_cpu(ph->cap_id); t_seq = le32_to_cpu(ph->seq); t_mseq = le32_to_cpu(ph->mseq); target = le32_to_cpu(ph->mds); } else { t_cap_id = t_seq = t_mseq = 0; target = -1; } doutc(cl, "%p %llx.%llx ci %p mds%d mseq %d target %d\n", inode, ceph_vinop(inode), ci, mds, mseq, target); retry: down_read(&mdsc->snap_rwsem); spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id)) goto out_unlock; if (target < 0) { ceph_remove_cap(mdsc, cap, false); goto out_unlock; } /* * now we know we haven't received the cap import message yet * because the exported cap still exist. */ issued = cap->issued; if (issued != cap->implemented) pr_err_ratelimited_client(cl, "issued != implemented: " "%p %llx.%llx mds%d seq %d mseq %d" " issued %s implemented %s\n", inode, ceph_vinop(inode), mds, cap->seq, cap->mseq, ceph_cap_string(issued), ceph_cap_string(cap->implemented)); tcap = __get_cap_for_mds(ci, target); if (tcap) { /* already have caps from the target */ if (tcap->cap_id == t_cap_id && ceph_seq_cmp(tcap->seq, t_seq) < 0) { doutc(cl, " updating import cap %p mds%d\n", tcap, target); tcap->cap_id = t_cap_id; tcap->seq = t_seq - 1; tcap->issue_seq = t_seq - 1; tcap->issued |= issued; tcap->implemented |= issued; if (cap == ci->i_auth_cap) { ci->i_auth_cap = tcap; change_auth_cap_ses(ci, tcap->session); } } ceph_remove_cap(mdsc, cap, false); goto out_unlock; } else if (tsession) { /* add placeholder for the export tagert */ int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; tcap = new_cap; ceph_add_cap(inode, tsession, t_cap_id, issued, 0, t_seq - 1, t_mseq, (u64)-1, flag, &new_cap); if (!list_empty(&ci->i_cap_flush_list) && ci->i_auth_cap == tcap) { spin_lock(&mdsc->cap_dirty_lock); list_move_tail(&ci->i_flushing_item, &tcap->session->s_cap_flushing); spin_unlock(&mdsc->cap_dirty_lock); } ceph_remove_cap(mdsc, cap, false); goto out_unlock; } spin_unlock(&ci->i_ceph_lock); up_read(&mdsc->snap_rwsem); mutex_unlock(&session->s_mutex); /* open target session */ tsession = ceph_mdsc_open_export_target_session(mdsc, target); if (!IS_ERR(tsession)) { if (mds > target) { mutex_lock(&session->s_mutex); mutex_lock_nested(&tsession->s_mutex, SINGLE_DEPTH_NESTING); } else { mutex_lock(&tsession->s_mutex); mutex_lock_nested(&session->s_mutex, SINGLE_DEPTH_NESTING); } new_cap = ceph_get_cap(mdsc, NULL); } else { WARN_ON(1); tsession = NULL; target = -1; mutex_lock(&session->s_mutex); } goto retry; out_unlock: spin_unlock(&ci->i_ceph_lock); up_read(&mdsc->snap_rwsem); mutex_unlock(&session->s_mutex); if (tsession) { mutex_unlock(&tsession->s_mutex); ceph_put_mds_session(tsession); } if (new_cap) ceph_put_cap(mdsc, new_cap); } /* * Handle cap IMPORT. * * caller holds s_mutex. acquires i_ceph_lock */ static void handle_cap_import(struct ceph_mds_client *mdsc, struct inode *inode, struct ceph_mds_caps *im, struct ceph_mds_cap_peer *ph, struct ceph_mds_session *session, struct ceph_cap **target_cap, int *old_issued) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap *cap, *ocap, *new_cap = NULL; int mds = session->s_mds; int issued; unsigned caps = le32_to_cpu(im->caps); unsigned wanted = le32_to_cpu(im->wanted); unsigned seq = le32_to_cpu(im->seq); unsigned mseq = le32_to_cpu(im->migrate_seq); u64 realmino = le64_to_cpu(im->realm); u64 cap_id = le64_to_cpu(im->cap_id); u64 p_cap_id; int peer; if (ph) { p_cap_id = le64_to_cpu(ph->cap_id); peer = le32_to_cpu(ph->mds); } else { p_cap_id = 0; peer = -1; } doutc(cl, "%p %llx.%llx ci %p mds%d mseq %d peer %d\n", inode, ceph_vinop(inode), ci, mds, mseq, peer); retry: cap = __get_cap_for_mds(ci, mds); if (!cap) { if (!new_cap) { spin_unlock(&ci->i_ceph_lock); new_cap = ceph_get_cap(mdsc, NULL); spin_lock(&ci->i_ceph_lock); goto retry; } cap = new_cap; } else { if (new_cap) { ceph_put_cap(mdsc, new_cap); new_cap = NULL; } } __ceph_caps_issued(ci, &issued); issued |= __ceph_caps_dirty(ci); ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH, &new_cap); ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL; if (ocap && ocap->cap_id == p_cap_id) { doutc(cl, " remove export cap %p mds%d flags %d\n", ocap, peer, ph->flags); if ((ph->flags & CEPH_CAP_FLAG_AUTH) && (ocap->seq != le32_to_cpu(ph->seq) || ocap->mseq != le32_to_cpu(ph->mseq))) { pr_err_ratelimited_client(cl, "mismatched seq/mseq: " "%p %llx.%llx mds%d seq %d mseq %d" " importer mds%d has peer seq %d mseq %d\n", inode, ceph_vinop(inode), peer, ocap->seq, ocap->mseq, mds, le32_to_cpu(ph->seq), le32_to_cpu(ph->mseq)); } ceph_remove_cap(mdsc, ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); } *old_issued = issued; *target_cap = cap; } #ifdef CONFIG_FS_ENCRYPTION static int parse_fscrypt_fields(void **p, void *end, struct cap_extra_info *extra) { u32 len; ceph_decode_32_safe(p, end, extra->fscrypt_auth_len, bad); if (extra->fscrypt_auth_len) { ceph_decode_need(p, end, extra->fscrypt_auth_len, bad); extra->fscrypt_auth = kmalloc(extra->fscrypt_auth_len, GFP_KERNEL); if (!extra->fscrypt_auth) return -ENOMEM; ceph_decode_copy_safe(p, end, extra->fscrypt_auth, extra->fscrypt_auth_len, bad); } ceph_decode_32_safe(p, end, len, bad); if (len >= sizeof(u64)) { ceph_decode_64_safe(p, end, extra->fscrypt_file_size, bad); len -= sizeof(u64); } ceph_decode_skip_n(p, end, len, bad); return 0; bad: return -EIO; } #else static int parse_fscrypt_fields(void **p, void *end, struct cap_extra_info *extra) { u32 len; /* Don't care about these fields unless we're encryption-capable */ ceph_decode_32_safe(p, end, len, bad); if (len) ceph_decode_skip_n(p, end, len, bad); ceph_decode_32_safe(p, end, len, bad); if (len) ceph_decode_skip_n(p, end, len, bad); return 0; bad: return -EIO; } #endif /* * Handle a caps message from the MDS. * * Identify the appropriate session, inode, and call the right handler * based on the cap op. */ void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_msg *msg) { struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_client *cl = mdsc->fsc->client; struct inode *inode; struct ceph_inode_info *ci; struct ceph_cap *cap; struct ceph_mds_caps *h; struct ceph_mds_cap_peer *peer = NULL; struct ceph_snap_realm *realm = NULL; int op; int msg_version = le16_to_cpu(msg->hdr.version); u32 seq, mseq; struct ceph_vino vino; void *snaptrace; size_t snaptrace_len; void *p, *end; struct cap_extra_info extra_info = {}; bool queue_trunc; bool close_sessions = false; bool do_cap_release = false; doutc(cl, "from mds%d\n", session->s_mds); if (!ceph_inc_mds_stopping_blocker(mdsc, session)) return; /* decode */ end = msg->front.iov_base + msg->front.iov_len; if (msg->front.iov_len < sizeof(*h)) goto bad; h = msg->front.iov_base; op = le32_to_cpu(h->op); vino.ino = le64_to_cpu(h->ino); vino.snap = CEPH_NOSNAP; seq = le32_to_cpu(h->seq); mseq = le32_to_cpu(h->migrate_seq); snaptrace = h + 1; snaptrace_len = le32_to_cpu(h->snap_trace_len); p = snaptrace + snaptrace_len; if (msg_version >= 2) { u32 flock_len; ceph_decode_32_safe(&p, end, flock_len, bad); if (p + flock_len > end) goto bad; p += flock_len; } if (msg_version >= 3) { if (op == CEPH_CAP_OP_IMPORT) { if (p + sizeof(*peer) > end) goto bad; peer = p; p += sizeof(*peer); } else if (op == CEPH_CAP_OP_EXPORT) { /* recorded in unused fields */ peer = (void *)&h->size; } } if (msg_version >= 4) { ceph_decode_64_safe(&p, end, extra_info.inline_version, bad); ceph_decode_32_safe(&p, end, extra_info.inline_len, bad); if (p + extra_info.inline_len > end) goto bad; extra_info.inline_data = p; p += extra_info.inline_len; } if (msg_version >= 5) { struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc; u32 epoch_barrier; ceph_decode_32_safe(&p, end, epoch_barrier, bad); ceph_osdc_update_epoch_barrier(osdc, epoch_barrier); } if (msg_version >= 8) { u32 pool_ns_len; /* version >= 6 */ ceph_decode_skip_64(&p, end, bad); // flush_tid /* version >= 7 */ ceph_decode_skip_32(&p, end, bad); // caller_uid ceph_decode_skip_32(&p, end, bad); // caller_gid /* version >= 8 */ ceph_decode_32_safe(&p, end, pool_ns_len, bad); if (pool_ns_len > 0) { ceph_decode_need(&p, end, pool_ns_len, bad); extra_info.pool_ns = ceph_find_or_create_string(p, pool_ns_len); p += pool_ns_len; } } if (msg_version >= 9) { struct ceph_timespec *btime; if (p + sizeof(*btime) > end) goto bad; btime = p; ceph_decode_timespec64(&extra_info.btime, btime); p += sizeof(*btime); ceph_decode_64_safe(&p, end, extra_info.change_attr, bad); } if (msg_version >= 11) { /* version >= 10 */ ceph_decode_skip_32(&p, end, bad); // flags /* version >= 11 */ extra_info.dirstat_valid = true; ceph_decode_64_safe(&p, end, extra_info.nfiles, bad); ceph_decode_64_safe(&p, end, extra_info.nsubdirs, bad); } if (msg_version >= 12) { if (parse_fscrypt_fields(&p, end, &extra_info)) goto bad; } /* lookup ino */ inode = ceph_find_inode(mdsc->fsc->sb, vino); doutc(cl, " op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, vino.snap, inode); mutex_lock(&session->s_mutex); doutc(cl, " mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, (unsigned)seq); if (!inode) { doutc(cl, " i don't have ino %llx\n", vino.ino); switch (op) { case CEPH_CAP_OP_IMPORT: case CEPH_CAP_OP_REVOKE: case CEPH_CAP_OP_GRANT: do_cap_release = true; break; default: break; } goto flush_cap_releases; } ci = ceph_inode(inode); /* these will work even if we don't have a cap yet */ switch (op) { case CEPH_CAP_OP_FLUSHSNAP_ACK: handle_cap_flushsnap_ack(inode, le64_to_cpu(msg->hdr.tid), h, session); goto done; case CEPH_CAP_OP_EXPORT: handle_cap_export(inode, h, peer, session); goto done_unlocked; case CEPH_CAP_OP_IMPORT: realm = NULL; if (snaptrace_len) { down_write(&mdsc->snap_rwsem); if (ceph_update_snap_trace(mdsc, snaptrace, snaptrace + snaptrace_len, false, &realm)) { up_write(&mdsc->snap_rwsem); close_sessions = true; goto done; } downgrade_write(&mdsc->snap_rwsem); } else { down_read(&mdsc->snap_rwsem); } spin_lock(&ci->i_ceph_lock); handle_cap_import(mdsc, inode, h, peer, session, &cap, &extra_info.issued); handle_cap_grant(inode, session, cap, h, msg->middle, &extra_info); if (realm) ceph_put_snap_realm(mdsc, realm); goto done_unlocked; } /* the rest require a cap */ spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ceph_inode(inode), session->s_mds); if (!cap) { doutc(cl, " no cap on %p ino %llx.%llx from mds%d\n", inode, ceph_ino(inode), ceph_snap(inode), session->s_mds); spin_unlock(&ci->i_ceph_lock); switch (op) { case CEPH_CAP_OP_REVOKE: case CEPH_CAP_OP_GRANT: do_cap_release = true; break; default: break; } goto flush_cap_releases; } /* note that each of these drops i_ceph_lock for us */ switch (op) { case CEPH_CAP_OP_REVOKE: case CEPH_CAP_OP_GRANT: __ceph_caps_issued(ci, &extra_info.issued); extra_info.issued |= __ceph_caps_dirty(ci); handle_cap_grant(inode, session, cap, h, msg->middle, &extra_info); goto done_unlocked; case CEPH_CAP_OP_FLUSH_ACK: handle_cap_flush_ack(inode, le64_to_cpu(msg->hdr.tid), h, session, cap); break; case CEPH_CAP_OP_TRUNC: queue_trunc = handle_cap_trunc(inode, h, session, &extra_info); spin_unlock(&ci->i_ceph_lock); if (queue_trunc) ceph_queue_vmtruncate(inode); break; default: spin_unlock(&ci->i_ceph_lock); pr_err_client(cl, "unknown cap op %d %s\n", op, ceph_cap_op_name(op)); } done: mutex_unlock(&session->s_mutex); done_unlocked: iput(inode); out: ceph_dec_mds_stopping_blocker(mdsc); ceph_put_string(extra_info.pool_ns); /* Defer closing the sessions after s_mutex lock being released */ if (close_sessions) ceph_mdsc_close_sessions(mdsc); kfree(extra_info.fscrypt_auth); return; flush_cap_releases: /* * send any cap release message to try to move things * along for the mds (who clearly thinks we still have this * cap). */ if (do_cap_release) { cap = ceph_get_cap(mdsc, NULL); cap->cap_ino = vino.ino; cap->queue_release = 1; cap->cap_id = le64_to_cpu(h->cap_id); cap->mseq = mseq; cap->seq = seq; cap->issue_seq = seq; spin_lock(&session->s_cap_lock); __ceph_queue_cap_release(session, cap); spin_unlock(&session->s_cap_lock); } ceph_flush_cap_releases(mdsc, session); goto done; bad: pr_err_client(cl, "corrupt message\n"); ceph_msg_dump(msg); goto out; } /* * Delayed work handler to process end of delayed cap release LRU list. * * If new caps are added to the list while processing it, these won't get * processed in this run. In this case, the ci->i_hold_caps_max will be * returned so that the work can be scheduled accordingly. */ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc) { struct ceph_client *cl = mdsc->fsc->client; struct inode *inode; struct ceph_inode_info *ci; struct ceph_mount_options *opt = mdsc->fsc->mount_options; unsigned long delay_max = opt->caps_wanted_delay_max * HZ; unsigned long loop_start = jiffies; unsigned long delay = 0; doutc(cl, "begin\n"); spin_lock(&mdsc->cap_delay_lock); while (!list_empty(&mdsc->cap_delay_list)) { ci = list_first_entry(&mdsc->cap_delay_list, struct ceph_inode_info, i_cap_delay_list); if (time_before(loop_start, ci->i_hold_caps_max - delay_max)) { doutc(cl, "caps added recently. Exiting loop"); delay = ci->i_hold_caps_max; break; } if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 && time_before(jiffies, ci->i_hold_caps_max)) break; list_del_init(&ci->i_cap_delay_list); inode = igrab(&ci->netfs.inode); if (inode) { spin_unlock(&mdsc->cap_delay_lock); doutc(cl, "on %p %llx.%llx\n", inode, ceph_vinop(inode)); ceph_check_caps(ci, 0); iput(inode); spin_lock(&mdsc->cap_delay_lock); } } spin_unlock(&mdsc->cap_delay_lock); doutc(cl, "done\n"); return delay; } /* * Flush all dirty caps to the mds */ static void flush_dirty_session_caps(struct ceph_mds_session *s) { struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci; struct inode *inode; doutc(cl, "begin\n"); spin_lock(&mdsc->cap_dirty_lock); while (!list_empty(&s->s_cap_dirty)) { ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info, i_dirty_item); inode = &ci->netfs.inode; ihold(inode); doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode)); spin_unlock(&mdsc->cap_dirty_lock); ceph_wait_on_async_create(inode); ceph_check_caps(ci, CHECK_CAPS_FLUSH); iput(inode); spin_lock(&mdsc->cap_dirty_lock); } spin_unlock(&mdsc->cap_dirty_lock); doutc(cl, "done\n"); } void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) { ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true); } void __ceph_touch_fmode(struct ceph_inode_info *ci, struct ceph_mds_client *mdsc, int fmode) { unsigned long now = jiffies; if (fmode & CEPH_FILE_MODE_RD) ci->i_last_rd = now; if (fmode & CEPH_FILE_MODE_WR) ci->i_last_wr = now; /* queue periodic check */ if (fmode && __ceph_is_any_real_caps(ci) && list_empty(&ci->i_cap_delay_list)) __cap_delay_requeue(mdsc, ci); } void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb); int bits = (fmode << 1) | 1; bool already_opened = false; int i; if (count == 1) atomic64_inc(&mdsc->metric.opened_files); spin_lock(&ci->i_ceph_lock); for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { /* * If any of the mode ref is larger than 0, * that means it has been already opened by * others. Just skip checking the PIN ref. */ if (i && ci->i_nr_by_mode[i]) already_opened = true; if (bits & (1 << i)) ci->i_nr_by_mode[i] += count; } if (!already_opened) percpu_counter_inc(&mdsc->metric.opened_inodes); spin_unlock(&ci->i_ceph_lock); } /* * Drop open file reference. If we were the last open file, * we may need to release capabilities to the MDS (or schedule * their delayed release). */ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb); int bits = (fmode << 1) | 1; bool is_closed = true; int i; if (count == 1) atomic64_dec(&mdsc->metric.opened_files); spin_lock(&ci->i_ceph_lock); for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { if (bits & (1 << i)) { BUG_ON(ci->i_nr_by_mode[i] < count); ci->i_nr_by_mode[i] -= count; } /* * If any of the mode ref is not 0 after * decreased, that means it is still opened * by others. Just skip checking the PIN ref. */ if (i && ci->i_nr_by_mode[i]) is_closed = false; } if (is_closed) percpu_counter_dec(&mdsc->metric.opened_inodes); spin_unlock(&ci->i_ceph_lock); } /* * For a soon-to-be unlinked file, drop the LINK caps. If it * looks like the link count will hit 0, drop any other caps (other * than PIN) we don't specifically want (due to the file still being * open). */ int ceph_drop_caps_for_unlink(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; spin_lock(&ci->i_ceph_lock); if (inode->i_nlink == 1) { drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); if (__ceph_caps_dirty(ci)) { struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; __cap_delay_requeue_front(mdsc, ci); } } spin_unlock(&ci->i_ceph_lock); return drop; } /* * Helpers for embedding cap and dentry lease releases into mds * requests. * * @force is used by dentry_release (below) to force inclusion of a * record for the directory inode, even when there aren't any caps to * drop. */ int ceph_encode_inode_release(void **p, struct inode *inode, int mds, int drop, int unless, int force) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_cap *cap; struct ceph_mds_request_release *rel = *p; int used, dirty; int ret = 0; spin_lock(&ci->i_ceph_lock); used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); doutc(cl, "%p %llx.%llx mds%d used|dirty %s drop %s unless %s\n", inode, ceph_vinop(inode), mds, ceph_cap_string(used|dirty), ceph_cap_string(drop), ceph_cap_string(unless)); /* only drop unused, clean caps */ drop &= ~(used | dirty); cap = __get_cap_for_mds(ci, mds); if (cap && __cap_is_valid(cap)) { unless &= cap->issued; if (unless) { if (unless & CEPH_CAP_AUTH_EXCL) drop &= ~CEPH_CAP_AUTH_SHARED; if (unless & CEPH_CAP_LINK_EXCL) drop &= ~CEPH_CAP_LINK_SHARED; if (unless & CEPH_CAP_XATTR_EXCL) drop &= ~CEPH_CAP_XATTR_SHARED; if (unless & CEPH_CAP_FILE_EXCL) drop &= ~CEPH_CAP_FILE_SHARED; } if (force || (cap->issued & drop)) { if (cap->issued & drop) { int wanted = __ceph_caps_wanted(ci); doutc(cl, "%p %llx.%llx cap %p %s -> %s, " "wanted %s -> %s\n", inode, ceph_vinop(inode), cap, ceph_cap_string(cap->issued), ceph_cap_string(cap->issued & ~drop), ceph_cap_string(cap->mds_wanted), ceph_cap_string(wanted)); cap->issued &= ~drop; cap->implemented &= ~drop; cap->mds_wanted = wanted; if (cap == ci->i_auth_cap && !(wanted & CEPH_CAP_ANY_FILE_WR)) ci->i_requested_max_size = 0; } else { doutc(cl, "%p %llx.%llx cap %p %s (force)\n", inode, ceph_vinop(inode), cap, ceph_cap_string(cap->issued)); } rel->ino = cpu_to_le64(ceph_ino(inode)); rel->cap_id = cpu_to_le64(cap->cap_id); rel->seq = cpu_to_le32(cap->seq); rel->issue_seq = cpu_to_le32(cap->issue_seq); rel->mseq = cpu_to_le32(cap->mseq); rel->caps = cpu_to_le32(cap->implemented); rel->wanted = cpu_to_le32(cap->mds_wanted); rel->dname_len = 0; rel->dname_seq = 0; *p += sizeof(*rel); ret = 1; } else { doutc(cl, "%p %llx.%llx cap %p %s (noop)\n", inode, ceph_vinop(inode), cap, ceph_cap_string(cap->issued)); } } spin_unlock(&ci->i_ceph_lock); return ret; } /** * ceph_encode_dentry_release - encode a dentry release into an outgoing request * @p: outgoing request buffer * @dentry: dentry to release * @dir: dir to release it from * @mds: mds that we're speaking to * @drop: caps being dropped * @unless: unless we have these caps * * Encode a dentry release into an outgoing request buffer. Returns 1 if the * thing was released, or a negative error code otherwise. */ int ceph_encode_dentry_release(void **p, struct dentry *dentry, struct inode *dir, int mds, int drop, int unless) { struct ceph_mds_request_release *rel = *p; struct ceph_dentry_info *di = ceph_dentry(dentry); struct ceph_client *cl; int force = 0; int ret; /* This shouldn't happen */ BUG_ON(!dir); /* * force an record for the directory caps if we have a dentry lease. * this is racy (can't take i_ceph_lock and d_lock together), but it * doesn't have to be perfect; the mds will revoke anything we don't * release. */ spin_lock(&dentry->d_lock); if (di->lease_session && di->lease_session->s_mds == mds) force = 1; spin_unlock(&dentry->d_lock); ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force); cl = ceph_inode_to_client(dir); spin_lock(&dentry->d_lock); if (ret && di->lease_session && di->lease_session->s_mds == mds) { doutc(cl, "%p mds%d seq %d\n", dentry, mds, (int)di->lease_seq); rel->dname_seq = cpu_to_le32(di->lease_seq); __ceph_mdsc_drop_dentry_lease(dentry); spin_unlock(&dentry->d_lock); if (IS_ENCRYPTED(dir) && fscrypt_has_encryption_key(dir)) { int ret2 = ceph_encode_encrypted_fname(dir, dentry, *p); if (ret2 < 0) return ret2; rel->dname_len = cpu_to_le32(ret2); *p += ret2; } else { rel->dname_len = cpu_to_le32(dentry->d_name.len); memcpy(*p, dentry->d_name.name, dentry->d_name.len); *p += dentry->d_name.len; } } else { spin_unlock(&dentry->d_lock); } return ret; } static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap_snap *capsnap; int capsnap_release = 0; lockdep_assert_held(&ci->i_ceph_lock); doutc(cl, "removing capsnaps, ci is %p, %p %llx.%llx\n", ci, inode, ceph_vinop(inode)); while (!list_empty(&ci->i_cap_snaps)) { capsnap = list_first_entry(&ci->i_cap_snaps, struct ceph_cap_snap, ci_item); __ceph_remove_capsnap(inode, capsnap, NULL, NULL); ceph_put_snap_context(capsnap->context); ceph_put_cap_snap(capsnap); capsnap_release++; } wake_up_all(&ci->i_cap_wq); wake_up_all(&mdsc->cap_flushing_wq); return capsnap_release; } int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate) { struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_client *cl = fsc->client; struct ceph_inode_info *ci = ceph_inode(inode); bool is_auth; bool dirty_dropped = false; int iputs = 0; lockdep_assert_held(&ci->i_ceph_lock); doutc(cl, "removing cap %p, ci is %p, %p %llx.%llx\n", cap, ci, inode, ceph_vinop(inode)); is_auth = (cap == ci->i_auth_cap); __ceph_remove_cap(cap, false); if (is_auth) { struct ceph_cap_flush *cf; if (ceph_inode_is_shutdown(inode)) { if (inode->i_data.nrpages > 0) *invalidate = true; if (ci->i_wrbuffer_ref > 0) mapping_set_error(&inode->i_data, -EIO); } spin_lock(&mdsc->cap_dirty_lock); /* trash all of the cap flushes for this inode */ while (!list_empty(&ci->i_cap_flush_list)) { cf = list_first_entry(&ci->i_cap_flush_list, struct ceph_cap_flush, i_list); list_del_init(&cf->g_list); list_del_init(&cf->i_list); if (!cf->is_capsnap) ceph_free_cap_flush(cf); } if (!list_empty(&ci->i_dirty_item)) { pr_warn_ratelimited_client(cl, " dropping dirty %s state for %p %llx.%llx\n", ceph_cap_string(ci->i_dirty_caps), inode, ceph_vinop(inode)); ci->i_dirty_caps = 0; list_del_init(&ci->i_dirty_item); dirty_dropped = true; } if (!list_empty(&ci->i_flushing_item)) { pr_warn_ratelimited_client(cl, " dropping dirty+flushing %s state for %p %llx.%llx\n", ceph_cap_string(ci->i_flushing_caps), inode, ceph_vinop(inode)); ci->i_flushing_caps = 0; list_del_init(&ci->i_flushing_item); mdsc->num_cap_flushing--; dirty_dropped = true; } spin_unlock(&mdsc->cap_dirty_lock); if (dirty_dropped) { mapping_set_error(inode->i_mapping, -EIO); if (ci->i_wrbuffer_ref_head == 0 && ci->i_wr_ref == 0 && ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; } } if (atomic_read(&ci->i_filelock_ref) > 0) { /* make further file lock syscall return -EIO */ ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK; pr_warn_ratelimited_client(cl, " dropping file locks for %p %llx.%llx\n", inode, ceph_vinop(inode)); } if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) { cf = ci->i_prealloc_cap_flush; ci->i_prealloc_cap_flush = NULL; if (!cf->is_capsnap) ceph_free_cap_flush(cf); } if (!list_empty(&ci->i_cap_snaps)) iputs = remove_capsnaps(mdsc, inode); } if (dirty_dropped) ++iputs; return iputs; }
9 9 9 22 6 2 18 174 175 166 4 176 165 174 171 149 172 170 149 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include <net/genetlink.h> #define CREATE_TRACE_POINTS #include <trace/events/devlink.h> #include "devl_internal.h" EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr); EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report); DEFINE_XARRAY_FLAGS(devlinks, XA_FLAGS_ALLOC); static struct devlink *devlinks_xa_get(unsigned long index) { struct devlink *devlink; rcu_read_lock(); devlink = xa_find(&devlinks, &index, index, DEVLINK_REGISTERED); if (!devlink || !devlink_try_get(devlink)) devlink = NULL; rcu_read_unlock(); return devlink; } /* devlink_rels xarray contains 1:1 relationships between * devlink object and related nested devlink instance. * The xarray index is used to get the nested object from * the nested-in object code. */ static DEFINE_XARRAY_FLAGS(devlink_rels, XA_FLAGS_ALLOC1); #define DEVLINK_REL_IN_USE XA_MARK_0 struct devlink_rel { u32 index; refcount_t refcount; u32 devlink_index; struct { u32 devlink_index; u32 obj_index; devlink_rel_notify_cb_t *notify_cb; devlink_rel_cleanup_cb_t *cleanup_cb; struct work_struct notify_work; } nested_in; }; static void devlink_rel_free(struct devlink_rel *rel) { xa_erase(&devlink_rels, rel->index); kfree(rel); } static void __devlink_rel_get(struct devlink_rel *rel) { refcount_inc(&rel->refcount); } static void __devlink_rel_put(struct devlink_rel *rel) { if (refcount_dec_and_test(&rel->refcount)) devlink_rel_free(rel); } static void devlink_rel_nested_in_notify_work(struct work_struct *work) { struct devlink_rel *rel = container_of(work, struct devlink_rel, nested_in.notify_work); struct devlink *devlink; devlink = devlinks_xa_get(rel->nested_in.devlink_index); if (!devlink) goto rel_put; if (!devl_trylock(devlink)) { devlink_put(devlink); goto reschedule_work; } if (!devl_is_registered(devlink)) { devl_unlock(devlink); devlink_put(devlink); goto rel_put; } if (!xa_get_mark(&devlink_rels, rel->index, DEVLINK_REL_IN_USE)) rel->nested_in.cleanup_cb(devlink, rel->nested_in.obj_index, rel->index); rel->nested_in.notify_cb(devlink, rel->nested_in.obj_index); devl_unlock(devlink); devlink_put(devlink); rel_put: __devlink_rel_put(rel); return; reschedule_work: schedule_work(&rel->nested_in.notify_work); } static void devlink_rel_nested_in_notify_work_schedule(struct devlink_rel *rel) { __devlink_rel_get(rel); schedule_work(&rel->nested_in.notify_work); } static struct devlink_rel *devlink_rel_alloc(void) { struct devlink_rel *rel; static u32 next; int err; rel = kzalloc(sizeof(*rel), GFP_KERNEL); if (!rel) return ERR_PTR(-ENOMEM); err = xa_alloc_cyclic(&devlink_rels, &rel->index, rel, xa_limit_32b, &next, GFP_KERNEL); if (err) { kfree(rel); return ERR_PTR(err); } refcount_set(&rel->refcount, 1); INIT_WORK(&rel->nested_in.notify_work, &devlink_rel_nested_in_notify_work); return rel; } static void devlink_rel_put(struct devlink *devlink) { struct devlink_rel *rel = devlink->rel; if (!rel) return; xa_clear_mark(&devlink_rels, rel->index, DEVLINK_REL_IN_USE); devlink_rel_nested_in_notify_work_schedule(rel); __devlink_rel_put(rel); devlink->rel = NULL; } void devlink_rel_nested_in_clear(u32 rel_index) { xa_clear_mark(&devlink_rels, rel_index, DEVLINK_REL_IN_USE); } int devlink_rel_nested_in_add(u32 *rel_index, u32 devlink_index, u32 obj_index, devlink_rel_notify_cb_t *notify_cb, devlink_rel_cleanup_cb_t *cleanup_cb, struct devlink *devlink) { struct devlink_rel *rel = devlink_rel_alloc(); ASSERT_DEVLINK_NOT_REGISTERED(devlink); if (IS_ERR(rel)) return PTR_ERR(rel); rel->devlink_index = devlink->index; rel->nested_in.devlink_index = devlink_index; rel->nested_in.obj_index = obj_index; rel->nested_in.notify_cb = notify_cb; rel->nested_in.cleanup_cb = cleanup_cb; *rel_index = rel->index; xa_set_mark(&devlink_rels, rel->index, DEVLINK_REL_IN_USE); devlink->rel = rel; return 0; } /** * devlink_rel_nested_in_notify - Notify the object this devlink * instance is nested in. * @devlink: devlink * * This is called upon network namespace change of devlink instance. * In case this devlink instance is nested in another devlink object, * a notification of a change of this object should be sent * over netlink. The parent devlink instance lock needs to be * taken during the notification preparation. * However, since the devlink lock of nested instance is held here, * we would end with wrong devlink instance lock ordering and * deadlock. Therefore the work is utilized to avoid that. */ void devlink_rel_nested_in_notify(struct devlink *devlink) { struct devlink_rel *rel = devlink->rel; if (!rel) return; devlink_rel_nested_in_notify_work_schedule(rel); } static struct devlink_rel *devlink_rel_find(unsigned long rel_index) { return xa_find(&devlink_rels, &rel_index, rel_index, DEVLINK_REL_IN_USE); } static struct devlink *devlink_rel_devlink_get(u32 rel_index) { struct devlink_rel *rel; u32 devlink_index; if (!rel_index) return NULL; xa_lock(&devlink_rels); rel = devlink_rel_find(rel_index); if (rel) devlink_index = rel->devlink_index; xa_unlock(&devlink_rels); if (!rel) return NULL; return devlinks_xa_get(devlink_index); } int devlink_rel_devlink_handle_put(struct sk_buff *msg, struct devlink *devlink, u32 rel_index, int attrtype, bool *msg_updated) { struct net *net = devlink_net(devlink); struct devlink *rel_devlink; int err; rel_devlink = devlink_rel_devlink_get(rel_index); if (!rel_devlink) return 0; err = devlink_nl_put_nested_handle(msg, net, rel_devlink, attrtype); devlink_put(rel_devlink); if (!err && msg_updated) *msg_updated = true; return err; } void *devlink_priv(struct devlink *devlink) { return &devlink->priv; } EXPORT_SYMBOL_GPL(devlink_priv); struct devlink *priv_to_devlink(void *priv) { return container_of(priv, struct devlink, priv); } EXPORT_SYMBOL_GPL(priv_to_devlink); struct device *devlink_to_dev(const struct devlink *devlink) { return devlink->dev; } EXPORT_SYMBOL_GPL(devlink_to_dev); struct net *devlink_net(const struct devlink *devlink) { return read_pnet(&devlink->_net); } EXPORT_SYMBOL_GPL(devlink_net); void devl_assert_locked(struct devlink *devlink) { lockdep_assert_held(&devlink->lock); } EXPORT_SYMBOL_GPL(devl_assert_locked); #ifdef CONFIG_LOCKDEP /* For use in conjunction with LOCKDEP only e.g. rcu_dereference_protected() */ bool devl_lock_is_held(struct devlink *devlink) { return lockdep_is_held(&devlink->lock); } EXPORT_SYMBOL_GPL(devl_lock_is_held); #endif void devl_lock(struct devlink *devlink) { mutex_lock(&devlink->lock); } EXPORT_SYMBOL_GPL(devl_lock); int devl_trylock(struct devlink *devlink) { return mutex_trylock(&devlink->lock); } EXPORT_SYMBOL_GPL(devl_trylock); void devl_unlock(struct devlink *devlink) { mutex_unlock(&devlink->lock); } EXPORT_SYMBOL_GPL(devl_unlock); /** * devlink_try_get() - try to obtain a reference on a devlink instance * @devlink: instance to reference * * Obtain a reference on a devlink instance. A reference on a devlink instance * only implies that it's safe to take the instance lock. It does not imply * that the instance is registered, use devl_is_registered() after taking * the instance lock to check registration status. */ struct devlink *__must_check devlink_try_get(struct devlink *devlink) { if (refcount_inc_not_zero(&devlink->refcount)) return devlink; return NULL; } static void devlink_release(struct work_struct *work) { struct devlink *devlink; devlink = container_of(to_rcu_work(work), struct devlink, rwork); mutex_destroy(&devlink->lock); lockdep_unregister_key(&devlink->lock_key); put_device(devlink->dev); kfree(devlink); } void devlink_put(struct devlink *devlink) { if (refcount_dec_and_test(&devlink->refcount)) queue_rcu_work(system_wq, &devlink->rwork); } struct devlink *devlinks_xa_find_get(struct net *net, unsigned long *indexp) { struct devlink *devlink = NULL; rcu_read_lock(); retry: devlink = xa_find(&devlinks, indexp, ULONG_MAX, DEVLINK_REGISTERED); if (!devlink) goto unlock; if (!devlink_try_get(devlink)) goto next; if (!net_eq(devlink_net(devlink), net)) { devlink_put(devlink); goto next; } unlock: rcu_read_unlock(); return devlink; next: (*indexp)++; goto retry; } /** * devl_register - Register devlink instance * @devlink: devlink */ int devl_register(struct devlink *devlink) { ASSERT_DEVLINK_NOT_REGISTERED(devlink); devl_assert_locked(devlink); xa_set_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); devlink_notify_register(devlink); devlink_rel_nested_in_notify(devlink); return 0; } EXPORT_SYMBOL_GPL(devl_register); void devlink_register(struct devlink *devlink) { devl_lock(devlink); devl_register(devlink); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_register); /** * devl_unregister - Unregister devlink instance * @devlink: devlink */ void devl_unregister(struct devlink *devlink) { ASSERT_DEVLINK_REGISTERED(devlink); devl_assert_locked(devlink); devlink_notify_unregister(devlink); xa_clear_mark(&devlinks, devlink->index, DEVLINK_REGISTERED); devlink_rel_put(devlink); } EXPORT_SYMBOL_GPL(devl_unregister); void devlink_unregister(struct devlink *devlink) { devl_lock(devlink); devl_unregister(devlink); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_unregister); /** * devlink_alloc_ns - Allocate new devlink instance resources * in specific namespace * * @ops: ops * @priv_size: size of user private data * @net: net namespace * @dev: parent device * * Allocate new devlink instance resources, including devlink index * and name. */ struct devlink *devlink_alloc_ns(const struct devlink_ops *ops, size_t priv_size, struct net *net, struct device *dev) { struct devlink *devlink; static u32 last_id; int ret; WARN_ON(!ops || !dev); if (!devlink_reload_actions_valid(ops)) return NULL; devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL); if (!devlink) return NULL; ret = xa_alloc_cyclic(&devlinks, &devlink->index, devlink, xa_limit_31b, &last_id, GFP_KERNEL); if (ret < 0) goto err_xa_alloc; devlink->dev = get_device(dev); devlink->ops = ops; xa_init_flags(&devlink->ports, XA_FLAGS_ALLOC); xa_init_flags(&devlink->params, XA_FLAGS_ALLOC); xa_init_flags(&devlink->snapshot_ids, XA_FLAGS_ALLOC); xa_init_flags(&devlink->nested_rels, XA_FLAGS_ALLOC); write_pnet(&devlink->_net, net); INIT_LIST_HEAD(&devlink->rate_list); INIT_LIST_HEAD(&devlink->linecard_list); INIT_LIST_HEAD(&devlink->sb_list); INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); INIT_LIST_HEAD(&devlink->resource_list); INIT_LIST_HEAD(&devlink->region_list); INIT_LIST_HEAD(&devlink->reporter_list); INIT_LIST_HEAD(&devlink->trap_list); INIT_LIST_HEAD(&devlink->trap_group_list); INIT_LIST_HEAD(&devlink->trap_policer_list); INIT_RCU_WORK(&devlink->rwork, devlink_release); lockdep_register_key(&devlink->lock_key); mutex_init(&devlink->lock); lockdep_set_class(&devlink->lock, &devlink->lock_key); refcount_set(&devlink->refcount, 1); return devlink; err_xa_alloc: kfree(devlink); return NULL; } EXPORT_SYMBOL_GPL(devlink_alloc_ns); /** * devlink_free - Free devlink instance resources * * @devlink: devlink */ void devlink_free(struct devlink *devlink) { ASSERT_DEVLINK_NOT_REGISTERED(devlink); WARN_ON(!list_empty(&devlink->trap_policer_list)); WARN_ON(!list_empty(&devlink->trap_group_list)); WARN_ON(!list_empty(&devlink->trap_list)); WARN_ON(!list_empty(&devlink->reporter_list)); WARN_ON(!list_empty(&devlink->region_list)); WARN_ON(!list_empty(&devlink->resource_list)); WARN_ON(!list_empty(&devlink->dpipe_table_list)); WARN_ON(!list_empty(&devlink->sb_list)); WARN_ON(!list_empty(&devlink->rate_list)); WARN_ON(!list_empty(&devlink->linecard_list)); WARN_ON(!xa_empty(&devlink->ports)); xa_destroy(&devlink->nested_rels); xa_destroy(&devlink->snapshot_ids); xa_destroy(&devlink->params); xa_destroy(&devlink->ports); xa_erase(&devlinks, devlink->index); devlink_put(devlink); } EXPORT_SYMBOL_GPL(devlink_free); static void __net_exit devlink_pernet_pre_exit(struct net *net) { struct devlink *devlink; u32 actions_performed; unsigned long index; int err; /* In case network namespace is getting destroyed, reload * all devlink instances from this namespace into init_net. */ devlinks_xa_for_each_registered_get(net, index, devlink) { devl_dev_lock(devlink, true); err = 0; if (devl_is_registered(devlink)) err = devlink_reload(devlink, &init_net, DEVLINK_RELOAD_ACTION_DRIVER_REINIT, DEVLINK_RELOAD_LIMIT_UNSPEC, &actions_performed, NULL); devl_dev_unlock(devlink, true); devlink_put(devlink); if (err && err != -EOPNOTSUPP) pr_warn("Failed to reload devlink instance into init_net\n"); } } static struct pernet_operations devlink_pernet_ops __net_initdata = { .pre_exit = devlink_pernet_pre_exit, }; static struct notifier_block devlink_port_netdevice_nb = { .notifier_call = devlink_port_netdevice_event, }; static int __init devlink_init(void) { int err; err = genl_register_family(&devlink_nl_family); if (err) goto out; err = register_pernet_subsys(&devlink_pernet_ops); if (err) goto out; err = register_netdevice_notifier(&devlink_port_netdevice_nb); out: WARN_ON(err); return err; } subsys_initcall(devlink_init);
1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 // SPDX-License-Identifier: GPL-2.0-or-later /* * CMAC: Cipher Block Mode for Authentication * * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> * * Based on work by: * Copyright © 2013 Tom St Denis <tstdenis@elliptictech.com> * Based on crypto/xcbc.c: * Copyright © 2006 USAGI/WIDE Project, * Author: Kazunori Miyazawa <miyazawa@linux-ipv6.org> */ #include <crypto/internal/cipher.h> #include <crypto/internal/hash.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/module.h> /* * +------------------------ * | <parent tfm> * +------------------------ * | cmac_tfm_ctx * +------------------------ * | consts (block size * 2) * +------------------------ */ struct cmac_tfm_ctx { struct crypto_cipher *child; __be64 consts[]; }; /* * +------------------------ * | <shash desc> * +------------------------ * | cmac_desc_ctx * +------------------------ * | odds (block size) * +------------------------ * | prev (block size) * +------------------------ */ struct cmac_desc_ctx { unsigned int len; u8 odds[]; }; static int crypto_cmac_digest_setkey(struct crypto_shash *parent, const u8 *inkey, unsigned int keylen) { struct cmac_tfm_ctx *ctx = crypto_shash_ctx(parent); unsigned int bs = crypto_shash_blocksize(parent); __be64 *consts = ctx->consts; u64 _const[2]; int i, err = 0; u8 msb_mask, gfmask; err = crypto_cipher_setkey(ctx->child, inkey, keylen); if (err) return err; /* encrypt the zero block */ memset(consts, 0, bs); crypto_cipher_encrypt_one(ctx->child, (u8 *)consts, (u8 *)consts); switch (bs) { case 16: gfmask = 0x87; _const[0] = be64_to_cpu(consts[1]); _const[1] = be64_to_cpu(consts[0]); /* gf(2^128) multiply zero-ciphertext with u and u^2 */ for (i = 0; i < 4; i += 2) { msb_mask = ((s64)_const[1] >> 63) & gfmask; _const[1] = (_const[1] << 1) | (_const[0] >> 63); _const[0] = (_const[0] << 1) ^ msb_mask; consts[i + 0] = cpu_to_be64(_const[1]); consts[i + 1] = cpu_to_be64(_const[0]); } break; case 8: gfmask = 0x1B; _const[0] = be64_to_cpu(consts[0]); /* gf(2^64) multiply zero-ciphertext with u and u^2 */ for (i = 0; i < 2; i++) { msb_mask = ((s64)_const[0] >> 63) & gfmask; _const[0] = (_const[0] << 1) ^ msb_mask; consts[i] = cpu_to_be64(_const[0]); } break; } return 0; } static int crypto_cmac_digest_init(struct shash_desc *pdesc) { struct cmac_desc_ctx *ctx = shash_desc_ctx(pdesc); int bs = crypto_shash_blocksize(pdesc->tfm); u8 *prev = &ctx->odds[bs]; ctx->len = 0; memset(prev, 0, bs); return 0; } static int crypto_cmac_digest_update(struct shash_desc *pdesc, const u8 *p, unsigned int len) { struct crypto_shash *parent = pdesc->tfm; struct cmac_tfm_ctx *tctx = crypto_shash_ctx(parent); struct cmac_desc_ctx *ctx = shash_desc_ctx(pdesc); struct crypto_cipher *tfm = tctx->child; int bs = crypto_shash_blocksize(parent); u8 *odds = ctx->odds; u8 *prev = odds + bs; /* checking the data can fill the block */ if ((ctx->len + len) <= bs) { memcpy(odds + ctx->len, p, len); ctx->len += len; return 0; } /* filling odds with new data and encrypting it */ memcpy(odds + ctx->len, p, bs - ctx->len); len -= bs - ctx->len; p += bs - ctx->len; crypto_xor(prev, odds, bs); crypto_cipher_encrypt_one(tfm, prev, prev); /* clearing the length */ ctx->len = 0; /* encrypting the rest of data */ while (len > bs) { crypto_xor(prev, p, bs); crypto_cipher_encrypt_one(tfm, prev, prev); p += bs; len -= bs; } /* keeping the surplus of blocksize */ if (len) { memcpy(odds, p, len); ctx->len = len; } return 0; } static int crypto_cmac_digest_final(struct shash_desc *pdesc, u8 *out) { struct crypto_shash *parent = pdesc->tfm; struct cmac_tfm_ctx *tctx = crypto_shash_ctx(parent); struct cmac_desc_ctx *ctx = shash_desc_ctx(pdesc); struct crypto_cipher *tfm = tctx->child; int bs = crypto_shash_blocksize(parent); u8 *odds = ctx->odds; u8 *prev = odds + bs; unsigned int offset = 0; if (ctx->len != bs) { unsigned int rlen; u8 *p = odds + ctx->len; *p = 0x80; p++; rlen = bs - ctx->len - 1; if (rlen) memset(p, 0, rlen); offset += bs; } crypto_xor(prev, odds, bs); crypto_xor(prev, (const u8 *)tctx->consts + offset, bs); crypto_cipher_encrypt_one(tfm, out, prev); return 0; } static int cmac_init_tfm(struct crypto_shash *tfm) { struct shash_instance *inst = shash_alg_instance(tfm); struct cmac_tfm_ctx *ctx = crypto_shash_ctx(tfm); struct crypto_cipher_spawn *spawn; struct crypto_cipher *cipher; spawn = shash_instance_ctx(inst); cipher = crypto_spawn_cipher(spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->child = cipher; return 0; } static int cmac_clone_tfm(struct crypto_shash *tfm, struct crypto_shash *otfm) { struct cmac_tfm_ctx *octx = crypto_shash_ctx(otfm); struct cmac_tfm_ctx *ctx = crypto_shash_ctx(tfm); struct crypto_cipher *cipher; cipher = crypto_clone_cipher(octx->child); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->child = cipher; return 0; } static void cmac_exit_tfm(struct crypto_shash *tfm) { struct cmac_tfm_ctx *ctx = crypto_shash_ctx(tfm); crypto_free_cipher(ctx->child); } static int cmac_create(struct crypto_template *tmpl, struct rtattr **tb) { struct shash_instance *inst; struct crypto_cipher_spawn *spawn; struct crypto_alg *alg; u32 mask; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SHASH, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); if (!inst) return -ENOMEM; spawn = shash_instance_ctx(inst); err = crypto_grab_cipher(spawn, shash_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; alg = crypto_spawn_cipher_alg(spawn); switch (alg->cra_blocksize) { case 16: case 8: break; default: err = -EINVAL; goto err_free_inst; } err = crypto_inst_setname(shash_crypto_instance(inst), tmpl->name, alg); if (err) goto err_free_inst; inst->alg.base.cra_priority = alg->cra_priority; inst->alg.base.cra_blocksize = alg->cra_blocksize; inst->alg.base.cra_ctxsize = sizeof(struct cmac_tfm_ctx) + alg->cra_blocksize * 2; inst->alg.digestsize = alg->cra_blocksize; inst->alg.descsize = sizeof(struct cmac_desc_ctx) + alg->cra_blocksize * 2; inst->alg.init = crypto_cmac_digest_init; inst->alg.update = crypto_cmac_digest_update; inst->alg.final = crypto_cmac_digest_final; inst->alg.setkey = crypto_cmac_digest_setkey; inst->alg.init_tfm = cmac_init_tfm; inst->alg.clone_tfm = cmac_clone_tfm; inst->alg.exit_tfm = cmac_exit_tfm; inst->free = shash_free_singlespawn_instance; err = shash_register_instance(tmpl, inst); if (err) { err_free_inst: shash_free_singlespawn_instance(inst); } return err; } static struct crypto_template crypto_cmac_tmpl = { .name = "cmac", .create = cmac_create, .module = THIS_MODULE, }; static int __init crypto_cmac_module_init(void) { return crypto_register_template(&crypto_cmac_tmpl); } static void __exit crypto_cmac_module_exit(void) { crypto_unregister_template(&crypto_cmac_tmpl); } subsys_initcall(crypto_cmac_module_init); module_exit(crypto_cmac_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("CMAC keyed hash algorithm"); MODULE_ALIAS_CRYPTO("cmac"); MODULE_IMPORT_NS(CRYPTO_INTERNAL);
6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/core/netclassid_cgroup.c Classid Cgroupfs Handling * * Authors: Thomas Graf <tgraf@suug.ch> */ #include <linux/slab.h> #include <linux/cgroup.h> #include <linux/fdtable.h> #include <linux/sched/task.h> #include <net/cls_cgroup.h> #include <net/sock.h> static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state *css) { return css ? container_of(css, struct cgroup_cls_state, css) : NULL; } struct cgroup_cls_state *task_cls_state(struct task_struct *p) { return css_cls_state(task_css_check(p, net_cls_cgrp_id, rcu_read_lock_bh_held())); } EXPORT_SYMBOL_GPL(task_cls_state); static struct cgroup_subsys_state * cgrp_css_alloc(struct cgroup_subsys_state *parent_css) { struct cgroup_cls_state *cs; cs = kzalloc(sizeof(*cs), GFP_KERNEL); if (!cs) return ERR_PTR(-ENOMEM); return &cs->css; } static int cgrp_css_online(struct cgroup_subsys_state *css) { struct cgroup_cls_state *cs = css_cls_state(css); struct cgroup_cls_state *parent = css_cls_state(css->parent); if (parent) cs->classid = parent->classid; return 0; } static void cgrp_css_free(struct cgroup_subsys_state *css) { kfree(css_cls_state(css)); } /* * To avoid freezing of sockets creation for tasks with big number of threads * and opened sockets lets release file_lock every 1000 iterated descriptors. * New sockets will already have been created with new classid. */ struct update_classid_context { u32 classid; unsigned int batch; }; #define UPDATE_CLASSID_BATCH 1000 static int update_classid_sock(const void *v, struct file *file, unsigned int n) { struct update_classid_context *ctx = (void *)v; struct socket *sock = sock_from_file(file); if (sock) sock_cgroup_set_classid(&sock->sk->sk_cgrp_data, ctx->classid); if (--ctx->batch == 0) { ctx->batch = UPDATE_CLASSID_BATCH; return n + 1; } return 0; } static void update_classid_task(struct task_struct *p, u32 classid) { struct update_classid_context ctx = { .classid = classid, .batch = UPDATE_CLASSID_BATCH }; unsigned int fd = 0; /* Only update the leader task, when many threads in this task, * so it can avoid the useless traversal. */ if (p != p->group_leader) return; do { task_lock(p); fd = iterate_fd(p->files, fd, update_classid_sock, &ctx); task_unlock(p); cond_resched(); } while (fd); } static void cgrp_attach(struct cgroup_taskset *tset) { struct cgroup_subsys_state *css; struct task_struct *p; cgroup_taskset_for_each(p, css, tset) { update_classid_task(p, css_cls_state(css)->classid); } } static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft) { return css_cls_state(css)->classid; } static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, u64 value) { struct cgroup_cls_state *cs = css_cls_state(css); struct css_task_iter it; struct task_struct *p; cs->classid = (u32)value; css_task_iter_start(css, 0, &it); while ((p = css_task_iter_next(&it))) update_classid_task(p, cs->classid); css_task_iter_end(&it); return 0; } static struct cftype ss_files[] = { { .name = "classid", .read_u64 = read_classid, .write_u64 = write_classid, }, { } /* terminate */ }; struct cgroup_subsys net_cls_cgrp_subsys = { .css_alloc = cgrp_css_alloc, .css_online = cgrp_css_online, .css_free = cgrp_css_free, .attach = cgrp_attach, .legacy_cftypes = ss_files, };
21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 /* SPDX-License-Identifier: GPL-2.0 */ /* * Generic RTC interface. * This version contains the part of the user interface to the Real Time Clock * service. It is used with both the legacy mc146818 and also EFI * Struct rtc_time and first 12 ioctl by Paul Gortmaker, 1996 - separated out * from <linux/mc146818rtc.h> to this file for 2.4 kernels. * * Copyright (C) 1999 Hewlett-Packard Co. * Copyright (C) 1999 Stephane Eranian <eranian@hpl.hp.com> */ #ifndef _LINUX_RTC_H_ #define _LINUX_RTC_H_ #include <linux/types.h> #include <linux/interrupt.h> #include <linux/nvmem-provider.h> #include <uapi/linux/rtc.h> extern int rtc_month_days(unsigned int month, unsigned int year); extern int rtc_year_days(unsigned int day, unsigned int month, unsigned int year); extern int rtc_valid_tm(struct rtc_time *tm); extern time64_t rtc_tm_to_time64(struct rtc_time *tm); extern void rtc_time64_to_tm(time64_t time, struct rtc_time *tm); ktime_t rtc_tm_to_ktime(struct rtc_time tm); struct rtc_time rtc_ktime_to_tm(ktime_t kt); /* * rtc_tm_sub - Return the difference in seconds. */ static inline time64_t rtc_tm_sub(struct rtc_time *lhs, struct rtc_time *rhs) { return rtc_tm_to_time64(lhs) - rtc_tm_to_time64(rhs); } #include <linux/device.h> #include <linux/seq_file.h> #include <linux/cdev.h> #include <linux/poll.h> #include <linux/mutex.h> #include <linux/timerqueue.h> #include <linux/workqueue.h> extern struct class *rtc_class; /* * For these RTC methods the device parameter is the physical device * on whatever bus holds the hardware (I2C, Platform, SPI, etc), which * was passed to rtc_device_register(). Its driver_data normally holds * device state, including the rtc_device pointer for the RTC. * * Most of these methods are called with rtc_device.ops_lock held, * through the rtc_*(struct rtc_device *, ...) calls. * * The (current) exceptions are mostly filesystem hooks: * - the proc() hook for procfs */ struct rtc_class_ops { int (*ioctl)(struct device *, unsigned int, unsigned long); int (*read_time)(struct device *, struct rtc_time *); int (*set_time)(struct device *, struct rtc_time *); int (*read_alarm)(struct device *, struct rtc_wkalrm *); int (*set_alarm)(struct device *, struct rtc_wkalrm *); int (*proc)(struct device *, struct seq_file *); int (*alarm_irq_enable)(struct device *, unsigned int enabled); int (*read_offset)(struct device *, long *offset); int (*set_offset)(struct device *, long offset); int (*param_get)(struct device *, struct rtc_param *param); int (*param_set)(struct device *, struct rtc_param *param); }; struct rtc_device; struct rtc_timer { struct timerqueue_node node; ktime_t period; void (*func)(struct rtc_device *rtc); struct rtc_device *rtc; int enabled; }; /* flags */ #define RTC_DEV_BUSY 0 #define RTC_NO_CDEV 1 struct rtc_device { struct device dev; struct module *owner; int id; const struct rtc_class_ops *ops; struct mutex ops_lock; struct cdev char_dev; unsigned long flags; unsigned long irq_data; spinlock_t irq_lock; wait_queue_head_t irq_queue; struct fasync_struct *async_queue; int irq_freq; int max_user_freq; struct timerqueue_head timerqueue; struct rtc_timer aie_timer; struct rtc_timer uie_rtctimer; struct hrtimer pie_timer; /* sub second exp, so needs hrtimer */ int pie_enabled; struct work_struct irqwork; /* * This offset specifies the update timing of the RTC. * * tsched t1 write(t2.tv_sec - 1sec)) t2 RTC increments seconds * * The offset defines how tsched is computed so that the write to * the RTC (t2.tv_sec - 1sec) is correct versus the time required * for the transport of the write and the time which the RTC needs * to increment seconds the first time after the write (t2). * * For direct accessible RTCs tsched ~= t1 because the write time * is negligible. For RTCs behind slow busses the transport time is * significant and has to be taken into account. * * The time between the write (t1) and the first increment after * the write (t2) is RTC specific. For a MC146818 RTC it's 500ms, * for many others it's exactly 1 second. Consult the datasheet. * * The value of this offset is also used to calculate the to be * written value (t2.tv_sec - 1sec) at tsched. * * The default value for this is NSEC_PER_SEC + 10 msec default * transport time. The offset can be adjusted by drivers so the * calculation for the to be written value at tsched becomes * correct: * * newval = tsched + set_offset_nsec - NSEC_PER_SEC * and (tsched + set_offset_nsec) % NSEC_PER_SEC == 0 */ unsigned long set_offset_nsec; unsigned long features[BITS_TO_LONGS(RTC_FEATURE_CNT)]; time64_t range_min; timeu64_t range_max; timeu64_t alarm_offset_max; time64_t start_secs; time64_t offset_secs; bool set_start_time; #ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL struct work_struct uie_task; struct timer_list uie_timer; /* Those fields are protected by rtc->irq_lock */ unsigned int oldsecs; unsigned int uie_irq_active:1; unsigned int stop_uie_polling:1; unsigned int uie_task_active:1; unsigned int uie_timer_active:1; #endif }; #define to_rtc_device(d) container_of(d, struct rtc_device, dev) #define rtc_lock(d) mutex_lock(&d->ops_lock) #define rtc_unlock(d) mutex_unlock(&d->ops_lock) /* useful timestamps */ #define RTC_TIMESTAMP_BEGIN_0000 -62167219200ULL /* 0000-01-01 00:00:00 */ #define RTC_TIMESTAMP_BEGIN_1900 -2208988800LL /* 1900-01-01 00:00:00 */ #define RTC_TIMESTAMP_BEGIN_2000 946684800LL /* 2000-01-01 00:00:00 */ #define RTC_TIMESTAMP_END_2063 2966371199LL /* 2063-12-31 23:59:59 */ #define RTC_TIMESTAMP_END_2079 3471292799LL /* 2079-12-31 23:59:59 */ #define RTC_TIMESTAMP_END_2099 4102444799LL /* 2099-12-31 23:59:59 */ #define RTC_TIMESTAMP_END_2199 7258118399LL /* 2199-12-31 23:59:59 */ #define RTC_TIMESTAMP_END_9999 253402300799LL /* 9999-12-31 23:59:59 */ extern struct rtc_device *devm_rtc_device_register(struct device *dev, const char *name, const struct rtc_class_ops *ops, struct module *owner); struct rtc_device *devm_rtc_allocate_device(struct device *dev); int __devm_rtc_register_device(struct module *owner, struct rtc_device *rtc); extern int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm); extern int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm); int __rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm); extern int rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alrm); extern int rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alrm); extern int rtc_initialize_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alrm); extern void rtc_update_irq(struct rtc_device *rtc, unsigned long num, unsigned long events); extern struct rtc_device *rtc_class_open(const char *name); extern void rtc_class_close(struct rtc_device *rtc); extern int rtc_irq_set_state(struct rtc_device *rtc, int enabled); extern int rtc_irq_set_freq(struct rtc_device *rtc, int freq); extern int rtc_update_irq_enable(struct rtc_device *rtc, unsigned int enabled); extern int rtc_alarm_irq_enable(struct rtc_device *rtc, unsigned int enabled); extern int rtc_dev_update_irq_enable_emul(struct rtc_device *rtc, unsigned int enabled); void rtc_handle_legacy_irq(struct rtc_device *rtc, int num, int mode); void rtc_aie_update_irq(struct rtc_device *rtc); void rtc_uie_update_irq(struct rtc_device *rtc); enum hrtimer_restart rtc_pie_update_irq(struct hrtimer *timer); void rtc_timer_init(struct rtc_timer *timer, void (*f)(struct rtc_device *r), struct rtc_device *rtc); int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer *timer, ktime_t expires, ktime_t period); void rtc_timer_cancel(struct rtc_device *rtc, struct rtc_timer *timer); int rtc_read_offset(struct rtc_device *rtc, long *offset); int rtc_set_offset(struct rtc_device *rtc, long offset); void rtc_timer_do_work(struct work_struct *work); static inline bool is_leap_year(unsigned int year) { return (!(year % 4) && (year % 100)) || !(year % 400); } /** * rtc_bound_alarmtime() - Return alarm time bound by rtc limit * @rtc: Pointer to rtc device structure * @requested: Requested alarm timeout * * Return: Alarm timeout bound by maximum alarm time supported by rtc. */ static inline ktime_t rtc_bound_alarmtime(struct rtc_device *rtc, ktime_t requested) { if (rtc->alarm_offset_max && rtc->alarm_offset_max * MSEC_PER_SEC < ktime_to_ms(requested)) return ms_to_ktime(rtc->alarm_offset_max * MSEC_PER_SEC); return requested; } #define devm_rtc_register_device(device) \ __devm_rtc_register_device(THIS_MODULE, device) #ifdef CONFIG_RTC_HCTOSYS_DEVICE extern int rtc_hctosys_ret; #else #define rtc_hctosys_ret -ENODEV #endif #ifdef CONFIG_RTC_NVMEM int devm_rtc_nvmem_register(struct rtc_device *rtc, struct nvmem_config *nvmem_config); #else static inline int devm_rtc_nvmem_register(struct rtc_device *rtc, struct nvmem_config *nvmem_config) { return 0; } #endif #ifdef CONFIG_RTC_INTF_SYSFS int rtc_add_group(struct rtc_device *rtc, const struct attribute_group *grp); int rtc_add_groups(struct rtc_device *rtc, const struct attribute_group **grps); #else static inline int rtc_add_group(struct rtc_device *rtc, const struct attribute_group *grp) { return 0; } static inline int rtc_add_groups(struct rtc_device *rtc, const struct attribute_group **grps) { return 0; } #endif #endif /* _LINUX_RTC_H_ */
125 125 11 11 11 2834 2838 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 // SPDX-License-Identifier: GPL-2.0 /* * Block rq-qos policy for assigning an I/O priority class to requests. * * Using an rq-qos policy for assigning I/O priority class has two advantages * over using the ioprio_set() system call: * * - This policy is cgroup based so it has all the advantages of cgroups. * - While ioprio_set() does not affect page cache writeback I/O, this rq-qos * controller affects page cache writeback I/O for filesystems that support * assiociating a cgroup with writeback I/O. See also * Documentation/admin-guide/cgroup-v2.rst. */ #include <linux/blk-mq.h> #include <linux/blk_types.h> #include <linux/kernel.h> #include <linux/module.h> #include "blk-cgroup.h" #include "blk-ioprio.h" #include "blk-rq-qos.h" /** * enum prio_policy - I/O priority class policy. * @POLICY_NO_CHANGE: (default) do not modify the I/O priority class. * @POLICY_PROMOTE_TO_RT: modify no-IOPRIO_CLASS_RT to IOPRIO_CLASS_RT. * @POLICY_RESTRICT_TO_BE: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_RT into * IOPRIO_CLASS_BE. * @POLICY_ALL_TO_IDLE: change the I/O priority class into IOPRIO_CLASS_IDLE. * @POLICY_NONE_TO_RT: an alias for POLICY_PROMOTE_TO_RT. * * See also <linux/ioprio.h>. */ enum prio_policy { POLICY_NO_CHANGE = 0, POLICY_PROMOTE_TO_RT = 1, POLICY_RESTRICT_TO_BE = 2, POLICY_ALL_TO_IDLE = 3, POLICY_NONE_TO_RT = 4, }; static const char *policy_name[] = { [POLICY_NO_CHANGE] = "no-change", [POLICY_PROMOTE_TO_RT] = "promote-to-rt", [POLICY_RESTRICT_TO_BE] = "restrict-to-be", [POLICY_ALL_TO_IDLE] = "idle", [POLICY_NONE_TO_RT] = "none-to-rt", }; static struct blkcg_policy ioprio_policy; /** * struct ioprio_blkg - Per (cgroup, request queue) data. * @pd: blkg_policy_data structure. */ struct ioprio_blkg { struct blkg_policy_data pd; }; /** * struct ioprio_blkcg - Per cgroup data. * @cpd: blkcg_policy_data structure. * @prio_policy: One of the IOPRIO_CLASS_* values. See also <linux/ioprio.h>. */ struct ioprio_blkcg { struct blkcg_policy_data cpd; enum prio_policy prio_policy; }; static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd) { return pd ? container_of(pd, struct ioprio_blkg, pd) : NULL; } static struct ioprio_blkcg *blkcg_to_ioprio_blkcg(struct blkcg *blkcg) { return container_of(blkcg_to_cpd(blkcg, &ioprio_policy), struct ioprio_blkcg, cpd); } static struct ioprio_blkcg * ioprio_blkcg_from_css(struct cgroup_subsys_state *css) { return blkcg_to_ioprio_blkcg(css_to_blkcg(css)); } static struct ioprio_blkcg *ioprio_blkcg_from_bio(struct bio *bio) { struct blkg_policy_data *pd = blkg_to_pd(bio->bi_blkg, &ioprio_policy); if (!pd) return NULL; return blkcg_to_ioprio_blkcg(pd->blkg->blkcg); } static int ioprio_show_prio_policy(struct seq_file *sf, void *v) { struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(seq_css(sf)); seq_printf(sf, "%s\n", policy_name[blkcg->prio_policy]); return 0; } static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(of_css(of)); int ret; if (off != 0) return -EIO; /* kernfs_fop_write_iter() terminates 'buf' with '\0'. */ ret = sysfs_match_string(policy_name, buf); if (ret < 0) return ret; blkcg->prio_policy = ret; return nbytes; } static struct blkg_policy_data * ioprio_alloc_pd(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp) { struct ioprio_blkg *ioprio_blkg; ioprio_blkg = kzalloc(sizeof(*ioprio_blkg), gfp); if (!ioprio_blkg) return NULL; return &ioprio_blkg->pd; } static void ioprio_free_pd(struct blkg_policy_data *pd) { struct ioprio_blkg *ioprio_blkg = pd_to_ioprio(pd); kfree(ioprio_blkg); } static struct blkcg_policy_data *ioprio_alloc_cpd(gfp_t gfp) { struct ioprio_blkcg *blkcg; blkcg = kzalloc(sizeof(*blkcg), gfp); if (!blkcg) return NULL; blkcg->prio_policy = POLICY_NO_CHANGE; return &blkcg->cpd; } static void ioprio_free_cpd(struct blkcg_policy_data *cpd) { struct ioprio_blkcg *blkcg = container_of(cpd, typeof(*blkcg), cpd); kfree(blkcg); } #define IOPRIO_ATTRS \ { \ .name = "prio.class", \ .seq_show = ioprio_show_prio_policy, \ .write = ioprio_set_prio_policy, \ }, \ { } /* sentinel */ /* cgroup v2 attributes */ static struct cftype ioprio_files[] = { IOPRIO_ATTRS }; /* cgroup v1 attributes */ static struct cftype ioprio_legacy_files[] = { IOPRIO_ATTRS }; static struct blkcg_policy ioprio_policy = { .dfl_cftypes = ioprio_files, .legacy_cftypes = ioprio_legacy_files, .cpd_alloc_fn = ioprio_alloc_cpd, .cpd_free_fn = ioprio_free_cpd, .pd_alloc_fn = ioprio_alloc_pd, .pd_free_fn = ioprio_free_pd, }; void blkcg_set_ioprio(struct bio *bio) { struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio); u16 prio; if (!blkcg || blkcg->prio_policy == POLICY_NO_CHANGE) return; if (blkcg->prio_policy == POLICY_PROMOTE_TO_RT || blkcg->prio_policy == POLICY_NONE_TO_RT) { /* * For RT threads, the default priority level is 4 because * task_nice is 0. By promoting non-RT io-priority to RT-class * and default level 4, those requests that are already * RT-class but need a higher io-priority can use ioprio_set() * to achieve this. */ if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) != IOPRIO_CLASS_RT) bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 4); return; } /* * Except for IOPRIO_CLASS_NONE, higher I/O priority numbers * correspond to a lower priority. Hence, the max_t() below selects * the lower priority of bi_ioprio and the cgroup I/O priority class. * If the bio I/O priority equals IOPRIO_CLASS_NONE, the cgroup I/O * priority is assigned to the bio. */ prio = max_t(u16, bio->bi_ioprio, IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0)); if (prio > bio->bi_ioprio) bio->bi_ioprio = prio; } void blk_ioprio_exit(struct gendisk *disk) { blkcg_deactivate_policy(disk, &ioprio_policy); } int blk_ioprio_init(struct gendisk *disk) { return blkcg_activate_policy(disk, &ioprio_policy); } static int __init ioprio_init(void) { return blkcg_policy_register(&ioprio_policy); } static void __exit ioprio_exit(void) { blkcg_policy_unregister(&ioprio_policy); } module_init(ioprio_init); module_exit(ioprio_exit);
28 28 28 1504 1503 1475 28 28 29 28 18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/types.h> #include <linux/ip.h> #include <linux/netfilter.h> #include <linux/module.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <net/netns/generic.h> #include <net/route.h> #include <net/ip.h> #include <linux/netfilter_bridge.h> #include <linux/netfilter_ipv4.h> #include <net/netfilter/ipv4/nf_defrag_ipv4.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> #endif #include <net/netfilter/nf_conntrack_zones.h> static DEFINE_MUTEX(defrag4_mutex); static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb, u_int32_t user) { int err; local_bh_disable(); err = ip_defrag(net, skb, user); local_bh_enable(); if (!err) skb->ignore_df = 1; return err; } static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum, struct sk_buff *skb) { u16 zone_id = NF_CT_DEFAULT_ZONE_ID; #if IS_ENABLED(CONFIG_NF_CONNTRACK) if (skb_nfct(skb)) { enum ip_conntrack_info ctinfo; const struct nf_conn *ct = nf_ct_get(skb, &ctinfo); zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo)); } #endif if (nf_bridge_in_prerouting(skb)) return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id; if (hooknum == NF_INET_PRE_ROUTING) return IP_DEFRAG_CONNTRACK_IN + zone_id; else return IP_DEFRAG_CONNTRACK_OUT + zone_id; } static unsigned int ipv4_conntrack_defrag(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct sock *sk = skb->sk; if (sk && sk_fullsock(sk) && (sk->sk_family == PF_INET) && inet_test_bit(NODEFRAG, sk)) return NF_ACCEPT; #if IS_ENABLED(CONFIG_NF_CONNTRACK) #if !IS_ENABLED(CONFIG_NF_NAT) /* Previously seen (loopback)? Ignore. Do this before fragment check. */ if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb))) return NF_ACCEPT; #endif if (skb->_nfct == IP_CT_UNTRACKED) return NF_ACCEPT; #endif /* Gather fragments. */ if (ip_is_fragment(ip_hdr(skb))) { enum ip_defrag_users user = nf_ct_defrag_user(state->hook, skb); if (nf_ct_ipv4_gather_frags(state->net, skb, user)) return NF_STOLEN; } return NF_ACCEPT; } static const struct nf_hook_ops ipv4_defrag_ops[] = { { .hook = ipv4_conntrack_defrag, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_CONNTRACK_DEFRAG, }, { .hook = ipv4_conntrack_defrag, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_CONNTRACK_DEFRAG, }, }; static void __net_exit defrag4_net_exit(struct net *net) { if (net->nf.defrag_ipv4_users) { nf_unregister_net_hooks(net, ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); net->nf.defrag_ipv4_users = 0; } } static const struct nf_defrag_hook defrag_hook = { .owner = THIS_MODULE, .enable = nf_defrag_ipv4_enable, .disable = nf_defrag_ipv4_disable, }; static struct pernet_operations defrag4_net_ops = { .exit = defrag4_net_exit, }; static int __init nf_defrag_init(void) { int err; err = register_pernet_subsys(&defrag4_net_ops); if (err) return err; rcu_assign_pointer(nf_defrag_v4_hook, &defrag_hook); return err; } static void __exit nf_defrag_fini(void) { rcu_assign_pointer(nf_defrag_v4_hook, NULL); unregister_pernet_subsys(&defrag4_net_ops); } int nf_defrag_ipv4_enable(struct net *net) { int err = 0; mutex_lock(&defrag4_mutex); if (net->nf.defrag_ipv4_users == UINT_MAX) { err = -EOVERFLOW; goto out_unlock; } if (net->nf.defrag_ipv4_users) { net->nf.defrag_ipv4_users++; goto out_unlock; } err = nf_register_net_hooks(net, ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); if (err == 0) net->nf.defrag_ipv4_users = 1; out_unlock: mutex_unlock(&defrag4_mutex); return err; } EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable); void nf_defrag_ipv4_disable(struct net *net) { mutex_lock(&defrag4_mutex); if (net->nf.defrag_ipv4_users) { net->nf.defrag_ipv4_users--; if (net->nf.defrag_ipv4_users == 0) nf_unregister_net_hooks(net, ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); } mutex_unlock(&defrag4_mutex); } EXPORT_SYMBOL_GPL(nf_defrag_ipv4_disable); module_init(nf_defrag_init); module_exit(nf_defrag_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IPv4 defragmentation support");
208 206 23 11 11 205 317 318 48 10 16 201 99 104 32 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_queue.h> #include <net/ip6_checksum.h> #ifdef CONFIG_INET __sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u8 protocol) { const struct iphdr *iph = ip_hdr(skb); __sum16 csum = 0; switch (skb->ip_summed) { case CHECKSUM_COMPLETE: if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN) break; if ((protocol != IPPROTO_TCP && protocol != IPPROTO_UDP && !csum_fold(skb->csum)) || !csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len - dataoff, protocol, skb->csum)) { skb->ip_summed = CHECKSUM_UNNECESSARY; break; } fallthrough; case CHECKSUM_NONE: if (protocol != IPPROTO_TCP && protocol != IPPROTO_UDP) skb->csum = 0; else skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, skb->len - dataoff, protocol, 0); csum = __skb_checksum_complete(skb); } return csum; } EXPORT_SYMBOL(nf_ip_checksum); #endif static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, unsigned int len, u8 protocol) { const struct iphdr *iph = ip_hdr(skb); __sum16 csum = 0; switch (skb->ip_summed) { case CHECKSUM_COMPLETE: if (len == skb->len - dataoff) return nf_ip_checksum(skb, hook, dataoff, protocol); fallthrough; case CHECKSUM_NONE: skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol, skb->len - dataoff, 0); skb->ip_summed = CHECKSUM_NONE; return __skb_checksum_complete_head(skb, dataoff + len); } return csum; } __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u8 protocol) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); __sum16 csum = 0; switch (skb->ip_summed) { case CHECKSUM_COMPLETE: if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN) break; if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, skb->len - dataoff, protocol, csum_sub(skb->csum, skb_checksum(skb, 0, dataoff, 0)))) { skb->ip_summed = CHECKSUM_UNNECESSARY; break; } fallthrough; case CHECKSUM_NONE: skb->csum = ~csum_unfold( csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, skb->len - dataoff, protocol, csum_sub(0, skb_checksum(skb, 0, dataoff, 0)))); csum = __skb_checksum_complete(skb); } return csum; } EXPORT_SYMBOL(nf_ip6_checksum); static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, unsigned int len, u8 protocol) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); __wsum hsum; __sum16 csum = 0; switch (skb->ip_summed) { case CHECKSUM_COMPLETE: if (len == skb->len - dataoff) return nf_ip6_checksum(skb, hook, dataoff, protocol); fallthrough; case CHECKSUM_NONE: hsum = skb_checksum(skb, 0, dataoff, 0); skb->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, skb->len - dataoff, protocol, csum_sub(0, hsum))); skb->ip_summed = CHECKSUM_NONE; return __skb_checksum_complete_head(skb, dataoff + len); } return csum; }; __sum16 nf_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u8 protocol, unsigned short family) { __sum16 csum = 0; switch (family) { case AF_INET: csum = nf_ip_checksum(skb, hook, dataoff, protocol); break; case AF_INET6: csum = nf_ip6_checksum(skb, hook, dataoff, protocol); break; } return csum; } EXPORT_SYMBOL_GPL(nf_checksum); __sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, unsigned int len, u8 protocol, unsigned short family) { __sum16 csum = 0; switch (family) { case AF_INET: csum = nf_ip_checksum_partial(skb, hook, dataoff, len, protocol); break; case AF_INET6: csum = nf_ip6_checksum_partial(skb, hook, dataoff, len, protocol); break; } return csum; } EXPORT_SYMBOL_GPL(nf_checksum_partial); int nf_route(struct net *net, struct dst_entry **dst, struct flowi *fl, bool strict, unsigned short family) { const struct nf_ipv6_ops *v6ops __maybe_unused; int ret = 0; switch (family) { case AF_INET: ret = nf_ip_route(net, dst, fl, strict); break; case AF_INET6: ret = nf_ip6_route(net, dst, fl, strict); break; } return ret; } EXPORT_SYMBOL_GPL(nf_route); static int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry) { #ifdef CONFIG_INET const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); if (entry->state.hook == NF_INET_LOCAL_OUT) { const struct iphdr *iph = ip_hdr(skb); if (!(iph->tos == rt_info->tos && skb->mark == rt_info->mark && iph->daddr == rt_info->daddr && iph->saddr == rt_info->saddr)) return ip_route_me_harder(entry->state.net, entry->state.sk, skb, RTN_UNSPEC); } #endif return 0; } int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry) { const struct nf_ipv6_ops *v6ops; int ret = 0; switch (entry->state.pf) { case AF_INET: ret = nf_ip_reroute(skb, entry); break; case AF_INET6: v6ops = rcu_dereference(nf_ipv6_ops); if (v6ops) ret = v6ops->reroute(skb, entry); break; } return ret; } /* Only get and check the lengths, not do any hop-by-hop stuff. */ int nf_ip6_check_hbh_len(struct sk_buff *skb, u32 *plen) { int len, off = sizeof(struct ipv6hdr); unsigned char *nh; if (!pskb_may_pull(skb, off + 8)) return -ENOMEM; nh = (unsigned char *)(ipv6_hdr(skb) + 1); len = (nh[1] + 1) << 3; if (!pskb_may_pull(skb, off + len)) return -ENOMEM; nh = skb_network_header(skb); off += 2; len -= 2; while (len > 0) { int optlen; if (nh[off] == IPV6_TLV_PAD1) { off++; len--; continue; } if (len < 2) return -EBADMSG; optlen = nh[off + 1] + 2; if (optlen > len) return -EBADMSG; if (nh[off] == IPV6_TLV_JUMBO) { u32 pkt_len; if (nh[off + 1] != 4 || (off & 3) != 2) return -EBADMSG; pkt_len = ntohl(*(__be32 *)(nh + off + 2)); if (pkt_len <= IPV6_MAXPLEN || ipv6_hdr(skb)->payload_len) return -EBADMSG; if (pkt_len > skb->len - sizeof(struct ipv6hdr)) return -EBADMSG; *plen = pkt_len; } off += optlen; len -= optlen; } return len ? -EBADMSG : 0; } EXPORT_SYMBOL_GPL(nf_ip6_check_hbh_len);
28 470 470 42 1 42 1 42 42 41 1 42 42 42 41 46 46 72 72 38 46 38 50 28 9 23 24 16 8 8 24 17 1 7 20 4 23 23 7 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 // SPDX-License-Identifier: GPL-2.0-or-later /* * inet fragments management * * Authors: Pavel Emelyanov <xemul@openvz.org> * Started as consolidation of ipv4/ip_fragment.c, * ipv6/reassembly. and ipv6 nf conntrack reassembly */ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/module.h> #include <linux/timer.h> #include <linux/mm.h> #include <linux/random.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/rhashtable.h> #include <net/sock.h> #include <net/inet_frag.h> #include <net/inet_ecn.h> #include <net/ip.h> #include <net/ipv6.h> /* Use skb->cb to track consecutive/adjacent fragments coming at * the end of the queue. Nodes in the rb-tree queue will * contain "runs" of one or more adjacent fragments. * * Invariants: * - next_frag is NULL at the tail of a "run"; * - the head of a "run" has the sum of all fragment lengths in frag_run_len. */ struct ipfrag_skb_cb { union { struct inet_skb_parm h4; struct inet6_skb_parm h6; }; struct sk_buff *next_frag; int frag_run_len; }; #define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) static void fragcb_clear(struct sk_buff *skb) { RB_CLEAR_NODE(&skb->rbnode); FRAG_CB(skb)->next_frag = NULL; FRAG_CB(skb)->frag_run_len = skb->len; } /* Append skb to the last "run". */ static void fragrun_append_to_last(struct inet_frag_queue *q, struct sk_buff *skb) { fragcb_clear(skb); FRAG_CB(q->last_run_head)->frag_run_len += skb->len; FRAG_CB(q->fragments_tail)->next_frag = skb; q->fragments_tail = skb; } /* Create a new "run" with the skb. */ static void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb) { BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); fragcb_clear(skb); if (q->last_run_head) rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, &q->last_run_head->rbnode.rb_right); else rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); rb_insert_color(&skb->rbnode, &q->rb_fragments); q->fragments_tail = skb; q->last_run_head = skb; } /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements * Value : 0xff if frame should be dropped. * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field */ const u8 ip_frag_ecn_table[16] = { /* at least one fragment had CE, and others ECT_0 or ECT_1 */ [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = INET_ECN_CE, [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = INET_ECN_CE, [IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = INET_ECN_CE, /* invalid combinations : drop frame */ [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE] = 0xff, [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0] = 0xff, [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_1] = 0xff, [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0] = 0xff, [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_1] = 0xff, [IPFRAG_ECN_NOT_ECT | IPFRAG_ECN_CE | IPFRAG_ECN_ECT_0 | IPFRAG_ECN_ECT_1] = 0xff, }; EXPORT_SYMBOL(ip_frag_ecn_table); int inet_frags_init(struct inet_frags *f) { f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0, NULL); if (!f->frags_cachep) return -ENOMEM; refcount_set(&f->refcnt, 1); init_completion(&f->completion); return 0; } EXPORT_SYMBOL(inet_frags_init); void inet_frags_fini(struct inet_frags *f) { if (refcount_dec_and_test(&f->refcnt)) complete(&f->completion); wait_for_completion(&f->completion); kmem_cache_destroy(f->frags_cachep); f->frags_cachep = NULL; } EXPORT_SYMBOL(inet_frags_fini); /* called from rhashtable_free_and_destroy() at netns_frags dismantle */ static void inet_frags_free_cb(void *ptr, void *arg) { struct inet_frag_queue *fq = ptr; int count; count = del_timer_sync(&fq->timer) ? 1 : 0; spin_lock_bh(&fq->lock); fq->flags |= INET_FRAG_DROP; if (!(fq->flags & INET_FRAG_COMPLETE)) { fq->flags |= INET_FRAG_COMPLETE; count++; } else if (fq->flags & INET_FRAG_HASH_DEAD) { count++; } spin_unlock_bh(&fq->lock); if (refcount_sub_and_test(count, &fq->refcnt)) inet_frag_destroy(fq); } static LLIST_HEAD(fqdir_free_list); static void fqdir_free_fn(struct work_struct *work) { struct llist_node *kill_list; struct fqdir *fqdir, *tmp; struct inet_frags *f; /* Atomically snapshot the list of fqdirs to free */ kill_list = llist_del_all(&fqdir_free_list); /* We need to make sure all ongoing call_rcu(..., inet_frag_destroy_rcu) * have completed, since they need to dereference fqdir. * Would it not be nice to have kfree_rcu_barrier() ? :) */ rcu_barrier(); llist_for_each_entry_safe(fqdir, tmp, kill_list, free_list) { f = fqdir->f; if (refcount_dec_and_test(&f->refcnt)) complete(&f->completion); kfree(fqdir); } } static DECLARE_WORK(fqdir_free_work, fqdir_free_fn); static void fqdir_work_fn(struct work_struct *work) { struct fqdir *fqdir = container_of(work, struct fqdir, destroy_work); rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL); if (llist_add(&fqdir->free_list, &fqdir_free_list)) queue_work(system_wq, &fqdir_free_work); } int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net) { struct fqdir *fqdir = kzalloc(sizeof(*fqdir), GFP_KERNEL); int res; if (!fqdir) return -ENOMEM; fqdir->f = f; fqdir->net = net; res = rhashtable_init(&fqdir->rhashtable, &fqdir->f->rhash_params); if (res < 0) { kfree(fqdir); return res; } refcount_inc(&f->refcnt); *fqdirp = fqdir; return 0; } EXPORT_SYMBOL(fqdir_init); static struct workqueue_struct *inet_frag_wq; static int __init inet_frag_wq_init(void) { inet_frag_wq = create_workqueue("inet_frag_wq"); if (!inet_frag_wq) panic("Could not create inet frag workq"); return 0; } pure_initcall(inet_frag_wq_init); void fqdir_exit(struct fqdir *fqdir) { INIT_WORK(&fqdir->destroy_work, fqdir_work_fn); queue_work(inet_frag_wq, &fqdir->destroy_work); } EXPORT_SYMBOL(fqdir_exit); void inet_frag_kill(struct inet_frag_queue *fq) { if (del_timer(&fq->timer)) refcount_dec(&fq->refcnt); if (!(fq->flags & INET_FRAG_COMPLETE)) { struct fqdir *fqdir = fq->fqdir; fq->flags |= INET_FRAG_COMPLETE; rcu_read_lock(); /* The RCU read lock provides a memory barrier * guaranteeing that if fqdir->dead is false then * the hash table destruction will not start until * after we unlock. Paired with fqdir_pre_exit(). */ if (!READ_ONCE(fqdir->dead)) { rhashtable_remove_fast(&fqdir->rhashtable, &fq->node, fqdir->f->rhash_params); refcount_dec(&fq->refcnt); } else { fq->flags |= INET_FRAG_HASH_DEAD; } rcu_read_unlock(); } } EXPORT_SYMBOL(inet_frag_kill); static void inet_frag_destroy_rcu(struct rcu_head *head) { struct inet_frag_queue *q = container_of(head, struct inet_frag_queue, rcu); struct inet_frags *f = q->fqdir->f; if (f->destructor) f->destructor(q); kmem_cache_free(f->frags_cachep, q); } unsigned int inet_frag_rbtree_purge(struct rb_root *root, enum skb_drop_reason reason) { struct rb_node *p = rb_first(root); unsigned int sum = 0; while (p) { struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); p = rb_next(p); rb_erase(&skb->rbnode, root); while (skb) { struct sk_buff *next = FRAG_CB(skb)->next_frag; sum += skb->truesize; kfree_skb_reason(skb, reason); skb = next; } } return sum; } EXPORT_SYMBOL(inet_frag_rbtree_purge); void inet_frag_destroy(struct inet_frag_queue *q) { unsigned int sum, sum_truesize = 0; enum skb_drop_reason reason; struct inet_frags *f; struct fqdir *fqdir; WARN_ON(!(q->flags & INET_FRAG_COMPLETE)); reason = (q->flags & INET_FRAG_DROP) ? SKB_DROP_REASON_FRAG_REASM_TIMEOUT : SKB_CONSUMED; WARN_ON(del_timer(&q->timer) != 0); /* Release all fragment data. */ fqdir = q->fqdir; f = fqdir->f; sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments, reason); sum = sum_truesize + f->qsize; call_rcu(&q->rcu, inet_frag_destroy_rcu); sub_frag_mem_limit(fqdir, sum); } EXPORT_SYMBOL(inet_frag_destroy); static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir, struct inet_frags *f, void *arg) { struct inet_frag_queue *q; q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); if (!q) return NULL; q->fqdir = fqdir; f->constructor(q, arg); add_frag_mem_limit(fqdir, f->qsize); timer_setup(&q->timer, f->frag_expire, 0); spin_lock_init(&q->lock); refcount_set(&q->refcnt, 3); return q; } static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir, void *arg, struct inet_frag_queue **prev) { struct inet_frags *f = fqdir->f; struct inet_frag_queue *q; q = inet_frag_alloc(fqdir, f, arg); if (!q) { *prev = ERR_PTR(-ENOMEM); return NULL; } mod_timer(&q->timer, jiffies + fqdir->timeout); *prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key, &q->node, f->rhash_params); if (*prev) { q->flags |= INET_FRAG_COMPLETE; inet_frag_kill(q); inet_frag_destroy(q); return NULL; } return q; } /* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key) { /* This pairs with WRITE_ONCE() in fqdir_pre_exit(). */ long high_thresh = READ_ONCE(fqdir->high_thresh); struct inet_frag_queue *fq = NULL, *prev; if (!high_thresh || frag_mem_limit(fqdir) > high_thresh) return NULL; rcu_read_lock(); prev = rhashtable_lookup(&fqdir->rhashtable, key, fqdir->f->rhash_params); if (!prev) fq = inet_frag_create(fqdir, key, &prev); if (!IS_ERR_OR_NULL(prev)) { fq = prev; if (!refcount_inc_not_zero(&fq->refcnt)) fq = NULL; } rcu_read_unlock(); return fq; } EXPORT_SYMBOL(inet_frag_find); int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, int offset, int end) { struct sk_buff *last = q->fragments_tail; /* RFC5722, Section 4, amended by Errata ID : 3089 * When reassembling an IPv6 datagram, if * one or more its constituent fragments is determined to be an * overlapping fragment, the entire datagram (and any constituent * fragments) MUST be silently discarded. * * Duplicates, however, should be ignored (i.e. skb dropped, but the * queue/fragments kept for later reassembly). */ if (!last) fragrun_create(q, skb); /* First fragment. */ else if (last->ip_defrag_offset + last->len < end) { /* This is the common case: skb goes to the end. */ /* Detect and discard overlaps. */ if (offset < last->ip_defrag_offset + last->len) return IPFRAG_OVERLAP; if (offset == last->ip_defrag_offset + last->len) fragrun_append_to_last(q, skb); else fragrun_create(q, skb); } else { /* Binary search. Note that skb can become the first fragment, * but not the last (covered above). */ struct rb_node **rbn, *parent; rbn = &q->rb_fragments.rb_node; do { struct sk_buff *curr; int curr_run_end; parent = *rbn; curr = rb_to_skb(parent); curr_run_end = curr->ip_defrag_offset + FRAG_CB(curr)->frag_run_len; if (end <= curr->ip_defrag_offset) rbn = &parent->rb_left; else if (offset >= curr_run_end) rbn = &parent->rb_right; else if (offset >= curr->ip_defrag_offset && end <= curr_run_end) return IPFRAG_DUP; else return IPFRAG_OVERLAP; } while (*rbn); /* Here we have parent properly set, and rbn pointing to * one of its NULL left/right children. Insert skb. */ fragcb_clear(skb); rb_link_node(&skb->rbnode, parent, rbn); rb_insert_color(&skb->rbnode, &q->rb_fragments); } skb->ip_defrag_offset = offset; return IPFRAG_OK; } EXPORT_SYMBOL(inet_frag_queue_insert); void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, struct sk_buff *parent) { struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments); struct sk_buff **nextp; int delta; if (head != skb) { fp = skb_clone(skb, GFP_ATOMIC); if (!fp) return NULL; FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; if (RB_EMPTY_NODE(&skb->rbnode)) FRAG_CB(parent)->next_frag = fp; else rb_replace_node(&skb->rbnode, &fp->rbnode, &q->rb_fragments); if (q->fragments_tail == skb) q->fragments_tail = fp; skb_morph(skb, head); FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; rb_replace_node(&head->rbnode, &skb->rbnode, &q->rb_fragments); consume_skb(head); head = skb; } WARN_ON(head->ip_defrag_offset != 0); delta = -head->truesize; /* Head of list must not be cloned. */ if (skb_unclone(head, GFP_ATOMIC)) return NULL; delta += head->truesize; if (delta) add_frag_mem_limit(q->fqdir, delta); /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ if (skb_has_frag_list(head)) { struct sk_buff *clone; int i, plen = 0; clone = alloc_skb(0, GFP_ATOMIC); if (!clone) return NULL; skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; skb_frag_list_init(head); for (i = 0; i < skb_shinfo(head)->nr_frags; i++) plen += skb_frag_size(&skb_shinfo(head)->frags[i]); clone->data_len = head->data_len - plen; clone->len = clone->data_len; head->truesize += clone->truesize; clone->csum = 0; clone->ip_summed = head->ip_summed; add_frag_mem_limit(q->fqdir, clone->truesize); skb_shinfo(head)->frag_list = clone; nextp = &clone->next; } else { nextp = &skb_shinfo(head)->frag_list; } return nextp; } EXPORT_SYMBOL(inet_frag_reasm_prepare); void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, void *reasm_data, bool try_coalesce) { struct sk_buff **nextp = reasm_data; struct rb_node *rbn; struct sk_buff *fp; int sum_truesize; skb_push(head, head->data - skb_network_header(head)); /* Traverse the tree in order, to build frag_list. */ fp = FRAG_CB(head)->next_frag; rbn = rb_next(&head->rbnode); rb_erase(&head->rbnode, &q->rb_fragments); sum_truesize = head->truesize; while (rbn || fp) { /* fp points to the next sk_buff in the current run; * rbn points to the next run. */ /* Go through the current run. */ while (fp) { struct sk_buff *next_frag = FRAG_CB(fp)->next_frag; bool stolen; int delta; sum_truesize += fp->truesize; if (head->ip_summed != fp->ip_summed) head->ip_summed = CHECKSUM_NONE; else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); if (try_coalesce && skb_try_coalesce(head, fp, &stolen, &delta)) { kfree_skb_partial(fp, stolen); } else { fp->prev = NULL; memset(&fp->rbnode, 0, sizeof(fp->rbnode)); fp->sk = NULL; head->data_len += fp->len; head->len += fp->len; head->truesize += fp->truesize; *nextp = fp; nextp = &fp->next; } fp = next_frag; } /* Move to the next run. */ if (rbn) { struct rb_node *rbnext = rb_next(rbn); fp = rb_to_skb(rbn); rb_erase(rbn, &q->rb_fragments); rbn = rbnext; } } sub_frag_mem_limit(q->fqdir, sum_truesize); *nextp = NULL; skb_mark_not_on_list(head); head->prev = NULL; head->tstamp = q->stamp; head->mono_delivery_time = q->mono_delivery_time; } EXPORT_SYMBOL(inet_frag_reasm_finish); struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q) { struct sk_buff *head, *skb; head = skb_rb_first(&q->rb_fragments); if (!head) return NULL; skb = FRAG_CB(head)->next_frag; if (skb) rb_replace_node(&head->rbnode, &skb->rbnode, &q->rb_fragments); else rb_erase(&head->rbnode, &q->rb_fragments); memset(&head->rbnode, 0, sizeof(head->rbnode)); barrier(); if (head == q->fragments_tail) q->fragments_tail = NULL; sub_frag_mem_limit(q->fqdir, head->truesize); return head; } EXPORT_SYMBOL(inet_frag_pull_head);
793 519 846 846 848 847 908 169 860 847 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2005,2006,2007,2008 IBM Corporation * * Authors: * Serge Hallyn <serue@us.ibm.com> * Reiner Sailer <sailer@watson.ibm.com> * Mimi Zohar <zohar@us.ibm.com> * * File: ima_queue.c * Implements queues that store template measurements and * maintains aggregate over the stored measurements * in the pre-configured TPM PCR (if available). * The measurement list is append-only. No entry is * ever removed or changed during the boot-cycle. */ #include <linux/rculist.h> #include <linux/slab.h> #include "ima.h" #define AUDIT_CAUSE_LEN_MAX 32 /* pre-allocated array of tpm_digest structures to extend a PCR */ static struct tpm_digest *digests; LIST_HEAD(ima_measurements); /* list of all measurements */ #ifdef CONFIG_IMA_KEXEC static unsigned long binary_runtime_size; #else static unsigned long binary_runtime_size = ULONG_MAX; #endif /* key: inode (before secure-hashing a file) */ struct ima_h_table ima_htable = { .len = ATOMIC_LONG_INIT(0), .violations = ATOMIC_LONG_INIT(0), .queue[0 ... IMA_MEASURE_HTABLE_SIZE - 1] = HLIST_HEAD_INIT }; /* mutex protects atomicity of extending measurement list * and extending the TPM PCR aggregate. Since tpm_extend can take * long (and the tpm driver uses a mutex), we can't use the spinlock. */ static DEFINE_MUTEX(ima_extend_list_mutex); /* lookup up the digest value in the hash table, and return the entry */ static struct ima_queue_entry *ima_lookup_digest_entry(u8 *digest_value, int pcr) { struct ima_queue_entry *qe, *ret = NULL; unsigned int key; int rc; key = ima_hash_key(digest_value); rcu_read_lock(); hlist_for_each_entry_rcu(qe, &ima_htable.queue[key], hnext) { rc = memcmp(qe->entry->digests[ima_hash_algo_idx].digest, digest_value, hash_digest_size[ima_hash_algo]); if ((rc == 0) && (qe->entry->pcr == pcr)) { ret = qe; break; } } rcu_read_unlock(); return ret; } /* * Calculate the memory required for serializing a single * binary_runtime_measurement list entry, which contains a * couple of variable length fields (e.g template name and data). */ static int get_binary_runtime_size(struct ima_template_entry *entry) { int size = 0; size += sizeof(u32); /* pcr */ size += TPM_DIGEST_SIZE; size += sizeof(int); /* template name size field */ size += strlen(entry->template_desc->name); size += sizeof(entry->template_data_len); size += entry->template_data_len; return size; } /* ima_add_template_entry helper function: * - Add template entry to the measurement list and hash table, for * all entries except those carried across kexec. * * (Called with ima_extend_list_mutex held.) */ static int ima_add_digest_entry(struct ima_template_entry *entry, bool update_htable) { struct ima_queue_entry *qe; unsigned int key; qe = kmalloc(sizeof(*qe), GFP_KERNEL); if (qe == NULL) { pr_err("OUT OF MEMORY ERROR creating queue entry\n"); return -ENOMEM; } qe->entry = entry; INIT_LIST_HEAD(&qe->later); list_add_tail_rcu(&qe->later, &ima_measurements); atomic_long_inc(&ima_htable.len); if (update_htable) { key = ima_hash_key(entry->digests[ima_hash_algo_idx].digest); hlist_add_head_rcu(&qe->hnext, &ima_htable.queue[key]); } if (binary_runtime_size != ULONG_MAX) { int size; size = get_binary_runtime_size(entry); binary_runtime_size = (binary_runtime_size < ULONG_MAX - size) ? binary_runtime_size + size : ULONG_MAX; } return 0; } /* * Return the amount of memory required for serializing the * entire binary_runtime_measurement list, including the ima_kexec_hdr * structure. */ unsigned long ima_get_binary_runtime_size(void) { if (binary_runtime_size >= (ULONG_MAX - sizeof(struct ima_kexec_hdr))) return ULONG_MAX; else return binary_runtime_size + sizeof(struct ima_kexec_hdr); } static int ima_pcr_extend(struct tpm_digest *digests_arg, int pcr) { int result = 0; if (!ima_tpm_chip) return result; result = tpm_pcr_extend(ima_tpm_chip, pcr, digests_arg); if (result != 0) pr_err("Error Communicating to TPM chip, result: %d\n", result); return result; } /* * Add template entry to the measurement list and hash table, and * extend the pcr. * * On systems which support carrying the IMA measurement list across * kexec, maintain the total memory size required for serializing the * binary_runtime_measurements. */ int ima_add_template_entry(struct ima_template_entry *entry, int violation, const char *op, struct inode *inode, const unsigned char *filename) { u8 *digest = entry->digests[ima_hash_algo_idx].digest; struct tpm_digest *digests_arg = entry->digests; const char *audit_cause = "hash_added"; char tpm_audit_cause[AUDIT_CAUSE_LEN_MAX]; int audit_info = 1; int result = 0, tpmresult = 0; mutex_lock(&ima_extend_list_mutex); if (!violation && !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE)) { if (ima_lookup_digest_entry(digest, entry->pcr)) { audit_cause = "hash_exists"; result = -EEXIST; goto out; } } result = ima_add_digest_entry(entry, !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE)); if (result < 0) { audit_cause = "ENOMEM"; audit_info = 0; goto out; } if (violation) /* invalidate pcr */ digests_arg = digests; tpmresult = ima_pcr_extend(digests_arg, entry->pcr); if (tpmresult != 0) { snprintf(tpm_audit_cause, AUDIT_CAUSE_LEN_MAX, "TPM_error(%d)", tpmresult); audit_cause = tpm_audit_cause; audit_info = 0; } out: mutex_unlock(&ima_extend_list_mutex); integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode, filename, op, audit_cause, result, audit_info); return result; } int ima_restore_measurement_entry(struct ima_template_entry *entry) { int result = 0; mutex_lock(&ima_extend_list_mutex); result = ima_add_digest_entry(entry, 0); mutex_unlock(&ima_extend_list_mutex); return result; } int __init ima_init_digests(void) { u16 digest_size; u16 crypto_id; int i; if (!ima_tpm_chip) return 0; digests = kcalloc(ima_tpm_chip->nr_allocated_banks, sizeof(*digests), GFP_NOFS); if (!digests) return -ENOMEM; for (i = 0; i < ima_tpm_chip->nr_allocated_banks; i++) { digests[i].alg_id = ima_tpm_chip->allocated_banks[i].alg_id; digest_size = ima_tpm_chip->allocated_banks[i].digest_size; crypto_id = ima_tpm_chip->allocated_banks[i].crypto_id; /* for unmapped TPM algorithms digest is still a padded SHA1 */ if (crypto_id == HASH_ALGO__LAST) digest_size = SHA1_DIGEST_SIZE; memset(digests[i].digest, 0xff, digest_size); } return 0; }
26 26 26 26 23 26 26 32 1 2 3 23 2 2 2 1 1 1 1 6 3 3 4 2 2 11 11 3 3 1 126 13 8 104 9 100 137 16 21 80 37 37 59 59 15 10 3 6 15 3 7 2 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 #include <linux/bpf.h> #include <linux/btf.h> #include <linux/err.h> #include <linux/irq_work.h> #include <linux/slab.h> #include <linux/filter.h> #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/wait.h> #include <linux/poll.h> #include <linux/kmemleak.h> #include <uapi/linux/btf.h> #include <linux/btf_ids.h> #define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) /* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */ #define RINGBUF_PGOFF \ (offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT) /* consumer page and producer page */ #define RINGBUF_POS_PAGES 2 #define RINGBUF_NR_META_PAGES (RINGBUF_PGOFF + RINGBUF_POS_PAGES) #define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4) struct bpf_ringbuf { wait_queue_head_t waitq; struct irq_work work; u64 mask; struct page **pages; int nr_pages; spinlock_t spinlock ____cacheline_aligned_in_smp; /* For user-space producer ring buffers, an atomic_t busy bit is used * to synchronize access to the ring buffers in the kernel, rather than * the spinlock that is used for kernel-producer ring buffers. This is * done because the ring buffer must hold a lock across a BPF program's * callback: * * __bpf_user_ringbuf_peek() // lock acquired * -> program callback_fn() * -> __bpf_user_ringbuf_sample_release() // lock released * * It is unsafe and incorrect to hold an IRQ spinlock across what could * be a long execution window, so we instead simply disallow concurrent * access to the ring buffer by kernel consumers, and return -EBUSY from * __bpf_user_ringbuf_peek() if the busy bit is held by another task. */ atomic_t busy ____cacheline_aligned_in_smp; /* Consumer and producer counters are put into separate pages to * allow each position to be mapped with different permissions. * This prevents a user-space application from modifying the * position and ruining in-kernel tracking. The permissions of the * pages depend on who is producing samples: user-space or the * kernel. * * Kernel-producer * --------------- * The producer position and data pages are mapped as r/o in * userspace. For this approach, bits in the header of samples are * used to signal to user-space, and to other producers, whether a * sample is currently being written. * * User-space producer * ------------------- * Only the page containing the consumer position is mapped r/o in * user-space. User-space producers also use bits of the header to * communicate to the kernel, but the kernel must carefully check and * validate each sample to ensure that they're correctly formatted, and * fully contained within the ring buffer. */ unsigned long consumer_pos __aligned(PAGE_SIZE); unsigned long producer_pos __aligned(PAGE_SIZE); char data[] __aligned(PAGE_SIZE); }; struct bpf_ringbuf_map { struct bpf_map map; struct bpf_ringbuf *rb; }; /* 8-byte ring buffer record header structure */ struct bpf_ringbuf_hdr { u32 len; u32 pg_off; }; static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) { const gfp_t flags = GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | __GFP_ZERO; int nr_meta_pages = RINGBUF_NR_META_PAGES; int nr_data_pages = data_sz >> PAGE_SHIFT; int nr_pages = nr_meta_pages + nr_data_pages; struct page **pages, *page; struct bpf_ringbuf *rb; size_t array_size; int i; /* Each data page is mapped twice to allow "virtual" * continuous read of samples wrapping around the end of ring * buffer area: * ------------------------------------------------------ * | meta pages | real data pages | same data pages | * ------------------------------------------------------ * | | 1 2 3 4 5 6 7 8 9 | 1 2 3 4 5 6 7 8 9 | * ------------------------------------------------------ * | | TA DA | TA DA | * ------------------------------------------------------ * ^^^^^^^ * | * Here, no need to worry about special handling of wrapped-around * data due to double-mapped data pages. This works both in kernel and * when mmap()'ed in user-space, simplifying both kernel and * user-space implementations significantly. */ array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages); pages = bpf_map_area_alloc(array_size, numa_node); if (!pages) return NULL; for (i = 0; i < nr_pages; i++) { page = alloc_pages_node(numa_node, flags, 0); if (!page) { nr_pages = i; goto err_free_pages; } pages[i] = page; if (i >= nr_meta_pages) pages[nr_data_pages + i] = page; } rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages, VM_MAP | VM_USERMAP, PAGE_KERNEL); if (rb) { kmemleak_not_leak(pages); rb->pages = pages; rb->nr_pages = nr_pages; return rb; } err_free_pages: for (i = 0; i < nr_pages; i++) __free_page(pages[i]); bpf_map_area_free(pages); return NULL; } static void bpf_ringbuf_notify(struct irq_work *work) { struct bpf_ringbuf *rb = container_of(work, struct bpf_ringbuf, work); wake_up_all(&rb->waitq); } /* Maximum size of ring buffer area is limited by 32-bit page offset within * record header, counted in pages. Reserve 8 bits for extensibility, and * take into account few extra pages for consumer/producer pages and * non-mmap()'able parts, the current maximum size would be: * * (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE) * * This gives 64GB limit, which seems plenty for single ring buffer. Now * considering that the maximum value of data_sz is (4GB - 1), there * will be no overflow, so just note the size limit in the comments. */ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) { struct bpf_ringbuf *rb; rb = bpf_ringbuf_area_alloc(data_sz, numa_node); if (!rb) return NULL; spin_lock_init(&rb->spinlock); atomic_set(&rb->busy, 0); init_waitqueue_head(&rb->waitq); init_irq_work(&rb->work, bpf_ringbuf_notify); rb->mask = data_sz - 1; rb->consumer_pos = 0; rb->producer_pos = 0; return rb; } static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) { struct bpf_ringbuf_map *rb_map; if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); if (attr->key_size || attr->value_size || !is_power_of_2(attr->max_entries) || !PAGE_ALIGNED(attr->max_entries)) return ERR_PTR(-EINVAL); rb_map = bpf_map_area_alloc(sizeof(*rb_map), NUMA_NO_NODE); if (!rb_map) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&rb_map->map, attr); rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node); if (!rb_map->rb) { bpf_map_area_free(rb_map); return ERR_PTR(-ENOMEM); } return &rb_map->map; } static void bpf_ringbuf_free(struct bpf_ringbuf *rb) { /* copy pages pointer and nr_pages to local variable, as we are going * to unmap rb itself with vunmap() below */ struct page **pages = rb->pages; int i, nr_pages = rb->nr_pages; vunmap(rb); for (i = 0; i < nr_pages; i++) __free_page(pages[i]); bpf_map_area_free(pages); } static void ringbuf_map_free(struct bpf_map *map) { struct bpf_ringbuf_map *rb_map; rb_map = container_of(map, struct bpf_ringbuf_map, map); bpf_ringbuf_free(rb_map->rb); bpf_map_area_free(rb_map); } static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key) { return ERR_PTR(-ENOTSUPP); } static long ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { return -ENOTSUPP; } static long ringbuf_map_delete_elem(struct bpf_map *map, void *key) { return -ENOTSUPP; } static int ringbuf_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { return -ENOTSUPP; } static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma) { struct bpf_ringbuf_map *rb_map; rb_map = container_of(map, struct bpf_ringbuf_map, map); if (vma->vm_flags & VM_WRITE) { /* allow writable mapping for the consumer_pos only */ if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE) return -EPERM; } else { vm_flags_clear(vma, VM_MAYWRITE); } /* remap_vmalloc_range() checks size and offset constraints */ return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); } static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma) { struct bpf_ringbuf_map *rb_map; rb_map = container_of(map, struct bpf_ringbuf_map, map); if (vma->vm_flags & VM_WRITE) { if (vma->vm_pgoff == 0) /* Disallow writable mappings to the consumer pointer, * and allow writable mappings to both the producer * position, and the ring buffer data itself. */ return -EPERM; } else { vm_flags_clear(vma, VM_MAYWRITE); } /* remap_vmalloc_range() checks size and offset constraints */ return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); } static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb) { unsigned long cons_pos, prod_pos; cons_pos = smp_load_acquire(&rb->consumer_pos); prod_pos = smp_load_acquire(&rb->producer_pos); return prod_pos - cons_pos; } static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb) { return rb->mask + 1; } static __poll_t ringbuf_map_poll_kern(struct bpf_map *map, struct file *filp, struct poll_table_struct *pts) { struct bpf_ringbuf_map *rb_map; rb_map = container_of(map, struct bpf_ringbuf_map, map); poll_wait(filp, &rb_map->rb->waitq, pts); if (ringbuf_avail_data_sz(rb_map->rb)) return EPOLLIN | EPOLLRDNORM; return 0; } static __poll_t ringbuf_map_poll_user(struct bpf_map *map, struct file *filp, struct poll_table_struct *pts) { struct bpf_ringbuf_map *rb_map; rb_map = container_of(map, struct bpf_ringbuf_map, map); poll_wait(filp, &rb_map->rb->waitq, pts); if (ringbuf_avail_data_sz(rb_map->rb) < ringbuf_total_data_sz(rb_map->rb)) return EPOLLOUT | EPOLLWRNORM; return 0; } static u64 ringbuf_map_mem_usage(const struct bpf_map *map) { struct bpf_ringbuf *rb; int nr_data_pages; int nr_meta_pages; u64 usage = sizeof(struct bpf_ringbuf_map); rb = container_of(map, struct bpf_ringbuf_map, map)->rb; usage += (u64)rb->nr_pages << PAGE_SHIFT; nr_meta_pages = RINGBUF_NR_META_PAGES; nr_data_pages = map->max_entries >> PAGE_SHIFT; usage += (nr_meta_pages + 2 * nr_data_pages) * sizeof(struct page *); return usage; } BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map) const struct bpf_map_ops ringbuf_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = ringbuf_map_alloc, .map_free = ringbuf_map_free, .map_mmap = ringbuf_map_mmap_kern, .map_poll = ringbuf_map_poll_kern, .map_lookup_elem = ringbuf_map_lookup_elem, .map_update_elem = ringbuf_map_update_elem, .map_delete_elem = ringbuf_map_delete_elem, .map_get_next_key = ringbuf_map_get_next_key, .map_mem_usage = ringbuf_map_mem_usage, .map_btf_id = &ringbuf_map_btf_ids[0], }; BTF_ID_LIST_SINGLE(user_ringbuf_map_btf_ids, struct, bpf_ringbuf_map) const struct bpf_map_ops user_ringbuf_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc = ringbuf_map_alloc, .map_free = ringbuf_map_free, .map_mmap = ringbuf_map_mmap_user, .map_poll = ringbuf_map_poll_user, .map_lookup_elem = ringbuf_map_lookup_elem, .map_update_elem = ringbuf_map_update_elem, .map_delete_elem = ringbuf_map_delete_elem, .map_get_next_key = ringbuf_map_get_next_key, .map_mem_usage = ringbuf_map_mem_usage, .map_btf_id = &user_ringbuf_map_btf_ids[0], }; /* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself, * calculate offset from record metadata to ring buffer in pages, rounded * down. This page offset is stored as part of record metadata and allows to * restore struct bpf_ringbuf * from record pointer. This page offset is * stored at offset 4 of record metadata header. */ static size_t bpf_ringbuf_rec_pg_off(struct bpf_ringbuf *rb, struct bpf_ringbuf_hdr *hdr) { return ((void *)hdr - (void *)rb) >> PAGE_SHIFT; } /* Given pointer to ring buffer record header, restore pointer to struct * bpf_ringbuf itself by using page offset stored at offset 4 */ static struct bpf_ringbuf * bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr) { unsigned long addr = (unsigned long)(void *)hdr; unsigned long off = (unsigned long)hdr->pg_off << PAGE_SHIFT; return (void*)((addr & PAGE_MASK) - off); } static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) { unsigned long cons_pos, prod_pos, new_prod_pos, flags; u32 len, pg_off; struct bpf_ringbuf_hdr *hdr; if (unlikely(size > RINGBUF_MAX_RECORD_SZ)) return NULL; len = round_up(size + BPF_RINGBUF_HDR_SZ, 8); if (len > ringbuf_total_data_sz(rb)) return NULL; cons_pos = smp_load_acquire(&rb->consumer_pos); if (in_nmi()) { if (!spin_trylock_irqsave(&rb->spinlock, flags)) return NULL; } else { spin_lock_irqsave(&rb->spinlock, flags); } prod_pos = rb->producer_pos; new_prod_pos = prod_pos + len; /* check for out of ringbuf space by ensuring producer position * doesn't advance more than (ringbuf_size - 1) ahead */ if (new_prod_pos - cons_pos > rb->mask) { spin_unlock_irqrestore(&rb->spinlock, flags); return NULL; } hdr = (void *)rb->data + (prod_pos & rb->mask); pg_off = bpf_ringbuf_rec_pg_off(rb, hdr); hdr->len = size | BPF_RINGBUF_BUSY_BIT; hdr->pg_off = pg_off; /* pairs with consumer's smp_load_acquire() */ smp_store_release(&rb->producer_pos, new_prod_pos); spin_unlock_irqrestore(&rb->spinlock, flags); return (void *)hdr + BPF_RINGBUF_HDR_SZ; } BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags) { struct bpf_ringbuf_map *rb_map; if (unlikely(flags)) return 0; rb_map = container_of(map, struct bpf_ringbuf_map, map); return (unsigned long)__bpf_ringbuf_reserve(rb_map->rb, size); } const struct bpf_func_proto bpf_ringbuf_reserve_proto = { .func = bpf_ringbuf_reserve, .ret_type = RET_PTR_TO_RINGBUF_MEM_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, }; static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard) { unsigned long rec_pos, cons_pos; struct bpf_ringbuf_hdr *hdr; struct bpf_ringbuf *rb; u32 new_len; hdr = sample - BPF_RINGBUF_HDR_SZ; rb = bpf_ringbuf_restore_from_rec(hdr); new_len = hdr->len ^ BPF_RINGBUF_BUSY_BIT; if (discard) new_len |= BPF_RINGBUF_DISCARD_BIT; /* update record header with correct final size prefix */ xchg(&hdr->len, new_len); /* if consumer caught up and is waiting for our record, notify about * new data availability */ rec_pos = (void *)hdr - (void *)rb->data; cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask; if (flags & BPF_RB_FORCE_WAKEUP) irq_work_queue(&rb->work); else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP)) irq_work_queue(&rb->work); } BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags) { bpf_ringbuf_commit(sample, flags, false /* discard */); return 0; } const struct bpf_func_proto bpf_ringbuf_submit_proto = { .func = bpf_ringbuf_submit, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE, .arg2_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags) { bpf_ringbuf_commit(sample, flags, true /* discard */); return 0; } const struct bpf_func_proto bpf_ringbuf_discard_proto = { .func = bpf_ringbuf_discard, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_RINGBUF_MEM | OBJ_RELEASE, .arg2_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_ringbuf_output, struct bpf_map *, map, void *, data, u64, size, u64, flags) { struct bpf_ringbuf_map *rb_map; void *rec; if (unlikely(flags & ~(BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP))) return -EINVAL; rb_map = container_of(map, struct bpf_ringbuf_map, map); rec = __bpf_ringbuf_reserve(rb_map->rb, size); if (!rec) return -EAGAIN; memcpy(rec, data, size); bpf_ringbuf_commit(rec, flags, false /* discard */); return 0; } const struct bpf_func_proto bpf_ringbuf_output_proto = { .func = bpf_ringbuf_output, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg3_type = ARG_CONST_SIZE_OR_ZERO, .arg4_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags) { struct bpf_ringbuf *rb; rb = container_of(map, struct bpf_ringbuf_map, map)->rb; switch (flags) { case BPF_RB_AVAIL_DATA: return ringbuf_avail_data_sz(rb); case BPF_RB_RING_SIZE: return ringbuf_total_data_sz(rb); case BPF_RB_CONS_POS: return smp_load_acquire(&rb->consumer_pos); case BPF_RB_PROD_POS: return smp_load_acquire(&rb->producer_pos); default: return 0; } } const struct bpf_func_proto bpf_ringbuf_query_proto = { .func = bpf_ringbuf_query, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, }; BPF_CALL_4(bpf_ringbuf_reserve_dynptr, struct bpf_map *, map, u32, size, u64, flags, struct bpf_dynptr_kern *, ptr) { struct bpf_ringbuf_map *rb_map; void *sample; int err; if (unlikely(flags)) { bpf_dynptr_set_null(ptr); return -EINVAL; } err = bpf_dynptr_check_size(size); if (err) { bpf_dynptr_set_null(ptr); return err; } rb_map = container_of(map, struct bpf_ringbuf_map, map); sample = __bpf_ringbuf_reserve(rb_map->rb, size); if (!sample) { bpf_dynptr_set_null(ptr); return -EINVAL; } bpf_dynptr_init(ptr, sample, BPF_DYNPTR_TYPE_RINGBUF, 0, size); return 0; } const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = { .func = bpf_ringbuf_reserve_dynptr, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT, }; BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags) { if (!ptr->data) return 0; bpf_ringbuf_commit(ptr->data, flags, false /* discard */); bpf_dynptr_set_null(ptr); return 0; } const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto = { .func = bpf_ringbuf_submit_dynptr, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE, .arg2_type = ARG_ANYTHING, }; BPF_CALL_2(bpf_ringbuf_discard_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags) { if (!ptr->data) return 0; bpf_ringbuf_commit(ptr->data, flags, true /* discard */); bpf_dynptr_set_null(ptr); return 0; } const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = { .func = bpf_ringbuf_discard_dynptr, .ret_type = RET_VOID, .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE, .arg2_type = ARG_ANYTHING, }; static int __bpf_user_ringbuf_peek(struct bpf_ringbuf *rb, void **sample, u32 *size) { int err; u32 hdr_len, sample_len, total_len, flags, *hdr; u64 cons_pos, prod_pos; /* Synchronizes with smp_store_release() in user-space producer. */ prod_pos = smp_load_acquire(&rb->producer_pos); if (prod_pos % 8) return -EINVAL; /* Synchronizes with smp_store_release() in __bpf_user_ringbuf_sample_release() */ cons_pos = smp_load_acquire(&rb->consumer_pos); if (cons_pos >= prod_pos) return -ENODATA; hdr = (u32 *)((uintptr_t)rb->data + (uintptr_t)(cons_pos & rb->mask)); /* Synchronizes with smp_store_release() in user-space producer. */ hdr_len = smp_load_acquire(hdr); flags = hdr_len & (BPF_RINGBUF_BUSY_BIT | BPF_RINGBUF_DISCARD_BIT); sample_len = hdr_len & ~flags; total_len = round_up(sample_len + BPF_RINGBUF_HDR_SZ, 8); /* The sample must fit within the region advertised by the producer position. */ if (total_len > prod_pos - cons_pos) return -EINVAL; /* The sample must fit within the data region of the ring buffer. */ if (total_len > ringbuf_total_data_sz(rb)) return -E2BIG; /* The sample must fit into a struct bpf_dynptr. */ err = bpf_dynptr_check_size(sample_len); if (err) return -E2BIG; if (flags & BPF_RINGBUF_DISCARD_BIT) { /* If the discard bit is set, the sample should be skipped. * * Update the consumer pos, and return -EAGAIN so the caller * knows to skip this sample and try to read the next one. */ smp_store_release(&rb->consumer_pos, cons_pos + total_len); return -EAGAIN; } if (flags & BPF_RINGBUF_BUSY_BIT) return -ENODATA; *sample = (void *)((uintptr_t)rb->data + (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask)); *size = sample_len; return 0; } static void __bpf_user_ringbuf_sample_release(struct bpf_ringbuf *rb, size_t size, u64 flags) { u64 consumer_pos; u32 rounded_size = round_up(size + BPF_RINGBUF_HDR_SZ, 8); /* Using smp_load_acquire() is unnecessary here, as the busy-bit * prevents another task from writing to consumer_pos after it was read * by this task with smp_load_acquire() in __bpf_user_ringbuf_peek(). */ consumer_pos = rb->consumer_pos; /* Synchronizes with smp_load_acquire() in user-space producer. */ smp_store_release(&rb->consumer_pos, consumer_pos + rounded_size); } BPF_CALL_4(bpf_user_ringbuf_drain, struct bpf_map *, map, void *, callback_fn, void *, callback_ctx, u64, flags) { struct bpf_ringbuf *rb; long samples, discarded_samples = 0, ret = 0; bpf_callback_t callback = (bpf_callback_t)callback_fn; u64 wakeup_flags = BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP; int busy = 0; if (unlikely(flags & ~wakeup_flags)) return -EINVAL; rb = container_of(map, struct bpf_ringbuf_map, map)->rb; /* If another consumer is already consuming a sample, wait for them to finish. */ if (!atomic_try_cmpxchg(&rb->busy, &busy, 1)) return -EBUSY; for (samples = 0; samples < BPF_MAX_USER_RINGBUF_SAMPLES && ret == 0; samples++) { int err; u32 size; void *sample; struct bpf_dynptr_kern dynptr; err = __bpf_user_ringbuf_peek(rb, &sample, &size); if (err) { if (err == -ENODATA) { break; } else if (err == -EAGAIN) { discarded_samples++; continue; } else { ret = err; goto schedule_work_return; } } bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size); ret = callback((uintptr_t)&dynptr, (uintptr_t)callback_ctx, 0, 0, 0); __bpf_user_ringbuf_sample_release(rb, size, flags); } ret = samples - discarded_samples; schedule_work_return: /* Prevent the clearing of the busy-bit from being reordered before the * storing of any rb consumer or producer positions. */ atomic_set_release(&rb->busy, 0); if (flags & BPF_RB_FORCE_WAKEUP) irq_work_queue(&rb->work); else if (!(flags & BPF_RB_NO_WAKEUP) && samples > 0) irq_work_queue(&rb->work); return ret; } const struct bpf_func_proto bpf_user_ringbuf_drain_proto = { .func = bpf_user_ringbuf_drain, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_FUNC, .arg3_type = ARG_PTR_TO_STACK_OR_NULL, .arg4_type = ARG_ANYTHING, };
1026 1024 687 976 975 470 453 453 281 6 279 470 470 2 24 24 9 24 2 27 2 2 2 23 17 9 7 1 6 6 35 2 6 1 8 29 2 3 2 1 1 5 19 19 19 20 28 25 3 2 1 440 397 441 106 4 4 362 33 65 3 106 102 245 245 245 39 245 244 245 245 172 163 163 45 45 45 420 18 433 433 420 6 358 3 349 79 1 80 1 1 2 335 1 11 342 330 333 410 3 2 1 11 11 11 409 3 324 153 409 3 409 348 118 3 1 2 3 3 3 3 3 3 409 6 574 493 493 441 420 14 11 428 6 3 4 1 1 5 434 411 411 411 4 85 1 2 1 1 1 406 4 14 2 80 1841 713 1839 1842 1835 1837 10 1836 1838 30 4 63 54 30 63 63 54 9 2 5 2 61 125 157 16 146 146 146 141 144 124 124 124 3 144 144 144 2 127 2 56 85 36 137 181 156 242 242 242 2 79 207 189 14 144 155 181 22 242 241 243 210 241 240 3 242 181 1025 1027 1028 1027 23 1026 1026 1027 1027 1024 1028 1 16 1022 1021 22 1027 1020 1021 1019 9 1014 999 958 283 283 958 955 207 12 999 999 995 958 958 283 283 283 299 299 8 299 297 298 8 294 470 470 7 7 7 7 3 3 7 3 7 7 7 7 7 7 7 3 3 3 3 7 7 7 5 7 5 7 7 7 5 3 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux INET6 implementation * Forwarding Information Database * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Changes: * Yuji SEKIYA @USAGI: Support default route on router node; * remove ip6_null_entry from the top of * routing table. * Ville Nuorvala: Fixed routing subtrees. */ #define pr_fmt(fmt) "IPv6: " fmt #include <linux/bpf.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/net.h> #include <linux/route.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/lwtunnel.h> #include <net/fib_notifier.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> static struct kmem_cache *fib6_node_kmem __read_mostly; struct fib6_cleaner { struct fib6_walker w; struct net *net; int (*func)(struct fib6_info *, void *arg); int sernum; void *arg; bool skip_notify; }; #ifdef CONFIG_IPV6_SUBTREES #define FWS_INIT FWS_S #else #define FWS_INIT FWS_L #endif static struct fib6_info *fib6_find_prefix(struct net *net, struct fib6_table *table, struct fib6_node *fn); static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_table *table, struct fib6_node *fn); static int fib6_walk(struct net *net, struct fib6_walker *w); static int fib6_walk_continue(struct fib6_walker *w); /* * A routing update causes an increase of the serial number on the * affected subtree. This allows for cached routes to be asynchronously * tested when modifications are made to the destination cache as a * result of redirects, path MTU changes, etc. */ static void fib6_gc_timer_cb(struct timer_list *t); #define FOR_WALKERS(net, w) \ list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh) static void fib6_walker_link(struct net *net, struct fib6_walker *w) { write_lock_bh(&net->ipv6.fib6_walker_lock); list_add(&w->lh, &net->ipv6.fib6_walkers); write_unlock_bh(&net->ipv6.fib6_walker_lock); } static void fib6_walker_unlink(struct net *net, struct fib6_walker *w) { write_lock_bh(&net->ipv6.fib6_walker_lock); list_del(&w->lh); write_unlock_bh(&net->ipv6.fib6_walker_lock); } static int fib6_new_sernum(struct net *net) { int new, old = atomic_read(&net->ipv6.fib6_sernum); do { new = old < INT_MAX ? old + 1 : 1; } while (!atomic_try_cmpxchg(&net->ipv6.fib6_sernum, &old, new)); return new; } enum { FIB6_NO_SERNUM_CHANGE = 0, }; void fib6_update_sernum(struct net *net, struct fib6_info *f6i) { struct fib6_node *fn; fn = rcu_dereference_protected(f6i->fib6_node, lockdep_is_held(&f6i->fib6_table->tb6_lock)); if (fn) WRITE_ONCE(fn->fn_sernum, fib6_new_sernum(net)); } /* * Auxiliary address test functions for the radix tree. * * These assume a 32bit processor (although it will work on * 64bit processors) */ /* * test bit */ #if defined(__LITTLE_ENDIAN) # define BITOP_BE32_SWIZZLE (0x1F & ~7) #else # define BITOP_BE32_SWIZZLE 0 #endif static __be32 addr_bit_set(const void *token, int fn_bit) { const __be32 *addr = token; /* * Here, * 1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f) * is optimized version of * htonl(1 << ((~fn_bit)&0x1F)) * See include/asm-generic/bitops/le.h. */ return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) & addr[fn_bit >> 5]; } struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh) { struct fib6_info *f6i; size_t sz = sizeof(*f6i); if (with_fib6_nh) sz += sizeof(struct fib6_nh); f6i = kzalloc(sz, gfp_flags); if (!f6i) return NULL; /* fib6_siblings is a union with nh_list, so this initializes both */ INIT_LIST_HEAD(&f6i->fib6_siblings); refcount_set(&f6i->fib6_ref, 1); return f6i; } void fib6_info_destroy_rcu(struct rcu_head *head) { struct fib6_info *f6i = container_of(head, struct fib6_info, rcu); WARN_ON(f6i->fib6_node); if (f6i->nh) nexthop_put(f6i->nh); else fib6_nh_release(f6i->fib6_nh); ip_fib_metrics_put(f6i->fib6_metrics); kfree(f6i); } EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu); static struct fib6_node *node_alloc(struct net *net) { struct fib6_node *fn; fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC); if (fn) net->ipv6.rt6_stats->fib_nodes++; return fn; } static void node_free_immediate(struct net *net, struct fib6_node *fn) { kmem_cache_free(fib6_node_kmem, fn); net->ipv6.rt6_stats->fib_nodes--; } static void node_free_rcu(struct rcu_head *head) { struct fib6_node *fn = container_of(head, struct fib6_node, rcu); kmem_cache_free(fib6_node_kmem, fn); } static void node_free(struct net *net, struct fib6_node *fn) { call_rcu(&fn->rcu, node_free_rcu); net->ipv6.rt6_stats->fib_nodes--; } static void fib6_free_table(struct fib6_table *table) { inetpeer_invalidate_tree(&table->tb6_peers); kfree(table); } static void fib6_link_table(struct net *net, struct fib6_table *tb) { unsigned int h; /* * Initialize table lock at a single place to give lockdep a key, * tables aren't visible prior to being linked to the list. */ spin_lock_init(&tb->tb6_lock); h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1); /* * No protection necessary, this is the only list mutatation * operation, tables never disappear once they exist. */ hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]); } #ifdef CONFIG_IPV6_MULTIPLE_TABLES static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) { struct fib6_table *table; table = kzalloc(sizeof(*table), GFP_ATOMIC); if (table) { table->tb6_id = id; rcu_assign_pointer(table->tb6_root.leaf, net->ipv6.fib6_null_entry); table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&table->tb6_peers); } return table; } struct fib6_table *fib6_new_table(struct net *net, u32 id) { struct fib6_table *tb; if (id == 0) id = RT6_TABLE_MAIN; tb = fib6_get_table(net, id); if (tb) return tb; tb = fib6_alloc_table(net, id); if (tb) fib6_link_table(net, tb); return tb; } EXPORT_SYMBOL_GPL(fib6_new_table); struct fib6_table *fib6_get_table(struct net *net, u32 id) { struct fib6_table *tb; struct hlist_head *head; unsigned int h; if (id == 0) id = RT6_TABLE_MAIN; h = id & (FIB6_TABLE_HASHSZ - 1); rcu_read_lock(); head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (tb->tb6_id == id) { rcu_read_unlock(); return tb; } } rcu_read_unlock(); return NULL; } EXPORT_SYMBOL_GPL(fib6_get_table); static void __net_init fib6_tables_init(struct net *net) { fib6_link_table(net, net->ipv6.fib6_main_tbl); fib6_link_table(net, net->ipv6.fib6_local_tbl); } #else struct fib6_table *fib6_new_table(struct net *net, u32 id) { return fib6_get_table(net, id); } struct fib6_table *fib6_get_table(struct net *net, u32 id) { return net->ipv6.fib6_main_tbl; } struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup) { struct rt6_info *rt; rt = pol_lookup_func(lookup, net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error == -EAGAIN) { ip6_rt_put_flags(rt, flags); rt = net->ipv6.ip6_null_entry; if (!(flags & RT6_LOOKUP_F_DST_NOREF)) dst_hold(&rt->dst); } return &rt->dst; } /* called with rcu lock held; no reference taken on fib6_info */ int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, struct fib6_result *res, int flags) { return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, res, flags); } static void __net_init fib6_tables_init(struct net *net) { fib6_link_table(net, net->ipv6.fib6_main_tbl); } #endif unsigned int fib6_tables_seq_read(struct net *net) { unsigned int h, fib_seq = 0; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv6.fib_table_hash[h]; struct fib6_table *tb; hlist_for_each_entry_rcu(tb, head, tb6_hlist) fib_seq += tb->fib_seq; } rcu_read_unlock(); return fib_seq; } static int call_fib6_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib6_info *rt, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, }; return call_fib6_notifier(nb, event_type, &info.info); } static int call_fib6_multipath_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib6_info *rt, unsigned int nsiblings, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, .nsiblings = nsiblings, }; return call_fib6_notifier(nb, event_type, &info.info); } int call_fib6_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, }; rt->fib6_table->fib_seq++; return call_fib6_notifiers(net, event_type, &info.info); } int call_fib6_multipath_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, unsigned int nsiblings, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, .nsiblings = nsiblings, }; rt->fib6_table->fib_seq++; return call_fib6_notifiers(net, event_type, &info.info); } int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt) { struct fib6_entry_notifier_info info = { .rt = rt, .nsiblings = rt->fib6_nsiblings, }; rt->fib6_table->fib_seq++; return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info); } struct fib6_dump_arg { struct net *net; struct notifier_block *nb; struct netlink_ext_ack *extack; }; static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) { enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE; int err; if (!rt || rt == arg->net->ipv6.fib6_null_entry) return 0; if (rt->fib6_nsiblings) err = call_fib6_multipath_entry_notifier(arg->nb, fib_event, rt, rt->fib6_nsiblings, arg->extack); else err = call_fib6_entry_notifier(arg->nb, fib_event, rt, arg->extack); return err; } static int fib6_node_dump(struct fib6_walker *w) { int err; err = fib6_rt_dump(w->leaf, w->args); w->leaf = NULL; return err; } static int fib6_table_dump(struct net *net, struct fib6_table *tb, struct fib6_walker *w) { int err; w->root = &tb->tb6_root; spin_lock_bh(&tb->tb6_lock); err = fib6_walk(net, w); spin_unlock_bh(&tb->tb6_lock); return err; } /* Called with rcu_read_lock() */ int fib6_tables_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct fib6_dump_arg arg; struct fib6_walker *w; unsigned int h; int err = 0; w = kzalloc(sizeof(*w), GFP_ATOMIC); if (!w) return -ENOMEM; w->func = fib6_node_dump; arg.net = net; arg.nb = nb; arg.extack = extack; w->args = &arg; for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv6.fib_table_hash[h]; struct fib6_table *tb; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { err = fib6_table_dump(net, tb, w); if (err) goto out; } } out: kfree(w); /* The tree traversal function should never return a positive value. */ return err > 0 ? -EINVAL : err; } static int fib6_dump_node(struct fib6_walker *w) { int res; struct fib6_info *rt; for_each_fib6_walker_rt(w) { res = rt6_dump_route(rt, w->args, w->skip_in_node); if (res >= 0) { /* Frame is full, suspend walking */ w->leaf = rt; /* We'll restart from this node, so if some routes were * already dumped, skip them next time. */ w->skip_in_node += res; return 1; } w->skip_in_node = 0; /* Multipath routes are dumped in one route with the * RTA_MULTIPATH attribute. Jump 'rt' to point to the * last sibling of this route (no need to dump the * sibling routes again) */ if (rt->fib6_nsiblings) rt = list_last_entry(&rt->fib6_siblings, struct fib6_info, fib6_siblings); } w->leaf = NULL; return 0; } static void fib6_dump_end(struct netlink_callback *cb) { struct net *net = sock_net(cb->skb->sk); struct fib6_walker *w = (void *)cb->args[2]; if (w) { if (cb->args[4]) { cb->args[4] = 0; fib6_walker_unlink(net, w); } cb->args[2] = 0; kfree(w); } cb->done = (void *)cb->args[3]; cb->args[1] = 3; } static int fib6_dump_done(struct netlink_callback *cb) { fib6_dump_end(cb); return cb->done ? cb->done(cb) : 0; } static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct fib6_walker *w; int res; w = (void *)cb->args[2]; w->root = &table->tb6_root; if (cb->args[4] == 0) { w->count = 0; w->skip = 0; w->skip_in_node = 0; spin_lock_bh(&table->tb6_lock); res = fib6_walk(net, w); spin_unlock_bh(&table->tb6_lock); if (res > 0) { cb->args[4] = 1; cb->args[5] = READ_ONCE(w->root->fn_sernum); } } else { int sernum = READ_ONCE(w->root->fn_sernum); if (cb->args[5] != sernum) { /* Begin at the root if the tree changed */ cb->args[5] = sernum; w->state = FWS_INIT; w->node = w->root; w->skip = w->count; w->skip_in_node = 0; } else w->skip = 0; spin_lock_bh(&table->tb6_lock); res = fib6_walk_continue(w); spin_unlock_bh(&table->tb6_lock); if (res <= 0) { fib6_walker_unlink(net, w); cb->args[4] = 0; } } return res; } static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true, .filter.dump_routes = true }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int h, s_h; unsigned int e = 0, s_e; struct fib6_walker *w; struct fib6_table *tb; struct hlist_head *head; int res = 0; if (cb->strict_check) { int err; err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb); if (err < 0) return err; } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { struct rtmsg *rtm = nlmsg_data(nlh); if (rtm->rtm_flags & RTM_F_PREFIX) arg.filter.flags = RTM_F_PREFIX; } w = (void *)cb->args[2]; if (!w) { /* New dump: * * 1. hook callback destructor. */ cb->args[3] = (long)cb->done; cb->done = fib6_dump_done; /* * 2. allocate and initialize walker. */ w = kzalloc(sizeof(*w), GFP_ATOMIC); if (!w) return -ENOMEM; w->func = fib6_dump_node; cb->args[2] = (long)w; } arg.skb = skb; arg.cb = cb; arg.net = net; w->args = &arg; if (arg.filter.table_id) { tb = fib6_get_table(net, arg.filter.table_id); if (!tb) { if (rtnl_msg_family(cb->nlh) != PF_INET6) goto out; NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist"); return -ENOENT; } if (!cb->args[0]) { res = fib6_dump_table(tb, skb, cb); if (!res) cb->args[0] = 1; } goto out; } s_h = cb->args[0]; s_e = cb->args[1]; rcu_read_lock(); for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (e < s_e) goto next; res = fib6_dump_table(tb, skb, cb); if (res != 0) goto out_unlock; next: e++; } } out_unlock: rcu_read_unlock(); cb->args[1] = e; cb->args[0] = h; out: res = res < 0 ? res : skb->len; if (res <= 0) fib6_dump_end(cb); return res; } void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val) { if (!f6i) return; if (f6i->fib6_metrics == &dst_default_metrics) { struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC); if (!p) return; refcount_set(&p->refcnt, 1); f6i->fib6_metrics = p; } f6i->fib6_metrics->metrics[metric - 1] = val; } /* * Routing Table * * return the appropriate node for a routing tree "add" operation * by either creating and inserting or by returning an existing * node. */ static struct fib6_node *fib6_add_1(struct net *net, struct fib6_table *table, struct fib6_node *root, struct in6_addr *addr, int plen, int offset, int allow_create, int replace_required, struct netlink_ext_ack *extack) { struct fib6_node *fn, *in, *ln; struct fib6_node *pn = NULL; struct rt6key *key; int bit; __be32 dir = 0; RT6_TRACE("fib6_add_1\n"); /* insert node in tree */ fn = root; do { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match */ if (plen < fn->fn_bit || !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) { if (!allow_create) { if (replace_required) { NL_SET_ERR_MSG(extack, "Can not replace route - no match found"); pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT); } pr_warn("NLM_F_CREATE should be set when creating new route\n"); } goto insert_above; } /* * Exact match ? */ if (plen == fn->fn_bit) { /* clean up an intermediate node */ if (!(fn->fn_flags & RTN_RTINFO)) { RCU_INIT_POINTER(fn->leaf, NULL); fib6_info_release(leaf); /* remove null_entry in the root node */ } else if (fn->fn_flags & RTN_TL_ROOT && rcu_access_pointer(fn->leaf) == net->ipv6.fib6_null_entry) { RCU_INIT_POINTER(fn->leaf, NULL); } return fn; } /* * We have more bits to go */ /* Try to walk down on tree. */ dir = addr_bit_set(addr, fn->fn_bit); pn = fn; fn = dir ? rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)) : rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); } while (fn); if (!allow_create) { /* We should not create new node because * NLM_F_REPLACE was specified without NLM_F_CREATE * I assume it is safe to require NLM_F_CREATE when * REPLACE flag is used! Later we may want to remove the * check for replace_required, because according * to netlink specification, NLM_F_CREATE * MUST be specified if new route is created. * That would keep IPv6 consistent with IPv4 */ if (replace_required) { NL_SET_ERR_MSG(extack, "Can not replace route - no match found"); pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT); } pr_warn("NLM_F_CREATE should be set when creating new route\n"); } /* * We walked to the bottom of tree. * Create new leaf node without children. */ ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, pn); if (dir) rcu_assign_pointer(pn->right, ln); else rcu_assign_pointer(pn->left, ln); return ln; insert_above: /* * split since we don't have a common prefix anymore or * we have a less significant route. * we've to insert an intermediate node on the list * this new node will point to the one we need to create * and the current */ pn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); /* find 1st bit in difference between the 2 addrs. See comment in __ipv6_addr_diff: bit may be an invalid value, but if it is >= plen, the value is ignored in any case. */ bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr)); /* * (intermediate)[in] * / \ * (new leaf node)[ln] (old node)[fn] */ if (plen > bit) { in = node_alloc(net); ln = node_alloc(net); if (!in || !ln) { if (in) node_free_immediate(net, in); if (ln) node_free_immediate(net, ln); return ERR_PTR(-ENOMEM); } /* * new intermediate node. * RTN_RTINFO will * be off since that an address that chooses one of * the branches would not match less specific routes * in the other branch */ in->fn_bit = bit; RCU_INIT_POINTER(in->parent, pn); in->leaf = fn->leaf; fib6_info_hold(rcu_dereference_protected(in->leaf, lockdep_is_held(&table->tb6_lock))); /* update parent pointer */ if (dir) rcu_assign_pointer(pn->right, in); else rcu_assign_pointer(pn->left, in); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, in); rcu_assign_pointer(fn->parent, in); if (addr_bit_set(addr, bit)) { rcu_assign_pointer(in->right, ln); rcu_assign_pointer(in->left, fn); } else { rcu_assign_pointer(in->left, ln); rcu_assign_pointer(in->right, fn); } } else { /* plen <= bit */ /* * (new leaf node)[ln] * / \ * (old node)[fn] NULL */ ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, pn); if (addr_bit_set(&key->addr, plen)) RCU_INIT_POINTER(ln->right, fn); else RCU_INIT_POINTER(ln->left, fn); rcu_assign_pointer(fn->parent, ln); if (dir) rcu_assign_pointer(pn->right, ln); else rcu_assign_pointer(pn->left, ln); } return ln; } static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, const struct fib6_info *match, const struct fib6_table *table) { int cpu; if (!fib6_nh->rt6i_pcpu) return; /* release the reference to this fib entry from * all of its cached pcpu routes */ for_each_possible_cpu(cpu) { struct rt6_info **ppcpu_rt; struct rt6_info *pcpu_rt; ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); pcpu_rt = *ppcpu_rt; /* only dropping the 'from' reference if the cached route * is using 'match'. The cached pcpu_rt->from only changes * from a fib6_info to NULL (ip6_dst_destroy); it can never * change from one fib6_info reference to another */ if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) { struct fib6_info *from; from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL); fib6_info_release(from); } } } struct fib6_nh_pcpu_arg { struct fib6_info *from; const struct fib6_table *table; }; static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg) { struct fib6_nh_pcpu_arg *arg = _arg; __fib6_drop_pcpu_from(nh, arg->from, arg->table); return 0; } static void fib6_drop_pcpu_from(struct fib6_info *f6i, const struct fib6_table *table) { /* Make sure rt6_make_pcpu_route() wont add other percpu routes * while we are cleaning them here. */ f6i->fib6_destroying = 1; mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ if (f6i->nh) { struct fib6_nh_pcpu_arg arg = { .from = f6i, .table = table }; nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, &arg); } else { struct fib6_nh *fib6_nh; fib6_nh = f6i->fib6_nh; __fib6_drop_pcpu_from(fib6_nh, f6i, table); } } static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, struct net *net) { struct fib6_table *table = rt->fib6_table; /* Flush all cached dst in exception table */ rt6_flush_exceptions(rt); fib6_drop_pcpu_from(rt, table); if (rt->nh && !list_empty(&rt->nh_list)) list_del_init(&rt->nh_list); if (refcount_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split * nodes. It is not leaked, but it still holds other resources, * which must be released in time. So, scan ascendant nodes * and replace dummy references to this route with references * to still alive ones. */ while (fn) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *new_leaf; if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) { new_leaf = fib6_find_prefix(net, table, fn); fib6_info_hold(new_leaf); rcu_assign_pointer(fn->leaf, new_leaf); fib6_info_release(rt); } fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); } } } /* * Insert routing information in a node. */ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->fib6_table->tb6_lock)); struct fib6_info *iter = NULL; struct fib6_info __rcu **ins; struct fib6_info __rcu **fallback_ins = NULL; int replace = (info->nlh && (info->nlh->nlmsg_flags & NLM_F_REPLACE)); int add = (!info->nlh || (info->nlh->nlmsg_flags & NLM_F_CREATE)); int found = 0; bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); bool notify_sibling_rt = false; u16 nlflags = NLM_F_EXCL; int err; if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND)) nlflags |= NLM_F_APPEND; ins = &fn->leaf; for (iter = leaf; iter; iter = rcu_dereference_protected(iter->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock))) { /* * Search for duplicates */ if (iter->fib6_metric == rt->fib6_metric) { /* * Same priority level */ if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_EXCL)) return -EEXIST; nlflags &= ~NLM_F_EXCL; if (replace) { if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) { found++; break; } fallback_ins = fallback_ins ?: ins; goto next_iter; } if (rt6_duplicate_nexthop(iter, rt)) { if (rt->fib6_nsiblings) rt->fib6_nsiblings = 0; if (!(iter->fib6_flags & RTF_EXPIRES)) return -EEXIST; if (!(rt->fib6_flags & RTF_EXPIRES)) fib6_clean_expires(iter); else fib6_set_expires(iter, rt->expires); if (rt->fib6_pmtu) fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu); return -EEXIST; } /* If we have the same destination and the same metric, * but not the same gateway, then the route we try to * add is sibling to this route, increment our counter * of siblings, and later we will add our route to the * list. * Only static routes (which don't have flag * RTF_EXPIRES) are used for ECMPv6. * * To avoid long list, we only had siblings if the * route have a gateway. */ if (rt_can_ecmp && rt6_qualify_for_ecmp(iter)) rt->fib6_nsiblings++; } if (iter->fib6_metric > rt->fib6_metric) break; next_iter: ins = &iter->fib6_next; } if (fallback_ins && !found) { /* No matching route with same ecmp-able-ness found, replace * first matching route */ ins = fallback_ins; iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); found++; } /* Reset round-robin state, if necessary */ if (ins == &fn->leaf) fn->rr_ptr = NULL; /* Link this route to others same route. */ if (rt->fib6_nsiblings) { unsigned int fib6_nsiblings; struct fib6_info *sibling, *temp_sibling; /* Find the first route that have the same metric */ sibling = leaf; notify_sibling_rt = true; while (sibling) { if (sibling->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(sibling)) { list_add_tail(&rt->fib6_siblings, &sibling->fib6_siblings); break; } sibling = rcu_dereference_protected(sibling->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock)); notify_sibling_rt = false; } /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings * is broken! */ fib6_nsiblings = 0; list_for_each_entry_safe(sibling, temp_sibling, &rt->fib6_siblings, fib6_siblings) { sibling->fib6_nsiblings++; BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings); fib6_nsiblings++; } BUG_ON(fib6_nsiblings != rt->fib6_nsiblings); rt6_multipath_rebalance(temp_sibling); } /* * insert node */ if (!replace) { if (!add) pr_warn("NLM_F_CREATE should be set when creating new route\n"); add: nlflags |= NLM_F_CREATE; /* The route should only be notified if it is the first * route in the node or if it is added as a sibling * route to the first route in the node. */ if (!info->skip_notify_kernel && (notify_sibling_rt || ins == &fn->leaf)) { enum fib_event_type fib_event; if (notify_sibling_rt) fib_event = FIB_EVENT_ENTRY_APPEND; else fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib6_entry_notifiers(info->nl_net, fib_event, rt, extack); if (err) { struct fib6_info *sibling, *next_sibling; /* If the route has siblings, then it first * needs to be unlinked from them. */ if (!rt->fib6_nsiblings) return err; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) sibling->fib6_nsiblings--; rt->fib6_nsiblings = 0; list_del_init(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); return err; } } rcu_assign_pointer(rt->fib6_next, iter); fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); rcu_assign_pointer(*ins, rt); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } } else { int nsiblings; if (!found) { if (add) goto add; pr_warn("NLM_F_REPLACE set, but no existing node found!\n"); return -ENOENT; } if (!info->skip_notify_kernel && ins == &fn->leaf) { err = call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, rt, extack); if (err) return err; } fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); rt->fib6_next = iter->fib6_next; rcu_assign_pointer(*ins, rt); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } nsiblings = iter->fib6_nsiblings; iter->fib6_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; fib6_info_release(iter); if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ ins = &rt->fib6_next; iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); while (iter) { if (iter->fib6_metric > rt->fib6_metric) break; if (rt6_qualify_for_ecmp(iter)) { *ins = iter->fib6_next; iter->fib6_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; fib6_info_release(iter); nsiblings--; info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } else { ins = &iter->fib6_next; } iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); } WARN_ON(nsiblings != 0); } } return 0; } static void fib6_start_gc(struct net *net, struct fib6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && (rt->fib6_flags & RTF_EXPIRES)) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } void fib6_force_start_gc(struct net *net) { if (!timer_pending(&net->ipv6.ip6_fib_timer)) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } static void __fib6_update_sernum_upto_root(struct fib6_info *rt, int sernum) { struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&rt->fib6_table->tb6_lock)); /* paired with smp_rmb() in fib6_get_cookie_safe() */ smp_wmb(); while (fn) { WRITE_ONCE(fn->fn_sernum, sernum); fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&rt->fib6_table->tb6_lock)); } } void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt) { __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net)); } /* allow ipv4 to update sernum via ipv6_stub */ void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i) { spin_lock_bh(&f6i->fib6_table->tb6_lock); fib6_update_sernum_upto_root(net, f6i); spin_unlock_bh(&f6i->fib6_table->tb6_lock); } /* * Add routing information to the routing tree. * <destination addr>/<source addr> * with source addr info in sub-trees * Need to own table->tb6_lock */ int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_table *table = rt->fib6_table; struct fib6_node *fn, *pn = NULL; int err = -ENOMEM; int allow_create = 1; int replace_required = 0; if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) allow_create = 0; if (info->nlh->nlmsg_flags & NLM_F_REPLACE) replace_required = 1; } if (!allow_create && !replace_required) pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n"); fn = fib6_add_1(info->nl_net, table, root, &rt->fib6_dst.addr, rt->fib6_dst.plen, offsetof(struct fib6_info, fib6_dst), allow_create, replace_required, extack); if (IS_ERR(fn)) { err = PTR_ERR(fn); fn = NULL; goto out; } pn = fn; #ifdef CONFIG_IPV6_SUBTREES if (rt->fib6_src.plen) { struct fib6_node *sn; if (!rcu_access_pointer(fn->subtree)) { struct fib6_node *sfn; /* * Create subtree. * * fn[main tree] * | * sfn[subtree root] * \ * sn[new leaf node] */ /* Create subtree root node */ sfn = node_alloc(info->nl_net); if (!sfn) goto failure; fib6_info_hold(info->nl_net->ipv6.fib6_null_entry); rcu_assign_pointer(sfn->leaf, info->nl_net->ipv6.fib6_null_entry); sfn->fn_flags = RTN_ROOT; /* Now add the first leaf node to new subtree */ sn = fib6_add_1(info->nl_net, table, sfn, &rt->fib6_src.addr, rt->fib6_src.plen, offsetof(struct fib6_info, fib6_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { /* If it is failed, discard just allocated root, and then (in failure) stale node in main tree. */ node_free_immediate(info->nl_net, sfn); err = PTR_ERR(sn); goto failure; } /* Now link new subtree to main tree */ rcu_assign_pointer(sfn->parent, fn); rcu_assign_pointer(fn->subtree, sfn); } else { sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn), &rt->fib6_src.addr, rt->fib6_src.plen, offsetof(struct fib6_info, fib6_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { err = PTR_ERR(sn); goto failure; } } if (!rcu_access_pointer(fn->leaf)) { if (fn->fn_flags & RTN_TL_ROOT) { /* put back null_entry for root node */ rcu_assign_pointer(fn->leaf, info->nl_net->ipv6.fib6_null_entry); } else { fib6_info_hold(rt); rcu_assign_pointer(fn->leaf, rt); } } fn = sn; } #endif err = fib6_add_rt2node(fn, rt, info, extack); if (!err) { if (rt->nh) list_add(&rt->nh_list, &rt->nh->f6i_list); __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net)); fib6_start_gc(info->nl_net, rt); } out: if (err) { #ifdef CONFIG_IPV6_SUBTREES /* * If fib6_add_1 has cleared the old leaf pointer in the * super-tree leaf node we have to find a new one for it. */ if (pn != fn) { struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf, lockdep_is_held(&table->tb6_lock)); if (pn_leaf == rt) { pn_leaf = NULL; RCU_INIT_POINTER(pn->leaf, NULL); fib6_info_release(rt); } if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { pn_leaf = fib6_find_prefix(info->nl_net, table, pn); if (!pn_leaf) pn_leaf = info->nl_net->ipv6.fib6_null_entry; fib6_info_hold(pn_leaf); rcu_assign_pointer(pn->leaf, pn_leaf); } } #endif goto failure; } else if (fib6_requires_src(rt)) { fib6_routes_require_src_inc(info->nl_net); } return err; failure: /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if: * 1. fn is an intermediate node and we failed to add the new * route to it in both subtree creation failure and fib6_add_rt2node() * failure case. * 2. fn is the root node in the table and we fail to add the first * default route to it. */ if (fn && (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) || (fn->fn_flags & RTN_TL_ROOT && !rcu_access_pointer(fn->leaf)))) fib6_repair_tree(info->nl_net, table, fn); return err; } /* * Routing tree lookup * */ struct lookup_args { int offset; /* key offset on fib6_info */ const struct in6_addr *addr; /* search key */ }; static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root, struct lookup_args *args) { struct fib6_node *fn; __be32 dir; if (unlikely(args->offset == 0)) return NULL; /* * Descend on a tree */ fn = root; for (;;) { struct fib6_node *next; dir = addr_bit_set(args->addr, fn->fn_bit); next = dir ? rcu_dereference(fn->right) : rcu_dereference(fn->left); if (next) { fn = next; continue; } break; } while (fn) { struct fib6_node *subtree = FIB6_SUBTREE(fn); if (subtree || fn->fn_flags & RTN_RTINFO) { struct fib6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; if (!leaf) goto backtrack; key = (struct rt6key *) ((u8 *)leaf + args->offset); if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) { #ifdef CONFIG_IPV6_SUBTREES if (subtree) { struct fib6_node *sfn; sfn = fib6_node_lookup_1(subtree, args + 1); if (!sfn) goto backtrack; fn = sfn; } #endif if (fn->fn_flags & RTN_RTINFO) return fn; } } backtrack: if (fn->fn_flags & RTN_ROOT) break; fn = rcu_dereference(fn->parent); } return NULL; } /* called with rcu_read_lock() held */ struct fib6_node *fib6_node_lookup(struct fib6_node *root, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct fib6_node *fn; struct lookup_args args[] = { { .offset = offsetof(struct fib6_info, fib6_dst), .addr = daddr, }, #ifdef CONFIG_IPV6_SUBTREES { .offset = offsetof(struct fib6_info, fib6_src), .addr = saddr, }, #endif { .offset = 0, /* sentinel */ } }; fn = fib6_node_lookup_1(root, daddr ? args : args + 1); if (!fn || fn->fn_flags & RTN_TL_ROOT) fn = root; return fn; } /* * Get node with specified destination prefix (and source prefix, * if subtrees are used) * exact_match == true means we try to find fn with exact match of * the passed in prefix addr * exact_match == false means we try to find fn with longest prefix * match of the passed in prefix addr. This is useful for finding fn * for cached route as it will be stored in the exception table under * the node with longest prefix length. */ static struct fib6_node *fib6_locate_1(struct fib6_node *root, const struct in6_addr *addr, int plen, int offset, bool exact_match) { struct fib6_node *fn, *prev = NULL; for (fn = root; fn ; ) { struct fib6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; /* This node is being deleted */ if (!leaf) { if (plen <= fn->fn_bit) goto out; else goto next; } key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match */ if (plen < fn->fn_bit || !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) goto out; if (plen == fn->fn_bit) return fn; if (fn->fn_flags & RTN_RTINFO) prev = fn; next: /* * We have more bits to go */ if (addr_bit_set(addr, fn->fn_bit)) fn = rcu_dereference(fn->right); else fn = rcu_dereference(fn->left); } out: if (exact_match) return NULL; else return prev; } struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *daddr, int dst_len, const struct in6_addr *saddr, int src_len, bool exact_match) { struct fib6_node *fn; fn = fib6_locate_1(root, daddr, dst_len, offsetof(struct fib6_info, fib6_dst), exact_match); #ifdef CONFIG_IPV6_SUBTREES if (src_len) { WARN_ON(saddr == NULL); if (fn) { struct fib6_node *subtree = FIB6_SUBTREE(fn); if (subtree) { fn = fib6_locate_1(subtree, saddr, src_len, offsetof(struct fib6_info, fib6_src), exact_match); } } } #endif if (fn && fn->fn_flags & RTN_RTINFO) return fn; return NULL; } /* * Deletion * */ static struct fib6_info *fib6_find_prefix(struct net *net, struct fib6_table *table, struct fib6_node *fn) { struct fib6_node *child_left, *child_right; if (fn->fn_flags & RTN_ROOT) return net->ipv6.fib6_null_entry; while (fn) { child_left = rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); child_right = rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)); if (child_left) return rcu_dereference_protected(child_left->leaf, lockdep_is_held(&table->tb6_lock)); if (child_right) return rcu_dereference_protected(child_right->leaf, lockdep_is_held(&table->tb6_lock)); fn = FIB6_SUBTREE(fn); } return NULL; } /* * Called to trim the tree of intermediate nodes when possible. "fn" * is the node we want to try and remove. * Need to own table->tb6_lock */ static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_table *table, struct fib6_node *fn) { int children; int nstate; struct fib6_node *child; struct fib6_walker *w; int iter = 0; /* Set fn->leaf to null_entry for root node. */ if (fn->fn_flags & RTN_TL_ROOT) { rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry); return fn; } for (;;) { struct fib6_node *fn_r = rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)); struct fib6_node *fn_l = rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn_r = rcu_dereference_protected(pn->right, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn_l = rcu_dereference_protected(pn->left, lockdep_is_held(&table->tb6_lock)); struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *new_fn_leaf; RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); iter++; WARN_ON(fn->fn_flags & RTN_RTINFO); WARN_ON(fn->fn_flags & RTN_TL_ROOT); WARN_ON(fn_leaf); children = 0; child = NULL; if (fn_r) { child = fn_r; children |= 1; } if (fn_l) { child = fn_l; children |= 2; } if (children == 3 || FIB6_SUBTREE(fn) #ifdef CONFIG_IPV6_SUBTREES /* Subtree root (i.e. fn) may have one child */ || (children && fn->fn_flags & RTN_ROOT) #endif ) { new_fn_leaf = fib6_find_prefix(net, table, fn); #if RT6_DEBUG >= 2 if (!new_fn_leaf) { WARN_ON(!new_fn_leaf); new_fn_leaf = net->ipv6.fib6_null_entry; } #endif fib6_info_hold(new_fn_leaf); rcu_assign_pointer(fn->leaf, new_fn_leaf); return pn; } #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { WARN_ON(!(fn->fn_flags & RTN_ROOT)); RCU_INIT_POINTER(pn->subtree, NULL); nstate = FWS_L; } else { WARN_ON(fn->fn_flags & RTN_ROOT); #endif if (pn_r == fn) rcu_assign_pointer(pn->right, child); else if (pn_l == fn) rcu_assign_pointer(pn->left, child); #if RT6_DEBUG >= 2 else WARN_ON(1); #endif if (child) rcu_assign_pointer(child->parent, pn); nstate = FWS_R; #ifdef CONFIG_IPV6_SUBTREES } #endif read_lock(&net->ipv6.fib6_walker_lock); FOR_WALKERS(net, w) { if (!child) { if (w->node == fn) { RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate); w->node = pn; w->state = nstate; } } else { if (w->node == fn) { w->node = child; if (children&2) { RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); w->state = w->state >= FWS_R ? FWS_U : FWS_INIT; } else { RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); w->state = w->state >= FWS_C ? FWS_U : FWS_INIT; } } } } read_unlock(&net->ipv6.fib6_walker_lock); node_free(net, fn); if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn)) return pn; RCU_INIT_POINTER(pn->leaf, NULL); fib6_info_release(pn_leaf); fn = pn; } } static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, struct fib6_info __rcu **rtp, struct nl_info *info) { struct fib6_info *leaf, *replace_rt = NULL; struct fib6_walker *w; struct fib6_info *rt = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); struct net *net = info->nl_net; bool notify_del = false; RT6_TRACE("fib6_del_route\n"); /* If the deleted route is the first in the node and it is not part of * a multipath route, then we need to replace it with the next route * in the node, if exists. */ leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); if (leaf == rt && !rt->fib6_nsiblings) { if (rcu_access_pointer(rt->fib6_next)) replace_rt = rcu_dereference_protected(rt->fib6_next, lockdep_is_held(&table->tb6_lock)); else notify_del = true; } /* Unlink it */ *rtp = rt->fib6_next; rt->fib6_node = NULL; net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; /* Reset round-robin state, if necessary */ if (rcu_access_pointer(fn->rr_ptr) == rt) fn->rr_ptr = NULL; /* Remove this entry from other siblings */ if (rt->fib6_nsiblings) { struct fib6_info *sibling, *next_sibling; /* The route is deleted from a multipath route. If this * multipath route is the first route in the node, then we need * to emit a delete notification. Otherwise, we need to skip * the notification. */ if (rt->fib6_metric == leaf->fib6_metric && rt6_qualify_for_ecmp(leaf)) notify_del = true; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) sibling->fib6_nsiblings--; rt->fib6_nsiblings = 0; list_del_init(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); } /* Adjust walkers */ read_lock(&net->ipv6.fib6_walker_lock); FOR_WALKERS(net, w) { if (w->state == FWS_C && w->leaf == rt) { RT6_TRACE("walker %p adjusted by delroute\n", w); w->leaf = rcu_dereference_protected(rt->fib6_next, lockdep_is_held(&table->tb6_lock)); if (!w->leaf) w->state = FWS_U; } } read_unlock(&net->ipv6.fib6_walker_lock); /* If it was last route, call fib6_repair_tree() to: * 1. For root node, put back null_entry as how the table was created. * 2. For other nodes, expunge its radix tree node. */ if (!rcu_access_pointer(fn->leaf)) { if (!(fn->fn_flags & RTN_TL_ROOT)) { fn->fn_flags &= ~RTN_RTINFO; net->ipv6.rt6_stats->fib_route_nodes--; } fn = fib6_repair_tree(net, table, fn); } fib6_purge_rt(rt, fn, net); if (!info->skip_notify_kernel) { if (notify_del) call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); else if (replace_rt) call_fib6_entry_notifiers_replace(net, replace_rt); } if (!info->skip_notify) inet6_rt_notify(RTM_DELROUTE, rt, info, 0); fib6_info_release(rt); } /* Need to own table->tb6_lock */ int fib6_del(struct fib6_info *rt, struct nl_info *info) { struct net *net = info->nl_net; struct fib6_info __rcu **rtp; struct fib6_info __rcu **rtp_next; struct fib6_table *table; struct fib6_node *fn; if (rt == net->ipv6.fib6_null_entry) return -ENOENT; table = rt->fib6_table; fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&table->tb6_lock)); if (!fn) return -ENOENT; WARN_ON(!(fn->fn_flags & RTN_RTINFO)); /* * Walk the leaf entries looking for ourself */ for (rtp = &fn->leaf; *rtp; rtp = rtp_next) { struct fib6_info *cur = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); if (rt == cur) { if (fib6_requires_src(cur)) fib6_routes_require_src_dec(info->nl_net); fib6_del_route(table, fn, rtp, info); return 0; } rtp_next = &cur->fib6_next; } return -ENOENT; } /* * Tree traversal function. * * Certainly, it is not interrupt safe. * However, it is internally reenterable wrt itself and fib6_add/fib6_del. * It means, that we can modify tree during walking * and use this function for garbage collection, clone pruning, * cleaning tree when a device goes down etc. etc. * * It guarantees that every node will be traversed, * and that it will be traversed only once. * * Callback function w->func may return: * 0 -> continue walking. * positive value -> walking is suspended (used by tree dumps, * and probably by gc, if it will be split to several slices) * negative value -> terminate walking. * * The function itself returns: * 0 -> walk is complete. * >0 -> walk is incomplete (i.e. suspended) * <0 -> walk is terminated by an error. * * This function is called with tb6_lock held. */ static int fib6_walk_continue(struct fib6_walker *w) { struct fib6_node *fn, *pn, *left, *right; /* w->root should always be table->tb6_root */ WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT)); for (;;) { fn = w->node; if (!fn) return 0; switch (w->state) { #ifdef CONFIG_IPV6_SUBTREES case FWS_S: if (FIB6_SUBTREE(fn)) { w->node = FIB6_SUBTREE(fn); continue; } w->state = FWS_L; fallthrough; #endif case FWS_L: left = rcu_dereference_protected(fn->left, 1); if (left) { w->node = left; w->state = FWS_INIT; continue; } w->state = FWS_R; fallthrough; case FWS_R: right = rcu_dereference_protected(fn->right, 1); if (right) { w->node = right; w->state = FWS_INIT; continue; } w->state = FWS_C; w->leaf = rcu_dereference_protected(fn->leaf, 1); fallthrough; case FWS_C: if (w->leaf && fn->fn_flags & RTN_RTINFO) { int err; if (w->skip) { w->skip--; goto skip; } err = w->func(w); if (err) return err; w->count++; continue; } skip: w->state = FWS_U; fallthrough; case FWS_U: if (fn == w->root) return 0; pn = rcu_dereference_protected(fn->parent, 1); left = rcu_dereference_protected(pn->left, 1); right = rcu_dereference_protected(pn->right, 1); w->node = pn; #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { WARN_ON(!(fn->fn_flags & RTN_ROOT)); w->state = FWS_L; continue; } #endif if (left == fn) { w->state = FWS_R; continue; } if (right == fn) { w->state = FWS_C; w->leaf = rcu_dereference_protected(w->node->leaf, 1); continue; } #if RT6_DEBUG >= 2 WARN_ON(1); #endif } } } static int fib6_walk(struct net *net, struct fib6_walker *w) { int res; w->state = FWS_INIT; w->node = w->root; fib6_walker_link(net, w); res = fib6_walk_continue(w); if (res <= 0) fib6_walker_unlink(net, w); return res; } static int fib6_clean_node(struct fib6_walker *w) { int res; struct fib6_info *rt; struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w); struct nl_info info = { .nl_net = c->net, .skip_notify = c->skip_notify, }; if (c->sernum != FIB6_NO_SERNUM_CHANGE && READ_ONCE(w->node->fn_sernum) != c->sernum) WRITE_ONCE(w->node->fn_sernum, c->sernum); if (!c->func) { WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE); w->leaf = NULL; return 0; } for_each_fib6_walker_rt(w) { res = c->func(rt, c->arg); if (res == -1) { w->leaf = rt; res = fib6_del(rt, &info); if (res) { #if RT6_DEBUG >= 2 pr_debug("%s: del failed: rt=%p@%p err=%d\n", __func__, rt, rcu_access_pointer(rt->fib6_node), res); #endif continue; } return 0; } else if (res == -2) { if (WARN_ON(!rt->fib6_nsiblings)) continue; rt = list_last_entry(&rt->fib6_siblings, struct fib6_info, fib6_siblings); continue; } WARN_ON(res != 0); } w->leaf = rt; return 0; } /* * Convenient frontend to tree walker. * * func is called on each route. * It may return -2 -> skip multipath route. * -1 -> delete this route. * 0 -> continue walking */ static void fib6_clean_tree(struct net *net, struct fib6_node *root, int (*func)(struct fib6_info *, void *arg), int sernum, void *arg, bool skip_notify) { struct fib6_cleaner c; c.w.root = root; c.w.func = fib6_clean_node; c.w.count = 0; c.w.skip = 0; c.w.skip_in_node = 0; c.func = func; c.sernum = sernum; c.arg = arg; c.net = net; c.skip_notify = skip_notify; fib6_walk(net, &c.w); } static void __fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), int sernum, void *arg, bool skip_notify) { struct fib6_table *table; struct hlist_head *head; unsigned int h; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { spin_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, func, sernum, arg, skip_notify); spin_unlock_bh(&table->tb6_lock); } } rcu_read_unlock(); } void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), void *arg) { __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false); } void fib6_clean_all_skip_notify(struct net *net, int (*func)(struct fib6_info *, void *), void *arg) { __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true); } static void fib6_flush_trees(struct net *net) { int new_sernum = fib6_new_sernum(net); __fib6_clean_all(net, NULL, new_sernum, NULL, false); } /* * Garbage collection */ static int fib6_age(struct fib6_info *rt, void *arg) { struct fib6_gc_args *gc_args = arg; unsigned long now = jiffies; /* * check addrconf expiration here. * Routes are expired even if they are in use. */ if (rt->fib6_flags & RTF_EXPIRES && rt->expires) { if (time_after(now, rt->expires)) { RT6_TRACE("expiring %p\n", rt); return -1; } gc_args->more++; } /* Also age clones in the exception table. * Note, that clones are aged out * only if they are not in use now. */ rt6_age_exceptions(rt, gc_args, now); return 0; } void fib6_run_gc(unsigned long expires, struct net *net, bool force) { struct fib6_gc_args gc_args; unsigned long now; if (force) { spin_lock_bh(&net->ipv6.fib6_gc_lock); } else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) { mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ); return; } gc_args.timeout = expires ? (int)expires : net->ipv6.sysctl.ip6_rt_gc_interval; gc_args.more = 0; fib6_clean_all(net, fib6_age, &gc_args); now = jiffies; net->ipv6.ip6_rt_last_gc = now; if (gc_args.more) mod_timer(&net->ipv6.ip6_fib_timer, round_jiffies(now + net->ipv6.sysctl.ip6_rt_gc_interval)); else del_timer(&net->ipv6.ip6_fib_timer); spin_unlock_bh(&net->ipv6.fib6_gc_lock); } static void fib6_gc_timer_cb(struct timer_list *t) { struct net *arg = from_timer(arg, t, ipv6.ip6_fib_timer); fib6_run_gc(0, arg, true); } static int __net_init fib6_net_init(struct net *net) { size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ; int err; err = fib6_notifier_init(net); if (err) return err; /* Default to 3-tuple */ net->ipv6.sysctl.multipath_hash_fields = FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK; spin_lock_init(&net->ipv6.fib6_gc_lock); rwlock_init(&net->ipv6.fib6_walker_lock); INIT_LIST_HEAD(&net->ipv6.fib6_walkers); timer_setup(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, 0); net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL); if (!net->ipv6.rt6_stats) goto out_notifier; /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL); if (!net->ipv6.fib_table_hash) goto out_rt6_stats; net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl), GFP_KERNEL); if (!net->ipv6.fib6_main_tbl) goto out_fib_table_hash; net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf, net->ipv6.fib6_null_entry); net->ipv6.fib6_main_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl), GFP_KERNEL); if (!net->ipv6.fib6_local_tbl) goto out_fib6_main_tbl; net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf, net->ipv6.fib6_null_entry); net->ipv6.fib6_local_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers); #endif fib6_tables_init(net); return 0; #ifdef CONFIG_IPV6_MULTIPLE_TABLES out_fib6_main_tbl: kfree(net->ipv6.fib6_main_tbl); #endif out_fib_table_hash: kfree(net->ipv6.fib_table_hash); out_rt6_stats: kfree(net->ipv6.rt6_stats); out_notifier: fib6_notifier_exit(net); return -ENOMEM; } static void fib6_net_exit(struct net *net) { unsigned int i; del_timer_sync(&net->ipv6.ip6_fib_timer); for (i = 0; i < FIB6_TABLE_HASHSZ; i++) { struct hlist_head *head = &net->ipv6.fib_table_hash[i]; struct hlist_node *tmp; struct fib6_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) { hlist_del(&tb->tb6_hlist); fib6_free_table(tb); } } kfree(net->ipv6.fib_table_hash); kfree(net->ipv6.rt6_stats); fib6_notifier_exit(net); } static struct pernet_operations fib6_net_ops = { .init = fib6_net_init, .exit = fib6_net_exit, }; int __init fib6_init(void) { int ret = -ENOMEM; fib6_node_kmem = kmem_cache_create("fib6_nodes", sizeof(struct fib6_node), 0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!fib6_node_kmem) goto out; ret = register_pernet_subsys(&fib6_net_ops); if (ret) goto out_kmem_cache_create; ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib, 0); if (ret) goto out_unregister_subsys; __fib6_flush_trees = fib6_flush_trees; out: return ret; out_unregister_subsys: unregister_pernet_subsys(&fib6_net_ops); out_kmem_cache_create: kmem_cache_destroy(fib6_node_kmem); goto out; } void fib6_gc_cleanup(void) { unregister_pernet_subsys(&fib6_net_ops); kmem_cache_destroy(fib6_node_kmem); } #ifdef CONFIG_PROC_FS static int ipv6_route_native_seq_show(struct seq_file *seq, void *v) { struct fib6_info *rt = v; struct ipv6_route_iter *iter = seq->private; struct fib6_nh *fib6_nh = rt->fib6_nh; unsigned int flags = rt->fib6_flags; const struct net_device *dev; if (rt->nh) fib6_nh = nexthop_fib6_nh(rt->nh); seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); #ifdef CONFIG_IPV6_SUBTREES seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen); #else seq_puts(seq, "00000000000000000000000000000000 00 "); #endif if (fib6_nh->fib_nh_gw_family) { flags |= RTF_GATEWAY; seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6); } else { seq_puts(seq, "00000000000000000000000000000000"); } dev = fib6_nh->fib_nh_dev; seq_printf(seq, " %08x %08x %08x %08x %8s\n", rt->fib6_metric, refcount_read(&rt->fib6_ref), 0, flags, dev ? dev->name : ""); iter->w.leaf = NULL; return 0; } static int ipv6_route_yield(struct fib6_walker *w) { struct ipv6_route_iter *iter = w->args; if (!iter->skip) return 1; do { iter->w.leaf = rcu_dereference_protected( iter->w.leaf->fib6_next, lockdep_is_held(&iter->tbl->tb6_lock)); iter->skip--; if (!iter->skip && iter->w.leaf) return 1; } while (iter->w.leaf); return 0; } static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter, struct net *net) { memset(&iter->w, 0, sizeof(iter->w)); iter->w.func = ipv6_route_yield; iter->w.root = &iter->tbl->tb6_root; iter->w.state = FWS_INIT; iter->w.node = iter->w.root; iter->w.args = iter; iter->sernum = READ_ONCE(iter->w.root->fn_sernum); INIT_LIST_HEAD(&iter->w.lh); fib6_walker_link(net, &iter->w); } static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl, struct net *net) { unsigned int h; struct hlist_node *node; if (tbl) { h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1; node = rcu_dereference(hlist_next_rcu(&tbl->tb6_hlist)); } else { h = 0; node = NULL; } while (!node && h < FIB6_TABLE_HASHSZ) { node = rcu_dereference( hlist_first_rcu(&net->ipv6.fib_table_hash[h++])); } return hlist_entry_safe(node, struct fib6_table, tb6_hlist); } static void ipv6_route_check_sernum(struct ipv6_route_iter *iter) { int sernum = READ_ONCE(iter->w.root->fn_sernum); if (iter->sernum != sernum) { iter->sernum = sernum; iter->w.state = FWS_INIT; iter->w.node = iter->w.root; WARN_ON(iter->w.skip); iter->w.skip = iter->w.count; } } static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) { int r; struct fib6_info *n; struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; ++(*pos); if (!v) goto iter_table; n = rcu_dereference(((struct fib6_info *)v)->fib6_next); if (n) return n; iter_table: ipv6_route_check_sernum(iter); spin_lock_bh(&iter->tbl->tb6_lock); r = fib6_walk_continue(&iter->w); spin_unlock_bh(&iter->tbl->tb6_lock); if (r > 0) { return iter->w.leaf; } else if (r < 0) { fib6_walker_unlink(net, &iter->w); return NULL; } fib6_walker_unlink(net, &iter->w); iter->tbl = ipv6_route_seq_next_table(iter->tbl, net); if (!iter->tbl) return NULL; ipv6_route_seq_setup_walk(iter, net); goto iter_table; } static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; rcu_read_lock(); iter->tbl = ipv6_route_seq_next_table(NULL, net); iter->skip = *pos; if (iter->tbl) { loff_t p = 0; ipv6_route_seq_setup_walk(iter, net); return ipv6_route_seq_next(seq, NULL, &p); } else { return NULL; } } static bool ipv6_route_iter_active(struct ipv6_route_iter *iter) { struct fib6_walker *w = &iter->w; return w->node && !(w->state == FWS_U && w->node == w->root); } static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; if (ipv6_route_iter_active(iter)) fib6_walker_unlink(net, &iter->w); rcu_read_unlock(); } #if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) static int ipv6_route_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, void *v) { struct bpf_iter__ipv6_route ctx; ctx.meta = meta; ctx.rt = v; return bpf_iter_run_prog(prog, &ctx); } static int ipv6_route_seq_show(struct seq_file *seq, void *v) { struct ipv6_route_iter *iter = seq->private; struct bpf_iter_meta meta; struct bpf_prog *prog; int ret; meta.seq = seq; prog = bpf_iter_get_info(&meta, false); if (!prog) return ipv6_route_native_seq_show(seq, v); ret = ipv6_route_prog_seq_show(prog, &meta, v); iter->w.leaf = NULL; return ret; } static void ipv6_route_seq_stop(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; if (!v) { meta.seq = seq; prog = bpf_iter_get_info(&meta, true); if (prog) (void)ipv6_route_prog_seq_show(prog, &meta, v); } ipv6_route_native_seq_stop(seq, v); } #else static int ipv6_route_seq_show(struct seq_file *seq, void *v) { return ipv6_route_native_seq_show(seq, v); } static void ipv6_route_seq_stop(struct seq_file *seq, void *v) { ipv6_route_native_seq_stop(seq, v); } #endif const struct seq_operations ipv6_route_seq_ops = { .start = ipv6_route_seq_start, .next = ipv6_route_seq_next, .stop = ipv6_route_seq_stop, .show = ipv6_route_seq_show }; #endif /* CONFIG_PROC_FS */
1928 1678 222 1905 36 1 1 2 5 4 2 1 68 283 277 9 267 243 28 3 4 3 6 5 5 432 280 93 243 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 // SPDX-License-Identifier: GPL-2.0-or-later /* * Generic parts * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/init.h> #include <linux/llc.h> #include <net/llc.h> #include <net/stp.h> #include <net/switchdev.h> #include "br_private.h" /* * Handle changes in state of network devices enslaved to a bridge. * * Note: don't care about up/down if bridge itself is down, because * port state is checked when bridge is brought up. */ static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); struct netdev_notifier_pre_changeaddr_info *prechaddr_info; struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net_bridge_port *p; struct net_bridge *br; bool notified = false; bool changed_addr; int err; if (netif_is_bridge_master(dev)) { err = br_vlan_bridge_event(dev, event, ptr); if (err) return notifier_from_errno(err); if (event == NETDEV_REGISTER) { /* register of bridge completed, add sysfs entries */ err = br_sysfs_addbr(dev); if (err) return notifier_from_errno(err); return NOTIFY_DONE; } } /* not a port of a bridge */ p = br_port_get_rtnl(dev); if (!p) return NOTIFY_DONE; br = p->br; switch (event) { case NETDEV_CHANGEMTU: br_mtu_auto_adjust(br); break; case NETDEV_PRE_CHANGEADDR: if (br->dev->addr_assign_type == NET_ADDR_SET) break; prechaddr_info = ptr; err = dev_pre_changeaddr_notify(br->dev, prechaddr_info->dev_addr, extack); if (err) return notifier_from_errno(err); break; case NETDEV_CHANGEADDR: spin_lock_bh(&br->lock); br_fdb_changeaddr(p, dev->dev_addr); changed_addr = br_stp_recalculate_bridge_id(br); spin_unlock_bh(&br->lock); if (changed_addr) call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev); break; case NETDEV_CHANGE: br_port_carrier_check(p, &notified); break; case NETDEV_FEAT_CHANGE: netdev_update_features(br->dev); break; case NETDEV_DOWN: spin_lock_bh(&br->lock); if (br->dev->flags & IFF_UP) { br_stp_disable_port(p); notified = true; } spin_unlock_bh(&br->lock); break; case NETDEV_UP: if (netif_running(br->dev) && netif_oper_up(dev)) { spin_lock_bh(&br->lock); br_stp_enable_port(p); notified = true; spin_unlock_bh(&br->lock); } break; case NETDEV_UNREGISTER: br_del_if(br, dev); break; case NETDEV_CHANGENAME: err = br_sysfs_renameif(p); if (err) return notifier_from_errno(err); break; case NETDEV_PRE_TYPE_CHANGE: /* Forbid underlying device to change its type. */ return NOTIFY_BAD; case NETDEV_RESEND_IGMP: /* Propagate to master device */ call_netdevice_notifiers(event, br->dev); break; } if (event != NETDEV_UNREGISTER) br_vlan_port_event(p, event); /* Events that may cause spanning tree to refresh */ if (!notified && (event == NETDEV_CHANGEADDR || event == NETDEV_UP || event == NETDEV_CHANGE || event == NETDEV_DOWN)) br_ifinfo_notify(RTM_NEWLINK, NULL, p); return NOTIFY_DONE; } static struct notifier_block br_device_notifier = { .notifier_call = br_device_event }; /* called with RTNL or RCU */ static int br_switchdev_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = switchdev_notifier_info_to_dev(ptr); struct net_bridge_port *p; struct net_bridge *br; struct switchdev_notifier_fdb_info *fdb_info; int err = NOTIFY_DONE; p = br_port_get_rtnl_rcu(dev); if (!p) goto out; br = p->br; switch (event) { case SWITCHDEV_FDB_ADD_TO_BRIDGE: fdb_info = ptr; err = br_fdb_external_learn_add(br, p, fdb_info->addr, fdb_info->vid, fdb_info->locked, false); if (err) { err = notifier_from_errno(err); break; } br_fdb_offloaded_set(br, p, fdb_info->addr, fdb_info->vid, fdb_info->offloaded); break; case SWITCHDEV_FDB_DEL_TO_BRIDGE: fdb_info = ptr; err = br_fdb_external_learn_del(br, p, fdb_info->addr, fdb_info->vid, false); if (err) err = notifier_from_errno(err); break; case SWITCHDEV_FDB_OFFLOADED: fdb_info = ptr; br_fdb_offloaded_set(br, p, fdb_info->addr, fdb_info->vid, fdb_info->offloaded); break; case SWITCHDEV_FDB_FLUSH_TO_BRIDGE: fdb_info = ptr; /* Don't delete static entries */ br_fdb_delete_by_port(br, p, fdb_info->vid, 0); break; } out: return err; } static struct notifier_block br_switchdev_notifier = { .notifier_call = br_switchdev_event, }; /* called under rtnl_mutex */ static int br_switchdev_blocking_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(ptr); struct net_device *dev = switchdev_notifier_info_to_dev(ptr); struct switchdev_notifier_brport_info *brport_info; const struct switchdev_brport *b; struct net_bridge_port *p; int err = NOTIFY_DONE; p = br_port_get_rtnl(dev); if (!p) goto out; switch (event) { case SWITCHDEV_BRPORT_OFFLOADED: brport_info = ptr; b = &brport_info->brport; err = br_switchdev_port_offload(p, b->dev, b->ctx, b->atomic_nb, b->blocking_nb, b->tx_fwd_offload, extack); err = notifier_from_errno(err); break; case SWITCHDEV_BRPORT_UNOFFLOADED: brport_info = ptr; b = &brport_info->brport; br_switchdev_port_unoffload(p, b->ctx, b->atomic_nb, b->blocking_nb); break; case SWITCHDEV_BRPORT_REPLAY: brport_info = ptr; b = &brport_info->brport; err = br_switchdev_port_replay(p, b->dev, b->ctx, b->atomic_nb, b->blocking_nb, extack); err = notifier_from_errno(err); break; } out: return err; } static struct notifier_block br_switchdev_blocking_notifier = { .notifier_call = br_switchdev_blocking_event, }; /* br_boolopt_toggle - change user-controlled boolean option * * @br: bridge device * @opt: id of the option to change * @on: new option value * @extack: extack for error messages * * Changes the value of the respective boolean option to @on taking care of * any internal option value mapping and configuration. */ int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on, struct netlink_ext_ack *extack) { int err = 0; switch (opt) { case BR_BOOLOPT_NO_LL_LEARN: br_opt_toggle(br, BROPT_NO_LL_LEARN, on); break; case BR_BOOLOPT_MCAST_VLAN_SNOOPING: err = br_multicast_toggle_vlan_snooping(br, on, extack); break; case BR_BOOLOPT_MST_ENABLE: err = br_mst_set_enabled(br, on, extack); break; default: /* shouldn't be called with unsupported options */ WARN_ON(1); break; } return err; } int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt) { switch (opt) { case BR_BOOLOPT_NO_LL_LEARN: return br_opt_get(br, BROPT_NO_LL_LEARN); case BR_BOOLOPT_MCAST_VLAN_SNOOPING: return br_opt_get(br, BROPT_MCAST_VLAN_SNOOPING_ENABLED); case BR_BOOLOPT_MST_ENABLE: return br_opt_get(br, BROPT_MST_ENABLED); default: /* shouldn't be called with unsupported options */ WARN_ON(1); break; } return 0; } int br_boolopt_multi_toggle(struct net_bridge *br, struct br_boolopt_multi *bm, struct netlink_ext_ack *extack) { unsigned long bitmap = bm->optmask; int err = 0; int opt_id; for_each_set_bit(opt_id, &bitmap, BR_BOOLOPT_MAX) { bool on = !!(bm->optval & BIT(opt_id)); err = br_boolopt_toggle(br, opt_id, on, extack); if (err) { br_debug(br, "boolopt multi-toggle error: option: %d current: %d new: %d error: %d\n", opt_id, br_boolopt_get(br, opt_id), on, err); break; } } return err; } void br_boolopt_multi_get(const struct net_bridge *br, struct br_boolopt_multi *bm) { u32 optval = 0; int opt_id; for (opt_id = 0; opt_id < BR_BOOLOPT_MAX; opt_id++) optval |= (br_boolopt_get(br, opt_id) << opt_id); bm->optval = optval; bm->optmask = GENMASK((BR_BOOLOPT_MAX - 1), 0); } /* private bridge options, controlled by the kernel */ void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on) { bool cur = !!br_opt_get(br, opt); br_debug(br, "toggle option: %d state: %d -> %d\n", opt, cur, on); if (cur == on) return; if (on) set_bit(opt, &br->options); else clear_bit(opt, &br->options); } static void __net_exit br_net_exit_batch(struct list_head *net_list) { struct net_device *dev; struct net *net; LIST_HEAD(list); rtnl_lock(); list_for_each_entry(net, net_list, exit_list) for_each_netdev(net, dev) if (netif_is_bridge_master(dev)) br_dev_delete(dev, &list); unregister_netdevice_many(&list); rtnl_unlock(); } static struct pernet_operations br_net_ops = { .exit_batch = br_net_exit_batch, }; static const struct stp_proto br_stp_proto = { .rcv = br_stp_rcv, }; static int __init br_init(void) { int err; BUILD_BUG_ON(sizeof(struct br_input_skb_cb) > sizeof_field(struct sk_buff, cb)); err = stp_proto_register(&br_stp_proto); if (err < 0) { pr_err("bridge: can't register sap for STP\n"); return err; } err = br_fdb_init(); if (err) goto err_out; err = register_pernet_subsys(&br_net_ops); if (err) goto err_out1; err = br_nf_core_init(); if (err) goto err_out2; err = register_netdevice_notifier(&br_device_notifier); if (err) goto err_out3; err = register_switchdev_notifier(&br_switchdev_notifier); if (err) goto err_out4; err = register_switchdev_blocking_notifier(&br_switchdev_blocking_notifier); if (err) goto err_out5; err = br_netlink_init(); if (err) goto err_out6; brioctl_set(br_ioctl_stub); #if IS_ENABLED(CONFIG_ATM_LANE) br_fdb_test_addr_hook = br_fdb_test_addr; #endif #if IS_MODULE(CONFIG_BRIDGE_NETFILTER) pr_info("bridge: filtering via arp/ip/ip6tables is no longer available " "by default. Update your scripts to load br_netfilter if you " "need this.\n"); #endif return 0; err_out6: unregister_switchdev_blocking_notifier(&br_switchdev_blocking_notifier); err_out5: unregister_switchdev_notifier(&br_switchdev_notifier); err_out4: unregister_netdevice_notifier(&br_device_notifier); err_out3: br_nf_core_fini(); err_out2: unregister_pernet_subsys(&br_net_ops); err_out1: br_fdb_fini(); err_out: stp_proto_unregister(&br_stp_proto); return err; } static void __exit br_deinit(void) { stp_proto_unregister(&br_stp_proto); br_netlink_fini(); unregister_switchdev_blocking_notifier(&br_switchdev_blocking_notifier); unregister_switchdev_notifier(&br_switchdev_notifier); unregister_netdevice_notifier(&br_device_notifier); brioctl_set(NULL); unregister_pernet_subsys(&br_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ br_nf_core_fini(); #if IS_ENABLED(CONFIG_ATM_LANE) br_fdb_test_addr_hook = NULL; #endif br_fdb_fini(); } module_init(br_init) module_exit(br_deinit) MODULE_LICENSE("GPL"); MODULE_VERSION(BR_VERSION); MODULE_ALIAS_RTNL_LINK("bridge"); MODULE_DESCRIPTION("Ethernet bridge driver");
635 351 592 356 642 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 #undef TRACE_SYSTEM #define TRACE_SYSTEM qdisc #if !defined(_TRACE_QDISC_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_QDISC_H #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/tracepoint.h> #include <linux/ftrace.h> #include <linux/pkt_sched.h> #include <net/sch_generic.h> TRACE_EVENT(qdisc_dequeue, TP_PROTO(struct Qdisc *qdisc, const struct netdev_queue *txq, int packets, struct sk_buff *skb), TP_ARGS(qdisc, txq, packets, skb), TP_STRUCT__entry( __field( struct Qdisc *, qdisc ) __field(const struct netdev_queue *, txq ) __field( int, packets ) __field( void *, skbaddr ) __field( int, ifindex ) __field( u32, handle ) __field( u32, parent ) __field( unsigned long, txq_state) ), /* skb==NULL indicate packets dequeued was 0, even when packets==1 */ TP_fast_assign( __entry->qdisc = qdisc; __entry->txq = txq; __entry->packets = skb ? packets : 0; __entry->skbaddr = skb; __entry->ifindex = txq->dev ? txq->dev->ifindex : 0; __entry->handle = qdisc->handle; __entry->parent = qdisc->parent; __entry->txq_state = txq->state; ), TP_printk("dequeue ifindex=%d qdisc handle=0x%X parent=0x%X txq_state=0x%lX packets=%d skbaddr=%p", __entry->ifindex, __entry->handle, __entry->parent, __entry->txq_state, __entry->packets, __entry->skbaddr ) ); TRACE_EVENT(qdisc_enqueue, TP_PROTO(struct Qdisc *qdisc, const struct netdev_queue *txq, struct sk_buff *skb), TP_ARGS(qdisc, txq, skb), TP_STRUCT__entry( __field(struct Qdisc *, qdisc) __field(const struct netdev_queue *, txq) __field(void *, skbaddr) __field(int, ifindex) __field(u32, handle) __field(u32, parent) ), TP_fast_assign( __entry->qdisc = qdisc; __entry->txq = txq; __entry->skbaddr = skb; __entry->ifindex = txq->dev ? txq->dev->ifindex : 0; __entry->handle = qdisc->handle; __entry->parent = qdisc->parent; ), TP_printk("enqueue ifindex=%d qdisc handle=0x%X parent=0x%X skbaddr=%p", __entry->ifindex, __entry->handle, __entry->parent, __entry->skbaddr) ); TRACE_EVENT(qdisc_reset, TP_PROTO(struct Qdisc *q), TP_ARGS(q), TP_STRUCT__entry( __string( dev, qdisc_dev(q) ) __string( kind, q->ops->id ) __field( u32, parent ) __field( u32, handle ) ), TP_fast_assign( __assign_str(dev, qdisc_dev(q)); __assign_str(kind, q->ops->id); __entry->parent = q->parent; __entry->handle = q->handle; ), TP_printk("dev=%s kind=%s parent=%x:%x handle=%x:%x", __get_str(dev), __get_str(kind), TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent), TC_H_MAJ(__entry->handle) >> 16, TC_H_MIN(__entry->handle)) ); TRACE_EVENT(qdisc_destroy, TP_PROTO(struct Qdisc *q), TP_ARGS(q), TP_STRUCT__entry( __string( dev, qdisc_dev(q) ) __string( kind, q->ops->id ) __field( u32, parent ) __field( u32, handle ) ), TP_fast_assign( __assign_str(dev, qdisc_dev(q)); __assign_str(kind, q->ops->id); __entry->parent = q->parent; __entry->handle = q->handle; ), TP_printk("dev=%s kind=%s parent=%x:%x handle=%x:%x", __get_str(dev), __get_str(kind), TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent), TC_H_MAJ(__entry->handle) >> 16, TC_H_MIN(__entry->handle)) ); TRACE_EVENT(qdisc_create, TP_PROTO(const struct Qdisc_ops *ops, struct net_device *dev, u32 parent), TP_ARGS(ops, dev, parent), TP_STRUCT__entry( __string( dev, dev->name ) __string( kind, ops->id ) __field( u32, parent ) ), TP_fast_assign( __assign_str(dev, dev->name); __assign_str(kind, ops->id); __entry->parent = parent; ), TP_printk("dev=%s kind=%s parent=%x:%x", __get_str(dev), __get_str(kind), TC_H_MAJ(__entry->parent) >> 16, TC_H_MIN(__entry->parent)) ); #endif /* _TRACE_QDISC_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 /* -*- linux-c -*- */ /* fs/reiserfs/procfs.c */ /* * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README */ /* proc info support a la one created by Sizif@Botik.RU for PGC */ #include <linux/module.h> #include <linux/time.h> #include <linux/seq_file.h> #include <linux/uaccess.h> #include "reiserfs.h" #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/blkdev.h> /* * LOCKING: * * These guys are evicted from procfs as the very first step in ->kill_sb(). * */ static int show_version(struct seq_file *m, void *unused) { struct super_block *sb = m->private; char *format; if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_6)) { format = "3.6"; } else if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_5)) { format = "3.5"; } else { format = "unknown"; } seq_printf(m, "%s format\twith checks %s\n", format, #if defined( CONFIG_REISERFS_CHECK ) "on" #else "off" #endif ); return 0; } #define SF( x ) ( r -> x ) #define SFP( x ) SF( s_proc_info_data.x ) #define SFPL( x ) SFP( x[ level ] ) #define SFPF( x ) SFP( scan_bitmap.x ) #define SFPJ( x ) SFP( journal.x ) #define D2C( x ) le16_to_cpu( x ) #define D4C( x ) le32_to_cpu( x ) #define DF( x ) D2C( rs -> s_v1.x ) #define DFL( x ) D4C( rs -> s_v1.x ) #define objectid_map( s, rs ) (old_format_only (s) ? \ (__le32 *)((struct reiserfs_super_block_v1 *)rs + 1) : \ (__le32 *)(rs + 1)) #define MAP( i ) D4C( objectid_map( sb, rs )[ i ] ) #define DJF( x ) le32_to_cpu( rs -> x ) #define DJP( x ) le32_to_cpu( jp -> x ) #define JF( x ) ( r -> s_journal -> x ) static int show_super(struct seq_file *m, void *unused) { struct super_block *sb = m->private; struct reiserfs_sb_info *r = REISERFS_SB(sb); seq_printf(m, "state: \t%s\n" "mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n" "gen. counter: \t%i\n" "s_disk_reads: \t%i\n" "s_disk_writes: \t%i\n" "s_fix_nodes: \t%i\n" "s_do_balance: \t%i\n" "s_unneeded_left_neighbor: \t%i\n" "s_good_search_by_key_reada: \t%i\n" "s_bmaps: \t%i\n" "s_bmaps_without_search: \t%i\n" "s_direct2indirect: \t%i\n" "s_indirect2direct: \t%i\n" "\n" "max_hash_collisions: \t%i\n" "breads: \t%lu\n" "bread_misses: \t%lu\n" "search_by_key: \t%lu\n" "search_by_key_fs_changed: \t%lu\n" "search_by_key_restarted: \t%lu\n" "insert_item_restarted: \t%lu\n" "paste_into_item_restarted: \t%lu\n" "cut_from_item_restarted: \t%lu\n" "delete_solid_item_restarted: \t%lu\n" "delete_item_restarted: \t%lu\n" "leaked_oid: \t%lu\n" "leaves_removable: \t%lu\n", SF(s_mount_state) == REISERFS_VALID_FS ? "REISERFS_VALID_FS" : "REISERFS_ERROR_FS", reiserfs_r5_hash(sb) ? "FORCE_R5 " : "", reiserfs_rupasov_hash(sb) ? "FORCE_RUPASOV " : "", reiserfs_tea_hash(sb) ? "FORCE_TEA " : "", reiserfs_hash_detect(sb) ? "DETECT_HASH " : "", reiserfs_no_border(sb) ? "NO_BORDER " : "BORDER ", reiserfs_no_unhashed_relocation(sb) ? "NO_UNHASHED_RELOCATION " : "", reiserfs_hashed_relocation(sb) ? "UNHASHED_RELOCATION " : "", reiserfs_test4(sb) ? "TEST4 " : "", have_large_tails(sb) ? "TAILS " : have_small_tails(sb) ? "SMALL_TAILS " : "NO_TAILS ", replay_only(sb) ? "REPLAY_ONLY " : "", convert_reiserfs(sb) ? "CONV " : "", atomic_read(&r->s_generation_counter), SF(s_disk_reads), SF(s_disk_writes), SF(s_fix_nodes), SF(s_do_balance), SF(s_unneeded_left_neighbor), SF(s_good_search_by_key_reada), SF(s_bmaps), SF(s_bmaps_without_search), SF(s_direct2indirect), SF(s_indirect2direct), SFP(max_hash_collisions), SFP(breads), SFP(bread_miss), SFP(search_by_key), SFP(search_by_key_fs_changed), SFP(search_by_key_restarted), SFP(insert_item_restarted), SFP(paste_into_item_restarted), SFP(cut_from_item_restarted), SFP(delete_solid_item_restarted), SFP(delete_item_restarted), SFP(leaked_oid), SFP(leaves_removable)); return 0; } static int show_per_level(struct seq_file *m, void *unused) { struct super_block *sb = m->private; struct reiserfs_sb_info *r = REISERFS_SB(sb); int level; seq_printf(m, "level\t" " balances" " [sbk: reads" " fs_changed" " restarted]" " free space" " items" " can_remove" " lnum" " rnum" " lbytes" " rbytes" " get_neig" " get_neig_res" " need_l_neig" " need_r_neig" "\n"); for (level = 0; level < MAX_HEIGHT; ++level) { seq_printf(m, "%i\t" " %12lu" " %12lu" " %12lu" " %12lu" " %12lu" " %12lu" " %12lu" " %12li" " %12li" " %12li" " %12li" " %12lu" " %12lu" " %12lu" " %12lu" "\n", level, SFPL(balance_at), SFPL(sbk_read_at), SFPL(sbk_fs_changed), SFPL(sbk_restarted), SFPL(free_at), SFPL(items_at), SFPL(can_node_be_removed), SFPL(lnum), SFPL(rnum), SFPL(lbytes), SFPL(rbytes), SFPL(get_neighbors), SFPL(get_neighbors_restart), SFPL(need_l_neighbor), SFPL(need_r_neighbor) ); } return 0; } static int show_bitmap(struct seq_file *m, void *unused) { struct super_block *sb = m->private; struct reiserfs_sb_info *r = REISERFS_SB(sb); seq_printf(m, "free_block: %lu\n" " scan_bitmap:" " wait" " bmap" " retry" " stolen" " journal_hint" "journal_nohint" "\n" " %14lu" " %14lu" " %14lu" " %14lu" " %14lu" " %14lu" " %14lu" "\n", SFP(free_block), SFPF(call), SFPF(wait), SFPF(bmap), SFPF(retry), SFPF(stolen), SFPF(in_journal_hint), SFPF(in_journal_nohint)); return 0; } static int show_on_disk_super(struct seq_file *m, void *unused) { struct super_block *sb = m->private; struct reiserfs_sb_info *sb_info = REISERFS_SB(sb); struct reiserfs_super_block *rs = sb_info->s_rs; int hash_code = DFL(s_hash_function_code); __u32 flags = DJF(s_flags); seq_printf(m, "block_count: \t%i\n" "free_blocks: \t%i\n" "root_block: \t%i\n" "blocksize: \t%i\n" "oid_maxsize: \t%i\n" "oid_cursize: \t%i\n" "umount_state: \t%i\n" "magic: \t%10.10s\n" "fs_state: \t%i\n" "hash: \t%s\n" "tree_height: \t%i\n" "bmap_nr: \t%i\n" "version: \t%i\n" "flags: \t%x[%s]\n" "reserved_for_journal: \t%i\n", DFL(s_block_count), DFL(s_free_blocks), DFL(s_root_block), DF(s_blocksize), DF(s_oid_maxsize), DF(s_oid_cursize), DF(s_umount_state), rs->s_v1.s_magic, DF(s_fs_state), hash_code == TEA_HASH ? "tea" : (hash_code == YURA_HASH) ? "rupasov" : (hash_code == R5_HASH) ? "r5" : (hash_code == UNSET_HASH) ? "unset" : "unknown", DF(s_tree_height), DF(s_bmap_nr), DF(s_version), flags, (flags & reiserfs_attrs_cleared) ? "attrs_cleared" : "", DF(s_reserved_for_journal)); return 0; } static int show_oidmap(struct seq_file *m, void *unused) { struct super_block *sb = m->private; struct reiserfs_sb_info *sb_info = REISERFS_SB(sb); struct reiserfs_super_block *rs = sb_info->s_rs; unsigned int mapsize = le16_to_cpu(rs->s_v1.s_oid_cursize); unsigned long total_used = 0; int i; for (i = 0; i < mapsize; ++i) { __u32 right; right = (i == mapsize - 1) ? MAX_KEY_OBJECTID : MAP(i + 1); seq_printf(m, "%s: [ %x .. %x )\n", (i & 1) ? "free" : "used", MAP(i), right); if (!(i & 1)) { total_used += right - MAP(i); } } #if defined( REISERFS_USE_OIDMAPF ) if (sb_info->oidmap.use_file && (sb_info->oidmap.mapf != NULL)) { loff_t size = file_inode(sb_info->oidmap.mapf)->i_size; total_used += size / sizeof(reiserfs_oidinterval_d_t); } #endif seq_printf(m, "total: \t%i [%i/%i] used: %lu [exact]\n", mapsize, mapsize, le16_to_cpu(rs->s_v1.s_oid_maxsize), total_used); return 0; } static time64_t ktime_mono_to_real_seconds(time64_t mono) { ktime_t kt = ktime_set(mono, NSEC_PER_SEC/2); return ktime_divns(ktime_mono_to_real(kt), NSEC_PER_SEC); } static int show_journal(struct seq_file *m, void *unused) { struct super_block *sb = m->private; struct reiserfs_sb_info *r = REISERFS_SB(sb); struct reiserfs_super_block *rs = r->s_rs; struct journal_params *jp = &rs->s_v1.s_journal; seq_printf(m, /* on-disk fields */ "jp_journal_1st_block: \t%i\n" "jp_journal_dev: \t%pg[%x]\n" "jp_journal_size: \t%i\n" "jp_journal_trans_max: \t%i\n" "jp_journal_magic: \t%i\n" "jp_journal_max_batch: \t%i\n" "jp_journal_max_commit_age: \t%i\n" "jp_journal_max_trans_age: \t%i\n" /* incore fields */ "j_1st_reserved_block: \t%i\n" "j_state: \t%li\n" "j_trans_id: \t%u\n" "j_mount_id: \t%lu\n" "j_start: \t%lu\n" "j_len: \t%lu\n" "j_len_alloc: \t%lu\n" "j_wcount: \t%i\n" "j_bcount: \t%lu\n" "j_first_unflushed_offset: \t%lu\n" "j_last_flush_trans_id: \t%u\n" "j_trans_start_time: \t%lli\n" "j_list_bitmap_index: \t%i\n" "j_must_wait: \t%i\n" "j_next_full_flush: \t%i\n" "j_next_async_flush: \t%i\n" "j_cnode_used: \t%i\n" "j_cnode_free: \t%i\n" "\n" /* reiserfs_proc_info_data_t.journal fields */ "in_journal: \t%12lu\n" "in_journal_bitmap: \t%12lu\n" "in_journal_reusable: \t%12lu\n" "lock_journal: \t%12lu\n" "lock_journal_wait: \t%12lu\n" "journal_begin: \t%12lu\n" "journal_relock_writers: \t%12lu\n" "journal_relock_wcount: \t%12lu\n" "mark_dirty: \t%12lu\n" "mark_dirty_already: \t%12lu\n" "mark_dirty_notjournal: \t%12lu\n" "restore_prepared: \t%12lu\n" "prepare: \t%12lu\n" "prepare_retry: \t%12lu\n", DJP(jp_journal_1st_block), SB_JOURNAL(sb)->j_bdev_handle->bdev, DJP(jp_journal_dev), DJP(jp_journal_size), DJP(jp_journal_trans_max), DJP(jp_journal_magic), DJP(jp_journal_max_batch), SB_JOURNAL(sb)->j_max_commit_age, DJP(jp_journal_max_trans_age), JF(j_1st_reserved_block), JF(j_state), JF(j_trans_id), JF(j_mount_id), JF(j_start), JF(j_len), JF(j_len_alloc), atomic_read(&r->s_journal->j_wcount), JF(j_bcount), JF(j_first_unflushed_offset), JF(j_last_flush_trans_id), ktime_mono_to_real_seconds(JF(j_trans_start_time)), JF(j_list_bitmap_index), JF(j_must_wait), JF(j_next_full_flush), JF(j_next_async_flush), JF(j_cnode_used), JF(j_cnode_free), SFPJ(in_journal), SFPJ(in_journal_bitmap), SFPJ(in_journal_reusable), SFPJ(lock_journal), SFPJ(lock_journal_wait), SFPJ(journal_being), SFPJ(journal_relock_writers), SFPJ(journal_relock_wcount), SFPJ(mark_dirty), SFPJ(mark_dirty_already), SFPJ(mark_dirty_notjournal), SFPJ(restore_prepared), SFPJ(prepare), SFPJ(prepare_retry) ); return 0; } static struct proc_dir_entry *proc_info_root = NULL; static const char proc_info_root_name[] = "fs/reiserfs"; static void add_file(struct super_block *sb, char *name, int (*func) (struct seq_file *, void *)) { proc_create_single_data(name, 0, REISERFS_SB(sb)->procdir, func, sb); } int reiserfs_proc_info_init(struct super_block *sb) { char b[BDEVNAME_SIZE]; char *s; /* Some block devices use /'s */ strscpy(b, sb->s_id, BDEVNAME_SIZE); s = strchr(b, '/'); if (s) *s = '!'; spin_lock_init(&__PINFO(sb).lock); REISERFS_SB(sb)->procdir = proc_mkdir_data(b, 0, proc_info_root, sb); if (REISERFS_SB(sb)->procdir) { add_file(sb, "version", show_version); add_file(sb, "super", show_super); add_file(sb, "per-level", show_per_level); add_file(sb, "bitmap", show_bitmap); add_file(sb, "on-disk-super", show_on_disk_super); add_file(sb, "oidmap", show_oidmap); add_file(sb, "journal", show_journal); return 0; } reiserfs_warning(sb, "cannot create /proc/%s/%s", proc_info_root_name, b); return 1; } int reiserfs_proc_info_done(struct super_block *sb) { struct proc_dir_entry *de = REISERFS_SB(sb)->procdir; if (de) { char b[BDEVNAME_SIZE]; char *s; /* Some block devices use /'s */ strscpy(b, sb->s_id, BDEVNAME_SIZE); s = strchr(b, '/'); if (s) *s = '!'; remove_proc_subtree(b, proc_info_root); REISERFS_SB(sb)->procdir = NULL; } return 0; } int reiserfs_proc_info_global_init(void) { if (proc_info_root == NULL) { proc_info_root = proc_mkdir(proc_info_root_name, NULL); if (!proc_info_root) { reiserfs_warning(NULL, "cannot create /proc/%s", proc_info_root_name); return 1; } } return 0; } int reiserfs_proc_info_global_done(void) { if (proc_info_root != NULL) { proc_info_root = NULL; remove_proc_entry(proc_info_root_name, NULL); } return 0; } /* * Revision 1.1.8.2 2001/07/15 17:08:42 god * . use get_super() in procfs.c * . remove remove_save_link() from reiserfs_do_truncate() * * I accept terms and conditions stated in the Legal Agreement * (available at http://www.namesys.com/legalese.html) * * Revision 1.1.8.1 2001/07/11 16:48:50 god * proc info support * * I accept terms and conditions stated in the Legal Agreement * (available at http://www.namesys.com/legalese.html) * */
1 1927 1889 38 1 1 1 1 1 6 2 1 1 2 1 1 3 1 1 1 3 5 1 1 3 1 1 1 1 4 2 1 1 1 1 4 4 4 3 1 3 2 1 2 1 1 1 1 49 11 2 1 3 2 1 2 1 1 9 1 2 1 4 1 2 3 1 4 1 1 3 2 1 2 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 // SPDX-License-Identifier: GPL-2.0-or-later /* * X.25 Packet Layer release 002 * * This is ALPHA test software. This code may break your machine, * randomly fail to work with new releases, misbehave and/or generally * screw up. It might even work. * * This code REQUIRES 2.1.15 or higher * * History * X.25 001 Jonathan Naylor Started coding. * X.25 002 Jonathan Naylor Centralised disconnect handling. * New timer architecture. * 2000-03-11 Henner Eisen MSG_EOR handling more POSIX compliant. * 2000-03-22 Daniela Squassoni Allowed disabling/enabling of * facilities negotiation and increased * the throughput upper limit. * 2000-08-27 Arnaldo C. Melo s/suser/capable/ + micro cleanups * 2000-09-04 Henner Eisen Set sock->state in x25_accept(). * Fixed x25_output() related skb leakage. * 2000-10-02 Henner Eisen Made x25_kick() single threaded per socket. * 2000-10-27 Henner Eisen MSG_DONTWAIT for fragment allocation. * 2000-11-14 Henner Eisen Closing datalink from NETDEV_GOING_DOWN * 2002-10-06 Arnaldo C. Melo Get rid of cli/sti, move proc stuff to * x25_proc.c, using seq_file * 2005-04-02 Shaun Pereira Selective sub address matching * with call user data * 2005-04-15 Shaun Pereira Fast select with no restriction on * response */ #define pr_fmt(fmt) "X25: " fmt #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include <net/tcp_states.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/termios.h> /* For TIOCINQ/OUTQ */ #include <linux/notifier.h> #include <linux/init.h> #include <linux/compat.h> #include <linux/ctype.h> #include <net/x25.h> #include <net/compat.h> int sysctl_x25_restart_request_timeout = X25_DEFAULT_T20; int sysctl_x25_call_request_timeout = X25_DEFAULT_T21; int sysctl_x25_reset_request_timeout = X25_DEFAULT_T22; int sysctl_x25_clear_request_timeout = X25_DEFAULT_T23; int sysctl_x25_ack_holdback_timeout = X25_DEFAULT_T2; int sysctl_x25_forward = 0; HLIST_HEAD(x25_list); DEFINE_RWLOCK(x25_list_lock); static const struct proto_ops x25_proto_ops; static const struct x25_address null_x25_address = {" "}; #ifdef CONFIG_COMPAT struct compat_x25_subscrip_struct { char device[200-sizeof(compat_ulong_t)]; compat_ulong_t global_facil_mask; compat_uint_t extended; }; #endif int x25_parse_address_block(struct sk_buff *skb, struct x25_address *called_addr, struct x25_address *calling_addr) { unsigned char len; int needed; int rc; if (!pskb_may_pull(skb, 1)) { /* packet has no address block */ rc = 0; goto empty; } len = *skb->data; needed = 1 + ((len >> 4) + (len & 0x0f) + 1) / 2; if (!pskb_may_pull(skb, needed)) { /* packet is too short to hold the addresses it claims to hold */ rc = -1; goto empty; } return x25_addr_ntoa(skb->data, called_addr, calling_addr); empty: *called_addr->x25_addr = 0; *calling_addr->x25_addr = 0; return rc; } int x25_addr_ntoa(unsigned char *p, struct x25_address *called_addr, struct x25_address *calling_addr) { unsigned int called_len, calling_len; char *called, *calling; unsigned int i; called_len = (*p >> 0) & 0x0F; calling_len = (*p >> 4) & 0x0F; called = called_addr->x25_addr; calling = calling_addr->x25_addr; p++; for (i = 0; i < (called_len + calling_len); i++) { if (i < called_len) { if (i % 2 != 0) { *called++ = ((*p >> 0) & 0x0F) + '0'; p++; } else { *called++ = ((*p >> 4) & 0x0F) + '0'; } } else { if (i % 2 != 0) { *calling++ = ((*p >> 0) & 0x0F) + '0'; p++; } else { *calling++ = ((*p >> 4) & 0x0F) + '0'; } } } *called = *calling = '\0'; return 1 + (called_len + calling_len + 1) / 2; } int x25_addr_aton(unsigned char *p, struct x25_address *called_addr, struct x25_address *calling_addr) { unsigned int called_len, calling_len; char *called, *calling; int i; called = called_addr->x25_addr; calling = calling_addr->x25_addr; called_len = strlen(called); calling_len = strlen(calling); *p++ = (calling_len << 4) | (called_len << 0); for (i = 0; i < (called_len + calling_len); i++) { if (i < called_len) { if (i % 2 != 0) { *p |= (*called++ - '0') << 0; p++; } else { *p = 0x00; *p |= (*called++ - '0') << 4; } } else { if (i % 2 != 0) { *p |= (*calling++ - '0') << 0; p++; } else { *p = 0x00; *p |= (*calling++ - '0') << 4; } } } return 1 + (called_len + calling_len + 1) / 2; } /* * Socket removal during an interrupt is now safe. */ static void x25_remove_socket(struct sock *sk) { write_lock_bh(&x25_list_lock); sk_del_node_init(sk); write_unlock_bh(&x25_list_lock); } /* * Handle device status changes. */ static int x25_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct x25_neigh *nb; if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; if (dev->type == ARPHRD_X25) { switch (event) { case NETDEV_REGISTER: case NETDEV_POST_TYPE_CHANGE: x25_link_device_up(dev); break; case NETDEV_DOWN: nb = x25_get_neigh(dev); if (nb) { x25_link_terminated(nb); x25_neigh_put(nb); } x25_route_device_down(dev); break; case NETDEV_PRE_TYPE_CHANGE: case NETDEV_UNREGISTER: x25_link_device_down(dev); break; case NETDEV_CHANGE: if (!netif_carrier_ok(dev)) { nb = x25_get_neigh(dev); if (nb) { x25_link_terminated(nb); x25_neigh_put(nb); } } break; } } return NOTIFY_DONE; } /* * Add a socket to the bound sockets list. */ static void x25_insert_socket(struct sock *sk) { write_lock_bh(&x25_list_lock); sk_add_node(sk, &x25_list); write_unlock_bh(&x25_list_lock); } /* * Find a socket that wants to accept the Call Request we just * received. Check the full list for an address/cud match. * If no cuds match return the next_best thing, an address match. * Note: if a listening socket has cud set it must only get calls * with matching cud. */ static struct sock *x25_find_listener(struct x25_address *addr, struct sk_buff *skb) { struct sock *s; struct sock *next_best; read_lock_bh(&x25_list_lock); next_best = NULL; sk_for_each(s, &x25_list) if ((!strcmp(addr->x25_addr, x25_sk(s)->source_addr.x25_addr) || !strcmp(x25_sk(s)->source_addr.x25_addr, null_x25_address.x25_addr)) && s->sk_state == TCP_LISTEN) { /* * Found a listening socket, now check the incoming * call user data vs this sockets call user data */ if (x25_sk(s)->cudmatchlength > 0 && skb->len >= x25_sk(s)->cudmatchlength) { if((memcmp(x25_sk(s)->calluserdata.cuddata, skb->data, x25_sk(s)->cudmatchlength)) == 0) { sock_hold(s); goto found; } } else next_best = s; } if (next_best) { s = next_best; sock_hold(s); goto found; } s = NULL; found: read_unlock_bh(&x25_list_lock); return s; } /* * Find a connected X.25 socket given my LCI and neighbour. */ static struct sock *__x25_find_socket(unsigned int lci, struct x25_neigh *nb) { struct sock *s; sk_for_each(s, &x25_list) if (x25_sk(s)->lci == lci && x25_sk(s)->neighbour == nb) { sock_hold(s); goto found; } s = NULL; found: return s; } struct sock *x25_find_socket(unsigned int lci, struct x25_neigh *nb) { struct sock *s; read_lock_bh(&x25_list_lock); s = __x25_find_socket(lci, nb); read_unlock_bh(&x25_list_lock); return s; } /* * Find a unique LCI for a given device. */ static unsigned int x25_new_lci(struct x25_neigh *nb) { unsigned int lci = 1; struct sock *sk; while ((sk = x25_find_socket(lci, nb)) != NULL) { sock_put(sk); if (++lci == 4096) { lci = 0; break; } cond_resched(); } return lci; } /* * Deferred destroy. */ static void __x25_destroy_socket(struct sock *); /* * handler for deferred kills. */ static void x25_destroy_timer(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); x25_destroy_socket_from_timer(sk); } /* * This is called from user mode and the timers. Thus it protects itself * against interrupting users but doesn't worry about being called during * work. Once it is removed from the queue no interrupt or bottom half * will touch it and we are (fairly 8-) ) safe. * Not static as it's used by the timer */ static void __x25_destroy_socket(struct sock *sk) { struct sk_buff *skb; x25_stop_heartbeat(sk); x25_stop_timer(sk); x25_remove_socket(sk); x25_clear_queues(sk); /* Flush the queues */ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { if (skb->sk != sk) { /* A pending connection */ /* * Queue the unaccepted socket for death */ skb->sk->sk_state = TCP_LISTEN; sock_set_flag(skb->sk, SOCK_DEAD); x25_start_heartbeat(skb->sk); x25_sk(skb->sk)->state = X25_STATE_0; } kfree_skb(skb); } if (sk_has_allocations(sk)) { /* Defer: outstanding buffers */ sk->sk_timer.expires = jiffies + 10 * HZ; sk->sk_timer.function = x25_destroy_timer; add_timer(&sk->sk_timer); } else { /* drop last reference so sock_put will free */ __sock_put(sk); } } void x25_destroy_socket_from_timer(struct sock *sk) { sock_hold(sk); bh_lock_sock(sk); __x25_destroy_socket(sk); bh_unlock_sock(sk); sock_put(sk); } /* * Handling for system calls applied via the various interfaces to a * X.25 socket object. */ static int x25_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { int opt; struct sock *sk = sock->sk; int rc = -ENOPROTOOPT; if (level != SOL_X25 || optname != X25_QBITINCL) goto out; rc = -EINVAL; if (optlen < sizeof(int)) goto out; rc = -EFAULT; if (copy_from_sockptr(&opt, optval, sizeof(int))) goto out; if (opt) set_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags); else clear_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags); rc = 0; out: return rc; } static int x25_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; int val, len, rc = -ENOPROTOOPT; if (level != SOL_X25 || optname != X25_QBITINCL) goto out; rc = -EFAULT; if (get_user(len, optlen)) goto out; len = min_t(unsigned int, len, sizeof(int)); rc = -EINVAL; if (len < 0) goto out; rc = -EFAULT; if (put_user(len, optlen)) goto out; val = test_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags); rc = copy_to_user(optval, &val, len) ? -EFAULT : 0; out: return rc; } static int x25_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; int rc = -EOPNOTSUPP; lock_sock(sk); if (sock->state != SS_UNCONNECTED) { rc = -EINVAL; release_sock(sk); return rc; } if (sk->sk_state != TCP_LISTEN) { memset(&x25_sk(sk)->dest_addr, 0, X25_ADDR_LEN); sk->sk_max_ack_backlog = backlog; sk->sk_state = TCP_LISTEN; rc = 0; } release_sock(sk); return rc; } static struct proto x25_proto = { .name = "X25", .owner = THIS_MODULE, .obj_size = sizeof(struct x25_sock), }; static struct sock *x25_alloc_socket(struct net *net, int kern) { struct x25_sock *x25; struct sock *sk = sk_alloc(net, AF_X25, GFP_ATOMIC, &x25_proto, kern); if (!sk) goto out; sock_init_data(NULL, sk); x25 = x25_sk(sk); skb_queue_head_init(&x25->ack_queue); skb_queue_head_init(&x25->fragment_queue); skb_queue_head_init(&x25->interrupt_in_queue); skb_queue_head_init(&x25->interrupt_out_queue); out: return sk; } static int x25_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct x25_sock *x25; int rc = -EAFNOSUPPORT; if (!net_eq(net, &init_net)) goto out; rc = -ESOCKTNOSUPPORT; if (sock->type != SOCK_SEQPACKET) goto out; rc = -EINVAL; if (protocol) goto out; rc = -ENOMEM; if ((sk = x25_alloc_socket(net, kern)) == NULL) goto out; x25 = x25_sk(sk); sock_init_data(sock, sk); x25_init_timers(sk); sock->ops = &x25_proto_ops; sk->sk_protocol = protocol; sk->sk_backlog_rcv = x25_backlog_rcv; x25->t21 = sysctl_x25_call_request_timeout; x25->t22 = sysctl_x25_reset_request_timeout; x25->t23 = sysctl_x25_clear_request_timeout; x25->t2 = sysctl_x25_ack_holdback_timeout; x25->state = X25_STATE_0; x25->cudmatchlength = 0; set_bit(X25_ACCPT_APPRV_FLAG, &x25->flags); /* normally no cud */ /* on call accept */ x25->facilities.winsize_in = X25_DEFAULT_WINDOW_SIZE; x25->facilities.winsize_out = X25_DEFAULT_WINDOW_SIZE; x25->facilities.pacsize_in = X25_DEFAULT_PACKET_SIZE; x25->facilities.pacsize_out = X25_DEFAULT_PACKET_SIZE; x25->facilities.throughput = 0; /* by default don't negotiate throughput */ x25->facilities.reverse = X25_DEFAULT_REVERSE; x25->dte_facilities.calling_len = 0; x25->dte_facilities.called_len = 0; memset(x25->dte_facilities.called_ae, '\0', sizeof(x25->dte_facilities.called_ae)); memset(x25->dte_facilities.calling_ae, '\0', sizeof(x25->dte_facilities.calling_ae)); rc = 0; out: return rc; } static struct sock *x25_make_new(struct sock *osk) { struct sock *sk = NULL; struct x25_sock *x25, *ox25; if (osk->sk_type != SOCK_SEQPACKET) goto out; if ((sk = x25_alloc_socket(sock_net(osk), 0)) == NULL) goto out; x25 = x25_sk(sk); sk->sk_type = osk->sk_type; sk->sk_priority = READ_ONCE(osk->sk_priority); sk->sk_protocol = osk->sk_protocol; sk->sk_rcvbuf = osk->sk_rcvbuf; sk->sk_sndbuf = osk->sk_sndbuf; sk->sk_state = TCP_ESTABLISHED; sk->sk_backlog_rcv = osk->sk_backlog_rcv; sock_copy_flags(sk, osk); ox25 = x25_sk(osk); x25->t21 = ox25->t21; x25->t22 = ox25->t22; x25->t23 = ox25->t23; x25->t2 = ox25->t2; x25->flags = ox25->flags; x25->facilities = ox25->facilities; x25->dte_facilities = ox25->dte_facilities; x25->cudmatchlength = ox25->cudmatchlength; clear_bit(X25_INTERRUPT_FLAG, &x25->flags); x25_init_timers(sk); out: return sk; } static int x25_release(struct socket *sock) { struct sock *sk = sock->sk; struct x25_sock *x25; if (!sk) return 0; x25 = x25_sk(sk); sock_hold(sk); lock_sock(sk); switch (x25->state) { case X25_STATE_0: case X25_STATE_2: x25_disconnect(sk, 0, 0, 0); __x25_destroy_socket(sk); goto out; case X25_STATE_1: case X25_STATE_3: case X25_STATE_4: x25_clear_queues(sk); x25_write_internal(sk, X25_CLEAR_REQUEST); x25_start_t23timer(sk); x25->state = X25_STATE_2; sk->sk_state = TCP_CLOSE; sk->sk_shutdown |= SEND_SHUTDOWN; sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DEAD); sock_set_flag(sk, SOCK_DESTROY); break; case X25_STATE_5: x25_write_internal(sk, X25_CLEAR_REQUEST); x25_disconnect(sk, 0, 0, 0); __x25_destroy_socket(sk); goto out; } sock_orphan(sk); out: release_sock(sk); sock_put(sk); return 0; } static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr; int len, i, rc = 0; if (addr_len != sizeof(struct sockaddr_x25) || addr->sx25_family != AF_X25 || strnlen(addr->sx25_addr.x25_addr, X25_ADDR_LEN) == X25_ADDR_LEN) { rc = -EINVAL; goto out; } /* check for the null_x25_address */ if (strcmp(addr->sx25_addr.x25_addr, null_x25_address.x25_addr)) { len = strlen(addr->sx25_addr.x25_addr); for (i = 0; i < len; i++) { if (!isdigit(addr->sx25_addr.x25_addr[i])) { rc = -EINVAL; goto out; } } } lock_sock(sk); if (sock_flag(sk, SOCK_ZAPPED)) { x25_sk(sk)->source_addr = addr->sx25_addr; x25_insert_socket(sk); sock_reset_flag(sk, SOCK_ZAPPED); } else { rc = -EINVAL; } release_sock(sk); net_dbg_ratelimited("x25_bind: socket is bound\n"); out: return rc; } static int x25_wait_for_connection_establishment(struct sock *sk) { DECLARE_WAITQUEUE(wait, current); int rc; add_wait_queue_exclusive(sk_sleep(sk), &wait); for (;;) { __set_current_state(TASK_INTERRUPTIBLE); rc = -ERESTARTSYS; if (signal_pending(current)) break; rc = sock_error(sk); if (rc) { sk->sk_socket->state = SS_UNCONNECTED; break; } rc = -ENOTCONN; if (sk->sk_state == TCP_CLOSE) { sk->sk_socket->state = SS_UNCONNECTED; break; } rc = 0; if (sk->sk_state != TCP_ESTABLISHED) { release_sock(sk); schedule(); lock_sock(sk); } else break; } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); return rc; } static int x25_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr; struct x25_route *rt; int rc = 0; lock_sock(sk); if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) { sock->state = SS_CONNECTED; goto out; /* Connect completed during a ERESTARTSYS event */ } rc = -ECONNREFUSED; if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) { sock->state = SS_UNCONNECTED; goto out; } rc = -EISCONN; /* No reconnect on a seqpacket socket */ if (sk->sk_state == TCP_ESTABLISHED) goto out; rc = -EALREADY; /* Do nothing if call is already in progress */ if (sk->sk_state == TCP_SYN_SENT) goto out; sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; rc = -EINVAL; if (addr_len != sizeof(struct sockaddr_x25) || addr->sx25_family != AF_X25 || strnlen(addr->sx25_addr.x25_addr, X25_ADDR_LEN) == X25_ADDR_LEN) goto out; rc = -ENETUNREACH; rt = x25_get_route(&addr->sx25_addr); if (!rt) goto out; x25->neighbour = x25_get_neigh(rt->dev); if (!x25->neighbour) goto out_put_route; x25_limit_facilities(&x25->facilities, x25->neighbour); x25->lci = x25_new_lci(x25->neighbour); if (!x25->lci) goto out_put_neigh; rc = -EINVAL; if (sock_flag(sk, SOCK_ZAPPED)) /* Must bind first - autobinding does not work */ goto out_put_neigh; if (!strcmp(x25->source_addr.x25_addr, null_x25_address.x25_addr)) memset(&x25->source_addr, '\0', X25_ADDR_LEN); x25->dest_addr = addr->sx25_addr; /* Move to connecting socket, start sending Connect Requests */ sock->state = SS_CONNECTING; sk->sk_state = TCP_SYN_SENT; x25->state = X25_STATE_1; x25_write_internal(sk, X25_CALL_REQUEST); x25_start_heartbeat(sk); x25_start_t21timer(sk); /* Now the loop */ rc = -EINPROGRESS; if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) goto out; rc = x25_wait_for_connection_establishment(sk); if (rc) goto out_put_neigh; sock->state = SS_CONNECTED; rc = 0; out_put_neigh: if (rc && x25->neighbour) { read_lock_bh(&x25_list_lock); x25_neigh_put(x25->neighbour); x25->neighbour = NULL; read_unlock_bh(&x25_list_lock); x25->state = X25_STATE_0; } out_put_route: x25_route_put(rt); out: release_sock(sk); return rc; } static int x25_wait_for_data(struct sock *sk, long timeout) { DECLARE_WAITQUEUE(wait, current); int rc = 0; add_wait_queue_exclusive(sk_sleep(sk), &wait); for (;;) { __set_current_state(TASK_INTERRUPTIBLE); if (sk->sk_shutdown & RCV_SHUTDOWN) break; rc = -ERESTARTSYS; if (signal_pending(current)) break; rc = -EAGAIN; if (!timeout) break; rc = 0; if (skb_queue_empty(&sk->sk_receive_queue)) { release_sock(sk); timeout = schedule_timeout(timeout); lock_sock(sk); } else break; } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); return rc; } static int x25_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { struct sock *sk = sock->sk; struct sock *newsk; struct sk_buff *skb; int rc = -EINVAL; if (!sk) goto out; rc = -EOPNOTSUPP; if (sk->sk_type != SOCK_SEQPACKET) goto out; lock_sock(sk); rc = -EINVAL; if (sk->sk_state != TCP_LISTEN) goto out2; rc = x25_wait_for_data(sk, sk->sk_rcvtimeo); if (rc) goto out2; skb = skb_dequeue(&sk->sk_receive_queue); rc = -EINVAL; if (!skb->sk) goto out2; newsk = skb->sk; sock_graft(newsk, newsock); /* Now attach up the new socket */ skb->sk = NULL; kfree_skb(skb); sk_acceptq_removed(sk); newsock->state = SS_CONNECTED; rc = 0; out2: release_sock(sk); out: return rc; } static int x25_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_x25 *sx25 = (struct sockaddr_x25 *)uaddr; struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); int rc = 0; if (peer) { if (sk->sk_state != TCP_ESTABLISHED) { rc = -ENOTCONN; goto out; } sx25->sx25_addr = x25->dest_addr; } else sx25->sx25_addr = x25->source_addr; sx25->sx25_family = AF_X25; rc = sizeof(*sx25); out: return rc; } int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, unsigned int lci) { struct sock *sk; struct sock *make; struct x25_sock *makex25; struct x25_address source_addr, dest_addr; struct x25_facilities facilities; struct x25_dte_facilities dte_facilities; int len, addr_len, rc; /* * Remove the LCI and frame type. */ skb_pull(skb, X25_STD_MIN_LEN); /* * Extract the X.25 addresses and convert them to ASCII strings, * and remove them. * * Address block is mandatory in call request packets */ addr_len = x25_parse_address_block(skb, &source_addr, &dest_addr); if (addr_len <= 0) goto out_clear_request; skb_pull(skb, addr_len); /* * Get the length of the facilities, skip past them for the moment * get the call user data because this is needed to determine * the correct listener * * Facilities length is mandatory in call request packets */ if (!pskb_may_pull(skb, 1)) goto out_clear_request; len = skb->data[0] + 1; if (!pskb_may_pull(skb, len)) goto out_clear_request; skb_pull(skb,len); /* * Ensure that the amount of call user data is valid. */ if (skb->len > X25_MAX_CUD_LEN) goto out_clear_request; /* * Get all the call user data so it can be used in * x25_find_listener and skb_copy_from_linear_data up ahead. */ if (!pskb_may_pull(skb, skb->len)) goto out_clear_request; /* * Find a listener for the particular address/cud pair. */ sk = x25_find_listener(&source_addr,skb); skb_push(skb,len); if (sk != NULL && sk_acceptq_is_full(sk)) { goto out_sock_put; } /* * We dont have any listeners for this incoming call. * Try forwarding it. */ if (sk == NULL) { skb_push(skb, addr_len + X25_STD_MIN_LEN); if (sysctl_x25_forward && x25_forward_call(&dest_addr, nb, skb, lci) > 0) { /* Call was forwarded, dont process it any more */ kfree_skb(skb); rc = 1; goto out; } else { /* No listeners, can't forward, clear the call */ goto out_clear_request; } } /* * Try to reach a compromise on the requested facilities. */ len = x25_negotiate_facilities(skb, sk, &facilities, &dte_facilities); if (len == -1) goto out_sock_put; /* * current neighbour/link might impose additional limits * on certain facilities */ x25_limit_facilities(&facilities, nb); /* * Try to create a new socket. */ make = x25_make_new(sk); if (!make) goto out_sock_put; /* * Remove the facilities */ skb_pull(skb, len); skb->sk = make; make->sk_state = TCP_ESTABLISHED; makex25 = x25_sk(make); makex25->lci = lci; makex25->dest_addr = dest_addr; makex25->source_addr = source_addr; x25_neigh_hold(nb); makex25->neighbour = nb; makex25->facilities = facilities; makex25->dte_facilities= dte_facilities; makex25->vc_facil_mask = x25_sk(sk)->vc_facil_mask; /* ensure no reverse facil on accept */ makex25->vc_facil_mask &= ~X25_MASK_REVERSE; /* ensure no calling address extension on accept */ makex25->vc_facil_mask &= ~X25_MASK_CALLING_AE; makex25->cudmatchlength = x25_sk(sk)->cudmatchlength; /* Normally all calls are accepted immediately */ if (test_bit(X25_ACCPT_APPRV_FLAG, &makex25->flags)) { x25_write_internal(make, X25_CALL_ACCEPTED); makex25->state = X25_STATE_3; } else { makex25->state = X25_STATE_5; } /* * Incoming Call User Data. */ skb_copy_from_linear_data(skb, makex25->calluserdata.cuddata, skb->len); makex25->calluserdata.cudlength = skb->len; sk_acceptq_added(sk); x25_insert_socket(make); skb_queue_head(&sk->sk_receive_queue, skb); x25_start_heartbeat(make); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); rc = 1; sock_put(sk); out: return rc; out_sock_put: sock_put(sk); out_clear_request: rc = 0; x25_transmit_clear_request(nb, lci, 0x01); goto out; } static int x25_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); DECLARE_SOCKADDR(struct sockaddr_x25 *, usx25, msg->msg_name); struct sockaddr_x25 sx25; struct sk_buff *skb; unsigned char *asmptr; int noblock = msg->msg_flags & MSG_DONTWAIT; size_t size; int qbit = 0, rc = -EINVAL; lock_sock(sk); if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_OOB|MSG_EOR|MSG_CMSG_COMPAT)) goto out; /* we currently don't support segmented records at the user interface */ if (!(msg->msg_flags & (MSG_EOR|MSG_OOB))) goto out; rc = -EADDRNOTAVAIL; if (sock_flag(sk, SOCK_ZAPPED)) goto out; rc = -EPIPE; if (sk->sk_shutdown & SEND_SHUTDOWN) { send_sig(SIGPIPE, current, 0); goto out; } rc = -ENETUNREACH; if (!x25->neighbour) goto out; if (usx25) { rc = -EINVAL; if (msg->msg_namelen < sizeof(sx25)) goto out; memcpy(&sx25, usx25, sizeof(sx25)); rc = -EISCONN; if (strcmp(x25->dest_addr.x25_addr, sx25.sx25_addr.x25_addr)) goto out; rc = -EINVAL; if (sx25.sx25_family != AF_X25) goto out; } else { /* * FIXME 1003.1g - if the socket is like this because * it has become closed (not started closed) we ought * to SIGPIPE, EPIPE; */ rc = -ENOTCONN; if (sk->sk_state != TCP_ESTABLISHED) goto out; sx25.sx25_family = AF_X25; sx25.sx25_addr = x25->dest_addr; } /* Sanity check the packet size */ if (len > 65535) { rc = -EMSGSIZE; goto out; } net_dbg_ratelimited("x25_sendmsg: sendto: Addresses built.\n"); /* Build a packet */ net_dbg_ratelimited("x25_sendmsg: sendto: building packet.\n"); if ((msg->msg_flags & MSG_OOB) && len > 32) len = 32; size = len + X25_MAX_L2_LEN + X25_EXT_MIN_LEN; release_sock(sk); skb = sock_alloc_send_skb(sk, size, noblock, &rc); lock_sock(sk); if (!skb) goto out; X25_SKB_CB(skb)->flags = msg->msg_flags; skb_reserve(skb, X25_MAX_L2_LEN + X25_EXT_MIN_LEN); /* * Put the data on the end */ net_dbg_ratelimited("x25_sendmsg: Copying user data\n"); skb_reset_transport_header(skb); skb_put(skb, len); rc = memcpy_from_msg(skb_transport_header(skb), msg, len); if (rc) goto out_kfree_skb; /* * If the Q BIT Include socket option is in force, the first * byte of the user data is the logical value of the Q Bit. */ if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) { if (!pskb_may_pull(skb, 1)) goto out_kfree_skb; qbit = skb->data[0]; skb_pull(skb, 1); } /* * Push down the X.25 header */ net_dbg_ratelimited("x25_sendmsg: Building X.25 Header.\n"); if (msg->msg_flags & MSG_OOB) { if (x25->neighbour->extended) { asmptr = skb_push(skb, X25_STD_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_EXTSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_INTERRUPT; } else { asmptr = skb_push(skb, X25_STD_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_STDSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_INTERRUPT; } } else { if (x25->neighbour->extended) { /* Build an Extended X.25 header */ asmptr = skb_push(skb, X25_EXT_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_EXTSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_DATA; *asmptr++ = X25_DATA; } else { /* Build an Standard X.25 header */ asmptr = skb_push(skb, X25_STD_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_STDSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_DATA; } if (qbit) skb->data[0] |= X25_Q_BIT; } net_dbg_ratelimited("x25_sendmsg: Built header.\n"); net_dbg_ratelimited("x25_sendmsg: Transmitting buffer\n"); rc = -ENOTCONN; if (sk->sk_state != TCP_ESTABLISHED) goto out_kfree_skb; if (msg->msg_flags & MSG_OOB) skb_queue_tail(&x25->interrupt_out_queue, skb); else { rc = x25_output(sk, skb); len = rc; if (rc < 0) kfree_skb(skb); else if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) len++; } x25_kick(sk); rc = len; out: release_sock(sk); return rc; out_kfree_skb: kfree_skb(skb); goto out; } static int x25_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); DECLARE_SOCKADDR(struct sockaddr_x25 *, sx25, msg->msg_name); size_t copied; int qbit, header_len; struct sk_buff *skb; unsigned char *asmptr; int rc = -ENOTCONN; lock_sock(sk); if (x25->neighbour == NULL) goto out; header_len = x25->neighbour->extended ? X25_EXT_MIN_LEN : X25_STD_MIN_LEN; /* * This works for seqpacket too. The receiver has ordered the queue for * us! We do one quick check first though */ if (sk->sk_state != TCP_ESTABLISHED) goto out; if (flags & MSG_OOB) { rc = -EINVAL; if (sock_flag(sk, SOCK_URGINLINE) || !skb_peek(&x25->interrupt_in_queue)) goto out; skb = skb_dequeue(&x25->interrupt_in_queue); if (!pskb_may_pull(skb, X25_STD_MIN_LEN)) goto out_free_dgram; skb_pull(skb, X25_STD_MIN_LEN); /* * No Q bit information on Interrupt data. */ if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) { asmptr = skb_push(skb, 1); *asmptr = 0x00; } msg->msg_flags |= MSG_OOB; } else { /* Now we can treat all alike */ release_sock(sk); skb = skb_recv_datagram(sk, flags, &rc); lock_sock(sk); if (!skb) goto out; if (!pskb_may_pull(skb, header_len)) goto out_free_dgram; qbit = (skb->data[0] & X25_Q_BIT) == X25_Q_BIT; skb_pull(skb, header_len); if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) { asmptr = skb_push(skb, 1); *asmptr = qbit; } } skb_reset_transport_header(skb); copied = skb->len; if (copied > size) { copied = size; msg->msg_flags |= MSG_TRUNC; } /* Currently, each datagram always contains a complete record */ msg->msg_flags |= MSG_EOR; rc = skb_copy_datagram_msg(skb, 0, msg, copied); if (rc) goto out_free_dgram; if (sx25) { sx25->sx25_family = AF_X25; sx25->sx25_addr = x25->dest_addr; msg->msg_namelen = sizeof(*sx25); } x25_check_rbuf(sk); rc = copied; out_free_dgram: skb_free_datagram(sk, skb); out: release_sock(sk); return rc; } static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); void __user *argp = (void __user *)arg; int rc; switch (cmd) { case TIOCOUTQ: { int amount; amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk); if (amount < 0) amount = 0; rc = put_user(amount, (unsigned int __user *)argp); break; } case TIOCINQ: { struct sk_buff *skb; int amount = 0; /* * These two are safe on a single CPU system as * only user tasks fiddle here */ lock_sock(sk); if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) amount = skb->len; release_sock(sk); rc = put_user(amount, (unsigned int __user *)argp); break; } case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCGIFMETRIC: case SIOCSIFMETRIC: rc = -EINVAL; break; case SIOCADDRT: case SIOCDELRT: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = x25_route_ioctl(cmd, argp); break; case SIOCX25GSUBSCRIP: rc = x25_subscr_ioctl(cmd, argp); break; case SIOCX25SSUBSCRIP: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = x25_subscr_ioctl(cmd, argp); break; case SIOCX25GFACILITIES: { lock_sock(sk); rc = copy_to_user(argp, &x25->facilities, sizeof(x25->facilities)) ? -EFAULT : 0; release_sock(sk); break; } case SIOCX25SFACILITIES: { struct x25_facilities facilities; rc = -EFAULT; if (copy_from_user(&facilities, argp, sizeof(facilities))) break; rc = -EINVAL; lock_sock(sk); if (sk->sk_state != TCP_LISTEN && sk->sk_state != TCP_CLOSE) goto out_fac_release; if (facilities.pacsize_in < X25_PS16 || facilities.pacsize_in > X25_PS4096) goto out_fac_release; if (facilities.pacsize_out < X25_PS16 || facilities.pacsize_out > X25_PS4096) goto out_fac_release; if (facilities.winsize_in < 1 || facilities.winsize_in > 127) goto out_fac_release; if (facilities.throughput) { int out = facilities.throughput & 0xf0; int in = facilities.throughput & 0x0f; if (!out) facilities.throughput |= X25_DEFAULT_THROUGHPUT << 4; else if (out < 0x30 || out > 0xD0) goto out_fac_release; if (!in) facilities.throughput |= X25_DEFAULT_THROUGHPUT; else if (in < 0x03 || in > 0x0D) goto out_fac_release; } if (facilities.reverse && (facilities.reverse & 0x81) != 0x81) goto out_fac_release; x25->facilities = facilities; rc = 0; out_fac_release: release_sock(sk); break; } case SIOCX25GDTEFACILITIES: { lock_sock(sk); rc = copy_to_user(argp, &x25->dte_facilities, sizeof(x25->dte_facilities)); release_sock(sk); if (rc) rc = -EFAULT; break; } case SIOCX25SDTEFACILITIES: { struct x25_dte_facilities dtefacs; rc = -EFAULT; if (copy_from_user(&dtefacs, argp, sizeof(dtefacs))) break; rc = -EINVAL; lock_sock(sk); if (sk->sk_state != TCP_LISTEN && sk->sk_state != TCP_CLOSE) goto out_dtefac_release; if (dtefacs.calling_len > X25_MAX_AE_LEN) goto out_dtefac_release; if (dtefacs.called_len > X25_MAX_AE_LEN) goto out_dtefac_release; x25->dte_facilities = dtefacs; rc = 0; out_dtefac_release: release_sock(sk); break; } case SIOCX25GCALLUSERDATA: { lock_sock(sk); rc = copy_to_user(argp, &x25->calluserdata, sizeof(x25->calluserdata)) ? -EFAULT : 0; release_sock(sk); break; } case SIOCX25SCALLUSERDATA: { struct x25_calluserdata calluserdata; rc = -EFAULT; if (copy_from_user(&calluserdata, argp, sizeof(calluserdata))) break; rc = -EINVAL; if (calluserdata.cudlength > X25_MAX_CUD_LEN) break; lock_sock(sk); x25->calluserdata = calluserdata; release_sock(sk); rc = 0; break; } case SIOCX25GCAUSEDIAG: { lock_sock(sk); rc = copy_to_user(argp, &x25->causediag, sizeof(x25->causediag)) ? -EFAULT : 0; release_sock(sk); break; } case SIOCX25SCAUSEDIAG: { struct x25_causediag causediag; rc = -EFAULT; if (copy_from_user(&causediag, argp, sizeof(causediag))) break; lock_sock(sk); x25->causediag = causediag; release_sock(sk); rc = 0; break; } case SIOCX25SCUDMATCHLEN: { struct x25_subaddr sub_addr; rc = -EINVAL; lock_sock(sk); if(sk->sk_state != TCP_CLOSE) goto out_cud_release; rc = -EFAULT; if (copy_from_user(&sub_addr, argp, sizeof(sub_addr))) goto out_cud_release; rc = -EINVAL; if (sub_addr.cudmatchlength > X25_MAX_CUD_LEN) goto out_cud_release; x25->cudmatchlength = sub_addr.cudmatchlength; rc = 0; out_cud_release: release_sock(sk); break; } case SIOCX25CALLACCPTAPPRV: { rc = -EINVAL; lock_sock(sk); if (sk->sk_state == TCP_CLOSE) { clear_bit(X25_ACCPT_APPRV_FLAG, &x25->flags); rc = 0; } release_sock(sk); break; } case SIOCX25SENDCALLACCPT: { rc = -EINVAL; lock_sock(sk); if (sk->sk_state != TCP_ESTABLISHED) goto out_sendcallaccpt_release; /* must call accptapprv above */ if (test_bit(X25_ACCPT_APPRV_FLAG, &x25->flags)) goto out_sendcallaccpt_release; x25_write_internal(sk, X25_CALL_ACCEPTED); x25->state = X25_STATE_3; rc = 0; out_sendcallaccpt_release: release_sock(sk); break; } default: rc = -ENOIOCTLCMD; break; } return rc; } static const struct net_proto_family x25_family_ops = { .family = AF_X25, .create = x25_create, .owner = THIS_MODULE, }; #ifdef CONFIG_COMPAT static int compat_x25_subscr_ioctl(unsigned int cmd, struct compat_x25_subscrip_struct __user *x25_subscr32) { struct compat_x25_subscrip_struct x25_subscr; struct x25_neigh *nb; struct net_device *dev; int rc = -EINVAL; rc = -EFAULT; if (copy_from_user(&x25_subscr, x25_subscr32, sizeof(*x25_subscr32))) goto out; rc = -EINVAL; dev = x25_dev_get(x25_subscr.device); if (dev == NULL) goto out; nb = x25_get_neigh(dev); if (nb == NULL) goto out_dev_put; dev_put(dev); if (cmd == SIOCX25GSUBSCRIP) { read_lock_bh(&x25_neigh_list_lock); x25_subscr.extended = nb->extended; x25_subscr.global_facil_mask = nb->global_facil_mask; read_unlock_bh(&x25_neigh_list_lock); rc = copy_to_user(x25_subscr32, &x25_subscr, sizeof(*x25_subscr32)) ? -EFAULT : 0; } else { rc = -EINVAL; if (x25_subscr.extended == 0 || x25_subscr.extended == 1) { rc = 0; write_lock_bh(&x25_neigh_list_lock); nb->extended = x25_subscr.extended; nb->global_facil_mask = x25_subscr.global_facil_mask; write_unlock_bh(&x25_neigh_list_lock); } } x25_neigh_put(nb); out: return rc; out_dev_put: dev_put(dev); goto out; } static int compat_x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = compat_ptr(arg); int rc = -ENOIOCTLCMD; switch(cmd) { case TIOCOUTQ: case TIOCINQ: rc = x25_ioctl(sock, cmd, (unsigned long)argp); break; case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCGIFMETRIC: case SIOCSIFMETRIC: rc = -EINVAL; break; case SIOCADDRT: case SIOCDELRT: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = x25_route_ioctl(cmd, argp); break; case SIOCX25GSUBSCRIP: rc = compat_x25_subscr_ioctl(cmd, argp); break; case SIOCX25SSUBSCRIP: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = compat_x25_subscr_ioctl(cmd, argp); break; case SIOCX25GFACILITIES: case SIOCX25SFACILITIES: case SIOCX25GDTEFACILITIES: case SIOCX25SDTEFACILITIES: case SIOCX25GCALLUSERDATA: case SIOCX25SCALLUSERDATA: case SIOCX25GCAUSEDIAG: case SIOCX25SCAUSEDIAG: case SIOCX25SCUDMATCHLEN: case SIOCX25CALLACCPTAPPRV: case SIOCX25SENDCALLACCPT: rc = x25_ioctl(sock, cmd, (unsigned long)argp); break; default: rc = -ENOIOCTLCMD; break; } return rc; } #endif static const struct proto_ops x25_proto_ops = { .family = AF_X25, .owner = THIS_MODULE, .release = x25_release, .bind = x25_bind, .connect = x25_connect, .socketpair = sock_no_socketpair, .accept = x25_accept, .getname = x25_getname, .poll = datagram_poll, .ioctl = x25_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_x25_ioctl, #endif .gettstamp = sock_gettstamp, .listen = x25_listen, .shutdown = sock_no_shutdown, .setsockopt = x25_setsockopt, .getsockopt = x25_getsockopt, .sendmsg = x25_sendmsg, .recvmsg = x25_recvmsg, .mmap = sock_no_mmap, }; static struct packet_type x25_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_X25), .func = x25_lapb_receive_frame, }; static struct notifier_block x25_dev_notifier = { .notifier_call = x25_device_event, }; void x25_kill_by_neigh(struct x25_neigh *nb) { struct sock *s; write_lock_bh(&x25_list_lock); sk_for_each(s, &x25_list) { if (x25_sk(s)->neighbour == nb) { write_unlock_bh(&x25_list_lock); lock_sock(s); x25_disconnect(s, ENETUNREACH, 0, 0); release_sock(s); write_lock_bh(&x25_list_lock); } } write_unlock_bh(&x25_list_lock); /* Remove any related forwards */ x25_clear_forward_by_dev(nb->dev); } static int __init x25_init(void) { int rc; rc = proto_register(&x25_proto, 0); if (rc) goto out; rc = sock_register(&x25_family_ops); if (rc) goto out_proto; dev_add_pack(&x25_packet_type); rc = register_netdevice_notifier(&x25_dev_notifier); if (rc) goto out_sock; rc = x25_register_sysctl(); if (rc) goto out_dev; rc = x25_proc_init(); if (rc) goto out_sysctl; pr_info("Linux Version 0.2\n"); out: return rc; out_sysctl: x25_unregister_sysctl(); out_dev: unregister_netdevice_notifier(&x25_dev_notifier); out_sock: dev_remove_pack(&x25_packet_type); sock_unregister(AF_X25); out_proto: proto_unregister(&x25_proto); goto out; } module_init(x25_init); static void __exit x25_exit(void) { x25_proc_exit(); x25_link_free(); x25_route_free(); x25_unregister_sysctl(); unregister_netdevice_notifier(&x25_dev_notifier); dev_remove_pack(&x25_packet_type); sock_unregister(AF_X25); proto_unregister(&x25_proto); } module_exit(x25_exit); MODULE_AUTHOR("Jonathan Naylor <g4klx@g4klx.demon.co.uk>"); MODULE_DESCRIPTION("The X.25 Packet Layer network layer protocol"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_X25);
2271 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 // SPDX-License-Identifier: GPL-2.0 /* * ACPI support * * Copyright (C) 2020, Intel Corporation * Author: Mika Westerberg <mika.westerberg@linux.intel.com> */ #include <linux/acpi.h> #include <linux/pm_runtime.h> #include "tb.h" static acpi_status tb_acpi_add_link(acpi_handle handle, u32 level, void *data, void **ret) { struct acpi_device *adev = acpi_fetch_acpi_dev(handle); struct fwnode_handle *fwnode; struct tb_nhi *nhi = data; struct pci_dev *pdev; struct device *dev; if (!adev) return AE_OK; fwnode = fwnode_find_reference(acpi_fwnode_handle(adev), "usb4-host-interface", 0); if (IS_ERR(fwnode)) return AE_OK; /* It needs to reference this NHI */ if (dev_fwnode(&nhi->pdev->dev) != fwnode) goto out_put; /* * Try to find physical device walking upwards to the hierarcy. * We need to do this because the xHCI driver might not yet be * bound so the USB3 SuperSpeed ports are not yet created. */ do { dev = acpi_get_first_physical_node(adev); if (dev) break; adev = acpi_dev_parent(adev); } while (adev); /* * Check that the device is PCIe. This is because USB3 * SuperSpeed ports have this property and they are not power * managed with the xHCI and the SuperSpeed hub so we create the * link from xHCI instead. */ while (dev && !dev_is_pci(dev)) dev = dev->parent; if (!dev) goto out_put; /* * Check that this actually matches the type of device we * expect. It should either be xHCI or PCIe root/downstream * port. */ pdev = to_pci_dev(dev); if (pdev->class == PCI_CLASS_SERIAL_USB_XHCI || (pci_is_pcie(pdev) && (pci_pcie_type(pdev) == PCI_EXP_TYPE_ROOT_PORT || pci_pcie_type(pdev) == PCI_EXP_TYPE_DOWNSTREAM))) { const struct device_link *link; /* * Make them both active first to make sure the NHI does * not runtime suspend before the consumer. The * pm_runtime_put() below then allows the consumer to * runtime suspend again (which then allows NHI runtime * suspend too now that the device link is established). */ pm_runtime_get_sync(&pdev->dev); link = device_link_add(&pdev->dev, &nhi->pdev->dev, DL_FLAG_AUTOREMOVE_SUPPLIER | DL_FLAG_RPM_ACTIVE | DL_FLAG_PM_RUNTIME); if (link) { dev_dbg(&nhi->pdev->dev, "created link from %s\n", dev_name(&pdev->dev)); *(bool *)ret = true; } else { dev_warn(&nhi->pdev->dev, "device link creation from %s failed\n", dev_name(&pdev->dev)); } pm_runtime_put(&pdev->dev); } out_put: fwnode_handle_put(fwnode); return AE_OK; } /** * tb_acpi_add_links() - Add device links based on ACPI description * @nhi: Pointer to NHI * * Goes over ACPI namespace finding tunneled ports that reference to * @nhi ACPI node. For each reference a device link is added. The link * is automatically removed by the driver core. * * Returns %true if at least one link was created. */ bool tb_acpi_add_links(struct tb_nhi *nhi) { acpi_status status; bool ret = false; if (!has_acpi_companion(&nhi->pdev->dev)) return false; /* * Find all devices that have usb4-host-controller interface * property that references to this NHI. */ status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT, 32, tb_acpi_add_link, NULL, nhi, (void **)&ret); if (ACPI_FAILURE(status)) { dev_warn(&nhi->pdev->dev, "failed to enumerate tunneled ports\n"); return false; } return ret; } /** * tb_acpi_is_native() - Did the platform grant native TBT/USB4 control * * Returns %true if the platform granted OS native control over * TBT/USB4. In this case software based connection manager can be used, * otherwise there is firmware based connection manager running. */ bool tb_acpi_is_native(void) { return osc_sb_native_usb4_support_confirmed && osc_sb_native_usb4_control; } /** * tb_acpi_may_tunnel_usb3() - Is USB3 tunneling allowed by the platform * * When software based connection manager is used, this function * returns %true if platform allows native USB3 tunneling. */ bool tb_acpi_may_tunnel_usb3(void) { if (tb_acpi_is_native()) return osc_sb_native_usb4_control & OSC_USB_USB3_TUNNELING; return true; } /** * tb_acpi_may_tunnel_dp() - Is DisplayPort tunneling allowed by the platform * * When software based connection manager is used, this function * returns %true if platform allows native DP tunneling. */ bool tb_acpi_may_tunnel_dp(void) { if (tb_acpi_is_native()) return osc_sb_native_usb4_control & OSC_USB_DP_TUNNELING; return true; } /** * tb_acpi_may_tunnel_pcie() - Is PCIe tunneling allowed by the platform * * When software based connection manager is used, this function * returns %true if platform allows native PCIe tunneling. */ bool tb_acpi_may_tunnel_pcie(void) { if (tb_acpi_is_native()) return osc_sb_native_usb4_control & OSC_USB_PCIE_TUNNELING; return true; } /** * tb_acpi_is_xdomain_allowed() - Are XDomain connections allowed * * When software based connection manager is used, this function * returns %true if platform allows XDomain connections. */ bool tb_acpi_is_xdomain_allowed(void) { if (tb_acpi_is_native()) return osc_sb_native_usb4_control & OSC_USB_XDOMAIN; return true; } /* UUID for retimer _DSM: e0053122-795b-4122-8a5e-57be1d26acb3 */ static const guid_t retimer_dsm_guid = GUID_INIT(0xe0053122, 0x795b, 0x4122, 0x8a, 0x5e, 0x57, 0xbe, 0x1d, 0x26, 0xac, 0xb3); #define RETIMER_DSM_QUERY_ONLINE_STATE 1 #define RETIMER_DSM_SET_ONLINE_STATE 2 static int tb_acpi_retimer_set_power(struct tb_port *port, bool power) { struct usb4_port *usb4 = port->usb4; union acpi_object argv4[2]; struct acpi_device *adev; union acpi_object *obj; int ret; if (!usb4->can_offline) return 0; adev = ACPI_COMPANION(&usb4->dev); if (WARN_ON(!adev)) return 0; /* Check if we are already powered on (and in correct mode) */ obj = acpi_evaluate_dsm_typed(adev->handle, &retimer_dsm_guid, 1, RETIMER_DSM_QUERY_ONLINE_STATE, NULL, ACPI_TYPE_INTEGER); if (!obj) { tb_port_warn(port, "ACPI: query online _DSM failed\n"); return -EIO; } ret = obj->integer.value; ACPI_FREE(obj); if (power == ret) return 0; tb_port_dbg(port, "ACPI: calling _DSM to power %s retimers\n", power ? "on" : "off"); argv4[0].type = ACPI_TYPE_PACKAGE; argv4[0].package.count = 1; argv4[0].package.elements = &argv4[1]; argv4[1].integer.type = ACPI_TYPE_INTEGER; argv4[1].integer.value = power; obj = acpi_evaluate_dsm_typed(adev->handle, &retimer_dsm_guid, 1, RETIMER_DSM_SET_ONLINE_STATE, argv4, ACPI_TYPE_INTEGER); if (!obj) { tb_port_warn(port, "ACPI: set online state _DSM evaluation failed\n"); return -EIO; } ret = obj->integer.value; ACPI_FREE(obj); if (ret >= 0) { if (power) return ret == 1 ? 0 : -EBUSY; return 0; } tb_port_warn(port, "ACPI: set online state _DSM failed with error %d\n", ret); return -EIO; } /** * tb_acpi_power_on_retimers() - Call platform to power on retimers * @port: USB4 port * * Calls platform to turn on power to all retimers behind this USB4 * port. After this function returns successfully the caller can * continue with the normal retimer flows (as specified in the USB4 * spec). Note if this returns %-EBUSY it means the type-C port is in * non-USB4/TBT mode (there is non-USB4/TBT device connected). * * This should only be called if the USB4/TBT link is not up. * * Returns %0 on success. */ int tb_acpi_power_on_retimers(struct tb_port *port) { return tb_acpi_retimer_set_power(port, true); } /** * tb_acpi_power_off_retimers() - Call platform to power off retimers * @port: USB4 port * * This is the opposite of tb_acpi_power_on_retimers(). After returning * successfully the normal operations with the @port can continue. * * Returns %0 on success. */ int tb_acpi_power_off_retimers(struct tb_port *port) { return tb_acpi_retimer_set_power(port, false); } static bool tb_acpi_bus_match(struct device *dev) { return tb_is_switch(dev) || tb_is_usb4_port_device(dev); } static struct acpi_device *tb_acpi_switch_find_companion(struct tb_switch *sw) { struct tb_switch *parent_sw = tb_switch_parent(sw); struct acpi_device *adev = NULL; /* * Device routers exists under the downstream facing USB4 port * of the parent router. Their _ADR is always 0. */ if (parent_sw) { struct tb_port *port = tb_switch_downstream_port(sw); struct acpi_device *port_adev; port_adev = acpi_find_child_by_adr(ACPI_COMPANION(&parent_sw->dev), port->port); if (port_adev) adev = acpi_find_child_device(port_adev, 0, false); } else { struct tb_nhi *nhi = sw->tb->nhi; struct acpi_device *parent_adev; parent_adev = ACPI_COMPANION(&nhi->pdev->dev); if (parent_adev) adev = acpi_find_child_device(parent_adev, 0, false); } return adev; } static struct acpi_device *tb_acpi_find_companion(struct device *dev) { /* * The Thunderbolt/USB4 hierarchy looks like following: * * Device (NHI) * Device (HR) // Host router _ADR == 0 * Device (DFP0) // Downstream port _ADR == lane 0 adapter * Device (DR) // Device router _ADR == 0 * Device (UFP) // Upstream port _ADR == lane 0 adapter * Device (DFP1) // Downstream port _ADR == lane 0 adapter number * * At the moment we bind the host router to the corresponding * Linux device. */ if (tb_is_switch(dev)) return tb_acpi_switch_find_companion(tb_to_switch(dev)); if (tb_is_usb4_port_device(dev)) return acpi_find_child_by_adr(ACPI_COMPANION(dev->parent), tb_to_usb4_port_device(dev)->port->port); return NULL; } static void tb_acpi_setup(struct device *dev) { struct acpi_device *adev = ACPI_COMPANION(dev); struct usb4_port *usb4 = tb_to_usb4_port_device(dev); if (!adev || !usb4) return; if (acpi_check_dsm(adev->handle, &retimer_dsm_guid, 1, BIT(RETIMER_DSM_QUERY_ONLINE_STATE) | BIT(RETIMER_DSM_SET_ONLINE_STATE))) usb4->can_offline = true; } static struct acpi_bus_type tb_acpi_bus = { .name = "thunderbolt", .match = tb_acpi_bus_match, .find_companion = tb_acpi_find_companion, .setup = tb_acpi_setup, }; int tb_acpi_init(void) { return register_acpi_bus_type(&tb_acpi_bus); } void tb_acpi_exit(void) { unregister_acpi_bus_type(&tb_acpi_bus); }
876 335 1181 1193 1170 863 325 47 41 41 42 41 149 150 143 113 16 16 339 339 2 44 66 6 60 34 49 12 1 47 2 3 12 46 17 29 6 29 6 33 35 23 47 8 11 44 53 119 4 5 1 86 17 16 2 11 68 63 45 93 90 13 61 7 8 19 45 49 83 69 18 83 72 36 22 5 9 346 287 213 206 292 287 212 208 292 44 81 61 50 48 63 40 72 80 16 7 9 8 9 3 3 980 980 980 979 1 978 981 44 36 8 44 8 40 36 8 41 4 52 53 44 9 53 54 1 1 51 1 54 3 9 42 28 26 44 44 41 41 9 9 8 7 13 10 13 12 12 35 17 2 7 12 7 2 7 14 5 8 8 3 2 8 1 2 1 2 8 3 15 26 1 1 9 2 11 14 3 21 1 18 10 24 18 6 3 2 21 1276 1143 367 35 2 28 5 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/pipe.c * * Copyright (C) 1991, 1992, 1999 Linus Torvalds */ #include <linux/mm.h> #include <linux/file.h> #include <linux/poll.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/log2.h> #include <linux/mount.h> #include <linux/pseudo_fs.h> #include <linux/magic.h> #include <linux/pipe_fs_i.h> #include <linux/uio.h> #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/audit.h> #include <linux/syscalls.h> #include <linux/fcntl.h> #include <linux/memcontrol.h> #include <linux/watch_queue.h> #include <linux/sysctl.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include "internal.h" /* * New pipe buffers will be restricted to this size while the user is exceeding * their pipe buffer quota. The general pipe use case needs at least two * buffers: one for data yet to be read, and one for new data. If this is less * than two, then a write to a non-empty pipe may block even if the pipe is not * full. This can occur with GNU make jobserver or similar uses of pipes as * semaphores: multiple processes may be waiting to write tokens back to the * pipe before reading tokens: https://lore.kernel.org/lkml/1628086770.5rn8p04n6j.none@localhost/. * * Users can reduce their pipe buffers with F_SETPIPE_SZ below this at their * own risk, namely: pipe writes to non-full pipes may block until the pipe is * emptied. */ #define PIPE_MIN_DEF_BUFFERS 2 /* * The max size that a non-root user is allowed to grow the pipe. Can * be set by root in /proc/sys/fs/pipe-max-size */ static unsigned int pipe_max_size = 1048576; /* Maximum allocatable pages per user. Hard limit is unset by default, soft * matches default values. */ static unsigned long pipe_user_pages_hard; static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; /* * We use head and tail indices that aren't masked off, except at the point of * dereference, but rather they're allowed to wrap naturally. This means there * isn't a dead spot in the buffer, but the ring has to be a power of two and * <= 2^31. * -- David Howells 2019-09-23. * * Reads with count = 0 should always return 0. * -- Julian Bradfield 1999-06-07. * * FIFOs and Pipes now generate SIGIO for both readers and writers. * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16 * * pipe_read & write cleanup * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 */ static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) { if (pipe->files) mutex_lock_nested(&pipe->mutex, subclass); } void pipe_lock(struct pipe_inode_info *pipe) { /* * pipe_lock() nests non-pipe inode locks (for writing to a file) */ pipe_lock_nested(pipe, I_MUTEX_PARENT); } EXPORT_SYMBOL(pipe_lock); void pipe_unlock(struct pipe_inode_info *pipe) { if (pipe->files) mutex_unlock(&pipe->mutex); } EXPORT_SYMBOL(pipe_unlock); static inline void __pipe_lock(struct pipe_inode_info *pipe) { mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT); } static inline void __pipe_unlock(struct pipe_inode_info *pipe) { mutex_unlock(&pipe->mutex); } void pipe_double_lock(struct pipe_inode_info *pipe1, struct pipe_inode_info *pipe2) { BUG_ON(pipe1 == pipe2); if (pipe1 < pipe2) { pipe_lock_nested(pipe1, I_MUTEX_PARENT); pipe_lock_nested(pipe2, I_MUTEX_CHILD); } else { pipe_lock_nested(pipe2, I_MUTEX_PARENT); pipe_lock_nested(pipe1, I_MUTEX_CHILD); } } static void anon_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; /* * If nobody else uses this page, and we don't already have a * temporary page, let's keep track of it as a one-deep * allocation cache. (Otherwise just release our reference to it) */ if (page_count(page) == 1 && !pipe->tmp_page) pipe->tmp_page = page; else put_page(page); } static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; if (page_count(page) != 1) return false; memcg_kmem_uncharge_page(page, 0); __SetPageLocked(page); return true; } /** * generic_pipe_buf_try_steal - attempt to take ownership of a &pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to attempt to steal * * Description: * This function attempts to steal the &struct page attached to * @buf. If successful, this function returns 0 and returns with * the page locked. The caller may then reuse the page for whatever * he wishes; the typical use is insertion into a different file * page cache. */ bool generic_pipe_buf_try_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; /* * A reference of one is golden, that means that the owner of this * page is the only one holding a reference to it. lock the page * and return OK. */ if (page_count(page) == 1) { lock_page(page); return true; } return false; } EXPORT_SYMBOL(generic_pipe_buf_try_steal); /** * generic_pipe_buf_get - get a reference to a &struct pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to get a reference to * * Description: * This function grabs an extra reference to @buf. It's used in * the tee() system call, when we duplicate the buffers in one * pipe into another. */ bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { return try_get_page(buf->page); } EXPORT_SYMBOL(generic_pipe_buf_get); /** * generic_pipe_buf_release - put a reference to a &struct pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to put a reference to * * Description: * This function releases a reference to @buf. */ void generic_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { put_page(buf->page); } EXPORT_SYMBOL(generic_pipe_buf_release); static const struct pipe_buf_operations anon_pipe_buf_ops = { .release = anon_pipe_buf_release, .try_steal = anon_pipe_buf_try_steal, .get = generic_pipe_buf_get, }; /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ static inline bool pipe_readable(const struct pipe_inode_info *pipe) { unsigned int head = READ_ONCE(pipe->head); unsigned int tail = READ_ONCE(pipe->tail); unsigned int writers = READ_ONCE(pipe->writers); return !pipe_empty(head, tail) || !writers; } static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe, struct pipe_buffer *buf, unsigned int tail) { pipe_buf_release(pipe, buf); /* * If the pipe has a watch_queue, we need additional protection * by the spinlock because notifications get posted with only * this spinlock, no mutex */ if (pipe_has_watch_queue(pipe)) { spin_lock_irq(&pipe->rd_wait.lock); #ifdef CONFIG_WATCH_QUEUE if (buf->flags & PIPE_BUF_FLAG_LOSS) pipe->note_loss = true; #endif pipe->tail = ++tail; spin_unlock_irq(&pipe->rd_wait.lock); return tail; } /* * Without a watch_queue, we can simply increment the tail * without the spinlock - the mutex is enough. */ pipe->tail = ++tail; return tail; } static ssize_t pipe_read(struct kiocb *iocb, struct iov_iter *to) { size_t total_len = iov_iter_count(to); struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; bool was_full, wake_next_reader = false; ssize_t ret; /* Null read succeeds. */ if (unlikely(total_len == 0)) return 0; ret = 0; __pipe_lock(pipe); /* * We only wake up writers if the pipe was full when we started * reading in order to avoid unnecessary wakeups. * * But when we do wake up writers, we do so using a sync wakeup * (WF_SYNC), because we want them to get going and generate more * data for us. */ was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); for (;;) { /* Read ->head with a barrier vs post_one_notification() */ unsigned int head = smp_load_acquire(&pipe->head); unsigned int tail = pipe->tail; unsigned int mask = pipe->ring_size - 1; #ifdef CONFIG_WATCH_QUEUE if (pipe->note_loss) { struct watch_notification n; if (total_len < 8) { if (ret == 0) ret = -ENOBUFS; break; } n.type = WATCH_TYPE_META; n.subtype = WATCH_META_LOSS_NOTIFICATION; n.info = watch_sizeof(n); if (copy_to_iter(&n, sizeof(n), to) != sizeof(n)) { if (ret == 0) ret = -EFAULT; break; } ret += sizeof(n); total_len -= sizeof(n); pipe->note_loss = false; } #endif if (!pipe_empty(head, tail)) { struct pipe_buffer *buf = &pipe->bufs[tail & mask]; size_t chars = buf->len; size_t written; int error; if (chars > total_len) { if (buf->flags & PIPE_BUF_FLAG_WHOLE) { if (ret == 0) ret = -ENOBUFS; break; } chars = total_len; } error = pipe_buf_confirm(pipe, buf); if (error) { if (!ret) ret = error; break; } written = copy_page_to_iter(buf->page, buf->offset, chars, to); if (unlikely(written < chars)) { if (!ret) ret = -EFAULT; break; } ret += chars; buf->offset += chars; buf->len -= chars; /* Was it a packet buffer? Clean up and exit */ if (buf->flags & PIPE_BUF_FLAG_PACKET) { total_len = chars; buf->len = 0; } if (!buf->len) tail = pipe_update_tail(pipe, buf, tail); total_len -= chars; if (!total_len) break; /* common path: read succeeded */ if (!pipe_empty(head, tail)) /* More to do? */ continue; } if (!pipe->writers) break; if (ret) break; if ((filp->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) { ret = -EAGAIN; break; } __pipe_unlock(pipe); /* * We only get here if we didn't actually read anything. * * However, we could have seen (and removed) a zero-sized * pipe buffer, and might have made space in the buffers * that way. * * You can't make zero-sized pipe buffers by doing an empty * write (not even in packet mode), but they can happen if * the writer gets an EFAULT when trying to fill a buffer * that already got allocated and inserted in the buffer * array. * * So we still need to wake up any pending writers in the * _very_ unlikely case that the pipe was full, but we got * no data. */ if (unlikely(was_full)) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); /* * But because we didn't read anything, at this point we can * just return directly with -ERESTARTSYS if we're interrupted, * since we've done any required wakeups and there's no need * to mark anything accessed. And we've dropped the lock. */ if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0) return -ERESTARTSYS; __pipe_lock(pipe); was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); wake_next_reader = true; } if (pipe_empty(pipe->head, pipe->tail)) wake_next_reader = false; __pipe_unlock(pipe); if (was_full) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); if (wake_next_reader) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); if (ret > 0) file_accessed(filp); return ret; } static inline int is_packetized(struct file *file) { return (file->f_flags & O_DIRECT) != 0; } /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ static inline bool pipe_writable(const struct pipe_inode_info *pipe) { unsigned int head = READ_ONCE(pipe->head); unsigned int tail = READ_ONCE(pipe->tail); unsigned int max_usage = READ_ONCE(pipe->max_usage); return !pipe_full(head, tail, max_usage) || !READ_ONCE(pipe->readers); } static ssize_t pipe_write(struct kiocb *iocb, struct iov_iter *from) { struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; unsigned int head; ssize_t ret = 0; size_t total_len = iov_iter_count(from); ssize_t chars; bool was_empty = false; bool wake_next_writer = false; /* * Reject writing to watch queue pipes before the point where we lock * the pipe. * Otherwise, lockdep would be unhappy if the caller already has another * pipe locked. * If we had to support locking a normal pipe and a notification pipe at * the same time, we could set up lockdep annotations for that, but * since we don't actually need that, it's simpler to just bail here. */ if (pipe_has_watch_queue(pipe)) return -EXDEV; /* Null write succeeds. */ if (unlikely(total_len == 0)) return 0; __pipe_lock(pipe); if (!pipe->readers) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; goto out; } /* * If it wasn't empty we try to merge new data into * the last buffer. * * That naturally merges small writes, but it also * page-aligns the rest of the writes for large writes * spanning multiple pages. */ head = pipe->head; was_empty = pipe_empty(head, pipe->tail); chars = total_len & (PAGE_SIZE-1); if (chars && !was_empty) { unsigned int mask = pipe->ring_size - 1; struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask]; int offset = buf->offset + buf->len; if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) && offset + chars <= PAGE_SIZE) { ret = pipe_buf_confirm(pipe, buf); if (ret) goto out; ret = copy_page_from_iter(buf->page, offset, chars, from); if (unlikely(ret < chars)) { ret = -EFAULT; goto out; } buf->len += ret; if (!iov_iter_count(from)) goto out; } } for (;;) { if (!pipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } head = pipe->head; if (!pipe_full(head, pipe->tail, pipe->max_usage)) { unsigned int mask = pipe->ring_size - 1; struct pipe_buffer *buf; struct page *page = pipe->tmp_page; int copied; if (!page) { page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); if (unlikely(!page)) { ret = ret ? : -ENOMEM; break; } pipe->tmp_page = page; } /* Allocate a slot in the ring in advance and attach an * empty buffer. If we fault or otherwise fail to use * it, either the reader will consume it or it'll still * be there for the next write. */ pipe->head = head + 1; /* Insert it into the buffer array */ buf = &pipe->bufs[head & mask]; buf->page = page; buf->ops = &anon_pipe_buf_ops; buf->offset = 0; buf->len = 0; if (is_packetized(filp)) buf->flags = PIPE_BUF_FLAG_PACKET; else buf->flags = PIPE_BUF_FLAG_CAN_MERGE; pipe->tmp_page = NULL; copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { if (!ret) ret = -EFAULT; break; } ret += copied; buf->len = copied; if (!iov_iter_count(from)) break; } if (!pipe_full(head, pipe->tail, pipe->max_usage)) continue; /* Wait for buffer space to become available. */ if ((filp->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) { if (!ret) ret = -EAGAIN; break; } if (signal_pending(current)) { if (!ret) ret = -ERESTARTSYS; break; } /* * We're going to release the pipe lock and wait for more * space. We wake up any readers if necessary, and then * after waiting we need to re-check whether the pipe * become empty while we dropped the lock. */ __pipe_unlock(pipe); if (was_empty) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe)); __pipe_lock(pipe); was_empty = pipe_empty(pipe->head, pipe->tail); wake_next_writer = true; } out: if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) wake_next_writer = false; __pipe_unlock(pipe); /* * If we do do a wakeup event, we do a 'sync' wakeup, because we * want the reader to start processing things asap, rather than * leave the data pending. * * This is particularly important for small writes, because of * how (for example) the GNU make jobserver uses small writes to * wake up pending jobs * * Epoll nonsensically wants a wakeup whether the pipe * was already empty or not. */ if (was_empty || pipe->poll_usage) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); if (wake_next_writer) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { int err = file_update_time(filp); if (err) ret = err; sb_end_write(file_inode(filp)->i_sb); } return ret; } static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct pipe_inode_info *pipe = filp->private_data; unsigned int count, head, tail, mask; switch (cmd) { case FIONREAD: __pipe_lock(pipe); count = 0; head = pipe->head; tail = pipe->tail; mask = pipe->ring_size - 1; while (tail != head) { count += pipe->bufs[tail & mask].len; tail++; } __pipe_unlock(pipe); return put_user(count, (int __user *)arg); #ifdef CONFIG_WATCH_QUEUE case IOC_WATCH_QUEUE_SET_SIZE: { int ret; __pipe_lock(pipe); ret = watch_queue_set_size(pipe, arg); __pipe_unlock(pipe); return ret; } case IOC_WATCH_QUEUE_SET_FILTER: return watch_queue_set_filter( pipe, (struct watch_notification_filter __user *)arg); #endif default: return -ENOIOCTLCMD; } } /* No kernel lock held - fine */ static __poll_t pipe_poll(struct file *filp, poll_table *wait) { __poll_t mask; struct pipe_inode_info *pipe = filp->private_data; unsigned int head, tail; /* Epoll has some historical nasty semantics, this enables them */ WRITE_ONCE(pipe->poll_usage, true); /* * Reading pipe state only -- no need for acquiring the semaphore. * * But because this is racy, the code has to add the * entry to the poll table _first_ .. */ if (filp->f_mode & FMODE_READ) poll_wait(filp, &pipe->rd_wait, wait); if (filp->f_mode & FMODE_WRITE) poll_wait(filp, &pipe->wr_wait, wait); /* * .. and only then can you do the racy tests. That way, * if something changes and you got it wrong, the poll * table entry will wake you up and fix it. */ head = READ_ONCE(pipe->head); tail = READ_ONCE(pipe->tail); mask = 0; if (filp->f_mode & FMODE_READ) { if (!pipe_empty(head, tail)) mask |= EPOLLIN | EPOLLRDNORM; if (!pipe->writers && filp->f_version != pipe->w_counter) mask |= EPOLLHUP; } if (filp->f_mode & FMODE_WRITE) { if (!pipe_full(head, tail, pipe->max_usage)) mask |= EPOLLOUT | EPOLLWRNORM; /* * Most Unices do not set EPOLLERR for FIFOs but on Linux they * behave exactly like pipes for poll(). */ if (!pipe->readers) mask |= EPOLLERR; } return mask; } static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe) { int kill = 0; spin_lock(&inode->i_lock); if (!--pipe->files) { inode->i_pipe = NULL; kill = 1; } spin_unlock(&inode->i_lock); if (kill) free_pipe_info(pipe); } static int pipe_release(struct inode *inode, struct file *file) { struct pipe_inode_info *pipe = file->private_data; __pipe_lock(pipe); if (file->f_mode & FMODE_READ) pipe->readers--; if (file->f_mode & FMODE_WRITE) pipe->writers--; /* Was that the last reader or writer, but not the other side? */ if (!pipe->readers != !pipe->writers) { wake_up_interruptible_all(&pipe->rd_wait); wake_up_interruptible_all(&pipe->wr_wait); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } __pipe_unlock(pipe); put_pipe_info(inode, pipe); return 0; } static int pipe_fasync(int fd, struct file *filp, int on) { struct pipe_inode_info *pipe = filp->private_data; int retval = 0; __pipe_lock(pipe); if (filp->f_mode & FMODE_READ) retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); if ((filp->f_mode & FMODE_WRITE) && retval >= 0) { retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); if (retval < 0 && (filp->f_mode & FMODE_READ)) /* this can happen only if on == T */ fasync_helper(-1, filp, 0, &pipe->fasync_readers); } __pipe_unlock(pipe); return retval; } unsigned long account_pipe_buffers(struct user_struct *user, unsigned long old, unsigned long new) { return atomic_long_add_return(new - old, &user->pipe_bufs); } bool too_many_pipe_buffers_soft(unsigned long user_bufs) { unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft); return soft_limit && user_bufs > soft_limit; } bool too_many_pipe_buffers_hard(unsigned long user_bufs) { unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard); return hard_limit && user_bufs > hard_limit; } bool pipe_is_unprivileged_user(void) { return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); } struct pipe_inode_info *alloc_pipe_info(void) { struct pipe_inode_info *pipe; unsigned long pipe_bufs = PIPE_DEF_BUFFERS; struct user_struct *user = get_current_user(); unsigned long user_bufs; unsigned int max_size = READ_ONCE(pipe_max_size); pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT); if (pipe == NULL) goto out_free_uid; if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE)) pipe_bufs = max_size >> PAGE_SHIFT; user_bufs = account_pipe_buffers(user, 0, pipe_bufs); if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) { user_bufs = account_pipe_buffers(user, pipe_bufs, PIPE_MIN_DEF_BUFFERS); pipe_bufs = PIPE_MIN_DEF_BUFFERS; } if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user()) goto out_revert_acct; pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer), GFP_KERNEL_ACCOUNT); if (pipe->bufs) { init_waitqueue_head(&pipe->rd_wait); init_waitqueue_head(&pipe->wr_wait); pipe->r_counter = pipe->w_counter = 1; pipe->max_usage = pipe_bufs; pipe->ring_size = pipe_bufs; pipe->nr_accounted = pipe_bufs; pipe->user = user; mutex_init(&pipe->mutex); return pipe; } out_revert_acct: (void) account_pipe_buffers(user, pipe_bufs, 0); kfree(pipe); out_free_uid: free_uid(user); return NULL; } void free_pipe_info(struct pipe_inode_info *pipe) { unsigned int i; #ifdef CONFIG_WATCH_QUEUE if (pipe->watch_queue) watch_queue_clear(pipe->watch_queue); #endif (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0); free_uid(pipe->user); for (i = 0; i < pipe->ring_size; i++) { struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) pipe_buf_release(pipe, buf); } #ifdef CONFIG_WATCH_QUEUE if (pipe->watch_queue) put_watch_queue(pipe->watch_queue); #endif if (pipe->tmp_page) __free_page(pipe->tmp_page); kfree(pipe->bufs); kfree(pipe); } static struct vfsmount *pipe_mnt __ro_after_init; /* * pipefs_dname() is called from d_path(). */ static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) { return dynamic_dname(buffer, buflen, "pipe:[%lu]", d_inode(dentry)->i_ino); } static const struct dentry_operations pipefs_dentry_operations = { .d_dname = pipefs_dname, }; static struct inode * get_pipe_inode(void) { struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); struct pipe_inode_info *pipe; if (!inode) goto fail_inode; inode->i_ino = get_next_ino(); pipe = alloc_pipe_info(); if (!pipe) goto fail_iput; inode->i_pipe = pipe; pipe->files = 2; pipe->readers = pipe->writers = 1; inode->i_fop = &pipefifo_fops; /* * Mark the inode dirty from the very beginning, * that way it will never be moved to the dirty * list because "mark_inode_dirty()" will think * that it already _is_ on the dirty list. */ inode->i_state = I_DIRTY; inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); simple_inode_init_ts(inode); return inode; fail_iput: iput(inode); fail_inode: return NULL; } int create_pipe_files(struct file **res, int flags) { struct inode *inode = get_pipe_inode(); struct file *f; int error; if (!inode) return -ENFILE; if (flags & O_NOTIFICATION_PIPE) { error = watch_queue_init(inode->i_pipe); if (error) { free_pipe_info(inode->i_pipe); iput(inode); return error; } } f = alloc_file_pseudo(inode, pipe_mnt, "", O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)), &pipefifo_fops); if (IS_ERR(f)) { free_pipe_info(inode->i_pipe); iput(inode); return PTR_ERR(f); } f->private_data = inode->i_pipe; res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK), &pipefifo_fops); if (IS_ERR(res[0])) { put_pipe_info(inode, inode->i_pipe); fput(f); return PTR_ERR(res[0]); } res[0]->private_data = inode->i_pipe; res[1] = f; stream_open(inode, res[0]); stream_open(inode, res[1]); return 0; } static int __do_pipe_flags(int *fd, struct file **files, int flags) { int error; int fdw, fdr; if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE)) return -EINVAL; error = create_pipe_files(files, flags); if (error) return error; error = get_unused_fd_flags(flags); if (error < 0) goto err_read_pipe; fdr = error; error = get_unused_fd_flags(flags); if (error < 0) goto err_fdr; fdw = error; audit_fd_pair(fdr, fdw); fd[0] = fdr; fd[1] = fdw; /* pipe groks IOCB_NOWAIT */ files[0]->f_mode |= FMODE_NOWAIT; files[1]->f_mode |= FMODE_NOWAIT; return 0; err_fdr: put_unused_fd(fdr); err_read_pipe: fput(files[0]); fput(files[1]); return error; } int do_pipe_flags(int *fd, int flags) { struct file *files[2]; int error = __do_pipe_flags(fd, files, flags); if (!error) { fd_install(fd[0], files[0]); fd_install(fd[1], files[1]); } return error; } /* * sys_pipe() is the normal C calling standard for creating * a pipe. It's not the way Unix traditionally does this, though. */ static int do_pipe2(int __user *fildes, int flags) { struct file *files[2]; int fd[2]; int error; error = __do_pipe_flags(fd, files, flags); if (!error) { if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) { fput(files[0]); fput(files[1]); put_unused_fd(fd[0]); put_unused_fd(fd[1]); error = -EFAULT; } else { fd_install(fd[0], files[0]); fd_install(fd[1], files[1]); } } return error; } SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) { return do_pipe2(fildes, flags); } SYSCALL_DEFINE1(pipe, int __user *, fildes) { return do_pipe2(fildes, 0); } /* * This is the stupid "wait for pipe to be readable or writable" * model. * * See pipe_read/write() for the proper kind of exclusive wait, * but that requires that we wake up any other readers/writers * if we then do not end up reading everything (ie the whole * "wake_next_reader/writer" logic in pipe_read/write()). */ void pipe_wait_readable(struct pipe_inode_info *pipe) { pipe_unlock(pipe); wait_event_interruptible(pipe->rd_wait, pipe_readable(pipe)); pipe_lock(pipe); } void pipe_wait_writable(struct pipe_inode_info *pipe) { pipe_unlock(pipe); wait_event_interruptible(pipe->wr_wait, pipe_writable(pipe)); pipe_lock(pipe); } /* * This depends on both the wait (here) and the wakeup (wake_up_partner) * holding the pipe lock, so "*cnt" is stable and we know a wakeup cannot * race with the count check and waitqueue prep. * * Normally in order to avoid races, you'd do the prepare_to_wait() first, * then check the condition you're waiting for, and only then sleep. But * because of the pipe lock, we can check the condition before being on * the wait queue. * * We use the 'rd_wait' waitqueue for pipe partner waiting. */ static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) { DEFINE_WAIT(rdwait); int cur = *cnt; while (cur == *cnt) { prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE); pipe_unlock(pipe); schedule(); finish_wait(&pipe->rd_wait, &rdwait); pipe_lock(pipe); if (signal_pending(current)) break; } return cur == *cnt ? -ERESTARTSYS : 0; } static void wake_up_partner(struct pipe_inode_info *pipe) { wake_up_interruptible_all(&pipe->rd_wait); } static int fifo_open(struct inode *inode, struct file *filp) { struct pipe_inode_info *pipe; bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; int ret; filp->f_version = 0; spin_lock(&inode->i_lock); if (inode->i_pipe) { pipe = inode->i_pipe; pipe->files++; spin_unlock(&inode->i_lock); } else { spin_unlock(&inode->i_lock); pipe = alloc_pipe_info(); if (!pipe) return -ENOMEM; pipe->files = 1; spin_lock(&inode->i_lock); if (unlikely(inode->i_pipe)) { inode->i_pipe->files++; spin_unlock(&inode->i_lock); free_pipe_info(pipe); pipe = inode->i_pipe; } else { inode->i_pipe = pipe; spin_unlock(&inode->i_lock); } } filp->private_data = pipe; /* OK, we have a pipe and it's pinned down */ __pipe_lock(pipe); /* We can only do regular read/write on fifos */ stream_open(inode, filp); switch (filp->f_mode & (FMODE_READ | FMODE_WRITE)) { case FMODE_READ: /* * O_RDONLY * POSIX.1 says that O_NONBLOCK means return with the FIFO * opened, even when there is no process writing the FIFO. */ pipe->r_counter++; if (pipe->readers++ == 0) wake_up_partner(pipe); if (!is_pipe && !pipe->writers) { if ((filp->f_flags & O_NONBLOCK)) { /* suppress EPOLLHUP until we have * seen a writer */ filp->f_version = pipe->w_counter; } else { if (wait_for_partner(pipe, &pipe->w_counter)) goto err_rd; } } break; case FMODE_WRITE: /* * O_WRONLY * POSIX.1 says that O_NONBLOCK means return -1 with * errno=ENXIO when there is no process reading the FIFO. */ ret = -ENXIO; if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers) goto err; pipe->w_counter++; if (!pipe->writers++) wake_up_partner(pipe); if (!is_pipe && !pipe->readers) { if (wait_for_partner(pipe, &pipe->r_counter)) goto err_wr; } break; case FMODE_READ | FMODE_WRITE: /* * O_RDWR * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set. * This implementation will NEVER block on a O_RDWR open, since * the process can at least talk to itself. */ pipe->readers++; pipe->writers++; pipe->r_counter++; pipe->w_counter++; if (pipe->readers == 1 || pipe->writers == 1) wake_up_partner(pipe); break; default: ret = -EINVAL; goto err; } /* Ok! */ __pipe_unlock(pipe); return 0; err_rd: if (!--pipe->readers) wake_up_interruptible(&pipe->wr_wait); ret = -ERESTARTSYS; goto err; err_wr: if (!--pipe->writers) wake_up_interruptible_all(&pipe->rd_wait); ret = -ERESTARTSYS; goto err; err: __pipe_unlock(pipe); put_pipe_info(inode, pipe); return ret; } const struct file_operations pipefifo_fops = { .open = fifo_open, .llseek = no_llseek, .read_iter = pipe_read, .write_iter = pipe_write, .poll = pipe_poll, .unlocked_ioctl = pipe_ioctl, .release = pipe_release, .fasync = pipe_fasync, .splice_write = iter_file_splice_write, }; /* * Currently we rely on the pipe array holding a power-of-2 number * of pages. Returns 0 on error. */ unsigned int round_pipe_size(unsigned int size) { if (size > (1U << 31)) return 0; /* Minimum pipe size, as required by POSIX */ if (size < PAGE_SIZE) return PAGE_SIZE; return roundup_pow_of_two(size); } /* * Resize the pipe ring to a number of slots. * * Note the pipe can be reduced in capacity, but only if the current * occupancy doesn't exceed nr_slots; if it does, EBUSY will be * returned instead. */ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) { struct pipe_buffer *bufs; unsigned int head, tail, mask, n; bufs = kcalloc(nr_slots, sizeof(*bufs), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (unlikely(!bufs)) return -ENOMEM; spin_lock_irq(&pipe->rd_wait.lock); mask = pipe->ring_size - 1; head = pipe->head; tail = pipe->tail; n = pipe_occupancy(head, tail); if (nr_slots < n) { spin_unlock_irq(&pipe->rd_wait.lock); kfree(bufs); return -EBUSY; } /* * The pipe array wraps around, so just start the new one at zero * and adjust the indices. */ if (n > 0) { unsigned int h = head & mask; unsigned int t = tail & mask; if (h > t) { memcpy(bufs, pipe->bufs + t, n * sizeof(struct pipe_buffer)); } else { unsigned int tsize = pipe->ring_size - t; if (h > 0) memcpy(bufs + tsize, pipe->bufs, h * sizeof(struct pipe_buffer)); memcpy(bufs, pipe->bufs + t, tsize * sizeof(struct pipe_buffer)); } } head = n; tail = 0; kfree(pipe->bufs); pipe->bufs = bufs; pipe->ring_size = nr_slots; if (pipe->max_usage > nr_slots) pipe->max_usage = nr_slots; pipe->tail = tail; pipe->head = head; if (!pipe_has_watch_queue(pipe)) { pipe->max_usage = nr_slots; pipe->nr_accounted = nr_slots; } spin_unlock_irq(&pipe->rd_wait.lock); /* This might have made more room for writers */ wake_up_interruptible(&pipe->wr_wait); return 0; } /* * Allocate a new array of pipe buffers and copy the info over. Returns the * pipe size if successful, or return -ERROR on error. */ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg) { unsigned long user_bufs; unsigned int nr_slots, size; long ret = 0; if (pipe_has_watch_queue(pipe)) return -EBUSY; size = round_pipe_size(arg); nr_slots = size >> PAGE_SHIFT; if (!nr_slots) return -EINVAL; /* * If trying to increase the pipe capacity, check that an * unprivileged user is not trying to exceed various limits * (soft limit check here, hard limit check just below). * Decreasing the pipe capacity is always permitted, even * if the user is currently over a limit. */ if (nr_slots > pipe->max_usage && size > pipe_max_size && !capable(CAP_SYS_RESOURCE)) return -EPERM; user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_slots); if (nr_slots > pipe->max_usage && (too_many_pipe_buffers_hard(user_bufs) || too_many_pipe_buffers_soft(user_bufs)) && pipe_is_unprivileged_user()) { ret = -EPERM; goto out_revert_acct; } ret = pipe_resize_ring(pipe, nr_slots); if (ret < 0) goto out_revert_acct; return pipe->max_usage * PAGE_SIZE; out_revert_acct: (void) account_pipe_buffers(pipe->user, nr_slots, pipe->nr_accounted); return ret; } /* * Note that i_pipe and i_cdev share the same location, so checking ->i_pipe is * not enough to verify that this is a pipe. */ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice) { struct pipe_inode_info *pipe = file->private_data; if (file->f_op != &pipefifo_fops || !pipe) return NULL; if (for_splice && pipe_has_watch_queue(pipe)) return NULL; return pipe; } long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg) { struct pipe_inode_info *pipe; long ret; pipe = get_pipe_info(file, false); if (!pipe) return -EBADF; __pipe_lock(pipe); switch (cmd) { case F_SETPIPE_SZ: ret = pipe_set_size(pipe, arg); break; case F_GETPIPE_SZ: ret = pipe->max_usage * PAGE_SIZE; break; default: ret = -EINVAL; break; } __pipe_unlock(pipe); return ret; } static const struct super_operations pipefs_ops = { .destroy_inode = free_inode_nonrcu, .statfs = simple_statfs, }; /* * pipefs should _never_ be mounted by userland - too much of security hassle, * no real gain from having the whole whorehouse mounted. So we don't need * any operations on the root directory. However, we need a non-trivial * d_name - pipe: will go nicely and kill the special-casing in procfs. */ static int pipefs_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC); if (!ctx) return -ENOMEM; ctx->ops = &pipefs_ops; ctx->dops = &pipefs_dentry_operations; return 0; } static struct file_system_type pipe_fs_type = { .name = "pipefs", .init_fs_context = pipefs_init_fs_context, .kill_sb = kill_anon_super, }; #ifdef CONFIG_SYSCTL static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, unsigned int *valp, int write, void *data) { if (write) { unsigned int val; val = round_pipe_size(*lvalp); if (val == 0) return -EINVAL; *valp = val; } else { unsigned int val = *valp; *lvalp = (unsigned long) val; } return 0; } static int proc_dopipe_max_size(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_douintvec(table, write, buffer, lenp, ppos, do_proc_dopipe_max_size_conv, NULL); } static struct ctl_table fs_pipe_sysctls[] = { { .procname = "pipe-max-size", .data = &pipe_max_size, .maxlen = sizeof(pipe_max_size), .mode = 0644, .proc_handler = proc_dopipe_max_size, }, { .procname = "pipe-user-pages-hard", .data = &pipe_user_pages_hard, .maxlen = sizeof(pipe_user_pages_hard), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "pipe-user-pages-soft", .data = &pipe_user_pages_soft, .maxlen = sizeof(pipe_user_pages_soft), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, }; #endif static int __init init_pipe_fs(void) { int err = register_filesystem(&pipe_fs_type); if (!err) { pipe_mnt = kern_mount(&pipe_fs_type); if (IS_ERR(pipe_mnt)) { err = PTR_ERR(pipe_mnt); unregister_filesystem(&pipe_fs_type); } } #ifdef CONFIG_SYSCTL register_sysctl_init("fs", fs_pipe_sysctls); #endif return err; } fs_initcall(init_pipe_fs);
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 // SPDX-License-Identifier: GPL-2.0 #include <linux/cpumask.h> #include <linux/fs.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/proc_fs.h> #include <linux/sched.h> #include <linux/sched/stat.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/time.h> #include <linux/time_namespace.h> #include <linux/irqnr.h> #include <linux/sched/cputime.h> #include <linux/tick.h> #ifndef arch_irq_stat_cpu #define arch_irq_stat_cpu(cpu) 0 #endif #ifndef arch_irq_stat #define arch_irq_stat() 0 #endif u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) { u64 idle, idle_usecs = -1ULL; if (cpu_online(cpu)) idle_usecs = get_cpu_idle_time_us(cpu, NULL); if (idle_usecs == -1ULL) /* !NO_HZ or cpu offline so we can rely on cpustat.idle */ idle = kcs->cpustat[CPUTIME_IDLE]; else idle = idle_usecs * NSEC_PER_USEC; return idle; } static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) { u64 iowait, iowait_usecs = -1ULL; if (cpu_online(cpu)) iowait_usecs = get_cpu_iowait_time_us(cpu, NULL); if (iowait_usecs == -1ULL) /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */ iowait = kcs->cpustat[CPUTIME_IOWAIT]; else iowait = iowait_usecs * NSEC_PER_USEC; return iowait; } static void show_irq_gap(struct seq_file *p, unsigned int gap) { static const char zeros[] = " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0"; while (gap > 0) { unsigned int inc; inc = min_t(unsigned int, gap, ARRAY_SIZE(zeros) / 2); seq_write(p, zeros, 2 * inc); gap -= inc; } } static void show_all_irqs(struct seq_file *p) { unsigned int i, next = 0; for_each_active_irq(i) { show_irq_gap(p, i - next); seq_put_decimal_ull(p, " ", kstat_irqs_usr(i)); next = i + 1; } show_irq_gap(p, nr_irqs - next); } static int show_stat(struct seq_file *p, void *v) { int i, j; u64 user, nice, system, idle, iowait, irq, softirq, steal; u64 guest, guest_nice; u64 sum = 0; u64 sum_softirq = 0; unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; struct timespec64 boottime; user = nice = system = idle = iowait = irq = softirq = steal = 0; guest = guest_nice = 0; getboottime64(&boottime); /* shift boot timestamp according to the timens offset */ timens_sub_boottime(&boottime); for_each_possible_cpu(i) { struct kernel_cpustat kcpustat; u64 *cpustat = kcpustat.cpustat; kcpustat_cpu_fetch(&kcpustat, i); user += cpustat[CPUTIME_USER]; nice += cpustat[CPUTIME_NICE]; system += cpustat[CPUTIME_SYSTEM]; idle += get_idle_time(&kcpustat, i); iowait += get_iowait_time(&kcpustat, i); irq += cpustat[CPUTIME_IRQ]; softirq += cpustat[CPUTIME_SOFTIRQ]; steal += cpustat[CPUTIME_STEAL]; guest += cpustat[CPUTIME_GUEST]; guest_nice += cpustat[CPUTIME_GUEST_NICE]; sum += kstat_cpu_irqs_sum(i); sum += arch_irq_stat_cpu(i); for (j = 0; j < NR_SOFTIRQS; j++) { unsigned int softirq_stat = kstat_softirqs_cpu(j, i); per_softirq_sums[j] += softirq_stat; sum_softirq += softirq_stat; } } sum += arch_irq_stat(); seq_put_decimal_ull(p, "cpu ", nsec_to_clock_t(user)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(system)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice)); seq_putc(p, '\n'); for_each_online_cpu(i) { struct kernel_cpustat kcpustat; u64 *cpustat = kcpustat.cpustat; kcpustat_cpu_fetch(&kcpustat, i); /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ user = cpustat[CPUTIME_USER]; nice = cpustat[CPUTIME_NICE]; system = cpustat[CPUTIME_SYSTEM]; idle = get_idle_time(&kcpustat, i); iowait = get_iowait_time(&kcpustat, i); irq = cpustat[CPUTIME_IRQ]; softirq = cpustat[CPUTIME_SOFTIRQ]; steal = cpustat[CPUTIME_STEAL]; guest = cpustat[CPUTIME_GUEST]; guest_nice = cpustat[CPUTIME_GUEST_NICE]; seq_printf(p, "cpu%d", i); seq_put_decimal_ull(p, " ", nsec_to_clock_t(user)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(system)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice)); seq_putc(p, '\n'); } seq_put_decimal_ull(p, "intr ", (unsigned long long)sum); show_all_irqs(p); seq_printf(p, "\nctxt %llu\n" "btime %llu\n" "processes %lu\n" "procs_running %u\n" "procs_blocked %u\n", nr_context_switches(), (unsigned long long)boottime.tv_sec, total_forks, nr_running(), nr_iowait()); seq_put_decimal_ull(p, "softirq ", (unsigned long long)sum_softirq); for (i = 0; i < NR_SOFTIRQS; i++) seq_put_decimal_ull(p, " ", per_softirq_sums[i]); seq_putc(p, '\n'); return 0; } static int stat_open(struct inode *inode, struct file *file) { unsigned int size = 1024 + 128 * num_online_cpus(); /* minimum size to display an interrupt count : 2 bytes */ size += 2 * nr_irqs; return single_open_size(file, show_stat, NULL, size); } static const struct proc_ops stat_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = stat_open, .proc_read_iter = seq_read_iter, .proc_lseek = seq_lseek, .proc_release = single_release, }; static int __init proc_stat_init(void) { proc_create("stat", 0, NULL, &stat_proc_ops); return 0; } fs_initcall(proc_stat_init);
28 28 1 19 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 // SPDX-License-Identifier: GPL-2.0-only /* loopback transport for vsock using virtio_transport_common APIs * * Copyright (C) 2013-2019 Red Hat, Inc. * Authors: Asias He <asias@redhat.com> * Stefan Hajnoczi <stefanha@redhat.com> * Stefano Garzarella <sgarzare@redhat.com> * */ #include <linux/spinlock.h> #include <linux/module.h> #include <linux/list.h> #include <linux/virtio_vsock.h> struct vsock_loopback { struct workqueue_struct *workqueue; struct sk_buff_head pkt_queue; struct work_struct pkt_work; }; static struct vsock_loopback the_vsock_loopback; static u32 vsock_loopback_get_local_cid(void) { return VMADDR_CID_LOCAL; } static int vsock_loopback_send_pkt(struct sk_buff *skb) { struct vsock_loopback *vsock = &the_vsock_loopback; int len = skb->len; virtio_vsock_skb_queue_tail(&vsock->pkt_queue, skb); queue_work(vsock->workqueue, &vsock->pkt_work); return len; } static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk) { struct vsock_loopback *vsock = &the_vsock_loopback; virtio_transport_purge_skbs(vsk, &vsock->pkt_queue); return 0; } static bool vsock_loopback_seqpacket_allow(u32 remote_cid); static bool vsock_loopback_msgzerocopy_allow(void) { return true; } static struct virtio_transport loopback_transport = { .transport = { .module = THIS_MODULE, .get_local_cid = vsock_loopback_get_local_cid, .init = virtio_transport_do_socket_init, .destruct = virtio_transport_destruct, .release = virtio_transport_release, .connect = virtio_transport_connect, .shutdown = virtio_transport_shutdown, .cancel_pkt = vsock_loopback_cancel_pkt, .dgram_bind = virtio_transport_dgram_bind, .dgram_dequeue = virtio_transport_dgram_dequeue, .dgram_enqueue = virtio_transport_dgram_enqueue, .dgram_allow = virtio_transport_dgram_allow, .stream_dequeue = virtio_transport_stream_dequeue, .stream_enqueue = virtio_transport_stream_enqueue, .stream_has_data = virtio_transport_stream_has_data, .stream_has_space = virtio_transport_stream_has_space, .stream_rcvhiwat = virtio_transport_stream_rcvhiwat, .stream_is_active = virtio_transport_stream_is_active, .stream_allow = virtio_transport_stream_allow, .seqpacket_dequeue = virtio_transport_seqpacket_dequeue, .seqpacket_enqueue = virtio_transport_seqpacket_enqueue, .seqpacket_allow = vsock_loopback_seqpacket_allow, .seqpacket_has_data = virtio_transport_seqpacket_has_data, .msgzerocopy_allow = vsock_loopback_msgzerocopy_allow, .notify_poll_in = virtio_transport_notify_poll_in, .notify_poll_out = virtio_transport_notify_poll_out, .notify_recv_init = virtio_transport_notify_recv_init, .notify_recv_pre_block = virtio_transport_notify_recv_pre_block, .notify_recv_pre_dequeue = virtio_transport_notify_recv_pre_dequeue, .notify_recv_post_dequeue = virtio_transport_notify_recv_post_dequeue, .notify_send_init = virtio_transport_notify_send_init, .notify_send_pre_block = virtio_transport_notify_send_pre_block, .notify_send_pre_enqueue = virtio_transport_notify_send_pre_enqueue, .notify_send_post_enqueue = virtio_transport_notify_send_post_enqueue, .notify_buffer_size = virtio_transport_notify_buffer_size, .notify_set_rcvlowat = virtio_transport_notify_set_rcvlowat, .read_skb = virtio_transport_read_skb, }, .send_pkt = vsock_loopback_send_pkt, }; static bool vsock_loopback_seqpacket_allow(u32 remote_cid) { return true; } static void vsock_loopback_work(struct work_struct *work) { struct vsock_loopback *vsock = container_of(work, struct vsock_loopback, pkt_work); struct sk_buff_head pkts; struct sk_buff *skb; skb_queue_head_init(&pkts); spin_lock_bh(&vsock->pkt_queue.lock); skb_queue_splice_init(&vsock->pkt_queue, &pkts); spin_unlock_bh(&vsock->pkt_queue.lock); while ((skb = __skb_dequeue(&pkts))) { virtio_transport_deliver_tap_pkt(skb); virtio_transport_recv_pkt(&loopback_transport, skb); } } static int __init vsock_loopback_init(void) { struct vsock_loopback *vsock = &the_vsock_loopback; int ret; vsock->workqueue = alloc_workqueue("vsock-loopback", 0, 0); if (!vsock->workqueue) return -ENOMEM; skb_queue_head_init(&vsock->pkt_queue); INIT_WORK(&vsock->pkt_work, vsock_loopback_work); ret = vsock_core_register(&loopback_transport.transport, VSOCK_TRANSPORT_F_LOCAL); if (ret) goto out_wq; return 0; out_wq: destroy_workqueue(vsock->workqueue); return ret; } static void __exit vsock_loopback_exit(void) { struct vsock_loopback *vsock = &the_vsock_loopback; vsock_core_unregister(&loopback_transport.transport); flush_work(&vsock->pkt_work); virtio_vsock_skb_queue_purge(&vsock->pkt_queue); destroy_workqueue(vsock->workqueue); } module_init(vsock_loopback_init); module_exit(vsock_loopback_exit); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Stefano Garzarella <sgarzare@redhat.com>"); MODULE_DESCRIPTION("loopback transport for vsock"); MODULE_ALIAS_NETPROTO(PF_VSOCK);
560 7768 20 7759 2261 2265 5863 5867 2588 10066 1938 1937 12 8 437 314 427 428 812 4 807 3535 1265 35 1265 35 2990 35 2977 470 346 11 3535 3535 923 64 231 752 2 20 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 // SPDX-License-Identifier: GPL-2.0-only #include <linux/bitmap.h> #include <linux/bug.h> #include <linux/export.h> #include <linux/idr.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/xarray.h> /** * idr_alloc_u32() - Allocate an ID. * @idr: IDR handle. * @ptr: Pointer to be associated with the new ID. * @nextid: Pointer to an ID. * @max: The maximum ID to allocate (inclusive). * @gfp: Memory allocation flags. * * Allocates an unused ID in the range specified by @nextid and @max. * Note that @max is inclusive whereas the @end parameter to idr_alloc() * is exclusive. The new ID is assigned to @nextid before the pointer * is inserted into the IDR, so if @nextid points into the object pointed * to by @ptr, a concurrent lookup will not find an uninitialised ID. * * The caller should provide their own locking to ensure that two * concurrent modifications to the IDR are not possible. Read-only * accesses to the IDR may be done under the RCU read lock or may * exclude simultaneous writers. * * Return: 0 if an ID was allocated, -ENOMEM if memory allocation failed, * or -ENOSPC if no free IDs could be found. If an error occurred, * @nextid is unchanged. */ int idr_alloc_u32(struct idr *idr, void *ptr, u32 *nextid, unsigned long max, gfp_t gfp) { struct radix_tree_iter iter; void __rcu **slot; unsigned int base = idr->idr_base; unsigned int id = *nextid; if (WARN_ON_ONCE(!(idr->idr_rt.xa_flags & ROOT_IS_IDR))) idr->idr_rt.xa_flags |= IDR_RT_MARKER; id = (id < base) ? 0 : id - base; radix_tree_iter_init(&iter, id); slot = idr_get_free(&idr->idr_rt, &iter, gfp, max - base); if (IS_ERR(slot)) return PTR_ERR(slot); *nextid = iter.index + base; /* there is a memory barrier inside radix_tree_iter_replace() */ radix_tree_iter_replace(&idr->idr_rt, &iter, slot, ptr); radix_tree_iter_tag_clear(&idr->idr_rt, &iter, IDR_FREE); return 0; } EXPORT_SYMBOL_GPL(idr_alloc_u32); /** * idr_alloc() - Allocate an ID. * @idr: IDR handle. * @ptr: Pointer to be associated with the new ID. * @start: The minimum ID (inclusive). * @end: The maximum ID (exclusive). * @gfp: Memory allocation flags. * * Allocates an unused ID in the range specified by @start and @end. If * @end is <= 0, it is treated as one larger than %INT_MAX. This allows * callers to use @start + N as @end as long as N is within integer range. * * The caller should provide their own locking to ensure that two * concurrent modifications to the IDR are not possible. Read-only * accesses to the IDR may be done under the RCU read lock or may * exclude simultaneous writers. * * Return: The newly allocated ID, -ENOMEM if memory allocation failed, * or -ENOSPC if no free IDs could be found. */ int idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp) { u32 id = start; int ret; if (WARN_ON_ONCE(start < 0)) return -EINVAL; ret = idr_alloc_u32(idr, ptr, &id, end > 0 ? end - 1 : INT_MAX, gfp); if (ret) return ret; return id; } EXPORT_SYMBOL_GPL(idr_alloc); /** * idr_alloc_cyclic() - Allocate an ID cyclically. * @idr: IDR handle. * @ptr: Pointer to be associated with the new ID. * @start: The minimum ID (inclusive). * @end: The maximum ID (exclusive). * @gfp: Memory allocation flags. * * Allocates an unused ID in the range specified by @start and @end. If * @end is <= 0, it is treated as one larger than %INT_MAX. This allows * callers to use @start + N as @end as long as N is within integer range. * The search for an unused ID will start at the last ID allocated and will * wrap around to @start if no free IDs are found before reaching @end. * * The caller should provide their own locking to ensure that two * concurrent modifications to the IDR are not possible. Read-only * accesses to the IDR may be done under the RCU read lock or may * exclude simultaneous writers. * * Return: The newly allocated ID, -ENOMEM if memory allocation failed, * or -ENOSPC if no free IDs could be found. */ int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, gfp_t gfp) { u32 id = idr->idr_next; int err, max = end > 0 ? end - 1 : INT_MAX; if ((int)id < start) id = start; err = idr_alloc_u32(idr, ptr, &id, max, gfp); if ((err == -ENOSPC) && (id > start)) { id = start; err = idr_alloc_u32(idr, ptr, &id, max, gfp); } if (err) return err; idr->idr_next = id + 1; return id; } EXPORT_SYMBOL(idr_alloc_cyclic); /** * idr_remove() - Remove an ID from the IDR. * @idr: IDR handle. * @id: Pointer ID. * * Removes this ID from the IDR. If the ID was not previously in the IDR, * this function returns %NULL. * * Since this function modifies the IDR, the caller should provide their * own locking to ensure that concurrent modification of the same IDR is * not possible. * * Return: The pointer formerly associated with this ID. */ void *idr_remove(struct idr *idr, unsigned long id) { return radix_tree_delete_item(&idr->idr_rt, id - idr->idr_base, NULL); } EXPORT_SYMBOL_GPL(idr_remove); /** * idr_find() - Return pointer for given ID. * @idr: IDR handle. * @id: Pointer ID. * * Looks up the pointer associated with this ID. A %NULL pointer may * indicate that @id is not allocated or that the %NULL pointer was * associated with this ID. * * This function can be called under rcu_read_lock(), given that the leaf * pointers lifetimes are correctly managed. * * Return: The pointer associated with this ID. */ void *idr_find(const struct idr *idr, unsigned long id) { return radix_tree_lookup(&idr->idr_rt, id - idr->idr_base); } EXPORT_SYMBOL_GPL(idr_find); /** * idr_for_each() - Iterate through all stored pointers. * @idr: IDR handle. * @fn: Function to be called for each pointer. * @data: Data passed to callback function. * * The callback function will be called for each entry in @idr, passing * the ID, the entry and @data. * * If @fn returns anything other than %0, the iteration stops and that * value is returned from this function. * * idr_for_each() can be called concurrently with idr_alloc() and * idr_remove() if protected by RCU. Newly added entries may not be * seen and deleted entries may be seen, but adding and removing entries * will not cause other entries to be skipped, nor spurious ones to be seen. */ int idr_for_each(const struct idr *idr, int (*fn)(int id, void *p, void *data), void *data) { struct radix_tree_iter iter; void __rcu **slot; int base = idr->idr_base; radix_tree_for_each_slot(slot, &idr->idr_rt, &iter, 0) { int ret; unsigned long id = iter.index + base; if (WARN_ON_ONCE(id > INT_MAX)) break; ret = fn(id, rcu_dereference_raw(*slot), data); if (ret) return ret; } return 0; } EXPORT_SYMBOL(idr_for_each); /** * idr_get_next_ul() - Find next populated entry. * @idr: IDR handle. * @nextid: Pointer to an ID. * * Returns the next populated entry in the tree with an ID greater than * or equal to the value pointed to by @nextid. On exit, @nextid is updated * to the ID of the found value. To use in a loop, the value pointed to by * nextid must be incremented by the user. */ void *idr_get_next_ul(struct idr *idr, unsigned long *nextid) { struct radix_tree_iter iter; void __rcu **slot; void *entry = NULL; unsigned long base = idr->idr_base; unsigned long id = *nextid; id = (id < base) ? 0 : id - base; radix_tree_for_each_slot(slot, &idr->idr_rt, &iter, id) { entry = rcu_dereference_raw(*slot); if (!entry) continue; if (!xa_is_internal(entry)) break; if (slot != &idr->idr_rt.xa_head && !xa_is_retry(entry)) break; slot = radix_tree_iter_retry(&iter); } if (!slot) return NULL; *nextid = iter.index + base; return entry; } EXPORT_SYMBOL(idr_get_next_ul); /** * idr_get_next() - Find next populated entry. * @idr: IDR handle. * @nextid: Pointer to an ID. * * Returns the next populated entry in the tree with an ID greater than * or equal to the value pointed to by @nextid. On exit, @nextid is updated * to the ID of the found value. To use in a loop, the value pointed to by * nextid must be incremented by the user. */ void *idr_get_next(struct idr *idr, int *nextid) { unsigned long id = *nextid; void *entry = idr_get_next_ul(idr, &id); if (WARN_ON_ONCE(id > INT_MAX)) return NULL; *nextid = id; return entry; } EXPORT_SYMBOL(idr_get_next); /** * idr_replace() - replace pointer for given ID. * @idr: IDR handle. * @ptr: New pointer to associate with the ID. * @id: ID to change. * * Replace the pointer registered with an ID and return the old value. * This function can be called under the RCU read lock concurrently with * idr_alloc() and idr_remove() (as long as the ID being removed is not * the one being replaced!). * * Returns: the old value on success. %-ENOENT indicates that @id was not * found. %-EINVAL indicates that @ptr was not valid. */ void *idr_replace(struct idr *idr, void *ptr, unsigned long id) { struct radix_tree_node *node; void __rcu **slot = NULL; void *entry; id -= idr->idr_base; entry = __radix_tree_lookup(&idr->idr_rt, id, &node, &slot); if (!slot || radix_tree_tag_get(&idr->idr_rt, id, IDR_FREE)) return ERR_PTR(-ENOENT); __radix_tree_replace(&idr->idr_rt, node, slot, ptr); return entry; } EXPORT_SYMBOL(idr_replace); /** * DOC: IDA description * * The IDA is an ID allocator which does not provide the ability to * associate an ID with a pointer. As such, it only needs to store one * bit per ID, and so is more space efficient than an IDR. To use an IDA, * define it using DEFINE_IDA() (or embed a &struct ida in a data structure, * then initialise it using ida_init()). To allocate a new ID, call * ida_alloc(), ida_alloc_min(), ida_alloc_max() or ida_alloc_range(). * To free an ID, call ida_free(). * * ida_destroy() can be used to dispose of an IDA without needing to * free the individual IDs in it. You can use ida_is_empty() to find * out whether the IDA has any IDs currently allocated. * * The IDA handles its own locking. It is safe to call any of the IDA * functions without synchronisation in your code. * * IDs are currently limited to the range [0-INT_MAX]. If this is an awkward * limitation, it should be quite straightforward to raise the maximum. */ /* * Developer's notes: * * The IDA uses the functionality provided by the XArray to store bitmaps in * each entry. The XA_FREE_MARK is only cleared when all bits in the bitmap * have been set. * * I considered telling the XArray that each slot is an order-10 node * and indexing by bit number, but the XArray can't allow a single multi-index * entry in the head, which would significantly increase memory consumption * for the IDA. So instead we divide the index by the number of bits in the * leaf bitmap before doing a radix tree lookup. * * As an optimisation, if there are only a few low bits set in any given * leaf, instead of allocating a 128-byte bitmap, we store the bits * as a value entry. Value entries never have the XA_FREE_MARK cleared * because we can always convert them into a bitmap entry. * * It would be possible to optimise further; once we've run out of a * single 128-byte bitmap, we currently switch to a 576-byte node, put * the 128-byte bitmap in the first entry and then start allocating extra * 128-byte entries. We could instead use the 512 bytes of the node's * data as a bitmap before moving to that scheme. I do not believe this * is a worthwhile optimisation; Rasmus Villemoes surveyed the current * users of the IDA and almost none of them use more than 1024 entries. * Those that do use more than the 8192 IDs that the 512 bytes would * provide. * * The IDA always uses a lock to alloc/free. If we add a 'test_bit' * equivalent, it will still need locking. Going to RCU lookup would require * using RCU to free bitmaps, and that's not trivial without embedding an * RCU head in the bitmap, which adds a 2-pointer overhead to each 128-byte * bitmap, which is excessive. */ /** * ida_alloc_range() - Allocate an unused ID. * @ida: IDA handle. * @min: Lowest ID to allocate. * @max: Highest ID to allocate. * @gfp: Memory allocation flags. * * Allocate an ID between @min and @max, inclusive. The allocated ID will * not exceed %INT_MAX, even if @max is larger. * * Context: Any context. It is safe to call this function without * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ int ida_alloc_range(struct ida *ida, unsigned int min, unsigned int max, gfp_t gfp) { XA_STATE(xas, &ida->xa, min / IDA_BITMAP_BITS); unsigned bit = min % IDA_BITMAP_BITS; unsigned long flags; struct ida_bitmap *bitmap, *alloc = NULL; if ((int)min < 0) return -ENOSPC; if ((int)max < 0) max = INT_MAX; retry: xas_lock_irqsave(&xas, flags); next: bitmap = xas_find_marked(&xas, max / IDA_BITMAP_BITS, XA_FREE_MARK); if (xas.xa_index > min / IDA_BITMAP_BITS) bit = 0; if (xas.xa_index * IDA_BITMAP_BITS + bit > max) goto nospc; if (xa_is_value(bitmap)) { unsigned long tmp = xa_to_value(bitmap); if (bit < BITS_PER_XA_VALUE) { bit = find_next_zero_bit(&tmp, BITS_PER_XA_VALUE, bit); if (xas.xa_index * IDA_BITMAP_BITS + bit > max) goto nospc; if (bit < BITS_PER_XA_VALUE) { tmp |= 1UL << bit; xas_store(&xas, xa_mk_value(tmp)); goto out; } } bitmap = alloc; if (!bitmap) bitmap = kzalloc(sizeof(*bitmap), GFP_NOWAIT); if (!bitmap) goto alloc; bitmap->bitmap[0] = tmp; xas_store(&xas, bitmap); if (xas_error(&xas)) { bitmap->bitmap[0] = 0; goto out; } } if (bitmap) { bit = find_next_zero_bit(bitmap->bitmap, IDA_BITMAP_BITS, bit); if (xas.xa_index * IDA_BITMAP_BITS + bit > max) goto nospc; if (bit == IDA_BITMAP_BITS) goto next; __set_bit(bit, bitmap->bitmap); if (bitmap_full(bitmap->bitmap, IDA_BITMAP_BITS)) xas_clear_mark(&xas, XA_FREE_MARK); } else { if (bit < BITS_PER_XA_VALUE) { bitmap = xa_mk_value(1UL << bit); } else { bitmap = alloc; if (!bitmap) bitmap = kzalloc(sizeof(*bitmap), GFP_NOWAIT); if (!bitmap) goto alloc; __set_bit(bit, bitmap->bitmap); } xas_store(&xas, bitmap); } out: xas_unlock_irqrestore(&xas, flags); if (xas_nomem(&xas, gfp)) { xas.xa_index = min / IDA_BITMAP_BITS; bit = min % IDA_BITMAP_BITS; goto retry; } if (bitmap != alloc) kfree(alloc); if (xas_error(&xas)) return xas_error(&xas); return xas.xa_index * IDA_BITMAP_BITS + bit; alloc: xas_unlock_irqrestore(&xas, flags); alloc = kzalloc(sizeof(*bitmap), gfp); if (!alloc) return -ENOMEM; xas_set(&xas, min / IDA_BITMAP_BITS); bit = min % IDA_BITMAP_BITS; goto retry; nospc: xas_unlock_irqrestore(&xas, flags); kfree(alloc); return -ENOSPC; } EXPORT_SYMBOL(ida_alloc_range); /** * ida_free() - Release an allocated ID. * @ida: IDA handle. * @id: Previously allocated ID. * * Context: Any context. It is safe to call this function without * locking in your code. */ void ida_free(struct ida *ida, unsigned int id) { XA_STATE(xas, &ida->xa, id / IDA_BITMAP_BITS); unsigned bit = id % IDA_BITMAP_BITS; struct ida_bitmap *bitmap; unsigned long flags; if ((int)id < 0) return; xas_lock_irqsave(&xas, flags); bitmap = xas_load(&xas); if (xa_is_value(bitmap)) { unsigned long v = xa_to_value(bitmap); if (bit >= BITS_PER_XA_VALUE) goto err; if (!(v & (1UL << bit))) goto err; v &= ~(1UL << bit); if (!v) goto delete; xas_store(&xas, xa_mk_value(v)); } else { if (!bitmap || !test_bit(bit, bitmap->bitmap)) goto err; __clear_bit(bit, bitmap->bitmap); xas_set_mark(&xas, XA_FREE_MARK); if (bitmap_empty(bitmap->bitmap, IDA_BITMAP_BITS)) { kfree(bitmap); delete: xas_store(&xas, NULL); } } xas_unlock_irqrestore(&xas, flags); return; err: xas_unlock_irqrestore(&xas, flags); WARN(1, "ida_free called for id=%d which is not allocated.\n", id); } EXPORT_SYMBOL(ida_free); /** * ida_destroy() - Free all IDs. * @ida: IDA handle. * * Calling this function frees all IDs and releases all resources used * by an IDA. When this call returns, the IDA is empty and can be reused * or freed. If the IDA is already empty, there is no need to call this * function. * * Context: Any context. It is safe to call this function without * locking in your code. */ void ida_destroy(struct ida *ida) { XA_STATE(xas, &ida->xa, 0); struct ida_bitmap *bitmap; unsigned long flags; xas_lock_irqsave(&xas, flags); xas_for_each(&xas, bitmap, ULONG_MAX) { if (!xa_is_value(bitmap)) kfree(bitmap); xas_store(&xas, NULL); } xas_unlock_irqrestore(&xas, flags); } EXPORT_SYMBOL(ida_destroy); #ifndef __KERNEL__ extern void xa_dump_index(unsigned long index, unsigned int shift); #define IDA_CHUNK_SHIFT ilog2(IDA_BITMAP_BITS) static void ida_dump_entry(void *entry, unsigned long index) { unsigned long i; if (!entry) return; if (xa_is_node(entry)) { struct xa_node *node = xa_to_node(entry); unsigned int shift = node->shift + IDA_CHUNK_SHIFT + XA_CHUNK_SHIFT; xa_dump_index(index * IDA_BITMAP_BITS, shift); xa_dump_node(node); for (i = 0; i < XA_CHUNK_SIZE; i++) ida_dump_entry(node->slots[i], index | (i << node->shift)); } else if (xa_is_value(entry)) { xa_dump_index(index * IDA_BITMAP_BITS, ilog2(BITS_PER_LONG)); pr_cont("value: data %lx [%px]\n", xa_to_value(entry), entry); } else { struct ida_bitmap *bitmap = entry; xa_dump_index(index * IDA_BITMAP_BITS, IDA_CHUNK_SHIFT); pr_cont("bitmap: %p data", bitmap); for (i = 0; i < IDA_BITMAP_LONGS; i++) pr_cont(" %lx", bitmap->bitmap[i]); pr_cont("\n"); } } static void ida_dump(struct ida *ida) { struct xarray *xa = &ida->xa; pr_debug("ida: %p node %p free %d\n", ida, xa->xa_head, xa->xa_flags >> ROOT_TAG_SHIFT); ida_dump_entry(xa->xa_head, 0); } #endif
10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_IA32_H #define _ASM_X86_IA32_H #ifdef CONFIG_IA32_EMULATION #include <linux/compat.h> /* * 32 bit structures for IA32 support. */ #include <uapi/asm/sigcontext.h> /* signal.h */ struct ucontext_ia32 { unsigned int uc_flags; unsigned int uc_link; compat_stack_t uc_stack; struct sigcontext_32 uc_mcontext; compat_sigset_t uc_sigmask; /* mask last for extensibility */ }; /* This matches struct stat64 in glibc2.2, hence the absolutely * insane amounts of padding around dev_t's. */ struct stat64 { unsigned long long st_dev; unsigned char __pad0[4]; #define STAT64_HAS_BROKEN_ST_INO 1 unsigned int __st_ino; unsigned int st_mode; unsigned int st_nlink; unsigned int st_uid; unsigned int st_gid; unsigned long long st_rdev; unsigned char __pad3[4]; long long st_size; unsigned int st_blksize; long long st_blocks;/* Number 512-byte blocks allocated */ unsigned st_atime; unsigned st_atime_nsec; unsigned st_mtime; unsigned st_mtime_nsec; unsigned st_ctime; unsigned st_ctime_nsec; unsigned long long st_ino; } __attribute__((packed)); #define IA32_STACK_TOP IA32_PAGE_OFFSET #ifdef __KERNEL__ struct linux_binprm; extern int ia32_setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, int exec_stack); struct mm_struct; extern void ia32_pick_mmap_layout(struct mm_struct *mm); #endif extern bool __ia32_enabled; static inline bool ia32_enabled(void) { return __ia32_enabled; } static inline void ia32_disable(void) { __ia32_enabled = false; } #else /* !CONFIG_IA32_EMULATION */ static inline bool ia32_enabled(void) { return IS_ENABLED(CONFIG_X86_32); } static inline void ia32_disable(void) {} #endif static inline bool ia32_enabled_verbose(void) { bool enabled = ia32_enabled(); if (IS_ENABLED(CONFIG_IA32_EMULATION) && !enabled) pr_notice_once("32-bit emulation disabled. You can reenable with ia32_emulation=on\n"); return enabled; } #endif /* _ASM_X86_IA32_H */
580 556 531 17 44 41 40 16 4 10 19 3 1 2 2 2 20 4 1 5 2 3 4 557 557 487 69 2 69 536 22 540 525 17 524 198 2 1 9 1 3 10 2 12 1 1 27 27 14 6 2 6 1 27 27 25 25 3 3 3 12 1 1 1 2 2 119 9 10 10 10 10 10 10 10 10 10 10 3 7 3 1 54 13 125 126 126 31 31 23 8 295 295 295 1 2 7 50 1 119 128 8 2 1 38 2 3 243 238 7 232 5 385 384 4 3 355 1 2 215 214 11 6 123 44 92 295 3 333 325 2 7 331 331 155 125 30 148 33 4 144 41 298 17 1 271 299 261 41 25 257 21 10 10 36 37 2 324 38 335 76 14 14 2 1 1 21 21 21 3 6 44 6 97 1 97 6 1 39 21 21 21 21 21 21 21 21 56 56 56 56 28 597 56 54 2 2 56 36 7 7 4 3 7 4 4 4 6 1 96 93 1 79 79 78 3 17 5 62 63 24 39 63 61 1 1 60 75 49 27 6 5 9 9 11 9 9 3 3 8 16 4 1 1 7 19 8 11 11 19 14 4 11 3 19 13 6 9 6 6 570 532 39 36 2 39 39 57 6 50 51 41 7 3 44 45 21 11 11 19 1 18 9 9 2 11 11 11 11 1 19 19 8 8 15 15 15 15 11 7 7 4 29 2 27 12 8 4 34 35 33 1 27 27 31 10 2 8 10 15 1 2 1 2 3 7 7 4 8 9 40 39 40 37 6 5 9 17 13 8 2 2 30 54 54 54 44 9 2 43 12 8 1 2 155 94 3 3 55 2 2 7 9 7 4 5 4 5 2 2 3 1 4 2 3 3 2 452 72 382 27 2 3 1 3 3 3 3 3 3 3 1 21 58 7 50 49 44 9 49 5 5 5 5 6 8 8 8 8 8 6 6 6 6 6 6 3 3 8 8 8 8 6 8 8 7 4 3 3 2 470 470 470 470 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * The User Datagram Protocol (UDP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Alan Cox, <alan@lxorguk.ukuu.org.uk> * Hirokazu Takahashi, <taka@valinux.co.jp> * * Fixes: * Alan Cox : verify_area() calls * Alan Cox : stopped close while in use off icmp * messages. Not a fix but a botch that * for udp at least is 'valid'. * Alan Cox : Fixed icmp handling properly * Alan Cox : Correct error for oversized datagrams * Alan Cox : Tidied select() semantics. * Alan Cox : udp_err() fixed properly, also now * select and read wake correctly on errors * Alan Cox : udp_send verify_area moved to avoid mem leak * Alan Cox : UDP can count its memory * Alan Cox : send to an unknown connection causes * an ECONNREFUSED off the icmp, but * does NOT close. * Alan Cox : Switched to new sk_buff handlers. No more backlog! * Alan Cox : Using generic datagram code. Even smaller and the PEEK * bug no longer crashes it. * Fred Van Kempen : Net2e support for sk->broadcast. * Alan Cox : Uses skb_free_datagram * Alan Cox : Added get/set sockopt support. * Alan Cox : Broadcasting without option set returns EACCES. * Alan Cox : No wakeup calls. Instead we now use the callbacks. * Alan Cox : Use ip_tos and ip_ttl * Alan Cox : SNMP Mibs * Alan Cox : MSG_DONTROUTE, and 0.0.0.0 support. * Matt Dillon : UDP length checks. * Alan Cox : Smarter af_inet used properly. * Alan Cox : Use new kernel side addressing. * Alan Cox : Incorrect return on truncated datagram receive. * Arnt Gulbrandsen : New udp_send and stuff * Alan Cox : Cache last socket * Alan Cox : Route cache * Jon Peatfield : Minor efficiency fix to sendto(). * Mike Shaver : RFC1122 checks. * Alan Cox : Nonblocking error fix. * Willy Konynenberg : Transparent proxying support. * Mike McLagan : Routing by source * David S. Miller : New socket lookup architecture. * Last socket cache retained as it * does have a high hit rate. * Olaf Kirch : Don't linearise iovec on sendmsg. * Andi Kleen : Some cleanups, cache destination entry * for connect. * Vitaly E. Lavrov : Transparent proxy revived after year coma. * Melvin Smith : Check msg_name not msg_namelen in sendto(), * return ENOTCONN for unconnected sockets (POSIX) * Janos Farkas : don't deliver multi/broadcasts to a different * bound-to-device socket * Hirokazu Takahashi : HW checksumming for outgoing UDP * datagrams. * Hirokazu Takahashi : sendfile() on UDP works now. * Arnaldo C. Melo : convert /proc/net/udp to seq_file * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which * Alexey Kuznetsov: allow both IPv4 and IPv6 sockets to bind * a single port at the same time. * Derek Atkins <derek@ihtfp.com>: Add Encapulation Support * James Chapman : Add L2TP encapsulation type. */ #define pr_fmt(fmt) "UDP: " fmt #include <linux/bpf-cgroup.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include <linux/memblock.h> #include <linux/highmem.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/module.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/igmp.h> #include <linux/inetdevice.h> #include <linux/in.h> #include <linux/errno.h> #include <linux/timer.h> #include <linux/mm.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/slab.h> #include <net/tcp_states.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <net/net_namespace.h> #include <net/icmp.h> #include <net/inet_hashtables.h> #include <net/ip_tunnels.h> #include <net/route.h> #include <net/checksum.h> #include <net/gso.h> #include <net/xfrm.h> #include <trace/events/udp.h> #include <linux/static_key.h> #include <linux/btf_ids.h> #include <trace/events/skb.h> #include <net/busy_poll.h> #include "udp_impl.h" #include <net/sock_reuseport.h> #include <net/addrconf.h> #include <net/udp_tunnel.h> #include <net/gro.h> #if IS_ENABLED(CONFIG_IPV6) #include <net/ipv6_stubs.h> #endif struct udp_table udp_table __read_mostly; EXPORT_SYMBOL(udp_table); long sysctl_udp_mem[3] __read_mostly; EXPORT_SYMBOL(sysctl_udp_mem); atomic_long_t udp_memory_allocated ____cacheline_aligned_in_smp; EXPORT_SYMBOL(udp_memory_allocated); DEFINE_PER_CPU(int, udp_memory_per_cpu_fw_alloc); EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc); #define MAX_UDP_PORTS 65536 #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN_PERNET) static struct udp_table *udp_get_table_prot(struct sock *sk) { return sk->sk_prot->h.udp_table ? : sock_net(sk)->ipv4.udp_table; } static int udp_lib_lport_inuse(struct net *net, __u16 num, const struct udp_hslot *hslot, unsigned long *bitmap, struct sock *sk, unsigned int log) { struct sock *sk2; kuid_t uid = sock_i_uid(sk); sk_for_each(sk2, &hslot->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && (bitmap || udp_sk(sk2)->udp_port_hash == num) && (!sk2->sk_reuse || !sk->sk_reuse) && (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && inet_rcv_saddr_equal(sk, sk2, true)) { if (sk2->sk_reuseport && sk->sk_reuseport && !rcu_access_pointer(sk->sk_reuseport_cb) && uid_eq(uid, sock_i_uid(sk2))) { if (!bitmap) return 0; } else { if (!bitmap) return 1; __set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap); } } } return 0; } /* * Note: we still hold spinlock of primary hash chain, so no other writer * can insert/delete a socket with local_port == num */ static int udp_lib_lport_inuse2(struct net *net, __u16 num, struct udp_hslot *hslot2, struct sock *sk) { struct sock *sk2; kuid_t uid = sock_i_uid(sk); int res = 0; spin_lock(&hslot2->lock); udp_portaddr_for_each_entry(sk2, &hslot2->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && (udp_sk(sk2)->udp_port_hash == num) && (!sk2->sk_reuse || !sk->sk_reuse) && (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && inet_rcv_saddr_equal(sk, sk2, true)) { if (sk2->sk_reuseport && sk->sk_reuseport && !rcu_access_pointer(sk->sk_reuseport_cb) && uid_eq(uid, sock_i_uid(sk2))) { res = 0; } else { res = 1; } break; } } spin_unlock(&hslot2->lock); return res; } static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot) { struct net *net = sock_net(sk); kuid_t uid = sock_i_uid(sk); struct sock *sk2; sk_for_each(sk2, &hslot->head) { if (net_eq(sock_net(sk2), net) && sk2 != sk && sk2->sk_family == sk->sk_family && ipv6_only_sock(sk2) == ipv6_only_sock(sk) && (udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) && (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && inet_rcv_saddr_equal(sk, sk2, false)) { return reuseport_add_sock(sk, sk2, inet_rcv_saddr_any(sk)); } } return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } /** * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6 * * @sk: socket struct in question * @snum: port number to look up * @hash2_nulladdr: AF-dependent hash value in secondary hash chains, * with NULL address */ int udp_lib_get_port(struct sock *sk, unsigned short snum, unsigned int hash2_nulladdr) { struct udp_table *udptable = udp_get_table_prot(sk); struct udp_hslot *hslot, *hslot2; struct net *net = sock_net(sk); int error = -EADDRINUSE; if (!snum) { DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN); unsigned short first, last; int low, high, remaining; unsigned int rand; inet_sk_get_local_port_range(sk, &low, &high); remaining = (high - low) + 1; rand = get_random_u32(); first = reciprocal_scale(rand, remaining) + low; /* * force rand to be an odd multiple of UDP_HTABLE_SIZE */ rand = (rand | 1) * (udptable->mask + 1); last = first + udptable->mask + 1; do { hslot = udp_hashslot(udptable, net, first); bitmap_zero(bitmap, PORTS_PER_CHAIN); spin_lock_bh(&hslot->lock); udp_lib_lport_inuse(net, snum, hslot, bitmap, sk, udptable->log); snum = first; /* * Iterate on all possible values of snum for this hash. * Using steps of an odd multiple of UDP_HTABLE_SIZE * give us randomization and full range coverage. */ do { if (low <= snum && snum <= high && !test_bit(snum >> udptable->log, bitmap) && !inet_is_local_reserved_port(net, snum)) goto found; snum += rand; } while (snum != first); spin_unlock_bh(&hslot->lock); cond_resched(); } while (++first != last); goto fail; } else { hslot = udp_hashslot(udptable, net, snum); spin_lock_bh(&hslot->lock); if (hslot->count > 10) { int exist; unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum; slot2 &= udptable->mask; hash2_nulladdr &= udptable->mask; hslot2 = udp_hashslot2(udptable, slot2); if (hslot->count < hslot2->count) goto scan_primary_hash; exist = udp_lib_lport_inuse2(net, snum, hslot2, sk); if (!exist && (hash2_nulladdr != slot2)) { hslot2 = udp_hashslot2(udptable, hash2_nulladdr); exist = udp_lib_lport_inuse2(net, snum, hslot2, sk); } if (exist) goto fail_unlock; else goto found; } scan_primary_hash: if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, 0)) goto fail_unlock; } found: inet_sk(sk)->inet_num = snum; udp_sk(sk)->udp_port_hash = snum; udp_sk(sk)->udp_portaddr_hash ^= snum; if (sk_unhashed(sk)) { if (sk->sk_reuseport && udp_reuseport_add_sock(sk, hslot)) { inet_sk(sk)->inet_num = 0; udp_sk(sk)->udp_port_hash = 0; udp_sk(sk)->udp_portaddr_hash ^= snum; goto fail_unlock; } sk_add_node_rcu(sk, &hslot->head); hslot->count++; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); spin_lock(&hslot2->lock); if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && sk->sk_family == AF_INET6) hlist_add_tail_rcu(&udp_sk(sk)->udp_portaddr_node, &hslot2->head); else hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, &hslot2->head); hslot2->count++; spin_unlock(&hslot2->lock); } sock_set_flag(sk, SOCK_RCU_FREE); error = 0; fail_unlock: spin_unlock_bh(&hslot->lock); fail: return error; } EXPORT_SYMBOL(udp_lib_get_port); int udp_v4_get_port(struct sock *sk, unsigned short snum) { unsigned int hash2_nulladdr = ipv4_portaddr_hash(sock_net(sk), htonl(INADDR_ANY), snum); unsigned int hash2_partial = ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0); /* precompute partial secondary hash */ udp_sk(sk)->udp_portaddr_hash = hash2_partial; return udp_lib_get_port(sk, snum, hash2_nulladdr); } static int compute_score(struct sock *sk, struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned short hnum, int dif, int sdif) { int score; struct inet_sock *inet; bool dev_match; if (!net_eq(sock_net(sk), net) || udp_sk(sk)->udp_port_hash != hnum || ipv6_only_sock(sk)) return -1; if (sk->sk_rcv_saddr != daddr) return -1; score = (sk->sk_family == PF_INET) ? 2 : 1; inet = inet_sk(sk); if (inet->inet_daddr) { if (inet->inet_daddr != saddr) return -1; score += 4; } if (inet->inet_dport) { if (inet->inet_dport != sport) return -1; score += 4; } dev_match = udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif); if (!dev_match) return -1; if (sk->sk_bound_dev_if) score += 4; if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) score++; return score; } INDIRECT_CALLABLE_SCOPE u32 udp_ehashfn(const struct net *net, const __be32 laddr, const __u16 lport, const __be32 faddr, const __be16 fport) { static u32 udp_ehash_secret __read_mostly; net_get_random_once(&udp_ehash_secret, sizeof(udp_ehash_secret)); return __inet_ehashfn(laddr, lport, faddr, fport, udp_ehash_secret + net_hash_mix(net)); } /* called with rcu_read_lock() */ static struct sock *udp4_lib_lookup2(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, unsigned int hnum, int dif, int sdif, struct udp_hslot *hslot2, struct sk_buff *skb) { struct sock *sk, *result; int score, badness; result = NULL; badness = 0; udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif); if (score > badness) { badness = score; if (sk->sk_state == TCP_ESTABLISHED) { result = sk; continue; } result = inet_lookup_reuseport(net, sk, skb, sizeof(struct udphdr), saddr, sport, daddr, hnum, udp_ehashfn); if (!result) { result = sk; continue; } /* Fall back to scoring if group has connections */ if (!reuseport_has_conns(sk)) return result; /* Reuseport logic returned an error, keep original score. */ if (IS_ERR(result)) continue; badness = compute_score(result, net, saddr, sport, daddr, hnum, dif, sdif); } } return result; } /* UDP is nearly always wildcards out the wazoo, it makes no sense to try * harder than this. -DaveM */ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif, int sdif, struct udp_table *udptable, struct sk_buff *skb) { unsigned short hnum = ntohs(dport); unsigned int hash2, slot2; struct udp_hslot *hslot2; struct sock *result, *sk; hash2 = ipv4_portaddr_hash(net, daddr, hnum); slot2 = hash2 & udptable->mask; hslot2 = &udptable->hash2[slot2]; /* Lookup connected or non-wildcard socket */ result = udp4_lib_lookup2(net, saddr, sport, daddr, hnum, dif, sdif, hslot2, skb); if (!IS_ERR_OR_NULL(result) && result->sk_state == TCP_ESTABLISHED) goto done; /* Lookup redirect from BPF */ if (static_branch_unlikely(&bpf_sk_lookup_enabled) && udptable == net->ipv4.udp_table) { sk = inet_lookup_run_sk_lookup(net, IPPROTO_UDP, skb, sizeof(struct udphdr), saddr, sport, daddr, hnum, dif, udp_ehashfn); if (sk) { result = sk; goto done; } } /* Got non-wildcard socket or error on first lookup */ if (result) goto done; /* Lookup wildcard sockets */ hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum); slot2 = hash2 & udptable->mask; hslot2 = &udptable->hash2[slot2]; result = udp4_lib_lookup2(net, saddr, sport, htonl(INADDR_ANY), hnum, dif, sdif, hslot2, skb); done: if (IS_ERR(result)) return NULL; return result; } EXPORT_SYMBOL_GPL(__udp4_lib_lookup); static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport, struct udp_table *udptable) { const struct iphdr *iph = ip_hdr(skb); return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport, iph->daddr, dport, inet_iif(skb), inet_sdif(skb), udptable, skb); } struct sock *udp4_lib_lookup_skb(const struct sk_buff *skb, __be16 sport, __be16 dport) { const struct iphdr *iph = ip_hdr(skb); struct net *net = dev_net(skb->dev); int iif, sdif; inet_get_iif_sdif(skb, &iif, &sdif); return __udp4_lib_lookup(net, iph->saddr, sport, iph->daddr, dport, iif, sdif, net->ipv4.udp_table, NULL); } /* Must be called under rcu_read_lock(). * Does increment socket refcount. */ #if IS_ENABLED(CONFIG_NF_TPROXY_IPV4) || IS_ENABLED(CONFIG_NF_SOCKET_IPV4) struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif) { struct sock *sk; sk = __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, 0, net->ipv4.udp_table, NULL); if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; return sk; } EXPORT_SYMBOL_GPL(udp4_lib_lookup); #endif static inline bool __udp_is_mcast_sock(struct net *net, const struct sock *sk, __be16 loc_port, __be32 loc_addr, __be16 rmt_port, __be32 rmt_addr, int dif, int sdif, unsigned short hnum) { const struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net) || udp_sk(sk)->udp_port_hash != hnum || (inet->inet_daddr && inet->inet_daddr != rmt_addr) || (inet->inet_dport != rmt_port && inet->inet_dport) || (inet->inet_rcv_saddr && inet->inet_rcv_saddr != loc_addr) || ipv6_only_sock(sk) || !udp_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return false; if (!ip_mc_sf_allow(sk, loc_addr, rmt_addr, dif, sdif)) return false; return true; } DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); void udp_encap_enable(void) { static_branch_inc(&udp_encap_needed_key); } EXPORT_SYMBOL(udp_encap_enable); void udp_encap_disable(void) { static_branch_dec(&udp_encap_needed_key); } EXPORT_SYMBOL(udp_encap_disable); /* Handler for tunnels with arbitrary destination ports: no socket lookup, go * through error handlers in encapsulations looking for a match. */ static int __udp4_lib_err_encap_no_sk(struct sk_buff *skb, u32 info) { int i; for (i = 0; i < MAX_IPTUN_ENCAP_OPS; i++) { int (*handler)(struct sk_buff *skb, u32 info); const struct ip_tunnel_encap_ops *encap; encap = rcu_dereference(iptun_encaps[i]); if (!encap) continue; handler = encap->err_handler; if (handler && !handler(skb, info)) return 0; } return -ENOENT; } /* Try to match ICMP errors to UDP tunnels by looking up a socket without * reversing source and destination port: this will match tunnels that force the * same destination port on both endpoints (e.g. VXLAN, GENEVE). Note that * lwtunnels might actually break this assumption by being configured with * different destination ports on endpoints, in this case we won't be able to * trace ICMP messages back to them. * * If this doesn't match any socket, probe tunnels with arbitrary destination * ports (e.g. FoU, GUE): there, the receiving socket is useless, as the port * we've sent packets to won't necessarily match the local destination port. * * Then ask the tunnel implementation to match the error against a valid * association. * * Return an error if we can't find a match, the socket if we need further * processing, zero otherwise. */ static struct sock *__udp4_lib_err_encap(struct net *net, const struct iphdr *iph, struct udphdr *uh, struct udp_table *udptable, struct sock *sk, struct sk_buff *skb, u32 info) { int (*lookup)(struct sock *sk, struct sk_buff *skb); int network_offset, transport_offset; struct udp_sock *up; network_offset = skb_network_offset(skb); transport_offset = skb_transport_offset(skb); /* Network header needs to point to the outer IPv4 header inside ICMP */ skb_reset_network_header(skb); /* Transport header needs to point to the UDP header */ skb_set_transport_header(skb, iph->ihl << 2); if (sk) { up = udp_sk(sk); lookup = READ_ONCE(up->encap_err_lookup); if (lookup && lookup(sk, skb)) sk = NULL; goto out; } sk = __udp4_lib_lookup(net, iph->daddr, uh->source, iph->saddr, uh->dest, skb->dev->ifindex, 0, udptable, NULL); if (sk) { up = udp_sk(sk); lookup = READ_ONCE(up->encap_err_lookup); if (!lookup || lookup(sk, skb)) sk = NULL; } out: if (!sk) sk = ERR_PTR(__udp4_lib_err_encap_no_sk(skb, info)); skb_set_transport_header(skb, transport_offset); skb_set_network_header(skb, network_offset); return sk; } /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should * be closed and the error returned to the user. If err > 0 * it's just the icmp type << 8 | icmp code. * Header points to the ip header of the error packet. We move * on past this. Then (as it used to claim before adjustment) * header points to the first 8 bytes of the udp header. We need * to find the appropriate port. */ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) { struct inet_sock *inet; const struct iphdr *iph = (const struct iphdr *)skb->data; struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2)); const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; bool tunnel = false; struct sock *sk; int harderr; int err; struct net *net = dev_net(skb->dev); sk = __udp4_lib_lookup(net, iph->daddr, uh->dest, iph->saddr, uh->source, skb->dev->ifindex, inet_sdif(skb), udptable, NULL); if (!sk || READ_ONCE(udp_sk(sk)->encap_type)) { /* No socket for error: try tunnels before discarding */ if (static_branch_unlikely(&udp_encap_needed_key)) { sk = __udp4_lib_err_encap(net, iph, uh, udptable, sk, skb, info); if (!sk) return 0; } else sk = ERR_PTR(-ENOENT); if (IS_ERR(sk)) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return PTR_ERR(sk); } tunnel = true; } err = 0; harderr = 0; inet = inet_sk(sk); switch (type) { default: case ICMP_TIME_EXCEEDED: err = EHOSTUNREACH; break; case ICMP_SOURCE_QUENCH: goto out; case ICMP_PARAMETERPROB: err = EPROTO; harderr = 1; break; case ICMP_DEST_UNREACH: if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ ipv4_sk_update_pmtu(skb, sk, info); if (READ_ONCE(inet->pmtudisc) != IP_PMTUDISC_DONT) { err = EMSGSIZE; harderr = 1; break; } goto out; } err = EHOSTUNREACH; if (code <= NR_ICMP_UNREACH) { harderr = icmp_err_convert[code].fatal; err = icmp_err_convert[code].errno; } break; case ICMP_REDIRECT: ipv4_sk_redirect(skb, sk); goto out; } /* * RFC1122: OK. Passes ICMP errors back to application, as per * 4.1.3.3. */ if (tunnel) { /* ...not for tunnels though: we don't have a sending socket */ if (udp_sk(sk)->encap_err_rcv) udp_sk(sk)->encap_err_rcv(sk, skb, err, uh->dest, info, (u8 *)(uh+1)); goto out; } if (!inet_test_bit(RECVERR, sk)) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) goto out; } else ip_icmp_error(sk, skb, err, uh->dest, info, (u8 *)(uh+1)); sk->sk_err = err; sk_error_report(sk); out: return 0; } int udp_err(struct sk_buff *skb, u32 info) { return __udp4_lib_err(skb, info, dev_net(skb->dev)->ipv4.udp_table); } /* * Throw away all pending data and cancel the corking. Socket is locked. */ void udp_flush_pending_frames(struct sock *sk) { struct udp_sock *up = udp_sk(sk); if (up->pending) { up->len = 0; WRITE_ONCE(up->pending, 0); ip_flush_pending_frames(sk); } } EXPORT_SYMBOL(udp_flush_pending_frames); /** * udp4_hwcsum - handle outgoing HW checksumming * @skb: sk_buff containing the filled-in UDP header * (checksum field must be zeroed out) * @src: source IP address * @dst: destination IP address */ void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst) { struct udphdr *uh = udp_hdr(skb); int offset = skb_transport_offset(skb); int len = skb->len - offset; int hlen = len; __wsum csum = 0; if (!skb_has_frag_list(skb)) { /* * Only one fragment on the socket. */ skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); uh->check = ~csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, 0); } else { struct sk_buff *frags; /* * HW-checksum won't work as there are two or more * fragments on the socket so that all csums of sk_buffs * should be together */ skb_walk_frags(skb, frags) { csum = csum_add(csum, frags->csum); hlen -= frags->len; } csum = skb_checksum(skb, offset, hlen, csum); skb->ip_summed = CHECKSUM_NONE; uh->check = csum_tcpudp_magic(src, dst, len, IPPROTO_UDP, csum); if (uh->check == 0) uh->check = CSUM_MANGLED_0; } } EXPORT_SYMBOL_GPL(udp4_hwcsum); /* Function to set UDP checksum for an IPv4 UDP packet. This is intended * for the simple case like when setting the checksum for a UDP tunnel. */ void udp_set_csum(bool nocheck, struct sk_buff *skb, __be32 saddr, __be32 daddr, int len) { struct udphdr *uh = udp_hdr(skb); if (nocheck) { uh->check = 0; } else if (skb_is_gso(skb)) { uh->check = ~udp_v4_check(len, saddr, daddr, 0); } else if (skb->ip_summed == CHECKSUM_PARTIAL) { uh->check = 0; uh->check = udp_v4_check(len, saddr, daddr, lco_csum(skb)); if (uh->check == 0) uh->check = CSUM_MANGLED_0; } else { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct udphdr, check); uh->check = ~udp_v4_check(len, saddr, daddr, 0); } } EXPORT_SYMBOL(udp_set_csum); static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, struct inet_cork *cork) { struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(sk); struct udphdr *uh; int err; int is_udplite = IS_UDPLITE(sk); int offset = skb_transport_offset(skb); int len = skb->len - offset; int datalen = len - sizeof(*uh); __wsum csum = 0; /* * Create a UDP header */ uh = udp_hdr(skb); uh->source = inet->inet_sport; uh->dest = fl4->fl4_dport; uh->len = htons(len); uh->check = 0; if (cork->gso_size) { const int hlen = skb_network_header_len(skb) + sizeof(struct udphdr); if (hlen + cork->gso_size > cork->fragsize) { kfree_skb(skb); return -EINVAL; } if (datalen > cork->gso_size * UDP_MAX_SEGMENTS) { kfree_skb(skb); return -EINVAL; } if (sk->sk_no_check_tx) { kfree_skb(skb); return -EINVAL; } if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite || dst_xfrm(skb_dst(skb))) { kfree_skb(skb); return -EIO; } if (datalen > cork->gso_size) { skb_shinfo(skb)->gso_size = cork->gso_size; skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(datalen, cork->gso_size); } goto csum_partial; } if (is_udplite) /* UDP-Lite */ csum = udplite_csum(skb); else if (sk->sk_no_check_tx) { /* UDP csum off */ skb->ip_summed = CHECKSUM_NONE; goto send; } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ csum_partial: udp4_hwcsum(skb, fl4->saddr, fl4->daddr); goto send; } else csum = udp_csum(skb); /* add protocol-dependent pseudo-header */ uh->check = csum_tcpudp_magic(fl4->saddr, fl4->daddr, len, sk->sk_protocol, csum); if (uh->check == 0) uh->check = CSUM_MANGLED_0; send: err = ip_send_skb(sock_net(sk), skb); if (err) { if (err == -ENOBUFS && !inet_test_bit(RECVERR, sk)) { UDP_INC_STATS(sock_net(sk), UDP_MIB_SNDBUFERRORS, is_udplite); err = 0; } } else UDP_INC_STATS(sock_net(sk), UDP_MIB_OUTDATAGRAMS, is_udplite); return err; } /* * Push out all pending data as one UDP datagram. Socket is locked. */ int udp_push_pending_frames(struct sock *sk) { struct udp_sock *up = udp_sk(sk); struct inet_sock *inet = inet_sk(sk); struct flowi4 *fl4 = &inet->cork.fl.u.ip4; struct sk_buff *skb; int err = 0; skb = ip_finish_skb(sk, fl4); if (!skb) goto out; err = udp_send_skb(skb, fl4, &inet->cork.base); out: up->len = 0; WRITE_ONCE(up->pending, 0); return err; } EXPORT_SYMBOL(udp_push_pending_frames); static int __udp_cmsg_send(struct cmsghdr *cmsg, u16 *gso_size) { switch (cmsg->cmsg_type) { case UDP_SEGMENT: if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u16))) return -EINVAL; *gso_size = *(__u16 *)CMSG_DATA(cmsg); return 0; default: return -EINVAL; } } int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size) { struct cmsghdr *cmsg; bool need_ip = false; int err; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_UDP) { need_ip = true; continue; } err = __udp_cmsg_send(cmsg, gso_size); if (err) return err; } return need_ip; } EXPORT_SYMBOL_GPL(udp_cmsg_send); int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); struct udp_sock *up = udp_sk(sk); DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); struct flowi4 fl4_stack; struct flowi4 *fl4; int ulen = len; struct ipcm_cookie ipc; struct rtable *rt = NULL; int free = 0; int connected = 0; __be32 daddr, faddr, saddr; u8 tos, scope; __be16 dport; int err, is_udplite = IS_UDPLITE(sk); int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE; int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); struct sk_buff *skb; struct ip_options_data opt_copy; int uc_index; if (len > 0xFFFF) return -EMSGSIZE; /* * Check the flags. */ if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message compatibility */ return -EOPNOTSUPP; getfrag = is_udplite ? udplite_getfrag : ip_generic_getfrag; fl4 = &inet->cork.fl.u.ip4; if (READ_ONCE(up->pending)) { /* * There are pending frames. * The socket lock must be held while it's corked. */ lock_sock(sk); if (likely(up->pending)) { if (unlikely(up->pending != AF_INET)) { release_sock(sk); return -EINVAL; } goto do_append_data; } release_sock(sk); } ulen += sizeof(struct udphdr); /* * Get and verify the address. */ if (usin) { if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) { if (usin->sin_family != AF_UNSPEC) return -EAFNOSUPPORT; } daddr = usin->sin_addr.s_addr; dport = usin->sin_port; if (dport == 0) return -EINVAL; } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; daddr = inet->inet_daddr; dport = inet->inet_dport; /* Open fast path for connected socket. Route will not be used, if at least one option is set. */ connected = 1; } ipcm_init_sk(&ipc, inet); ipc.gso_size = READ_ONCE(up->gso_size); if (msg->msg_controllen) { err = udp_cmsg_send(sk, msg, &ipc.gso_size); if (err > 0) err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6); if (unlikely(err < 0)) { kfree(ipc.opt); return err; } if (ipc.opt) free = 1; connected = 0; } if (!ipc.opt) { struct ip_options_rcu *inet_opt; rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { memcpy(&opt_copy, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen); ipc.opt = &opt_copy.opt; } rcu_read_unlock(); } if (cgroup_bpf_enabled(CGROUP_UDP4_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, (struct sockaddr *)usin, &msg->msg_namelen, &ipc.addr); if (err) goto out_free; if (usin) { if (usin->sin_port == 0) { /* BPF program set invalid port. Reject it. */ err = -EINVAL; goto out_free; } daddr = usin->sin_addr.s_addr; dport = usin->sin_port; } } saddr = ipc.addr; ipc.addr = faddr = daddr; if (ipc.opt && ipc.opt->opt.srr) { if (!daddr) { err = -EINVAL; goto out_free; } faddr = ipc.opt->opt.faddr; connected = 0; } tos = get_rttos(&ipc, inet); scope = ip_sendmsg_scope(inet, &ipc, msg); if (scope == RT_SCOPE_LINK) connected = 0; uc_index = READ_ONCE(inet->uc_index); if (ipv4_is_multicast(daddr)) { if (!ipc.oif || netif_index_is_l3_master(sock_net(sk), ipc.oif)) ipc.oif = READ_ONCE(inet->mc_index); if (!saddr) saddr = READ_ONCE(inet->mc_addr); connected = 0; } else if (!ipc.oif) { ipc.oif = uc_index; } else if (ipv4_is_lbcast(daddr) && uc_index) { /* oif is set, packet is to local broadcast and * uc_index is set. oif is most likely set * by sk_bound_dev_if. If uc_index != oif check if the * oif is an L3 master and uc_index is an L3 slave. * If so, we want to allow the send using the uc_index. */ if (ipc.oif != uc_index && ipc.oif == l3mdev_master_ifindex_by_index(sock_net(sk), uc_index)) { ipc.oif = uc_index; } } if (connected) rt = (struct rtable *)sk_dst_check(sk, 0); if (!rt) { struct net *net = sock_net(sk); __u8 flow_flags = inet_sk_flowi_flags(sk); fl4 = &fl4_stack; flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, tos, scope, sk->sk_protocol, flow_flags, faddr, saddr, dport, inet->inet_sport, sk->sk_uid); security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; if (err == -ENETUNREACH) IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); goto out; } err = -EACCES; if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) goto out; if (connected) sk_dst_set(sk, dst_clone(&rt->dst)); } if (msg->msg_flags&MSG_CONFIRM) goto do_confirm; back_from_confirm: saddr = fl4->saddr; if (!ipc.addr) daddr = ipc.addr = fl4->daddr; /* Lockless fast path for the non-corking case. */ if (!corkreq) { struct inet_cork cork; skb = ip_make_skb(sk, fl4, getfrag, msg, ulen, sizeof(struct udphdr), &ipc, &rt, &cork, msg->msg_flags); err = PTR_ERR(skb); if (!IS_ERR_OR_NULL(skb)) err = udp_send_skb(skb, fl4, &cork); goto out; } lock_sock(sk); if (unlikely(up->pending)) { /* The socket is already corked while preparing it. */ /* ... which is an evident application bug. --ANK */ release_sock(sk); net_dbg_ratelimited("socket already corked\n"); err = -EINVAL; goto out; } /* * Now cork the socket to pend data. */ fl4 = &inet->cork.fl.u.ip4; fl4->daddr = daddr; fl4->saddr = saddr; fl4->fl4_dport = dport; fl4->fl4_sport = inet->inet_sport; WRITE_ONCE(up->pending, AF_INET); do_append_data: up->len += ulen; err = ip_append_data(sk, fl4, getfrag, msg, ulen, sizeof(struct udphdr), &ipc, &rt, corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); if (err) udp_flush_pending_frames(sk); else if (!corkreq) err = udp_push_pending_frames(sk); else if (unlikely(skb_queue_empty(&sk->sk_write_queue))) WRITE_ONCE(up->pending, 0); release_sock(sk); out: ip_rt_put(rt); out_free: if (free) kfree(ipc.opt); if (!err) return len; /* * ENOBUFS = no kernel mem, SOCK_NOSPACE = no sndbuf space. Reporting * ENOBUFS might not be good (it's not tunable per se), but otherwise * we don't have a good statistic (IpOutDiscards but it can be too many * things). We could add another new stat but at least for now that * seems like overkill. */ if (err == -ENOBUFS || test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { UDP_INC_STATS(sock_net(sk), UDP_MIB_SNDBUFERRORS, is_udplite); } return err; do_confirm: if (msg->msg_flags & MSG_PROBE) dst_confirm_neigh(&rt->dst, &fl4->daddr); if (!(msg->msg_flags&MSG_PROBE) || len) goto back_from_confirm; err = 0; goto out; } EXPORT_SYMBOL(udp_sendmsg); void udp_splice_eof(struct socket *sock) { struct sock *sk = sock->sk; struct udp_sock *up = udp_sk(sk); if (!READ_ONCE(up->pending) || udp_test_bit(CORK, sk)) return; lock_sock(sk); if (up->pending && !udp_test_bit(CORK, sk)) udp_push_pending_frames(sk); release_sock(sk); } EXPORT_SYMBOL_GPL(udp_splice_eof); #define UDP_SKB_IS_STATELESS 0x80000000 /* all head states (dst, sk, nf conntrack) except skb extensions are * cleared by udp_rcv(). * * We need to preserve secpath, if present, to eventually process * IP_CMSG_PASSSEC at recvmsg() time. * * Other extensions can be cleared. */ static bool udp_try_make_stateless(struct sk_buff *skb) { if (!skb_has_extensions(skb)) return true; if (!secpath_exists(skb)) { skb_ext_reset(skb); return true; } return false; } static void udp_set_dev_scratch(struct sk_buff *skb) { struct udp_dev_scratch *scratch = udp_skb_scratch(skb); BUILD_BUG_ON(sizeof(struct udp_dev_scratch) > sizeof(long)); scratch->_tsize_state = skb->truesize; #if BITS_PER_LONG == 64 scratch->len = skb->len; scratch->csum_unnecessary = !!skb_csum_unnecessary(skb); scratch->is_linear = !skb_is_nonlinear(skb); #endif if (udp_try_make_stateless(skb)) scratch->_tsize_state |= UDP_SKB_IS_STATELESS; } static void udp_skb_csum_unnecessary_set(struct sk_buff *skb) { /* We come here after udp_lib_checksum_complete() returned 0. * This means that __skb_checksum_complete() might have * set skb->csum_valid to 1. * On 64bit platforms, we can set csum_unnecessary * to true, but only if the skb is not shared. */ #if BITS_PER_LONG == 64 if (!skb_shared(skb)) udp_skb_scratch(skb)->csum_unnecessary = true; #endif } static int udp_skb_truesize(struct sk_buff *skb) { return udp_skb_scratch(skb)->_tsize_state & ~UDP_SKB_IS_STATELESS; } static bool udp_skb_has_head_state(struct sk_buff *skb) { return !(udp_skb_scratch(skb)->_tsize_state & UDP_SKB_IS_STATELESS); } /* fully reclaim rmem/fwd memory allocated for skb */ static void udp_rmem_release(struct sock *sk, int size, int partial, bool rx_queue_lock_held) { struct udp_sock *up = udp_sk(sk); struct sk_buff_head *sk_queue; int amt; if (likely(partial)) { up->forward_deficit += size; size = up->forward_deficit; if (size < READ_ONCE(up->forward_threshold) && !skb_queue_empty(&up->reader_queue)) return; } else { size += up->forward_deficit; } up->forward_deficit = 0; /* acquire the sk_receive_queue for fwd allocated memory scheduling, * if the called don't held it already */ sk_queue = &sk->sk_receive_queue; if (!rx_queue_lock_held) spin_lock(&sk_queue->lock); sk_forward_alloc_add(sk, size); amt = (sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1); sk_forward_alloc_add(sk, -amt); if (amt) __sk_mem_reduce_allocated(sk, amt >> PAGE_SHIFT); atomic_sub(size, &sk->sk_rmem_alloc); /* this can save us from acquiring the rx queue lock on next receive */ skb_queue_splice_tail_init(sk_queue, &up->reader_queue); if (!rx_queue_lock_held) spin_unlock(&sk_queue->lock); } /* Note: called with reader_queue.lock held. * Instead of using skb->truesize here, find a copy of it in skb->dev_scratch * This avoids a cache line miss while receive_queue lock is held. * Look at __udp_enqueue_schedule_skb() to find where this copy is done. */ void udp_skb_destructor(struct sock *sk, struct sk_buff *skb) { prefetch(&skb->data); udp_rmem_release(sk, udp_skb_truesize(skb), 1, false); } EXPORT_SYMBOL(udp_skb_destructor); /* as above, but the caller held the rx queue lock, too */ static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb) { prefetch(&skb->data); udp_rmem_release(sk, udp_skb_truesize(skb), 1, true); } /* Idea of busylocks is to let producers grab an extra spinlock * to relieve pressure on the receive_queue spinlock shared by consumer. * Under flood, this means that only one producer can be in line * trying to acquire the receive_queue spinlock. * These busylock can be allocated on a per cpu manner, instead of a * per socket one (that would consume a cache line per socket) */ static int udp_busylocks_log __read_mostly; static spinlock_t *udp_busylocks __read_mostly; static spinlock_t *busylock_acquire(void *ptr) { spinlock_t *busy; busy = udp_busylocks + hash_ptr(ptr, udp_busylocks_log); spin_lock(busy); return busy; } static void busylock_release(spinlock_t *busy) { if (busy) spin_unlock(busy); } static int udp_rmem_schedule(struct sock *sk, int size) { int delta; delta = size - sk->sk_forward_alloc; if (delta > 0 && !__sk_mem_schedule(sk, delta, SK_MEM_RECV)) return -ENOBUFS; return 0; } int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) { struct sk_buff_head *list = &sk->sk_receive_queue; int rmem, err = -ENOMEM; spinlock_t *busy = NULL; int size; /* try to avoid the costly atomic add/sub pair when the receive * queue is full; always allow at least a packet */ rmem = atomic_read(&sk->sk_rmem_alloc); if (rmem > sk->sk_rcvbuf) goto drop; /* Under mem pressure, it might be helpful to help udp_recvmsg() * having linear skbs : * - Reduce memory overhead and thus increase receive queue capacity * - Less cache line misses at copyout() time * - Less work at consume_skb() (less alien page frag freeing) */ if (rmem > (sk->sk_rcvbuf >> 1)) { skb_condense(skb); busy = busylock_acquire(sk); } size = skb->truesize; udp_set_dev_scratch(skb); /* we drop only if the receive buf is full and the receive * queue contains some other skb */ rmem = atomic_add_return(size, &sk->sk_rmem_alloc); if (rmem > (size + (unsigned int)sk->sk_rcvbuf)) goto uncharge_drop; spin_lock(&list->lock); err = udp_rmem_schedule(sk, size); if (err) { spin_unlock(&list->lock); goto uncharge_drop; } sk_forward_alloc_add(sk, -size); /* no need to setup a destructor, we will explicitly release the * forward allocated memory on dequeue */ sock_skb_set_dropcount(sk, skb); __skb_queue_tail(list, skb); spin_unlock(&list->lock); if (!sock_flag(sk, SOCK_DEAD)) INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk); busylock_release(busy); return 0; uncharge_drop: atomic_sub(skb->truesize, &sk->sk_rmem_alloc); drop: atomic_inc(&sk->sk_drops); busylock_release(busy); return err; } EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb); void udp_destruct_common(struct sock *sk) { /* reclaim completely the forward allocated memory */ struct udp_sock *up = udp_sk(sk); unsigned int total = 0; struct sk_buff *skb; skb_queue_splice_tail_init(&sk->sk_receive_queue, &up->reader_queue); while ((skb = __skb_dequeue(&up->reader_queue)) != NULL) { total += skb->truesize; kfree_skb(skb); } udp_rmem_release(sk, total, 0, true); } EXPORT_SYMBOL_GPL(udp_destruct_common); static void udp_destruct_sock(struct sock *sk) { udp_destruct_common(sk); inet_sock_destruct(sk); } int udp_init_sock(struct sock *sk) { udp_lib_init_sock(sk); sk->sk_destruct = udp_destruct_sock; set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); return 0; } void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) { if (unlikely(READ_ONCE(sk->sk_peek_off) >= 0)) { bool slow = lock_sock_fast(sk); sk_peek_offset_bwd(sk, len); unlock_sock_fast(sk, slow); } if (!skb_unref(skb)) return; /* In the more common cases we cleared the head states previously, * see __udp_queue_rcv_skb(). */ if (unlikely(udp_skb_has_head_state(skb))) skb_release_head_state(skb); __consume_stateless_skb(skb); } EXPORT_SYMBOL_GPL(skb_consume_udp); static struct sk_buff *__first_packet_length(struct sock *sk, struct sk_buff_head *rcvq, int *total) { struct sk_buff *skb; while ((skb = skb_peek(rcvq)) != NULL) { if (udp_lib_checksum_complete(skb)) { __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, IS_UDPLITE(sk)); __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, IS_UDPLITE(sk)); atomic_inc(&sk->sk_drops); __skb_unlink(skb, rcvq); *total += skb->truesize; kfree_skb(skb); } else { udp_skb_csum_unnecessary_set(skb); break; } } return skb; } /** * first_packet_length - return length of first packet in receive queue * @sk: socket * * Drops all bad checksum frames, until a valid one is found. * Returns the length of found skb, or -1 if none is found. */ static int first_packet_length(struct sock *sk) { struct sk_buff_head *rcvq = &udp_sk(sk)->reader_queue; struct sk_buff_head *sk_queue = &sk->sk_receive_queue; struct sk_buff *skb; int total = 0; int res; spin_lock_bh(&rcvq->lock); skb = __first_packet_length(sk, rcvq, &total); if (!skb && !skb_queue_empty_lockless(sk_queue)) { spin_lock(&sk_queue->lock); skb_queue_splice_tail_init(sk_queue, rcvq); spin_unlock(&sk_queue->lock); skb = __first_packet_length(sk, rcvq, &total); } res = skb ? skb->len : -1; if (total) udp_rmem_release(sk, total, 1, false); spin_unlock_bh(&rcvq->lock); return res; } /* * IOCTL requests applicable to the UDP protocol */ int udp_ioctl(struct sock *sk, int cmd, int *karg) { switch (cmd) { case SIOCOUTQ: { *karg = sk_wmem_alloc_get(sk); return 0; } case SIOCINQ: { *karg = max_t(int, 0, first_packet_length(sk)); return 0; } default: return -ENOIOCTLCMD; } return 0; } EXPORT_SYMBOL(udp_ioctl); struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, int *off, int *err) { struct sk_buff_head *sk_queue = &sk->sk_receive_queue; struct sk_buff_head *queue; struct sk_buff *last; long timeo; int error; queue = &udp_sk(sk)->reader_queue; timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); do { struct sk_buff *skb; error = sock_error(sk); if (error) break; error = -EAGAIN; do { spin_lock_bh(&queue->lock); skb = __skb_try_recv_from_queue(sk, queue, flags, off, err, &last); if (skb) { if (!(flags & MSG_PEEK)) udp_skb_destructor(sk, skb); spin_unlock_bh(&queue->lock); return skb; } if (skb_queue_empty_lockless(sk_queue)) { spin_unlock_bh(&queue->lock); goto busy_check; } /* refill the reader queue and walk it again * keep both queues locked to avoid re-acquiring * the sk_receive_queue lock if fwd memory scheduling * is needed. */ spin_lock(&sk_queue->lock); skb_queue_splice_tail_init(sk_queue, queue); skb = __skb_try_recv_from_queue(sk, queue, flags, off, err, &last); if (skb && !(flags & MSG_PEEK)) udp_skb_dtor_locked(sk, skb); spin_unlock(&sk_queue->lock); spin_unlock_bh(&queue->lock); if (skb) return skb; busy_check: if (!sk_can_busy_loop(sk)) break; sk_busy_loop(sk, flags & MSG_DONTWAIT); } while (!skb_queue_empty_lockless(sk_queue)); /* sk_queue is empty, reader_queue may contain peeked packets */ } while (timeo && !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue, &error, &timeo, (struct sk_buff *)sk_queue)); *err = error; return NULL; } EXPORT_SYMBOL(__skb_recv_udp); int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { struct sk_buff *skb; int err; try_again: skb = skb_recv_udp(sk, MSG_DONTWAIT, &err); if (!skb) return err; if (udp_lib_checksum_complete(skb)) { int is_udplite = IS_UDPLITE(sk); struct net *net = sock_net(sk); __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, is_udplite); __UDP_INC_STATS(net, UDP_MIB_INERRORS, is_udplite); atomic_inc(&sk->sk_drops); kfree_skb(skb); goto try_again; } WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk)); return recv_actor(sk, skb); } EXPORT_SYMBOL(udp_read_skb); /* * This should be easy, if there is something there we * return it, otherwise we block. */ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct inet_sock *inet = inet_sk(sk); DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct sk_buff *skb; unsigned int ulen, copied; int off, err, peeking = flags & MSG_PEEK; int is_udplite = IS_UDPLITE(sk); bool checksum_valid = false; if (flags & MSG_ERRQUEUE) return ip_recv_error(sk, msg, len, addr_len); try_again: off = sk_peek_offset(sk, flags); skb = __skb_recv_udp(sk, flags, &off, &err); if (!skb) return err; ulen = udp_skb_len(skb); copied = len; if (copied > ulen - off) copied = ulen - off; else if (copied < ulen) msg->msg_flags |= MSG_TRUNC; /* * If checksum is needed at all, try to do it while copying the * data. If the data is truncated, or if we only want a partial * coverage checksum (UDP-Lite), do it before the copy. */ if (copied < ulen || peeking || (is_udplite && UDP_SKB_CB(skb)->partial_cov)) { checksum_valid = udp_skb_csum_unnecessary(skb) || !__udp_lib_checksum_complete(skb); if (!checksum_valid) goto csum_copy_err; } if (checksum_valid || udp_skb_csum_unnecessary(skb)) { if (udp_skb_is_linear(skb)) err = copy_linear_skb(skb, copied, off, &msg->msg_iter); else err = skb_copy_datagram_msg(skb, off, msg, copied); } else { err = skb_copy_and_csum_datagram_msg(skb, off, msg); if (err == -EINVAL) goto csum_copy_err; } if (unlikely(err)) { if (!peeking) { atomic_inc(&sk->sk_drops); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } kfree_skb(skb); return err; } if (!peeking) UDP_INC_STATS(sock_net(sk), UDP_MIB_INDATAGRAMS, is_udplite); sock_recv_cmsgs(msg, sk, skb); /* Copy the address. */ if (sin) { sin->sin_family = AF_INET; sin->sin_port = udp_hdr(skb)->source; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, (struct sockaddr *)sin, addr_len); } if (udp_test_bit(GRO_ENABLED, sk)) udp_cmsg_recv(msg, sk, skb); if (inet_cmsg_flags(inet)) ip_cmsg_recv_offset(msg, sk, skb, sizeof(struct udphdr), off); err = copied; if (flags & MSG_TRUNC) err = ulen; skb_consume_udp(sk, skb, peeking ? -err : err); return err; csum_copy_err: if (!__sk_queue_drop_skb(sk, &udp_sk(sk)->reader_queue, skb, flags, udp_skb_destructor)) { UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } kfree_skb(skb); /* starting over for a new packet, but check if we need to yield */ cond_resched(); msg->msg_flags &= ~MSG_TRUNC; goto try_again; } int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { /* This check is replicated from __ip4_datagram_connect() and * intended to prevent BPF program called below from accessing bytes * that are out of the bound specified by user in addr_len. */ if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, &addr_len); } EXPORT_SYMBOL(udp_pre_connect); int __udp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); /* * 1003.1g - break association. */ sk->sk_state = TCP_CLOSE; inet->inet_daddr = 0; inet->inet_dport = 0; sock_rps_reset_rxhash(sk); sk->sk_bound_dev_if = 0; if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) { inet_reset_saddr(sk); if (sk->sk_prot->rehash && (sk->sk_userlocks & SOCK_BINDPORT_LOCK)) sk->sk_prot->rehash(sk); } if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) { sk->sk_prot->unhash(sk); inet->inet_sport = 0; } sk_dst_reset(sk); return 0; } EXPORT_SYMBOL(__udp_disconnect); int udp_disconnect(struct sock *sk, int flags) { lock_sock(sk); __udp_disconnect(sk, flags); release_sock(sk); return 0; } EXPORT_SYMBOL(udp_disconnect); void udp_lib_unhash(struct sock *sk) { if (sk_hashed(sk)) { struct udp_table *udptable = udp_get_table_prot(sk); struct udp_hslot *hslot, *hslot2; hslot = udp_hashslot(udptable, sock_net(sk), udp_sk(sk)->udp_port_hash); hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); spin_lock_bh(&hslot->lock); if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); if (sk_del_node_init_rcu(sk)) { hslot->count--; inet_sk(sk)->inet_num = 0; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_lock(&hslot2->lock); hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); hslot2->count--; spin_unlock(&hslot2->lock); } spin_unlock_bh(&hslot->lock); } } EXPORT_SYMBOL(udp_lib_unhash); /* * inet_rcv_saddr was changed, we must rehash secondary hash */ void udp_lib_rehash(struct sock *sk, u16 newhash) { if (sk_hashed(sk)) { struct udp_table *udptable = udp_get_table_prot(sk); struct udp_hslot *hslot, *hslot2, *nhslot2; hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); nhslot2 = udp_hashslot2(udptable, newhash); udp_sk(sk)->udp_portaddr_hash = newhash; if (hslot2 != nhslot2 || rcu_access_pointer(sk->sk_reuseport_cb)) { hslot = udp_hashslot(udptable, sock_net(sk), udp_sk(sk)->udp_port_hash); /* we must lock primary chain too */ spin_lock_bh(&hslot->lock); if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); if (hslot2 != nhslot2) { spin_lock(&hslot2->lock); hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); hslot2->count--; spin_unlock(&hslot2->lock); spin_lock(&nhslot2->lock); hlist_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, &nhslot2->head); nhslot2->count++; spin_unlock(&nhslot2->lock); } spin_unlock_bh(&hslot->lock); } } } EXPORT_SYMBOL(udp_lib_rehash); void udp_v4_rehash(struct sock *sk) { u16 new_hash = ipv4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, inet_sk(sk)->inet_num); udp_lib_rehash(sk, new_hash); } static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { int rc; if (inet_sk(sk)->inet_daddr) { sock_rps_save_rxhash(sk, skb); sk_mark_napi_id(sk, skb); sk_incoming_cpu_update(sk); } else { sk_mark_napi_id_once(sk, skb); } rc = __udp_enqueue_schedule_skb(sk, skb); if (rc < 0) { int is_udplite = IS_UDPLITE(sk); int drop_reason; /* Note that an ENOMEM error is charged twice */ if (rc == -ENOMEM) { UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS, is_udplite); drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF; } else { UDP_INC_STATS(sock_net(sk), UDP_MIB_MEMERRORS, is_udplite); drop_reason = SKB_DROP_REASON_PROTO_MEM; } UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); kfree_skb_reason(skb, drop_reason); trace_udp_fail_queue_rcv_skb(rc, sk); return -1; } return 0; } /* returns: * -1: error * 0: success * >0: "udp encap" protocol resubmission * * Note that in the success and error cases, the skb is assumed to * have either been requeued or freed. */ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) { int drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; struct udp_sock *up = udp_sk(sk); int is_udplite = IS_UDPLITE(sk); /* * Charge it to the socket, dropping if the queue is full. */ if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { drop_reason = SKB_DROP_REASON_XFRM_POLICY; goto drop; } nf_reset_ct(skb); if (static_branch_unlikely(&udp_encap_needed_key) && READ_ONCE(up->encap_type)) { int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); /* * This is an encapsulation socket so pass the skb to * the socket's udp_encap_rcv() hook. Otherwise, just * fall through and pass this up the UDP socket. * up->encap_rcv() returns the following value: * =0 if skb was successfully passed to the encap * handler or was discarded by it. * >0 if skb should be passed on to UDP. * <0 if skb should be resubmitted as proto -N */ /* if we're overly short, let UDP handle it */ encap_rcv = READ_ONCE(up->encap_rcv); if (encap_rcv) { int ret; /* Verify checksum before giving to encap */ if (udp_lib_checksum_complete(skb)) goto csum_error; ret = encap_rcv(sk, skb); if (ret <= 0) { __UDP_INC_STATS(sock_net(sk), UDP_MIB_INDATAGRAMS, is_udplite); return -ret; } } /* FALLTHROUGH -- it's a UDP Packet */ } /* * UDP-Lite specific tests, ignored on UDP sockets */ if (udp_test_bit(UDPLITE_RECV_CC, sk) && UDP_SKB_CB(skb)->partial_cov) { u16 pcrlen = READ_ONCE(up->pcrlen); /* * MIB statistics other than incrementing the error count are * disabled for the following two types of errors: these depend * on the application settings, not on the functioning of the * protocol stack as such. * * RFC 3828 here recommends (sec 3.3): "There should also be a * way ... to ... at least let the receiving application block * delivery of packets with coverage values less than a value * provided by the application." */ if (pcrlen == 0) { /* full coverage was set */ net_dbg_ratelimited("UDPLite: partial coverage %d while full coverage %d requested\n", UDP_SKB_CB(skb)->cscov, skb->len); goto drop; } /* The next case involves violating the min. coverage requested * by the receiver. This is subtle: if receiver wants x and x is * greater than the buffersize/MTU then receiver will complain * that it wants x while sender emits packets of smaller size y. * Therefore the above ...()->partial_cov statement is essential. */ if (UDP_SKB_CB(skb)->cscov < pcrlen) { net_dbg_ratelimited("UDPLite: coverage %d too small, need min %d\n", UDP_SKB_CB(skb)->cscov, pcrlen); goto drop; } } prefetch(&sk->sk_rmem_alloc); if (rcu_access_pointer(sk->sk_filter) && udp_lib_checksum_complete(skb)) goto csum_error; if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; goto drop; } udp_csum_pull_header(skb); ipv4_pktinfo_prepare(sk, skb, true); return __udp_queue_rcv_skb(sk, skb); csum_error: drop_reason = SKB_DROP_REASON_UDP_CSUM; __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); atomic_inc(&sk->sk_drops); kfree_skb_reason(skb, drop_reason); return -1; } static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct sk_buff *next, *segs; int ret; if (likely(!udp_unexpected_gso(sk, skb))) return udp_queue_rcv_one_skb(sk, skb); BUILD_BUG_ON(sizeof(struct udp_skb_cb) > SKB_GSO_CB_OFFSET); __skb_push(skb, -skb_mac_offset(skb)); segs = udp_rcv_segment(sk, skb, true); skb_list_walk_safe(segs, skb, next) { __skb_pull(skb, skb_transport_offset(skb)); udp_post_segment_fix_csum(skb); ret = udp_queue_rcv_one_skb(sk, skb); if (ret > 0) ip_protocol_deliver_rcu(dev_net(skb->dev), skb, ret); } return 0; } /* For TCP sockets, sk_rx_dst is protected by socket lock * For UDP, we use xchg() to guard against concurrent changes. */ bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) { struct dst_entry *old; if (dst_hold_safe(dst)) { old = xchg((__force struct dst_entry **)&sk->sk_rx_dst, dst); dst_release(old); return old != dst; } return false; } EXPORT_SYMBOL(udp_sk_rx_dst_set); /* * Multicasts and broadcasts go to each listener. * * Note: called only from the BH handler context. */ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct udphdr *uh, __be32 saddr, __be32 daddr, struct udp_table *udptable, int proto) { struct sock *sk, *first = NULL; unsigned short hnum = ntohs(uh->dest); struct udp_hslot *hslot = udp_hashslot(udptable, net, hnum); unsigned int hash2 = 0, hash2_any = 0, use_hash2 = (hslot->count > 10); unsigned int offset = offsetof(typeof(*sk), sk_node); int dif = skb->dev->ifindex; int sdif = inet_sdif(skb); struct hlist_node *node; struct sk_buff *nskb; if (use_hash2) { hash2_any = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum) & udptable->mask; hash2 = ipv4_portaddr_hash(net, daddr, hnum) & udptable->mask; start_lookup: hslot = &udptable->hash2[hash2]; offset = offsetof(typeof(*sk), __sk_common.skc_portaddr_node); } sk_for_each_entry_offset_rcu(sk, node, &hslot->head, offset) { if (!__udp_is_mcast_sock(net, sk, uh->dest, daddr, uh->source, saddr, dif, sdif, hnum)) continue; if (!first) { first = sk; continue; } nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) { atomic_inc(&sk->sk_drops); __UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS, IS_UDPLITE(sk)); __UDP_INC_STATS(net, UDP_MIB_INERRORS, IS_UDPLITE(sk)); continue; } if (udp_queue_rcv_skb(sk, nskb) > 0) consume_skb(nskb); } /* Also lookup *:port if we are using hash2 and haven't done so yet. */ if (use_hash2 && hash2 != hash2_any) { hash2 = hash2_any; goto start_lookup; } if (first) { if (udp_queue_rcv_skb(first, skb) > 0) consume_skb(skb); } else { kfree_skb(skb); __UDP_INC_STATS(net, UDP_MIB_IGNOREDMULTI, proto == IPPROTO_UDPLITE); } return 0; } /* Initialize UDP checksum. If exited with zero value (success), * CHECKSUM_UNNECESSARY means, that no more checks are required. * Otherwise, csum completion requires checksumming packet body, * including udp header and folding it to skb->csum. */ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto) { int err; UDP_SKB_CB(skb)->partial_cov = 0; UDP_SKB_CB(skb)->cscov = skb->len; if (proto == IPPROTO_UDPLITE) { err = udplite_checksum_init(skb, uh); if (err) return err; if (UDP_SKB_CB(skb)->partial_cov) { skb->csum = inet_compute_pseudo(skb, proto); return 0; } } /* Note, we are only interested in != 0 or == 0, thus the * force to int. */ err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check, inet_compute_pseudo); if (err) return err; if (skb->ip_summed == CHECKSUM_COMPLETE && !skb->csum_valid) { /* If SW calculated the value, we know it's bad */ if (skb->csum_complete_sw) return 1; /* HW says the value is bad. Let's validate that. * skb->csum is no longer the full packet checksum, * so don't treat it as such. */ skb_checksum_complete_unset(skb); } return 0; } /* wrapper for udp_queue_rcv_skb tacking care of csum conversion and * return code conversion for ip layer consumption */ static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, struct udphdr *uh) { int ret; if (inet_get_convert_csum(sk) && uh->check && !IS_UDPLITE(sk)) skb_checksum_try_convert(skb, IPPROTO_UDP, inet_compute_pseudo); ret = udp_queue_rcv_skb(sk, skb); /* a return value > 0 means to resubmit the input, but * it wants the return to be -protocol, or 0 */ if (ret > 0) return -ret; return 0; } /* * All we need to do is get the socket, and then do a checksum. */ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, int proto) { struct sock *sk; struct udphdr *uh; unsigned short ulen; struct rtable *rt = skb_rtable(skb); __be32 saddr, daddr; struct net *net = dev_net(skb->dev); bool refcounted; int drop_reason; drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; /* * Validate the packet. */ if (!pskb_may_pull(skb, sizeof(struct udphdr))) goto drop; /* No space for header. */ uh = udp_hdr(skb); ulen = ntohs(uh->len); saddr = ip_hdr(skb)->saddr; daddr = ip_hdr(skb)->daddr; if (ulen > skb->len) goto short_packet; if (proto == IPPROTO_UDP) { /* UDP validates ulen. */ if (ulen < sizeof(*uh) || pskb_trim_rcsum(skb, ulen)) goto short_packet; uh = udp_hdr(skb); } if (udp4_csum_init(skb, uh, proto)) goto csum_error; sk = inet_steal_sock(net, skb, sizeof(struct udphdr), saddr, uh->source, daddr, uh->dest, &refcounted, udp_ehashfn); if (IS_ERR(sk)) goto no_sk; if (sk) { struct dst_entry *dst = skb_dst(skb); int ret; if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst)) udp_sk_rx_dst_set(sk, dst); ret = udp_unicast_rcv_skb(sk, skb, uh); if (refcounted) sock_put(sk); return ret; } if (rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) return __udp4_lib_mcast_deliver(net, skb, uh, saddr, daddr, udptable, proto); sk = __udp4_lib_lookup_skb(skb, uh->source, uh->dest, udptable); if (sk) return udp_unicast_rcv_skb(sk, skb, uh); no_sk: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; nf_reset_ct(skb); /* No socket. Drop packet silently, if checksum is wrong */ if (udp_lib_checksum_complete(skb)) goto csum_error; drop_reason = SKB_DROP_REASON_NO_SOCKET; __UDP_INC_STATS(net, UDP_MIB_NOPORTS, proto == IPPROTO_UDPLITE); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); /* * Hmm. We got an UDP packet to a port to which we * don't wanna listen. Ignore it. */ kfree_skb_reason(skb, drop_reason); return 0; short_packet: drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; net_dbg_ratelimited("UDP%s: short packet: From %pI4:%u %d/%d to %pI4:%u\n", proto == IPPROTO_UDPLITE ? "Lite" : "", &saddr, ntohs(uh->source), ulen, skb->len, &daddr, ntohs(uh->dest)); goto drop; csum_error: /* * RFC1122: OK. Discards the bad packet silently (as far as * the network is concerned, anyway) as per 4.1.3.4 (MUST). */ drop_reason = SKB_DROP_REASON_UDP_CSUM; net_dbg_ratelimited("UDP%s: bad checksum. From %pI4:%u to %pI4:%u ulen %d\n", proto == IPPROTO_UDPLITE ? "Lite" : "", &saddr, ntohs(uh->source), &daddr, ntohs(uh->dest), ulen); __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, proto == IPPROTO_UDPLITE); drop: __UDP_INC_STATS(net, UDP_MIB_INERRORS, proto == IPPROTO_UDPLITE); kfree_skb_reason(skb, drop_reason); return 0; } /* We can only early demux multicast if there is a single matching socket. * If more than one socket found returns NULL */ static struct sock *__udp4_lib_mcast_demux_lookup(struct net *net, __be16 loc_port, __be32 loc_addr, __be16 rmt_port, __be32 rmt_addr, int dif, int sdif) { struct udp_table *udptable = net->ipv4.udp_table; unsigned short hnum = ntohs(loc_port); struct sock *sk, *result; struct udp_hslot *hslot; unsigned int slot; slot = udp_hashfn(net, hnum, udptable->mask); hslot = &udptable->hash[slot]; /* Do not bother scanning a too big list */ if (hslot->count > 10) return NULL; result = NULL; sk_for_each_rcu(sk, &hslot->head) { if (__udp_is_mcast_sock(net, sk, loc_port, loc_addr, rmt_port, rmt_addr, dif, sdif, hnum)) { if (result) return NULL; result = sk; } } return result; } /* For unicast we should only early demux connected sockets or we can * break forwarding setups. The chains here can be long so only check * if the first socket is an exact match and if not move on. */ static struct sock *__udp4_lib_demux_lookup(struct net *net, __be16 loc_port, __be32 loc_addr, __be16 rmt_port, __be32 rmt_addr, int dif, int sdif) { struct udp_table *udptable = net->ipv4.udp_table; INET_ADDR_COOKIE(acookie, rmt_addr, loc_addr); unsigned short hnum = ntohs(loc_port); unsigned int hash2, slot2; struct udp_hslot *hslot2; __portpair ports; struct sock *sk; hash2 = ipv4_portaddr_hash(net, loc_addr, hnum); slot2 = hash2 & udptable->mask; hslot2 = &udptable->hash2[slot2]; ports = INET_COMBINED_PORTS(rmt_port, hnum); udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { if (inet_match(net, sk, acookie, ports, dif, sdif)) return sk; /* Only check first socket in chain */ break; } return NULL; } int udp_v4_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct in_device *in_dev = NULL; const struct iphdr *iph; const struct udphdr *uh; struct sock *sk = NULL; struct dst_entry *dst; int dif = skb->dev->ifindex; int sdif = inet_sdif(skb); int ours; /* validate the packet */ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr))) return 0; iph = ip_hdr(skb); uh = udp_hdr(skb); if (skb->pkt_type == PACKET_MULTICAST) { in_dev = __in_dev_get_rcu(skb->dev); if (!in_dev) return 0; ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, iph->protocol); if (!ours) return 0; sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, uh->source, iph->saddr, dif, sdif); } else if (skb->pkt_type == PACKET_HOST) { sk = __udp4_lib_demux_lookup(net, uh->dest, iph->daddr, uh->source, iph->saddr, dif, sdif); } if (!sk || !refcount_inc_not_zero(&sk->sk_refcnt)) return 0; skb->sk = sk; skb->destructor = sock_efree; dst = rcu_dereference(sk->sk_rx_dst); if (dst) dst = dst_check(dst, 0); if (dst) { u32 itag = 0; /* set noref for now. * any place which wants to hold dst has to call * dst_hold_safe() */ skb_dst_set_noref(skb, dst); /* for unconnected multicast sockets we need to validate * the source on each packet */ if (!inet_sk(sk)->inet_daddr && in_dev) return ip_mc_validate_source(skb, iph->daddr, iph->saddr, iph->tos & IPTOS_RT_MASK, skb->dev, in_dev, &itag); } return 0; } int udp_rcv(struct sk_buff *skb) { return __udp4_lib_rcv(skb, dev_net(skb->dev)->ipv4.udp_table, IPPROTO_UDP); } void udp_destroy_sock(struct sock *sk) { struct udp_sock *up = udp_sk(sk); bool slow = lock_sock_fast(sk); /* protects from races with udp_abort() */ sock_set_flag(sk, SOCK_DEAD); udp_flush_pending_frames(sk); unlock_sock_fast(sk, slow); if (static_branch_unlikely(&udp_encap_needed_key)) { if (up->encap_type) { void (*encap_destroy)(struct sock *sk); encap_destroy = READ_ONCE(up->encap_destroy); if (encap_destroy) encap_destroy(sk); } if (udp_test_bit(ENCAP_ENABLED, sk)) static_branch_dec(&udp_encap_needed_key); } } static void set_xfrm_gro_udp_encap_rcv(__u16 encap_type, unsigned short family, struct sock *sk) { #ifdef CONFIG_XFRM if (udp_test_bit(GRO_ENABLED, sk) && encap_type == UDP_ENCAP_ESPINUDP) { if (family == AF_INET) WRITE_ONCE(udp_sk(sk)->gro_receive, xfrm4_gro_udp_encap_rcv); else if (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) WRITE_ONCE(udp_sk(sk)->gro_receive, ipv6_stub->xfrm6_gro_udp_encap_rcv); } #endif } /* * Socket option code for UDP */ int udp_lib_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen, int (*push_pending_frames)(struct sock *)) { struct udp_sock *up = udp_sk(sk); int val, valbool; int err = 0; int is_udplite = IS_UDPLITE(sk); if (level == SOL_SOCKET) { err = sk_setsockopt(sk, level, optname, optval, optlen); if (optname == SO_RCVBUF || optname == SO_RCVBUFFORCE) { sockopt_lock_sock(sk); /* paired with READ_ONCE in udp_rmem_release() */ WRITE_ONCE(up->forward_threshold, sk->sk_rcvbuf >> 2); sockopt_release_sock(sk); } return err; } if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; valbool = val ? 1 : 0; switch (optname) { case UDP_CORK: if (val != 0) { udp_set_bit(CORK, sk); } else { udp_clear_bit(CORK, sk); lock_sock(sk); push_pending_frames(sk); release_sock(sk); } break; case UDP_ENCAP: switch (val) { case 0: #ifdef CONFIG_XFRM case UDP_ENCAP_ESPINUDP: set_xfrm_gro_udp_encap_rcv(val, sk->sk_family, sk); fallthrough; case UDP_ENCAP_ESPINUDP_NON_IKE: #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) WRITE_ONCE(up->encap_rcv, ipv6_stub->xfrm6_udp_encap_rcv); else #endif WRITE_ONCE(up->encap_rcv, xfrm4_udp_encap_rcv); #endif fallthrough; case UDP_ENCAP_L2TPINUDP: WRITE_ONCE(up->encap_type, val); udp_tunnel_encap_enable(sk); break; default: err = -ENOPROTOOPT; break; } break; case UDP_NO_CHECK6_TX: udp_set_no_check6_tx(sk, valbool); break; case UDP_NO_CHECK6_RX: udp_set_no_check6_rx(sk, valbool); break; case UDP_SEGMENT: if (val < 0 || val > USHRT_MAX) return -EINVAL; WRITE_ONCE(up->gso_size, val); break; case UDP_GRO: /* when enabling GRO, accept the related GSO packet type */ if (valbool) udp_tunnel_encap_enable(sk); udp_assign_bit(GRO_ENABLED, sk, valbool); udp_assign_bit(ACCEPT_L4, sk, valbool); set_xfrm_gro_udp_encap_rcv(up->encap_type, sk->sk_family, sk); break; /* * UDP-Lite's partial checksum coverage (RFC 3828). */ /* The sender sets actual checksum coverage length via this option. * The case coverage > packet length is handled by send module. */ case UDPLITE_SEND_CSCOV: if (!is_udplite) /* Disable the option on UDP sockets */ return -ENOPROTOOPT; if (val != 0 && val < 8) /* Illegal coverage: use default (8) */ val = 8; else if (val > USHRT_MAX) val = USHRT_MAX; WRITE_ONCE(up->pcslen, val); udp_set_bit(UDPLITE_SEND_CC, sk); break; /* The receiver specifies a minimum checksum coverage value. To make * sense, this should be set to at least 8 (as done below). If zero is * used, this again means full checksum coverage. */ case UDPLITE_RECV_CSCOV: if (!is_udplite) /* Disable the option on UDP sockets */ return -ENOPROTOOPT; if (val != 0 && val < 8) /* Avoid silly minimal values. */ val = 8; else if (val > USHRT_MAX) val = USHRT_MAX; WRITE_ONCE(up->pcrlen, val); udp_set_bit(UDPLITE_RECV_CC, sk); break; default: err = -ENOPROTOOPT; break; } return err; } EXPORT_SYMBOL(udp_lib_setsockopt); int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { if (level == SOL_UDP || level == SOL_UDPLITE || level == SOL_SOCKET) return udp_lib_setsockopt(sk, level, optname, optval, optlen, udp_push_pending_frames); return ip_setsockopt(sk, level, optname, optval, optlen); } int udp_lib_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { struct udp_sock *up = udp_sk(sk); int val, len; if (get_user(len, optlen)) return -EFAULT; len = min_t(unsigned int, len, sizeof(int)); if (len < 0) return -EINVAL; switch (optname) { case UDP_CORK: val = udp_test_bit(CORK, sk); break; case UDP_ENCAP: val = READ_ONCE(up->encap_type); break; case UDP_NO_CHECK6_TX: val = udp_get_no_check6_tx(sk); break; case UDP_NO_CHECK6_RX: val = udp_get_no_check6_rx(sk); break; case UDP_SEGMENT: val = READ_ONCE(up->gso_size); break; case UDP_GRO: val = udp_test_bit(GRO_ENABLED, sk); break; /* The following two cannot be changed on UDP sockets, the return is * always 0 (which corresponds to the full checksum coverage of UDP). */ case UDPLITE_SEND_CSCOV: val = READ_ONCE(up->pcslen); break; case UDPLITE_RECV_CSCOV: val = READ_ONCE(up->pcrlen); break; default: return -ENOPROTOOPT; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } EXPORT_SYMBOL(udp_lib_getsockopt); int udp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { if (level == SOL_UDP || level == SOL_UDPLITE) return udp_lib_getsockopt(sk, level, optname, optval, optlen); return ip_getsockopt(sk, level, optname, optval, optlen); } /** * udp_poll - wait for a UDP event. * @file: - file struct * @sock: - socket * @wait: - poll table * * This is same as datagram poll, except for the special case of * blocking sockets. If application is using a blocking fd * and a packet with checksum error is in the queue; * then it could get return from select indicating data available * but then block when reading it. Add special case code * to work around these arguably broken applications. */ __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait) { __poll_t mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; if (!skb_queue_empty_lockless(&udp_sk(sk)->reader_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* Check for false positives due to checksum errors */ if ((mask & EPOLLRDNORM) && !(file->f_flags & O_NONBLOCK) && !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1) mask &= ~(EPOLLIN | EPOLLRDNORM); /* psock ingress_msg queue should not contain any bad checksum frames */ if (sk_is_readable(sk)) mask |= EPOLLIN | EPOLLRDNORM; return mask; } EXPORT_SYMBOL(udp_poll); int udp_abort(struct sock *sk, int err) { if (!has_current_bpf_ctx()) lock_sock(sk); /* udp{v6}_destroy_sock() sets it under the sk lock, avoid racing * with close() */ if (sock_flag(sk, SOCK_DEAD)) goto out; sk->sk_err = err; sk_error_report(sk); __udp_disconnect(sk, 0); out: if (!has_current_bpf_ctx()) release_sock(sk); return 0; } EXPORT_SYMBOL_GPL(udp_abort); struct proto udp_prot = { .name = "UDP", .owner = THIS_MODULE, .close = udp_lib_close, .pre_connect = udp_pre_connect, .connect = ip4_datagram_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, .init = udp_init_sock, .destroy = udp_destroy_sock, .setsockopt = udp_setsockopt, .getsockopt = udp_getsockopt, .sendmsg = udp_sendmsg, .recvmsg = udp_recvmsg, .splice_eof = udp_splice_eof, .release_cb = ip4_datagram_release_cb, .hash = udp_lib_hash, .unhash = udp_lib_unhash, .rehash = udp_v4_rehash, .get_port = udp_v4_get_port, .put_port = udp_lib_unhash, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = udp_bpf_update_proto, #endif .memory_allocated = &udp_memory_allocated, .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc, .sysctl_mem = sysctl_udp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_udp_wmem_min), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_udp_rmem_min), .obj_size = sizeof(struct udp_sock), .h.udp_table = NULL, .diag_destroy = udp_abort, }; EXPORT_SYMBOL(udp_prot); /* ------------------------------------------------------------------------ */ #ifdef CONFIG_PROC_FS static unsigned short seq_file_family(const struct seq_file *seq); static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) { unsigned short family = seq_file_family(seq); /* AF_UNSPEC is used as a match all */ return ((family == AF_UNSPEC || family == sk->sk_family) && net_eq(sock_net(sk), seq_file_net(seq))); } #ifdef CONFIG_BPF_SYSCALL static const struct seq_operations bpf_iter_udp_seq_ops; #endif static struct udp_table *udp_get_table_seq(struct seq_file *seq, struct net *net) { const struct udp_seq_afinfo *afinfo; #ifdef CONFIG_BPF_SYSCALL if (seq->op == &bpf_iter_udp_seq_ops) return net->ipv4.udp_table; #endif afinfo = pde_data(file_inode(seq->file)); return afinfo->udp_table ? : net->ipv4.udp_table; } static struct sock *udp_get_first(struct seq_file *seq, int start) { struct udp_iter_state *state = seq->private; struct net *net = seq_file_net(seq); struct udp_table *udptable; struct sock *sk; udptable = udp_get_table_seq(seq, net); for (state->bucket = start; state->bucket <= udptable->mask; ++state->bucket) { struct udp_hslot *hslot = &udptable->hash[state->bucket]; if (hlist_empty(&hslot->head)) continue; spin_lock_bh(&hslot->lock); sk_for_each(sk, &hslot->head) { if (seq_sk_match(seq, sk)) goto found; } spin_unlock_bh(&hslot->lock); } sk = NULL; found: return sk; } static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) { struct udp_iter_state *state = seq->private; struct net *net = seq_file_net(seq); struct udp_table *udptable; do { sk = sk_next(sk); } while (sk && !seq_sk_match(seq, sk)); if (!sk) { udptable = udp_get_table_seq(seq, net); if (state->bucket <= udptable->mask) spin_unlock_bh(&udptable->hash[state->bucket].lock); return udp_get_first(seq, state->bucket + 1); } return sk; } static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos) { struct sock *sk = udp_get_first(seq, 0); if (sk) while (pos && (sk = udp_get_next(seq, sk)) != NULL) --pos; return pos ? NULL : sk; } void *udp_seq_start(struct seq_file *seq, loff_t *pos) { struct udp_iter_state *state = seq->private; state->bucket = MAX_UDP_PORTS; return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN; } EXPORT_SYMBOL(udp_seq_start); void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sock *sk; if (v == SEQ_START_TOKEN) sk = udp_get_idx(seq, 0); else sk = udp_get_next(seq, v); ++*pos; return sk; } EXPORT_SYMBOL(udp_seq_next); void udp_seq_stop(struct seq_file *seq, void *v) { struct udp_iter_state *state = seq->private; struct udp_table *udptable; udptable = udp_get_table_seq(seq, seq_file_net(seq)); if (state->bucket <= udptable->mask) spin_unlock_bh(&udptable->hash[state->bucket].lock); } EXPORT_SYMBOL(udp_seq_stop); /* ------------------------------------------------------------------------ */ static void udp4_format_sock(struct sock *sp, struct seq_file *f, int bucket) { struct inet_sock *inet = inet_sk(sp); __be32 dest = inet->inet_daddr; __be32 src = inet->inet_rcv_saddr; __u16 destp = ntohs(inet->inet_dport); __u16 srcp = ntohs(inet->inet_sport); seq_printf(f, "%5d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %u", bucket, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), udp_rqueue_get(sp), 0, 0L, 0, from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); } int udp4_seq_show(struct seq_file *seq, void *v) { seq_setwidth(seq, 127); if (v == SEQ_START_TOKEN) seq_puts(seq, " sl local_address rem_address st tx_queue " "rx_queue tr tm->when retrnsmt uid timeout " "inode ref pointer drops"); else { struct udp_iter_state *state = seq->private; udp4_format_sock(v, seq, state->bucket); } seq_pad(seq, '\n'); return 0; } #ifdef CONFIG_BPF_SYSCALL struct bpf_iter__udp { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct udp_sock *, udp_sk); uid_t uid __aligned(8); int bucket __aligned(8); }; struct bpf_udp_iter_state { struct udp_iter_state state; unsigned int cur_sk; unsigned int end_sk; unsigned int max_sk; int offset; struct sock **batch; bool st_bucket_done; }; static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter, unsigned int new_batch_sz); static struct sock *bpf_iter_udp_batch(struct seq_file *seq) { struct bpf_udp_iter_state *iter = seq->private; struct udp_iter_state *state = &iter->state; struct net *net = seq_file_net(seq); int resume_bucket, resume_offset; struct udp_table *udptable; unsigned int batch_sks = 0; bool resized = false; struct sock *sk; resume_bucket = state->bucket; resume_offset = iter->offset; /* The current batch is done, so advance the bucket. */ if (iter->st_bucket_done) state->bucket++; udptable = udp_get_table_seq(seq, net); again: /* New batch for the next bucket. * Iterate over the hash table to find a bucket with sockets matching * the iterator attributes, and return the first matching socket from * the bucket. The remaining matched sockets from the bucket are batched * before releasing the bucket lock. This allows BPF programs that are * called in seq_show to acquire the bucket lock if needed. */ iter->cur_sk = 0; iter->end_sk = 0; iter->st_bucket_done = false; batch_sks = 0; for (; state->bucket <= udptable->mask; state->bucket++) { struct udp_hslot *hslot2 = &udptable->hash2[state->bucket]; if (hlist_empty(&hslot2->head)) continue; iter->offset = 0; spin_lock_bh(&hslot2->lock); udp_portaddr_for_each_entry(sk, &hslot2->head) { if (seq_sk_match(seq, sk)) { /* Resume from the last iterated socket at the * offset in the bucket before iterator was stopped. */ if (state->bucket == resume_bucket && iter->offset < resume_offset) { ++iter->offset; continue; } if (iter->end_sk < iter->max_sk) { sock_hold(sk); iter->batch[iter->end_sk++] = sk; } batch_sks++; } } spin_unlock_bh(&hslot2->lock); if (iter->end_sk) break; } /* All done: no batch made. */ if (!iter->end_sk) return NULL; if (iter->end_sk == batch_sks) { /* Batching is done for the current bucket; return the first * socket to be iterated from the batch. */ iter->st_bucket_done = true; goto done; } if (!resized && !bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2)) { resized = true; /* After allocating a larger batch, retry one more time to grab * the whole bucket. */ goto again; } done: return iter->batch[0]; } static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bpf_udp_iter_state *iter = seq->private; struct sock *sk; /* Whenever seq_next() is called, the iter->cur_sk is * done with seq_show(), so unref the iter->cur_sk. */ if (iter->cur_sk < iter->end_sk) { sock_put(iter->batch[iter->cur_sk++]); ++iter->offset; } /* After updating iter->cur_sk, check if there are more sockets * available in the current bucket batch. */ if (iter->cur_sk < iter->end_sk) sk = iter->batch[iter->cur_sk]; else /* Prepare a new batch. */ sk = bpf_iter_udp_batch(seq); ++*pos; return sk; } static void *bpf_iter_udp_seq_start(struct seq_file *seq, loff_t *pos) { /* bpf iter does not support lseek, so it always * continue from where it was stop()-ped. */ if (*pos) return bpf_iter_udp_batch(seq); return SEQ_START_TOKEN; } static int udp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, struct udp_sock *udp_sk, uid_t uid, int bucket) { struct bpf_iter__udp ctx; meta->seq_num--; /* skip SEQ_START_TOKEN */ ctx.meta = meta; ctx.udp_sk = udp_sk; ctx.uid = uid; ctx.bucket = bucket; return bpf_iter_run_prog(prog, &ctx); } static int bpf_iter_udp_seq_show(struct seq_file *seq, void *v) { struct udp_iter_state *state = seq->private; struct bpf_iter_meta meta; struct bpf_prog *prog; struct sock *sk = v; uid_t uid; int ret; if (v == SEQ_START_TOKEN) return 0; lock_sock(sk); if (unlikely(sk_unhashed(sk))) { ret = SEQ_SKIP; goto unlock; } uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); meta.seq = seq; prog = bpf_iter_get_info(&meta, false); ret = udp_prog_seq_show(prog, &meta, v, uid, state->bucket); unlock: release_sock(sk); return ret; } static void bpf_iter_udp_put_batch(struct bpf_udp_iter_state *iter) { while (iter->cur_sk < iter->end_sk) sock_put(iter->batch[iter->cur_sk++]); } static void bpf_iter_udp_seq_stop(struct seq_file *seq, void *v) { struct bpf_udp_iter_state *iter = seq->private; struct bpf_iter_meta meta; struct bpf_prog *prog; if (!v) { meta.seq = seq; prog = bpf_iter_get_info(&meta, true); if (prog) (void)udp_prog_seq_show(prog, &meta, v, 0, 0); } if (iter->cur_sk < iter->end_sk) { bpf_iter_udp_put_batch(iter); iter->st_bucket_done = false; } } static const struct seq_operations bpf_iter_udp_seq_ops = { .start = bpf_iter_udp_seq_start, .next = bpf_iter_udp_seq_next, .stop = bpf_iter_udp_seq_stop, .show = bpf_iter_udp_seq_show, }; #endif static unsigned short seq_file_family(const struct seq_file *seq) { const struct udp_seq_afinfo *afinfo; #ifdef CONFIG_BPF_SYSCALL /* BPF iterator: bpf programs to filter sockets. */ if (seq->op == &bpf_iter_udp_seq_ops) return AF_UNSPEC; #endif /* Proc fs iterator */ afinfo = pde_data(file_inode(seq->file)); return afinfo->family; } const struct seq_operations udp_seq_ops = { .start = udp_seq_start, .next = udp_seq_next, .stop = udp_seq_stop, .show = udp4_seq_show, }; EXPORT_SYMBOL(udp_seq_ops); static struct udp_seq_afinfo udp4_seq_afinfo = { .family = AF_INET, .udp_table = NULL, }; static int __net_init udp4_proc_init_net(struct net *net) { if (!proc_create_net_data("udp", 0444, net->proc_net, &udp_seq_ops, sizeof(struct udp_iter_state), &udp4_seq_afinfo)) return -ENOMEM; return 0; } static void __net_exit udp4_proc_exit_net(struct net *net) { remove_proc_entry("udp", net->proc_net); } static struct pernet_operations udp4_net_ops = { .init = udp4_proc_init_net, .exit = udp4_proc_exit_net, }; int __init udp4_proc_init(void) { return register_pernet_subsys(&udp4_net_ops); } void udp4_proc_exit(void) { unregister_pernet_subsys(&udp4_net_ops); } #endif /* CONFIG_PROC_FS */ static __initdata unsigned long uhash_entries; static int __init set_uhash_entries(char *str) { ssize_t ret; if (!str) return 0; ret = kstrtoul(str, 0, &uhash_entries); if (ret) return 0; if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN) uhash_entries = UDP_HTABLE_SIZE_MIN; return 1; } __setup("uhash_entries=", set_uhash_entries); void __init udp_table_init(struct udp_table *table, const char *name) { unsigned int i; table->hash = alloc_large_system_hash(name, 2 * sizeof(struct udp_hslot), uhash_entries, 21, /* one slot per 2 MB */ 0, &table->log, &table->mask, UDP_HTABLE_SIZE_MIN, UDP_HTABLE_SIZE_MAX); table->hash2 = table->hash + (table->mask + 1); for (i = 0; i <= table->mask; i++) { INIT_HLIST_HEAD(&table->hash[i].head); table->hash[i].count = 0; spin_lock_init(&table->hash[i].lock); } for (i = 0; i <= table->mask; i++) { INIT_HLIST_HEAD(&table->hash2[i].head); table->hash2[i].count = 0; spin_lock_init(&table->hash2[i].lock); } } u32 udp_flow_hashrnd(void) { static u32 hashrnd __read_mostly; net_get_random_once(&hashrnd, sizeof(hashrnd)); return hashrnd; } EXPORT_SYMBOL(udp_flow_hashrnd); static void __net_init udp_sysctl_init(struct net *net) { net->ipv4.sysctl_udp_rmem_min = PAGE_SIZE; net->ipv4.sysctl_udp_wmem_min = PAGE_SIZE; #ifdef CONFIG_NET_L3_MASTER_DEV net->ipv4.sysctl_udp_l3mdev_accept = 0; #endif } static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_entries) { struct udp_table *udptable; int i; udptable = kmalloc(sizeof(*udptable), GFP_KERNEL); if (!udptable) goto out; udptable->hash = vmalloc_huge(hash_entries * 2 * sizeof(struct udp_hslot), GFP_KERNEL_ACCOUNT); if (!udptable->hash) goto free_table; udptable->hash2 = udptable->hash + hash_entries; udptable->mask = hash_entries - 1; udptable->log = ilog2(hash_entries); for (i = 0; i < hash_entries; i++) { INIT_HLIST_HEAD(&udptable->hash[i].head); udptable->hash[i].count = 0; spin_lock_init(&udptable->hash[i].lock); INIT_HLIST_HEAD(&udptable->hash2[i].head); udptable->hash2[i].count = 0; spin_lock_init(&udptable->hash2[i].lock); } return udptable; free_table: kfree(udptable); out: return NULL; } static void __net_exit udp_pernet_table_free(struct net *net) { struct udp_table *udptable = net->ipv4.udp_table; if (udptable == &udp_table) return; kvfree(udptable->hash); kfree(udptable); } static void __net_init udp_set_table(struct net *net) { struct udp_table *udptable; unsigned int hash_entries; struct net *old_net; if (net_eq(net, &init_net)) goto fallback; old_net = current->nsproxy->net_ns; hash_entries = READ_ONCE(old_net->ipv4.sysctl_udp_child_hash_entries); if (!hash_entries) goto fallback; /* Set min to keep the bitmap on stack in udp_lib_get_port() */ if (hash_entries < UDP_HTABLE_SIZE_MIN_PERNET) hash_entries = UDP_HTABLE_SIZE_MIN_PERNET; else hash_entries = roundup_pow_of_two(hash_entries); udptable = udp_pernet_table_alloc(hash_entries); if (udptable) { net->ipv4.udp_table = udptable; } else { pr_warn("Failed to allocate UDP hash table (entries: %u) " "for a netns, fallback to the global one\n", hash_entries); fallback: net->ipv4.udp_table = &udp_table; } } static int __net_init udp_pernet_init(struct net *net) { udp_sysctl_init(net); udp_set_table(net); return 0; } static void __net_exit udp_pernet_exit(struct net *net) { udp_pernet_table_free(net); } static struct pernet_operations __net_initdata udp_sysctl_ops = { .init = udp_pernet_init, .exit = udp_pernet_exit, }; #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta, struct udp_sock *udp_sk, uid_t uid, int bucket) static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter, unsigned int new_batch_sz) { struct sock **new_batch; new_batch = kvmalloc_array(new_batch_sz, sizeof(*new_batch), GFP_USER | __GFP_NOWARN); if (!new_batch) return -ENOMEM; bpf_iter_udp_put_batch(iter); kvfree(iter->batch); iter->batch = new_batch; iter->max_sk = new_batch_sz; return 0; } #define INIT_BATCH_SZ 16 static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux) { struct bpf_udp_iter_state *iter = priv_data; int ret; ret = bpf_iter_init_seq_net(priv_data, aux); if (ret) return ret; ret = bpf_iter_udp_realloc_batch(iter, INIT_BATCH_SZ); if (ret) bpf_iter_fini_seq_net(priv_data); return ret; } static void bpf_iter_fini_udp(void *priv_data) { struct bpf_udp_iter_state *iter = priv_data; bpf_iter_fini_seq_net(priv_data); kvfree(iter->batch); } static const struct bpf_iter_seq_info udp_seq_info = { .seq_ops = &bpf_iter_udp_seq_ops, .init_seq_private = bpf_iter_init_udp, .fini_seq_private = bpf_iter_fini_udp, .seq_priv_size = sizeof(struct bpf_udp_iter_state), }; static struct bpf_iter_reg udp_reg_info = { .target = "udp", .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__udp, udp_sk), PTR_TO_BTF_ID_OR_NULL | PTR_TRUSTED }, }, .seq_info = &udp_seq_info, }; static void __init bpf_iter_register(void) { udp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UDP]; if (bpf_iter_reg_target(&udp_reg_info)) pr_warn("Warning: could not register bpf iterator udp\n"); } #endif void __init udp_init(void) { unsigned long limit; unsigned int i; udp_table_init(&udp_table, "UDP"); limit = nr_free_buffer_pages() / 8; limit = max(limit, 128UL); sysctl_udp_mem[0] = limit / 4 * 3; sysctl_udp_mem[1] = limit; sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2; /* 16 spinlocks per cpu */ udp_busylocks_log = ilog2(nr_cpu_ids) + 4; udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log, GFP_KERNEL); if (!udp_busylocks) panic("UDP: failed to alloc udp_busylocks\n"); for (i = 0; i < (1U << udp_busylocks_log); i++) spin_lock_init(udp_busylocks + i); if (register_pernet_subsys(&udp_sysctl_ops)) panic("UDP: failed to init sysctl parameters.\n"); #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS) bpf_iter_register(); #endif }
21 3 2 1 45 10 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM cgroup #if !defined(_TRACE_CGROUP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_CGROUP_H #include <linux/cgroup.h> #include <linux/tracepoint.h> DECLARE_EVENT_CLASS(cgroup_root, TP_PROTO(struct cgroup_root *root), TP_ARGS(root), TP_STRUCT__entry( __field( int, root ) __field( u16, ss_mask ) __string( name, root->name ) ), TP_fast_assign( __entry->root = root->hierarchy_id; __entry->ss_mask = root->subsys_mask; __assign_str(name, root->name); ), TP_printk("root=%d ss_mask=%#x name=%s", __entry->root, __entry->ss_mask, __get_str(name)) ); DEFINE_EVENT(cgroup_root, cgroup_setup_root, TP_PROTO(struct cgroup_root *root), TP_ARGS(root) ); DEFINE_EVENT(cgroup_root, cgroup_destroy_root, TP_PROTO(struct cgroup_root *root), TP_ARGS(root) ); DEFINE_EVENT(cgroup_root, cgroup_remount, TP_PROTO(struct cgroup_root *root), TP_ARGS(root) ); DECLARE_EVENT_CLASS(cgroup, TP_PROTO(struct cgroup *cgrp, const char *path), TP_ARGS(cgrp, path), TP_STRUCT__entry( __field( int, root ) __field( int, level ) __field( u64, id ) __string( path, path ) ), TP_fast_assign( __entry->root = cgrp->root->hierarchy_id; __entry->id = cgroup_id(cgrp); __entry->level = cgrp->level; __assign_str(path, path); ), TP_printk("root=%d id=%llu level=%d path=%s", __entry->root, __entry->id, __entry->level, __get_str(path)) ); DEFINE_EVENT(cgroup, cgroup_mkdir, TP_PROTO(struct cgroup *cgrp, const char *path), TP_ARGS(cgrp, path) ); DEFINE_EVENT(cgroup, cgroup_rmdir, TP_PROTO(struct cgroup *cgrp, const char *path), TP_ARGS(cgrp, path) ); DEFINE_EVENT(cgroup, cgroup_release, TP_PROTO(struct cgroup *cgrp, const char *path), TP_ARGS(cgrp, path) ); DEFINE_EVENT(cgroup, cgroup_rename, TP_PROTO(struct cgroup *cgrp, const char *path), TP_ARGS(cgrp, path) ); DEFINE_EVENT(cgroup, cgroup_freeze, TP_PROTO(struct cgroup *cgrp, const char *path), TP_ARGS(cgrp, path) ); DEFINE_EVENT(cgroup, cgroup_unfreeze, TP_PROTO(struct cgroup *cgrp, const char *path), TP_ARGS(cgrp, path) ); DECLARE_EVENT_CLASS(cgroup_migrate, TP_PROTO(struct cgroup *dst_cgrp, const char *path, struct task_struct *task, bool threadgroup), TP_ARGS(dst_cgrp, path, task, threadgroup), TP_STRUCT__entry( __field( int, dst_root ) __field( int, dst_level ) __field( u64, dst_id ) __field( int, pid ) __string( dst_path, path ) __string( comm, task->comm ) ), TP_fast_assign( __entry->dst_root = dst_cgrp->root->hierarchy_id; __entry->dst_id = cgroup_id(dst_cgrp); __entry->dst_level = dst_cgrp->level; __assign_str(dst_path, path); __entry->pid = task->pid; __assign_str(comm, task->comm); ), TP_printk("dst_root=%d dst_id=%llu dst_level=%d dst_path=%s pid=%d comm=%s", __entry->dst_root, __entry->dst_id, __entry->dst_level, __get_str(dst_path), __entry->pid, __get_str(comm)) ); DEFINE_EVENT(cgroup_migrate, cgroup_attach_task, TP_PROTO(struct cgroup *dst_cgrp, const char *path, struct task_struct *task, bool threadgroup), TP_ARGS(dst_cgrp, path, task, threadgroup) ); DEFINE_EVENT(cgroup_migrate, cgroup_transfer_tasks, TP_PROTO(struct cgroup *dst_cgrp, const char *path, struct task_struct *task, bool threadgroup), TP_ARGS(dst_cgrp, path, task, threadgroup) ); DECLARE_EVENT_CLASS(cgroup_event, TP_PROTO(struct cgroup *cgrp, const char *path, int val), TP_ARGS(cgrp, path, val), TP_STRUCT__entry( __field( int, root ) __field( int, level ) __field( u64, id ) __string( path, path ) __field( int, val ) ), TP_fast_assign( __entry->root = cgrp->root->hierarchy_id; __entry->id = cgroup_id(cgrp); __entry->level = cgrp->level; __assign_str(path, path); __entry->val = val; ), TP_printk("root=%d id=%llu level=%d path=%s val=%d", __entry->root, __entry->id, __entry->level, __get_str(path), __entry->val) ); DEFINE_EVENT(cgroup_event, cgroup_notify_populated, TP_PROTO(struct cgroup *cgrp, const char *path, int val), TP_ARGS(cgrp, path, val) ); DEFINE_EVENT(cgroup_event, cgroup_notify_frozen, TP_PROTO(struct cgroup *cgrp, const char *path, int val), TP_ARGS(cgrp, path, val) ); #endif /* _TRACE_CGROUP_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
20 43 51 16 19 32 2 2 20 5 5 5 20 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef ARCH_X86_KVM_CPUID_H #define ARCH_X86_KVM_CPUID_H #include "x86.h" #include "reverse_cpuid.h" #include <asm/cpu.h> #include <asm/processor.h> #include <uapi/asm/kvm_para.h> extern u32 kvm_cpu_caps[NR_KVM_CPU_CAPS] __read_mostly; void kvm_set_cpu_caps(void); void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); void kvm_update_pv_runtime(struct kvm_vcpu *vcpu); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry_index(struct kvm_vcpu *vcpu, u32 function, u32 index); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function); int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries, unsigned int type); int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid *cpuid, struct kvm_cpuid_entry __user *entries); int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool exact_only); u32 xstate_required_size(u64 xstate_bv, bool compacted); int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu); u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu); static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) { return vcpu->arch.maxphyaddr; } static inline bool kvm_vcpu_is_legal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) { return !(gpa & vcpu->arch.reserved_gpa_bits); } static inline bool kvm_vcpu_is_legal_aligned_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, gpa_t alignment) { return IS_ALIGNED(gpa, alignment) && kvm_vcpu_is_legal_gpa(vcpu, gpa); } static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) { return kvm_vcpu_is_legal_aligned_gpa(vcpu, gpa, PAGE_SIZE); } static __always_inline void cpuid_entry_override(struct kvm_cpuid_entry2 *entry, unsigned int leaf) { u32 *reg = cpuid_entry_get_reg(entry, leaf * 32); BUILD_BUG_ON(leaf >= ARRAY_SIZE(kvm_cpu_caps)); *reg = kvm_cpu_caps[leaf]; } static __always_inline u32 *guest_cpuid_get_register(struct kvm_vcpu *vcpu, unsigned int x86_feature) { const struct cpuid_reg cpuid = x86_feature_cpuid(x86_feature); struct kvm_cpuid_entry2 *entry; entry = kvm_find_cpuid_entry_index(vcpu, cpuid.function, cpuid.index); if (!entry) return NULL; return __cpuid_entry_get_reg(entry, cpuid.reg); } static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, unsigned int x86_feature) { u32 *reg; reg = guest_cpuid_get_register(vcpu, x86_feature); if (!reg) return false; return *reg & __feature_bit(x86_feature); } static __always_inline void guest_cpuid_clear(struct kvm_vcpu *vcpu, unsigned int x86_feature) { u32 *reg; reg = guest_cpuid_get_register(vcpu, x86_feature); if (reg) *reg &= ~__feature_bit(x86_feature); } static inline bool guest_cpuid_is_amd_or_hygon(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; best = kvm_find_cpuid_entry(vcpu, 0); return best && (is_guest_vendor_amd(best->ebx, best->ecx, best->edx) || is_guest_vendor_hygon(best->ebx, best->ecx, best->edx)); } static inline bool guest_cpuid_is_intel(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; best = kvm_find_cpuid_entry(vcpu, 0); return best && is_guest_vendor_intel(best->ebx, best->ecx, best->edx); } static inline int guest_cpuid_family(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; best = kvm_find_cpuid_entry(vcpu, 0x1); if (!best) return -1; return x86_family(best->eax); } static inline int guest_cpuid_model(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; best = kvm_find_cpuid_entry(vcpu, 0x1); if (!best) return -1; return x86_model(best->eax); } static inline bool cpuid_model_is_consistent(struct kvm_vcpu *vcpu) { return boot_cpu_data.x86_model == guest_cpuid_model(vcpu); } static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; best = kvm_find_cpuid_entry(vcpu, 0x1); if (!best) return -1; return x86_stepping(best->eax); } static inline bool guest_has_spec_ctrl_msr(struct kvm_vcpu *vcpu) { return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) || guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) || guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) || guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)); } static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu) { return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) || guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB) || guest_cpuid_has(vcpu, X86_FEATURE_SBPB)); } static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu) { return vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT; } static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu) { return vcpu->arch.msr_misc_features_enables & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT; } static __always_inline void kvm_cpu_cap_clear(unsigned int x86_feature) { unsigned int x86_leaf = __feature_leaf(x86_feature); reverse_cpuid_check(x86_leaf); kvm_cpu_caps[x86_leaf] &= ~__feature_bit(x86_feature); } static __always_inline void kvm_cpu_cap_set(unsigned int x86_feature) { unsigned int x86_leaf = __feature_leaf(x86_feature); reverse_cpuid_check(x86_leaf); kvm_cpu_caps[x86_leaf] |= __feature_bit(x86_feature); } static __always_inline u32 kvm_cpu_cap_get(unsigned int x86_feature) { unsigned int x86_leaf = __feature_leaf(x86_feature); reverse_cpuid_check(x86_leaf); return kvm_cpu_caps[x86_leaf] & __feature_bit(x86_feature); } static __always_inline bool kvm_cpu_cap_has(unsigned int x86_feature) { return !!kvm_cpu_cap_get(x86_feature); } static __always_inline void kvm_cpu_cap_check_and_set(unsigned int x86_feature) { if (boot_cpu_has(x86_feature)) kvm_cpu_cap_set(x86_feature); } static __always_inline bool guest_pv_has(struct kvm_vcpu *vcpu, unsigned int kvm_feature) { if (!vcpu->arch.pv_cpuid.enforce) return true; return vcpu->arch.pv_cpuid.features & (1u << kvm_feature); } enum kvm_governed_features { #define KVM_GOVERNED_FEATURE(x) KVM_GOVERNED_##x, #include "governed_features.h" KVM_NR_GOVERNED_FEATURES }; static __always_inline int kvm_governed_feature_index(unsigned int x86_feature) { switch (x86_feature) { #define KVM_GOVERNED_FEATURE(x) case x: return KVM_GOVERNED_##x; #include "governed_features.h" default: return -1; } } static __always_inline bool kvm_is_governed_feature(unsigned int x86_feature) { return kvm_governed_feature_index(x86_feature) >= 0; } static __always_inline void kvm_governed_feature_set(struct kvm_vcpu *vcpu, unsigned int x86_feature) { BUILD_BUG_ON(!kvm_is_governed_feature(x86_feature)); __set_bit(kvm_governed_feature_index(x86_feature), vcpu->arch.governed_features.enabled); } static __always_inline void kvm_governed_feature_check_and_set(struct kvm_vcpu *vcpu, unsigned int x86_feature) { if (kvm_cpu_cap_has(x86_feature) && guest_cpuid_has(vcpu, x86_feature)) kvm_governed_feature_set(vcpu, x86_feature); } static __always_inline bool guest_can_use(struct kvm_vcpu *vcpu, unsigned int x86_feature) { BUILD_BUG_ON(!kvm_is_governed_feature(x86_feature)); return test_bit(kvm_governed_feature_index(x86_feature), vcpu->arch.governed_features.enabled); } static inline bool kvm_vcpu_is_legal_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { if (guest_can_use(vcpu, X86_FEATURE_LAM)) cr3 &= ~(X86_CR3_LAM_U48 | X86_CR3_LAM_U57); return kvm_vcpu_is_legal_gpa(vcpu, cr3); } #endif
34 34 34 2 2 4 88 88 85 1 8 8 4 3 3 3 3 54 64 82 75 75 67 3 67 47 47 4 41 37 34 34 82 82 3 3 3 3 12 3 76 1 77 77 76 74 74 2 17 6 17 17 17 16 3 18 19 16 13 6 19 19 2 83 83 83 83 83 83 82 82 82 82 82 1 40 2 65 2 15 12 68 78 61 2 76 76 4 60 59 2 2 2 60 60 60 76 79 77 36 59 66 15 17 33 62 79 3 17 1 76 65 59 18 65 65 79 4 18 18 12 1 1 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2011 Novell Inc. * Copyright (C) 2016 Red Hat, Inc. */ #include <linux/fs.h> #include <linux/cred.h> #include <linux/ctype.h> #include <linux/namei.h> #include <linux/xattr.h> #include <linux/ratelimit.h> #include <linux/mount.h> #include <linux/exportfs.h> #include "overlayfs.h" #include "../internal.h" /* for vfs_path_lookup */ struct ovl_lookup_data { struct super_block *sb; const struct ovl_layer *layer; struct qstr name; bool is_dir; bool opaque; bool xwhiteouts; bool stop; bool last; char *redirect; int metacopy; /* Referring to last redirect xattr */ bool absolute_redirect; }; static int ovl_check_redirect(const struct path *path, struct ovl_lookup_data *d, size_t prelen, const char *post) { int res; char *buf; struct ovl_fs *ofs = OVL_FS(d->sb); d->absolute_redirect = false; buf = ovl_get_redirect_xattr(ofs, path, prelen + strlen(post)); if (IS_ERR_OR_NULL(buf)) return PTR_ERR(buf); if (buf[0] == '/') { d->absolute_redirect = true; /* * One of the ancestor path elements in an absolute path * lookup in ovl_lookup_layer() could have been opaque and * that will stop further lookup in lower layers (d->stop=true) * But we have found an absolute redirect in descendant path * element and that should force continue lookup in lower * layers (reset d->stop). */ d->stop = false; } else { res = strlen(buf) + 1; memmove(buf + prelen, buf, res); memcpy(buf, d->name.name, prelen); } strcat(buf, post); kfree(d->redirect); d->redirect = buf; d->name.name = d->redirect; d->name.len = strlen(d->redirect); return 0; } static int ovl_acceptable(void *ctx, struct dentry *dentry) { /* * A non-dir origin may be disconnected, which is fine, because * we only need it for its unique inode number. */ if (!d_is_dir(dentry)) return 1; /* Don't decode a deleted empty directory */ if (d_unhashed(dentry)) return 0; /* Check if directory belongs to the layer we are decoding from */ return is_subdir(dentry, ((struct vfsmount *)ctx)->mnt_root); } /* * Check validity of an overlay file handle buffer. * * Return 0 for a valid file handle. * Return -ENODATA for "origin unknown". * Return <0 for an invalid file handle. */ int ovl_check_fb_len(struct ovl_fb *fb, int fb_len) { if (fb_len < sizeof(struct ovl_fb) || fb_len < fb->len) return -EINVAL; if (fb->magic != OVL_FH_MAGIC) return -EINVAL; /* Treat larger version and unknown flags as "origin unknown" */ if (fb->version > OVL_FH_VERSION || fb->flags & ~OVL_FH_FLAG_ALL) return -ENODATA; /* Treat endianness mismatch as "origin unknown" */ if (!(fb->flags & OVL_FH_FLAG_ANY_ENDIAN) && (fb->flags & OVL_FH_FLAG_BIG_ENDIAN) != OVL_FH_FLAG_CPU_ENDIAN) return -ENODATA; return 0; } static struct ovl_fh *ovl_get_fh(struct ovl_fs *ofs, struct dentry *upperdentry, enum ovl_xattr ox) { int res, err; struct ovl_fh *fh = NULL; res = ovl_getxattr_upper(ofs, upperdentry, ox, NULL, 0); if (res < 0) { if (res == -ENODATA || res == -EOPNOTSUPP) return NULL; goto fail; } /* Zero size value means "copied up but origin unknown" */ if (res == 0) return NULL; fh = kzalloc(res + OVL_FH_WIRE_OFFSET, GFP_KERNEL); if (!fh) return ERR_PTR(-ENOMEM); res = ovl_getxattr_upper(ofs, upperdentry, ox, fh->buf, res); if (res < 0) goto fail; err = ovl_check_fb_len(&fh->fb, res); if (err < 0) { if (err == -ENODATA) goto out; goto invalid; } return fh; out: kfree(fh); return NULL; fail: pr_warn_ratelimited("failed to get origin (%i)\n", res); goto out; invalid: pr_warn_ratelimited("invalid origin (%*phN)\n", res, fh); goto out; } struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh, struct vfsmount *mnt, bool connected) { struct dentry *real; int bytes; if (!capable(CAP_DAC_READ_SEARCH)) return NULL; /* * Make sure that the stored uuid matches the uuid of the lower * layer where file handle will be decoded. * In case of uuid=off option just make sure that stored uuid is null. */ if (ovl_origin_uuid(ofs) ? !uuid_equal(&fh->fb.uuid, &mnt->mnt_sb->s_uuid) : !uuid_is_null(&fh->fb.uuid)) return NULL; bytes = (fh->fb.len - offsetof(struct ovl_fb, fid)); real = exportfs_decode_fh(mnt, (struct fid *)fh->fb.fid, bytes >> 2, (int)fh->fb.type, connected ? ovl_acceptable : NULL, mnt); if (IS_ERR(real)) { /* * Treat stale file handle to lower file as "origin unknown". * upper file handle could become stale when upper file is * unlinked and this information is needed to handle stale * index entries correctly. */ if (real == ERR_PTR(-ESTALE) && !(fh->fb.flags & OVL_FH_FLAG_PATH_UPPER)) real = NULL; return real; } if (ovl_dentry_weird(real)) { dput(real); return NULL; } return real; } static struct dentry *ovl_lookup_positive_unlocked(struct ovl_lookup_data *d, const char *name, struct dentry *base, int len, bool drop_negative) { struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->layer->mnt), name, base, len); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { if (drop_negative && ret->d_lockref.count == 1) { spin_lock(&ret->d_lock); /* Recheck condition under lock */ if (d_is_negative(ret) && ret->d_lockref.count == 1) __d_drop(ret); spin_unlock(&ret->d_lock); } dput(ret); ret = ERR_PTR(-ENOENT); } return ret; } static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, const char *name, unsigned int namelen, size_t prelen, const char *post, struct dentry **ret, bool drop_negative) { struct ovl_fs *ofs = OVL_FS(d->sb); struct dentry *this; struct path path; int err; bool last_element = !post[0]; bool is_upper = d->layer->idx == 0; char val; this = ovl_lookup_positive_unlocked(d, name, base, namelen, drop_negative); if (IS_ERR(this)) { err = PTR_ERR(this); this = NULL; if (err == -ENOENT || err == -ENAMETOOLONG) goto out; goto out_err; } if (ovl_dentry_weird(this)) { /* Don't support traversing automounts and other weirdness */ err = -EREMOTE; goto out_err; } path.dentry = this; path.mnt = d->layer->mnt; if (ovl_path_is_whiteout(ofs, &path)) { d->stop = d->opaque = true; goto put_and_out; } /* * This dentry should be a regular file if previous layer lookup * found a metacopy dentry. */ if (last_element && d->metacopy && !d_is_reg(this)) { d->stop = true; goto put_and_out; } if (!d_can_lookup(this)) { if (d->is_dir || !last_element) { d->stop = true; goto put_and_out; } err = ovl_check_metacopy_xattr(ofs, &path, NULL); if (err < 0) goto out_err; d->metacopy = err; d->stop = !d->metacopy; if (!d->metacopy || d->last) goto out; } else { if (ovl_lookup_trap_inode(d->sb, this)) { /* Caught in a trap of overlapping layers */ err = -ELOOP; goto out_err; } if (last_element) d->is_dir = true; if (d->last) goto out; /* overlay.opaque=x means xwhiteouts directory */ val = ovl_get_opaquedir_val(ofs, &path); if (last_element && !is_upper && val == 'x') { d->xwhiteouts = true; ovl_layer_set_xwhiteouts(ofs, d->layer); } else if (val == 'y') { d->stop = true; if (last_element) d->opaque = true; goto out; } } err = ovl_check_redirect(&path, d, prelen, post); if (err) goto out_err; out: *ret = this; return 0; put_and_out: dput(this); this = NULL; goto out; out_err: dput(this); return err; } static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d, struct dentry **ret, bool drop_negative) { /* Counting down from the end, since the prefix can change */ size_t rem = d->name.len - 1; struct dentry *dentry = NULL; int err; if (d->name.name[0] != '/') return ovl_lookup_single(base, d, d->name.name, d->name.len, 0, "", ret, drop_negative); while (!IS_ERR_OR_NULL(base) && d_can_lookup(base)) { const char *s = d->name.name + d->name.len - rem; const char *next = strchrnul(s, '/'); size_t thislen = next - s; bool end = !next[0]; /* Verify we did not go off the rails */ if (WARN_ON(s[-1] != '/')) return -EIO; err = ovl_lookup_single(base, d, s, thislen, d->name.len - rem, next, &base, drop_negative); dput(dentry); if (err) return err; dentry = base; if (end) break; rem -= thislen + 1; if (WARN_ON(rem >= d->name.len)) return -EIO; } *ret = dentry; return 0; } static int ovl_lookup_data_layer(struct dentry *dentry, const char *redirect, const struct ovl_layer *layer, struct path *datapath) { int err; err = vfs_path_lookup(layer->mnt->mnt_root, layer->mnt, redirect, LOOKUP_BENEATH | LOOKUP_NO_SYMLINKS | LOOKUP_NO_XDEV, datapath); pr_debug("lookup lowerdata (%pd2, redirect=\"%s\", layer=%d, err=%i)\n", dentry, redirect, layer->idx, err); if (err) return err; err = -EREMOTE; if (ovl_dentry_weird(datapath->dentry)) goto out_path_put; err = -ENOENT; /* Only regular file is acceptable as lower data */ if (!d_is_reg(datapath->dentry)) goto out_path_put; return 0; out_path_put: path_put(datapath); return err; } /* Lookup in data-only layers by absolute redirect to layer root */ static int ovl_lookup_data_layers(struct dentry *dentry, const char *redirect, struct ovl_path *lowerdata) { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); const struct ovl_layer *layer; struct path datapath; int err = -ENOENT; int i; layer = &ofs->layers[ofs->numlayer - ofs->numdatalayer]; for (i = 0; i < ofs->numdatalayer; i++, layer++) { err = ovl_lookup_data_layer(dentry, redirect, layer, &datapath); if (!err) { mntput(datapath.mnt); lowerdata->dentry = datapath.dentry; lowerdata->layer = layer; return 0; } } return err; } int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected, struct dentry *upperdentry, struct ovl_path **stackp) { struct dentry *origin = NULL; int i; for (i = 1; i <= ovl_numlowerlayer(ofs); i++) { /* * If lower fs uuid is not unique among lower fs we cannot match * fh->uuid to layer. */ if (ofs->layers[i].fsid && ofs->layers[i].fs->bad_uuid) continue; origin = ovl_decode_real_fh(ofs, fh, ofs->layers[i].mnt, connected); if (origin) break; } if (!origin) return -ESTALE; else if (IS_ERR(origin)) return PTR_ERR(origin); if (upperdentry && !ovl_upper_is_whiteout(ofs, upperdentry) && inode_wrong_type(d_inode(upperdentry), d_inode(origin)->i_mode)) goto invalid; if (!*stackp) *stackp = kmalloc(sizeof(struct ovl_path), GFP_KERNEL); if (!*stackp) { dput(origin); return -ENOMEM; } **stackp = (struct ovl_path){ .dentry = origin, .layer = &ofs->layers[i] }; return 0; invalid: pr_warn_ratelimited("invalid origin (%pd2, ftype=%x, origin ftype=%x).\n", upperdentry, d_inode(upperdentry)->i_mode & S_IFMT, d_inode(origin)->i_mode & S_IFMT); dput(origin); return -ESTALE; } static int ovl_check_origin(struct ovl_fs *ofs, struct dentry *upperdentry, struct ovl_path **stackp) { struct ovl_fh *fh = ovl_get_fh(ofs, upperdentry, OVL_XATTR_ORIGIN); int err; if (IS_ERR_OR_NULL(fh)) return PTR_ERR(fh); err = ovl_check_origin_fh(ofs, fh, false, upperdentry, stackp); kfree(fh); if (err) { if (err == -ESTALE) return 0; return err; } return 0; } /* * Verify that @fh matches the file handle stored in xattr @name. * Return 0 on match, -ESTALE on mismatch, < 0 on error. */ static int ovl_verify_fh(struct ovl_fs *ofs, struct dentry *dentry, enum ovl_xattr ox, const struct ovl_fh *fh) { struct ovl_fh *ofh = ovl_get_fh(ofs, dentry, ox); int err = 0; if (!ofh) return -ENODATA; if (IS_ERR(ofh)) return PTR_ERR(ofh); if (fh->fb.len != ofh->fb.len || memcmp(&fh->fb, &ofh->fb, fh->fb.len)) err = -ESTALE; kfree(ofh); return err; } int ovl_verify_set_fh(struct ovl_fs *ofs, struct dentry *dentry, enum ovl_xattr ox, const struct ovl_fh *fh, bool is_upper, bool set) { int err; err = ovl_verify_fh(ofs, dentry, ox, fh); if (set && err == -ENODATA) err = ovl_setxattr(ofs, dentry, ox, fh->buf, fh->fb.len); return err; } /* * Verify that @real dentry matches the file handle stored in xattr @name. * * If @set is true and there is no stored file handle, encode @real and store * file handle in xattr @name. * * Return 0 on match, -ESTALE on mismatch, -ENODATA on no xattr, < 0 on error. */ int ovl_verify_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry, enum ovl_xattr ox, struct dentry *real, bool is_upper, bool set) { struct inode *inode; struct ovl_fh *fh; int err; fh = ovl_encode_real_fh(ofs, real, is_upper); err = PTR_ERR(fh); if (IS_ERR(fh)) { fh = NULL; goto fail; } err = ovl_verify_set_fh(ofs, dentry, ox, fh, is_upper, set); if (err) goto fail; out: kfree(fh); return err; fail: inode = d_inode(real); pr_warn_ratelimited("failed to verify %s (%pd2, ino=%lu, err=%i)\n", is_upper ? "upper" : "origin", real, inode ? inode->i_ino : 0, err); goto out; } /* Get upper dentry from index */ struct dentry *ovl_index_upper(struct ovl_fs *ofs, struct dentry *index, bool connected) { struct ovl_fh *fh; struct dentry *upper; if (!d_is_dir(index)) return dget(index); fh = ovl_get_fh(ofs, index, OVL_XATTR_UPPER); if (IS_ERR_OR_NULL(fh)) return ERR_CAST(fh); upper = ovl_decode_real_fh(ofs, fh, ovl_upper_mnt(ofs), connected); kfree(fh); if (IS_ERR_OR_NULL(upper)) return upper ?: ERR_PTR(-ESTALE); if (!d_is_dir(upper)) { pr_warn_ratelimited("invalid index upper (%pd2, upper=%pd2).\n", index, upper); dput(upper); return ERR_PTR(-EIO); } return upper; } /* * Verify that an index entry name matches the origin file handle stored in * OVL_XATTR_ORIGIN and that origin file handle can be decoded to lower path. * Return 0 on match, -ESTALE on mismatch or stale origin, < 0 on error. */ int ovl_verify_index(struct ovl_fs *ofs, struct dentry *index) { struct ovl_fh *fh = NULL; size_t len; struct ovl_path origin = { }; struct ovl_path *stack = &origin; struct dentry *upper = NULL; int err; if (!d_inode(index)) return 0; err = -EINVAL; if (index->d_name.len < sizeof(struct ovl_fb)*2) goto fail; err = -ENOMEM; len = index->d_name.len / 2; fh = kzalloc(len + OVL_FH_WIRE_OFFSET, GFP_KERNEL); if (!fh) goto fail; err = -EINVAL; if (hex2bin(fh->buf, index->d_name.name, len)) goto fail; err = ovl_check_fb_len(&fh->fb, len); if (err) goto fail; /* * Whiteout index entries are used as an indication that an exported * overlay file handle should be treated as stale (i.e. after unlink * of the overlay inode). These entries contain no origin xattr. */ if (ovl_is_whiteout(index)) goto out; /* * Verifying directory index entries are not stale is expensive, so * only verify stale dir index if NFS export is enabled. */ if (d_is_dir(index) && !ofs->config.nfs_export) goto out; /* * Directory index entries should have 'upper' xattr pointing to the * real upper dir. Non-dir index entries are hardlinks to the upper * real inode. For non-dir index, we can read the copy up origin xattr * directly from the index dentry, but for dir index we first need to * decode the upper directory. */ upper = ovl_index_upper(ofs, index, false); if (IS_ERR_OR_NULL(upper)) { err = PTR_ERR(upper); /* * Directory index entries with no 'upper' xattr need to be * removed. When dir index entry has a stale 'upper' xattr, * we assume that upper dir was removed and we treat the dir * index as orphan entry that needs to be whited out. */ if (err == -ESTALE) goto orphan; else if (!err) err = -ESTALE; goto fail; } err = ovl_verify_fh(ofs, upper, OVL_XATTR_ORIGIN, fh); dput(upper); if (err) goto fail; /* Check if non-dir index is orphan and don't warn before cleaning it */ if (!d_is_dir(index) && d_inode(index)->i_nlink == 1) { err = ovl_check_origin_fh(ofs, fh, false, index, &stack); if (err) goto fail; if (ovl_get_nlink(ofs, origin.dentry, index, 0) == 0) goto orphan; } out: dput(origin.dentry); kfree(fh); return err; fail: pr_warn_ratelimited("failed to verify index (%pd2, ftype=%x, err=%i)\n", index, d_inode(index)->i_mode & S_IFMT, err); goto out; orphan: pr_warn_ratelimited("orphan index entry (%pd2, ftype=%x, nlink=%u)\n", index, d_inode(index)->i_mode & S_IFMT, d_inode(index)->i_nlink); err = -ENOENT; goto out; } int ovl_get_index_name_fh(const struct ovl_fh *fh, struct qstr *name) { char *n, *s; n = kcalloc(fh->fb.len, 2, GFP_KERNEL); if (!n) return -ENOMEM; s = bin2hex(n, fh->buf, fh->fb.len); *name = (struct qstr) QSTR_INIT(n, s - n); return 0; } /* * Lookup in indexdir for the index entry of a lower real inode or a copy up * origin inode. The index entry name is the hex representation of the lower * inode file handle. * * If the index dentry in negative, then either no lower aliases have been * copied up yet, or aliases have been copied up in older kernels and are * not indexed. * * If the index dentry for a copy up origin inode is positive, but points * to an inode different than the upper inode, then either the upper inode * has been copied up and not indexed or it was indexed, but since then * index dir was cleared. Either way, that index cannot be used to identify * the overlay inode. */ int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin, struct qstr *name) { struct ovl_fh *fh; int err; fh = ovl_encode_real_fh(ofs, origin, false); if (IS_ERR(fh)) return PTR_ERR(fh); err = ovl_get_index_name_fh(fh, name); kfree(fh); return err; } /* Lookup index by file handle for NFS export */ struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh) { struct dentry *index; struct qstr name; int err; err = ovl_get_index_name_fh(fh, &name); if (err) return ERR_PTR(err); index = lookup_positive_unlocked(name.name, ofs->workdir, name.len); kfree(name.name); if (IS_ERR(index)) { if (PTR_ERR(index) == -ENOENT) index = NULL; return index; } if (ovl_is_whiteout(index)) err = -ESTALE; else if (ovl_dentry_weird(index)) err = -EIO; else return index; dput(index); return ERR_PTR(err); } struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, struct dentry *origin, bool verify) { struct dentry *index; struct inode *inode; struct qstr name; bool is_dir = d_is_dir(origin); int err; err = ovl_get_index_name(ofs, origin, &name); if (err) return ERR_PTR(err); index = lookup_one_positive_unlocked(ovl_upper_mnt_idmap(ofs), name.name, ofs->workdir, name.len); if (IS_ERR(index)) { err = PTR_ERR(index); if (err == -ENOENT) { index = NULL; goto out; } pr_warn_ratelimited("failed inode index lookup (ino=%lu, key=%.*s, err=%i);\n" "overlayfs: mount with '-o index=off' to disable inodes index.\n", d_inode(origin)->i_ino, name.len, name.name, err); goto out; } inode = d_inode(index); if (ovl_is_whiteout(index) && !verify) { /* * When index lookup is called with !verify for decoding an * overlay file handle, a whiteout index implies that decode * should treat file handle as stale and no need to print a * warning about it. */ dput(index); index = ERR_PTR(-ESTALE); goto out; } else if (ovl_dentry_weird(index) || ovl_is_whiteout(index) || inode_wrong_type(inode, d_inode(origin)->i_mode)) { /* * Index should always be of the same file type as origin * except for the case of a whiteout index. A whiteout * index should only exist if all lower aliases have been * unlinked, which means that finding a lower origin on lookup * whose index is a whiteout should be treated as an error. */ pr_warn_ratelimited("bad index found (index=%pd2, ftype=%x, origin ftype=%x).\n", index, d_inode(index)->i_mode & S_IFMT, d_inode(origin)->i_mode & S_IFMT); goto fail; } else if (is_dir && verify) { if (!upper) { pr_warn_ratelimited("suspected uncovered redirected dir found (origin=%pd2, index=%pd2).\n", origin, index); goto fail; } /* Verify that dir index 'upper' xattr points to upper dir */ err = ovl_verify_upper(ofs, index, upper, false); if (err) { if (err == -ESTALE) { pr_warn_ratelimited("suspected multiply redirected dir found (upper=%pd2, origin=%pd2, index=%pd2).\n", upper, origin, index); } goto fail; } } else if (upper && d_inode(upper) != inode) { goto out_dput; } out: kfree(name.name); return index; out_dput: dput(index); index = NULL; goto out; fail: dput(index); index = ERR_PTR(-EIO); goto out; } /* * Returns next layer in stack starting from top. * Returns -1 if this is the last layer. */ int ovl_path_next(int idx, struct dentry *dentry, struct path *path, const struct ovl_layer **layer) { struct ovl_entry *oe = OVL_E(dentry); struct ovl_path *lowerstack = ovl_lowerstack(oe); BUG_ON(idx < 0); if (idx == 0) { ovl_path_upper(dentry, path); if (path->dentry) { *layer = &OVL_FS(dentry->d_sb)->layers[0]; return ovl_numlower(oe) ? 1 : -1; } idx++; } BUG_ON(idx > ovl_numlower(oe)); path->dentry = lowerstack[idx - 1].dentry; *layer = lowerstack[idx - 1].layer; path->mnt = (*layer)->mnt; return (idx < ovl_numlower(oe)) ? idx + 1 : -1; } /* Fix missing 'origin' xattr */ static int ovl_fix_origin(struct ovl_fs *ofs, struct dentry *dentry, struct dentry *lower, struct dentry *upper) { const struct ovl_fh *fh; int err; if (ovl_check_origin_xattr(ofs, upper)) return 0; fh = ovl_get_origin_fh(ofs, lower); if (IS_ERR(fh)) return PTR_ERR(fh); err = ovl_want_write(dentry); if (err) goto out; err = ovl_set_origin_fh(ofs, fh, upper); if (!err) err = ovl_set_impure(dentry->d_parent, upper->d_parent); ovl_drop_write(dentry); out: kfree(fh); return err; } static int ovl_maybe_validate_verity(struct dentry *dentry) { struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct inode *inode = d_inode(dentry); struct path datapath, metapath; int err; if (!ofs->config.verity_mode || !ovl_is_metacopy_dentry(dentry) || ovl_test_flag(OVL_VERIFIED_DIGEST, inode)) return 0; if (!ovl_test_flag(OVL_HAS_DIGEST, inode)) { if (ofs->config.verity_mode == OVL_VERITY_REQUIRE) { pr_warn_ratelimited("metacopy file '%pd' has no digest specified\n", dentry); return -EIO; } return 0; } ovl_path_lowerdata(dentry, &datapath); if (!datapath.dentry) return -EIO; ovl_path_real(dentry, &metapath); if (!metapath.dentry) return -EIO; err = ovl_inode_lock_interruptible(inode); if (err) return err; if (!ovl_test_flag(OVL_VERIFIED_DIGEST, inode)) { const struct cred *old_cred; old_cred = ovl_override_creds(dentry->d_sb); err = ovl_validate_verity(ofs, &metapath, &datapath); if (err == 0) ovl_set_flag(OVL_VERIFIED_DIGEST, inode); revert_creds(old_cred); } ovl_inode_unlock(inode); return err; } /* Lazy lookup of lowerdata */ static int ovl_maybe_lookup_lowerdata(struct dentry *dentry) { struct inode *inode = d_inode(dentry); const char *redirect = ovl_lowerdata_redirect(inode); struct ovl_path datapath = {}; const struct cred *old_cred; int err; if (!redirect || ovl_dentry_lowerdata(dentry)) return 0; if (redirect[0] != '/') return -EIO; err = ovl_inode_lock_interruptible(inode); if (err) return err; err = 0; /* Someone got here before us? */ if (ovl_dentry_lowerdata(dentry)) goto out; old_cred = ovl_override_creds(dentry->d_sb); err = ovl_lookup_data_layers(dentry, redirect, &datapath); revert_creds(old_cred); if (err) goto out_err; err = ovl_dentry_set_lowerdata(dentry, &datapath); if (err) goto out_err; out: ovl_inode_unlock(inode); dput(datapath.dentry); return err; out_err: pr_warn_ratelimited("lazy lowerdata lookup failed (%pd2, err=%i)\n", dentry, err); goto out; } int ovl_verify_lowerdata(struct dentry *dentry) { int err; err = ovl_maybe_lookup_lowerdata(dentry); if (err) return err; return ovl_maybe_validate_verity(dentry); } struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct ovl_entry *oe = NULL; const struct cred *old_cred; struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct ovl_entry *poe = OVL_E(dentry->d_parent); struct ovl_entry *roe = OVL_E(dentry->d_sb->s_root); struct ovl_path *stack = NULL, *origin_path = NULL; struct dentry *upperdir, *upperdentry = NULL; struct dentry *origin = NULL; struct dentry *index = NULL; unsigned int ctr = 0; struct inode *inode = NULL; bool upperopaque = false; char *upperredirect = NULL; struct dentry *this; unsigned int i; int err; bool uppermetacopy = false; int metacopy_size = 0; struct ovl_lookup_data d = { .sb = dentry->d_sb, .name = dentry->d_name, .is_dir = false, .opaque = false, .stop = false, .last = ovl_redirect_follow(ofs) ? false : !ovl_numlower(poe), .redirect = NULL, .metacopy = 0, }; if (dentry->d_name.len > ofs->namelen) return ERR_PTR(-ENAMETOOLONG); old_cred = ovl_override_creds(dentry->d_sb); upperdir = ovl_dentry_upper(dentry->d_parent); if (upperdir) { d.layer = &ofs->layers[0]; err = ovl_lookup_layer(upperdir, &d, &upperdentry, true); if (err) goto out; if (upperdentry && upperdentry->d_flags & DCACHE_OP_REAL) { dput(upperdentry); err = -EREMOTE; goto out; } if (upperdentry && !d.is_dir) { /* * Lookup copy up origin by decoding origin file handle. * We may get a disconnected dentry, which is fine, * because we only need to hold the origin inode in * cache and use its inode number. We may even get a * connected dentry, that is not under any of the lower * layers root. That is also fine for using it's inode * number - it's the same as if we held a reference * to a dentry in lower layer that was moved under us. */ err = ovl_check_origin(ofs, upperdentry, &origin_path); if (err) goto out_put_upper; if (d.metacopy) uppermetacopy = true; metacopy_size = d.metacopy; } if (d.redirect) { err = -ENOMEM; upperredirect = kstrdup(d.redirect, GFP_KERNEL); if (!upperredirect) goto out_put_upper; if (d.redirect[0] == '/') poe = roe; } upperopaque = d.opaque; } if (!d.stop && ovl_numlower(poe)) { err = -ENOMEM; stack = ovl_stack_alloc(ofs->numlayer - 1); if (!stack) goto out_put_upper; } for (i = 0; !d.stop && i < ovl_numlower(poe); i++) { struct ovl_path lower = ovl_lowerstack(poe)[i]; if (!ovl_redirect_follow(ofs)) d.last = i == ovl_numlower(poe) - 1; else if (d.is_dir || !ofs->numdatalayer) d.last = lower.layer->idx == ovl_numlower(roe); d.layer = lower.layer; err = ovl_lookup_layer(lower.dentry, &d, &this, false); if (err) goto out_put; if (!this) continue; if ((uppermetacopy || d.metacopy) && !ofs->config.metacopy) { dput(this); err = -EPERM; pr_warn_ratelimited("refusing to follow metacopy origin for (%pd2)\n", dentry); goto out_put; } /* * If no origin fh is stored in upper of a merge dir, store fh * of lower dir and set upper parent "impure". */ if (upperdentry && !ctr && !ofs->noxattr && d.is_dir) { err = ovl_fix_origin(ofs, dentry, this, upperdentry); if (err) { dput(this); goto out_put; } } /* * When "verify_lower" feature is enabled, do not merge with a * lower dir that does not match a stored origin xattr. In any * case, only verified origin is used for index lookup. * * For non-dir dentry, if index=on, then ensure origin * matches the dentry found using path based lookup, * otherwise error out. */ if (upperdentry && !ctr && ((d.is_dir && ovl_verify_lower(dentry->d_sb)) || (!d.is_dir && ofs->config.index && origin_path))) { err = ovl_verify_origin(ofs, upperdentry, this, false); if (err) { dput(this); if (d.is_dir) break; goto out_put; } origin = this; } if (!upperdentry && !d.is_dir && !ctr && d.metacopy) metacopy_size = d.metacopy; if (d.metacopy && ctr) { /* * Do not store intermediate metacopy dentries in * lower chain, except top most lower metacopy dentry. * Continue the loop so that if there is an absolute * redirect on this dentry, poe can be reset to roe. */ dput(this); this = NULL; } else { stack[ctr].dentry = this; stack[ctr].layer = lower.layer; ctr++; } /* * Following redirects can have security consequences: it's like * a symlink into the lower layer without the permission checks. * This is only a problem if the upper layer is untrusted (e.g * comes from an USB drive). This can allow a non-readable file * or directory to become readable. * * Only following redirects when redirects are enabled disables * this attack vector when not necessary. */ err = -EPERM; if (d.redirect && !ovl_redirect_follow(ofs)) { pr_warn_ratelimited("refusing to follow redirect for (%pd2)\n", dentry); goto out_put; } if (d.stop) break; if (d.redirect && d.redirect[0] == '/' && poe != roe) { poe = roe; /* Find the current layer on the root dentry */ i = lower.layer->idx - 1; } } /* Defer lookup of lowerdata in data-only layers to first access */ if (d.metacopy && ctr && ofs->numdatalayer && d.absolute_redirect) { d.metacopy = 0; ctr++; } /* * For regular non-metacopy upper dentries, there is no lower * path based lookup, hence ctr will be zero. If a dentry is found * using ORIGIN xattr on upper, install it in stack. * * For metacopy dentry, path based lookup will find lower dentries. * Just make sure a corresponding data dentry has been found. */ if (d.metacopy || (uppermetacopy && !ctr)) { pr_warn_ratelimited("metacopy with no lower data found - abort lookup (%pd2)\n", dentry); err = -EIO; goto out_put; } else if (!d.is_dir && upperdentry && !ctr && origin_path) { if (WARN_ON(stack != NULL)) { err = -EIO; goto out_put; } stack = origin_path; ctr = 1; origin = origin_path->dentry; origin_path = NULL; } /* * Always lookup index if there is no-upperdentry. * * For the case of upperdentry, we have set origin by now if it * needed to be set. There are basically three cases. * * For directories, lookup index by lower inode and verify it matches * upper inode. We only trust dir index if we verified that lower dir * matches origin, otherwise dir index entries may be inconsistent * and we ignore them. * * For regular upper, we already set origin if upper had ORIGIN * xattr. There is no verification though as there is no path * based dentry lookup in lower in this case. * * For metacopy upper, we set a verified origin already if index * is enabled and if upper had an ORIGIN xattr. * */ if (!upperdentry && ctr) origin = stack[0].dentry; if (origin && ovl_indexdir(dentry->d_sb) && (!d.is_dir || ovl_index_all(dentry->d_sb))) { index = ovl_lookup_index(ofs, upperdentry, origin, true); if (IS_ERR(index)) { err = PTR_ERR(index); index = NULL; goto out_put; } } if (ctr) { oe = ovl_alloc_entry(ctr); err = -ENOMEM; if (!oe) goto out_put; ovl_stack_cpy(ovl_lowerstack(oe), stack, ctr); } if (upperopaque) ovl_dentry_set_opaque(dentry); if (d.xwhiteouts) ovl_dentry_set_xwhiteouts(dentry); if (upperdentry) ovl_dentry_set_upper_alias(dentry); else if (index) { struct path upperpath = { .dentry = upperdentry = dget(index), .mnt = ovl_upper_mnt(ofs), }; /* * It's safe to assign upperredirect here: the previous * assignment of happens only if upperdentry is non-NULL, and * this one only if upperdentry is NULL. */ upperredirect = ovl_get_redirect_xattr(ofs, &upperpath, 0); if (IS_ERR(upperredirect)) { err = PTR_ERR(upperredirect); upperredirect = NULL; goto out_free_oe; } err = ovl_check_metacopy_xattr(ofs, &upperpath, NULL); if (err < 0) goto out_free_oe; uppermetacopy = err; metacopy_size = err; } if (upperdentry || ctr) { struct ovl_inode_params oip = { .upperdentry = upperdentry, .oe = oe, .index = index, .redirect = upperredirect, }; /* Store lowerdata redirect for lazy lookup */ if (ctr > 1 && !d.is_dir && !stack[ctr - 1].dentry) { oip.lowerdata_redirect = d.redirect; d.redirect = NULL; } inode = ovl_get_inode(dentry->d_sb, &oip); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_free_oe; if (upperdentry && !uppermetacopy) ovl_set_flag(OVL_UPPERDATA, inode); if (metacopy_size > OVL_METACOPY_MIN_SIZE) ovl_set_flag(OVL_HAS_DIGEST, inode); } ovl_dentry_init_reval(dentry, upperdentry, OVL_I_E(inode)); revert_creds(old_cred); if (origin_path) { dput(origin_path->dentry); kfree(origin_path); } dput(index); ovl_stack_free(stack, ctr); kfree(d.redirect); return d_splice_alias(inode, dentry); out_free_oe: ovl_free_entry(oe); out_put: dput(index); ovl_stack_free(stack, ctr); out_put_upper: if (origin_path) { dput(origin_path->dentry); kfree(origin_path); } dput(upperdentry); kfree(upperredirect); out: kfree(d.redirect); revert_creds(old_cred); return ERR_PTR(err); } bool ovl_lower_positive(struct dentry *dentry) { struct ovl_entry *poe = OVL_E(dentry->d_parent); const struct qstr *name = &dentry->d_name; const struct cred *old_cred; unsigned int i; bool positive = false; bool done = false; /* * If dentry is negative, then lower is positive iff this is a * whiteout. */ if (!dentry->d_inode) return ovl_dentry_is_opaque(dentry); /* Negative upper -> positive lower */ if (!ovl_dentry_upper(dentry)) return true; old_cred = ovl_override_creds(dentry->d_sb); /* Positive upper -> have to look up lower to see whether it exists */ for (i = 0; !done && !positive && i < ovl_numlower(poe); i++) { struct dentry *this; struct ovl_path *parentpath = &ovl_lowerstack(poe)[i]; this = lookup_one_positive_unlocked( mnt_idmap(parentpath->layer->mnt), name->name, parentpath->dentry, name->len); if (IS_ERR(this)) { switch (PTR_ERR(this)) { case -ENOENT: case -ENAMETOOLONG: break; default: /* * Assume something is there, we just couldn't * access it. */ positive = true; break; } } else { struct path path = { .dentry = this, .mnt = parentpath->layer->mnt, }; positive = !ovl_path_is_whiteout(OVL_FS(dentry->d_sb), &path); done = true; dput(this); } } revert_creds(old_cred); return positive; }
87 87 87 1 86 1 3 82 65 22 22 65 178 3 176 3 1 17 16 2 1 42 41 13 13 32 6 7 58 57 1 17 16 1 460 314 125 72 226 149 8 86 123 10 17 3 93 12 84 107 365 114 12 102 375 482 485 1 1 16 16 20 1 41 42 49 2 21 7 28 3 36 1 20 19 40 5 367 30 8 416 7 51 421 21 447 484 85 32 25 5 1 1 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 // SPDX-License-Identifier: GPL-2.0-or-later /* Manage a process's keyrings * * Copyright (C) 2004-2005, 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/init.h> #include <linux/sched.h> #include <linux/sched/user.h> #include <linux/keyctl.h> #include <linux/fs.h> #include <linux/err.h> #include <linux/mutex.h> #include <linux/security.h> #include <linux/user_namespace.h> #include <linux/uaccess.h> #include <linux/init_task.h> #include <keys/request_key_auth-type.h> #include "internal.h" /* Session keyring create vs join semaphore */ static DEFINE_MUTEX(key_session_mutex); /* The root user's tracking struct */ struct key_user root_key_user = { .usage = REFCOUNT_INIT(3), .cons_lock = __MUTEX_INITIALIZER(root_key_user.cons_lock), .lock = __SPIN_LOCK_UNLOCKED(root_key_user.lock), .nkeys = ATOMIC_INIT(2), .nikeys = ATOMIC_INIT(2), .uid = GLOBAL_ROOT_UID, }; /* * Get or create a user register keyring. */ static struct key *get_user_register(struct user_namespace *user_ns) { struct key *reg_keyring = READ_ONCE(user_ns->user_keyring_register); if (reg_keyring) return reg_keyring; down_write(&user_ns->keyring_sem); /* Make sure there's a register keyring. It gets owned by the * user_namespace's owner. */ reg_keyring = user_ns->user_keyring_register; if (!reg_keyring) { reg_keyring = keyring_alloc(".user_reg", user_ns->owner, INVALID_GID, &init_cred, KEY_POS_WRITE | KEY_POS_SEARCH | KEY_USR_VIEW | KEY_USR_READ, 0, NULL, NULL); if (!IS_ERR(reg_keyring)) smp_store_release(&user_ns->user_keyring_register, reg_keyring); } up_write(&user_ns->keyring_sem); /* We don't return a ref since the keyring is pinned by the user_ns */ return reg_keyring; } /* * Look up the user and user session keyrings for the current process's UID, * creating them if they don't exist. */ int look_up_user_keyrings(struct key **_user_keyring, struct key **_user_session_keyring) { const struct cred *cred = current_cred(); struct user_namespace *user_ns = current_user_ns(); struct key *reg_keyring, *uid_keyring, *session_keyring; key_perm_t user_keyring_perm; key_ref_t uid_keyring_r, session_keyring_r; uid_t uid = from_kuid(user_ns, cred->user->uid); char buf[20]; int ret; user_keyring_perm = (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_ALL; kenter("%u", uid); reg_keyring = get_user_register(user_ns); if (IS_ERR(reg_keyring)) return PTR_ERR(reg_keyring); down_write(&user_ns->keyring_sem); ret = 0; /* Get the user keyring. Note that there may be one in existence * already as it may have been pinned by a session, but the user_struct * pointing to it may have been destroyed by setuid. */ snprintf(buf, sizeof(buf), "_uid.%u", uid); uid_keyring_r = keyring_search(make_key_ref(reg_keyring, true), &key_type_keyring, buf, false); kdebug("_uid %p", uid_keyring_r); if (uid_keyring_r == ERR_PTR(-EAGAIN)) { uid_keyring = keyring_alloc(buf, cred->user->uid, INVALID_GID, cred, user_keyring_perm, KEY_ALLOC_UID_KEYRING | KEY_ALLOC_IN_QUOTA, NULL, reg_keyring); if (IS_ERR(uid_keyring)) { ret = PTR_ERR(uid_keyring); goto error; } } else if (IS_ERR(uid_keyring_r)) { ret = PTR_ERR(uid_keyring_r); goto error; } else { uid_keyring = key_ref_to_ptr(uid_keyring_r); } /* Get a default session keyring (which might also exist already) */ snprintf(buf, sizeof(buf), "_uid_ses.%u", uid); session_keyring_r = keyring_search(make_key_ref(reg_keyring, true), &key_type_keyring, buf, false); kdebug("_uid_ses %p", session_keyring_r); if (session_keyring_r == ERR_PTR(-EAGAIN)) { session_keyring = keyring_alloc(buf, cred->user->uid, INVALID_GID, cred, user_keyring_perm, KEY_ALLOC_UID_KEYRING | KEY_ALLOC_IN_QUOTA, NULL, NULL); if (IS_ERR(session_keyring)) { ret = PTR_ERR(session_keyring); goto error_release; } /* We install a link from the user session keyring to * the user keyring. */ ret = key_link(session_keyring, uid_keyring); if (ret < 0) goto error_release_session; /* And only then link the user-session keyring to the * register. */ ret = key_link(reg_keyring, session_keyring); if (ret < 0) goto error_release_session; } else if (IS_ERR(session_keyring_r)) { ret = PTR_ERR(session_keyring_r); goto error_release; } else { session_keyring = key_ref_to_ptr(session_keyring_r); } up_write(&user_ns->keyring_sem); if (_user_session_keyring) *_user_session_keyring = session_keyring; else key_put(session_keyring); if (_user_keyring) *_user_keyring = uid_keyring; else key_put(uid_keyring); kleave(" = 0"); return 0; error_release_session: key_put(session_keyring); error_release: key_put(uid_keyring); error: up_write(&user_ns->keyring_sem); kleave(" = %d", ret); return ret; } /* * Get the user session keyring if it exists, but don't create it if it * doesn't. */ struct key *get_user_session_keyring_rcu(const struct cred *cred) { struct key *reg_keyring = READ_ONCE(cred->user_ns->user_keyring_register); key_ref_t session_keyring_r; char buf[20]; struct keyring_search_context ctx = { .index_key.type = &key_type_keyring, .index_key.description = buf, .cred = cred, .match_data.cmp = key_default_cmp, .match_data.raw_data = buf, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = KEYRING_SEARCH_DO_STATE_CHECK, }; if (!reg_keyring) return NULL; ctx.index_key.desc_len = snprintf(buf, sizeof(buf), "_uid_ses.%u", from_kuid(cred->user_ns, cred->user->uid)); session_keyring_r = keyring_search_rcu(make_key_ref(reg_keyring, true), &ctx); if (IS_ERR(session_keyring_r)) return NULL; return key_ref_to_ptr(session_keyring_r); } /* * Install a thread keyring to the given credentials struct if it didn't have * one already. This is allowed to overrun the quota. * * Return: 0 if a thread keyring is now present; -errno on failure. */ int install_thread_keyring_to_cred(struct cred *new) { struct key *keyring; if (new->thread_keyring) return 0; keyring = keyring_alloc("_tid", new->uid, new->gid, new, KEY_POS_ALL | KEY_USR_VIEW, KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); new->thread_keyring = keyring; return 0; } /* * Install a thread keyring to the current task if it didn't have one already. * * Return: 0 if a thread keyring is now present; -errno on failure. */ static int install_thread_keyring(void) { struct cred *new; int ret; new = prepare_creds(); if (!new) return -ENOMEM; ret = install_thread_keyring_to_cred(new); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); } /* * Install a process keyring to the given credentials struct if it didn't have * one already. This is allowed to overrun the quota. * * Return: 0 if a process keyring is now present; -errno on failure. */ int install_process_keyring_to_cred(struct cred *new) { struct key *keyring; if (new->process_keyring) return 0; keyring = keyring_alloc("_pid", new->uid, new->gid, new, KEY_POS_ALL | KEY_USR_VIEW, KEY_ALLOC_QUOTA_OVERRUN, NULL, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); new->process_keyring = keyring; return 0; } /* * Install a process keyring to the current task if it didn't have one already. * * Return: 0 if a process keyring is now present; -errno on failure. */ static int install_process_keyring(void) { struct cred *new; int ret; new = prepare_creds(); if (!new) return -ENOMEM; ret = install_process_keyring_to_cred(new); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); } /* * Install the given keyring as the session keyring of the given credentials * struct, replacing the existing one if any. If the given keyring is NULL, * then install a new anonymous session keyring. * @cred can not be in use by any task yet. * * Return: 0 on success; -errno on failure. */ int install_session_keyring_to_cred(struct cred *cred, struct key *keyring) { unsigned long flags; struct key *old; might_sleep(); /* create an empty session keyring */ if (!keyring) { flags = KEY_ALLOC_QUOTA_OVERRUN; if (cred->session_keyring) flags = KEY_ALLOC_IN_QUOTA; keyring = keyring_alloc("_ses", cred->uid, cred->gid, cred, KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ, flags, NULL, NULL); if (IS_ERR(keyring)) return PTR_ERR(keyring); } else { __key_get(keyring); } /* install the keyring */ old = cred->session_keyring; cred->session_keyring = keyring; if (old) key_put(old); return 0; } /* * Install the given keyring as the session keyring of the current task, * replacing the existing one if any. If the given keyring is NULL, then * install a new anonymous session keyring. * * Return: 0 on success; -errno on failure. */ static int install_session_keyring(struct key *keyring) { struct cred *new; int ret; new = prepare_creds(); if (!new) return -ENOMEM; ret = install_session_keyring_to_cred(new, keyring); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); } /* * Handle the fsuid changing. */ void key_fsuid_changed(struct cred *new_cred) { /* update the ownership of the thread keyring */ if (new_cred->thread_keyring) { down_write(&new_cred->thread_keyring->sem); new_cred->thread_keyring->uid = new_cred->fsuid; up_write(&new_cred->thread_keyring->sem); } } /* * Handle the fsgid changing. */ void key_fsgid_changed(struct cred *new_cred) { /* update the ownership of the thread keyring */ if (new_cred->thread_keyring) { down_write(&new_cred->thread_keyring->sem); new_cred->thread_keyring->gid = new_cred->fsgid; up_write(&new_cred->thread_keyring->sem); } } /* * Search the process keyrings attached to the supplied cred for the first * matching key under RCU conditions (the caller must be holding the RCU read * lock). * * The search criteria are the type and the match function. The description is * given to the match function as a parameter, but doesn't otherwise influence * the search. Typically the match function will compare the description * parameter to the key's description. * * This can only search keyrings that grant Search permission to the supplied * credentials. Keyrings linked to searched keyrings will also be searched if * they grant Search permission too. Keys can only be found if they grant * Search permission to the credentials. * * Returns a pointer to the key with the key usage count incremented if * successful, -EAGAIN if we didn't find any matching key or -ENOKEY if we only * matched negative keys. * * In the case of a successful return, the possession attribute is set on the * returned key reference. */ key_ref_t search_cred_keyrings_rcu(struct keyring_search_context *ctx) { struct key *user_session; key_ref_t key_ref, ret, err; const struct cred *cred = ctx->cred; /* we want to return -EAGAIN or -ENOKEY if any of the keyrings were * searchable, but we failed to find a key or we found a negative key; * otherwise we want to return a sample error (probably -EACCES) if * none of the keyrings were searchable * * in terms of priority: success > -ENOKEY > -EAGAIN > other error */ key_ref = NULL; ret = NULL; err = ERR_PTR(-EAGAIN); /* search the thread keyring first */ if (cred->thread_keyring) { key_ref = keyring_search_rcu( make_key_ref(cred->thread_keyring, 1), ctx); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* search the process keyring second */ if (cred->process_keyring) { key_ref = keyring_search_rcu( make_key_ref(cred->process_keyring, 1), ctx); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ if (ret) break; fallthrough; case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* search the session keyring */ if (cred->session_keyring) { key_ref = keyring_search_rcu( make_key_ref(cred->session_keyring, 1), ctx); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ if (ret) break; fallthrough; case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* or search the user-session keyring */ else if ((user_session = get_user_session_keyring_rcu(cred))) { key_ref = keyring_search_rcu(make_key_ref(user_session, 1), ctx); key_put(user_session); if (!IS_ERR(key_ref)) goto found; switch (PTR_ERR(key_ref)) { case -EAGAIN: /* no key */ if (ret) break; fallthrough; case -ENOKEY: /* negative key */ ret = key_ref; break; default: err = key_ref; break; } } /* no key - decide on the error we're going to go for */ key_ref = ret ? ret : err; found: return key_ref; } /* * Search the process keyrings attached to the supplied cred for the first * matching key in the manner of search_my_process_keyrings(), but also search * the keys attached to the assumed authorisation key using its credentials if * one is available. * * The caller must be holding the RCU read lock. * * Return same as search_cred_keyrings_rcu(). */ key_ref_t search_process_keyrings_rcu(struct keyring_search_context *ctx) { struct request_key_auth *rka; key_ref_t key_ref, ret = ERR_PTR(-EACCES), err; key_ref = search_cred_keyrings_rcu(ctx); if (!IS_ERR(key_ref)) goto found; err = key_ref; /* if this process has an instantiation authorisation key, then we also * search the keyrings of the process mentioned there * - we don't permit access to request_key auth keys via this method */ if (ctx->cred->request_key_auth && ctx->cred == current_cred() && ctx->index_key.type != &key_type_request_key_auth ) { const struct cred *cred = ctx->cred; if (key_validate(cred->request_key_auth) == 0) { rka = ctx->cred->request_key_auth->payload.data[0]; //// was search_process_keyrings() [ie. recursive] ctx->cred = rka->cred; key_ref = search_cred_keyrings_rcu(ctx); ctx->cred = cred; if (!IS_ERR(key_ref)) goto found; ret = key_ref; } } /* no key - decide on the error we're going to go for */ if (err == ERR_PTR(-ENOKEY) || ret == ERR_PTR(-ENOKEY)) key_ref = ERR_PTR(-ENOKEY); else if (err == ERR_PTR(-EACCES)) key_ref = ret; else key_ref = err; found: return key_ref; } /* * See if the key we're looking at is the target key. */ bool lookup_user_key_possessed(const struct key *key, const struct key_match_data *match_data) { return key == match_data->raw_data; } /* * Look up a key ID given us by userspace with a given permissions mask to get * the key it refers to. * * Flags can be passed to request that special keyrings be created if referred * to directly, to permit partially constructed keys to be found and to skip * validity and permission checks on the found key. * * Returns a pointer to the key with an incremented usage count if successful; * -EINVAL if the key ID is invalid; -ENOKEY if the key ID does not correspond * to a key or the best found key was a negative key; -EKEYREVOKED or * -EKEYEXPIRED if the best found key was revoked or expired; -EACCES if the * found key doesn't grant the requested permit or the LSM denied access to it; * or -ENOMEM if a special keyring couldn't be created. * * In the case of a successful return, the possession attribute is set on the * returned key reference. */ key_ref_t lookup_user_key(key_serial_t id, unsigned long lflags, enum key_need_perm need_perm) { struct keyring_search_context ctx = { .match_data.cmp = lookup_user_key_possessed, .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, .flags = (KEYRING_SEARCH_NO_STATE_CHECK | KEYRING_SEARCH_RECURSE), }; struct request_key_auth *rka; struct key *key, *user_session; key_ref_t key_ref, skey_ref; int ret; try_again: ctx.cred = get_current_cred(); key_ref = ERR_PTR(-ENOKEY); switch (id) { case KEY_SPEC_THREAD_KEYRING: if (!ctx.cred->thread_keyring) { if (!(lflags & KEY_LOOKUP_CREATE)) goto error; ret = install_thread_keyring(); if (ret < 0) { key_ref = ERR_PTR(ret); goto error; } goto reget_creds; } key = ctx.cred->thread_keyring; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_PROCESS_KEYRING: if (!ctx.cred->process_keyring) { if (!(lflags & KEY_LOOKUP_CREATE)) goto error; ret = install_process_keyring(); if (ret < 0) { key_ref = ERR_PTR(ret); goto error; } goto reget_creds; } key = ctx.cred->process_keyring; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_SESSION_KEYRING: if (!ctx.cred->session_keyring) { /* always install a session keyring upon access if one * doesn't exist yet */ ret = look_up_user_keyrings(NULL, &user_session); if (ret < 0) goto error; if (lflags & KEY_LOOKUP_CREATE) ret = join_session_keyring(NULL); else ret = install_session_keyring(user_session); key_put(user_session); if (ret < 0) goto error; goto reget_creds; } else if (test_bit(KEY_FLAG_UID_KEYRING, &ctx.cred->session_keyring->flags) && lflags & KEY_LOOKUP_CREATE) { ret = join_session_keyring(NULL); if (ret < 0) goto error; goto reget_creds; } key = ctx.cred->session_keyring; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_USER_KEYRING: ret = look_up_user_keyrings(&key, NULL); if (ret < 0) goto error; key_ref = make_key_ref(key, 1); break; case KEY_SPEC_USER_SESSION_KEYRING: ret = look_up_user_keyrings(NULL, &key); if (ret < 0) goto error; key_ref = make_key_ref(key, 1); break; case KEY_SPEC_GROUP_KEYRING: /* group keyrings are not yet supported */ key_ref = ERR_PTR(-EINVAL); goto error; case KEY_SPEC_REQKEY_AUTH_KEY: key = ctx.cred->request_key_auth; if (!key) goto error; __key_get(key); key_ref = make_key_ref(key, 1); break; case KEY_SPEC_REQUESTOR_KEYRING: if (!ctx.cred->request_key_auth) goto error; down_read(&ctx.cred->request_key_auth->sem); if (test_bit(KEY_FLAG_REVOKED, &ctx.cred->request_key_auth->flags)) { key_ref = ERR_PTR(-EKEYREVOKED); key = NULL; } else { rka = ctx.cred->request_key_auth->payload.data[0]; key = rka->dest_keyring; __key_get(key); } up_read(&ctx.cred->request_key_auth->sem); if (!key) goto error; key_ref = make_key_ref(key, 1); break; default: key_ref = ERR_PTR(-EINVAL); if (id < 1) goto error; key = key_lookup(id); if (IS_ERR(key)) { key_ref = ERR_CAST(key); goto error; } key_ref = make_key_ref(key, 0); /* check to see if we possess the key */ ctx.index_key = key->index_key; ctx.match_data.raw_data = key; kdebug("check possessed"); rcu_read_lock(); skey_ref = search_process_keyrings_rcu(&ctx); rcu_read_unlock(); kdebug("possessed=%p", skey_ref); if (!IS_ERR(skey_ref)) { key_put(key); key_ref = skey_ref; } break; } /* unlink does not use the nominated key in any way, so can skip all * the permission checks as it is only concerned with the keyring */ if (need_perm != KEY_NEED_UNLINK) { if (!(lflags & KEY_LOOKUP_PARTIAL)) { ret = wait_for_key_construction(key, true); switch (ret) { case -ERESTARTSYS: goto invalid_key; default: if (need_perm != KEY_AUTHTOKEN_OVERRIDE && need_perm != KEY_DEFER_PERM_CHECK) goto invalid_key; break; case 0: break; } } else if (need_perm != KEY_DEFER_PERM_CHECK) { ret = key_validate(key); if (ret < 0) goto invalid_key; } ret = -EIO; if (!(lflags & KEY_LOOKUP_PARTIAL) && key_read_state(key) == KEY_IS_UNINSTANTIATED) goto invalid_key; } /* check the permissions */ ret = key_task_permission(key_ref, ctx.cred, need_perm); if (ret < 0) goto invalid_key; key->last_used_at = ktime_get_real_seconds(); error: put_cred(ctx.cred); return key_ref; invalid_key: key_ref_put(key_ref); key_ref = ERR_PTR(ret); goto error; /* if we attempted to install a keyring, then it may have caused new * creds to be installed */ reget_creds: put_cred(ctx.cred); goto try_again; } EXPORT_SYMBOL(lookup_user_key); /* * Join the named keyring as the session keyring if possible else attempt to * create a new one of that name and join that. * * If the name is NULL, an empty anonymous keyring will be installed as the * session keyring. * * Named session keyrings are joined with a semaphore held to prevent the * keyrings from going away whilst the attempt is made to going them and also * to prevent a race in creating compatible session keyrings. */ long join_session_keyring(const char *name) { const struct cred *old; struct cred *new; struct key *keyring; long ret, serial; new = prepare_creds(); if (!new) return -ENOMEM; old = current_cred(); /* if no name is provided, install an anonymous keyring */ if (!name) { ret = install_session_keyring_to_cred(new, NULL); if (ret < 0) goto error; serial = new->session_keyring->serial; ret = commit_creds(new); if (ret == 0) ret = serial; goto okay; } /* allow the user to join or create a named keyring */ mutex_lock(&key_session_mutex); /* look for an existing keyring of this name */ keyring = find_keyring_by_name(name, false); if (PTR_ERR(keyring) == -ENOKEY) { /* not found - try and create a new one */ keyring = keyring_alloc( name, old->uid, old->gid, old, KEY_POS_ALL | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_LINK, KEY_ALLOC_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error2; } } else if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto error2; } else if (keyring == new->session_keyring) { ret = 0; goto error3; } /* we've got a keyring - now to install it */ ret = install_session_keyring_to_cred(new, keyring); if (ret < 0) goto error3; commit_creds(new); mutex_unlock(&key_session_mutex); ret = keyring->serial; key_put(keyring); okay: return ret; error3: key_put(keyring); error2: mutex_unlock(&key_session_mutex); error: abort_creds(new); return ret; } /* * Replace a process's session keyring on behalf of one of its children when * the target process is about to resume userspace execution. */ void key_change_session_keyring(struct callback_head *twork) { const struct cred *old = current_cred(); struct cred *new = container_of(twork, struct cred, rcu); if (unlikely(current->flags & PF_EXITING)) { put_cred(new); return; } /* If get_ucounts fails more bits are needed in the refcount */ if (unlikely(!get_ucounts(old->ucounts))) { WARN_ONCE(1, "In %s get_ucounts failed\n", __func__); put_cred(new); return; } new-> uid = old-> uid; new-> euid = old-> euid; new-> suid = old-> suid; new->fsuid = old->fsuid; new-> gid = old-> gid; new-> egid = old-> egid; new-> sgid = old-> sgid; new->fsgid = old->fsgid; new->user = get_uid(old->user); new->ucounts = old->ucounts; new->user_ns = get_user_ns(old->user_ns); new->group_info = get_group_info(old->group_info); new->securebits = old->securebits; new->cap_inheritable = old->cap_inheritable; new->cap_permitted = old->cap_permitted; new->cap_effective = old->cap_effective; new->cap_ambient = old->cap_ambient; new->cap_bset = old->cap_bset; new->jit_keyring = old->jit_keyring; new->thread_keyring = key_get(old->thread_keyring); new->process_keyring = key_get(old->process_keyring); security_transfer_creds(new, old); commit_creds(new); } /* * Make sure that root's user and user-session keyrings exist. */ static int __init init_root_keyring(void) { return look_up_user_keyrings(NULL, NULL); } late_initcall(init_root_keyring);
9 1 8 8 8 3 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 /* * Routines to compress and uncompress tcp packets (for transmission * over low speed serial lines). * * Copyright (c) 1989 Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms are permitted * provided that the above copyright notice and this paragraph are * duplicated in all such forms and that any documentation, * advertising materials, and other materials related to such * distribution and use acknowledge that the software was developed * by the University of California, Berkeley. The name of the * University may not be used to endorse or promote products derived * from this software without specific prior written permission. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. * * Van Jacobson (van@helios.ee.lbl.gov), Dec 31, 1989: * - Initial distribution. * * * modified for KA9Q Internet Software Package by * Katie Stevens (dkstevens@ucdavis.edu) * University of California, Davis * Computing Services * - 01-31-90 initial adaptation (from 1.19) * PPP.05 02-15-90 [ks] * PPP.08 05-02-90 [ks] use PPP protocol field to signal compression * PPP.15 09-90 [ks] improve mbuf handling * PPP.16 11-02 [karn] substantially rewritten to use NOS facilities * * - Feb 1991 Bill_Simpson@um.cc.umich.edu * variable number of conversation slots * allow zero or one slots * separate routines * status display * - Jul 1994 Dmitry Gorodchanin * Fixes for memory leaks. * - Oct 1994 Dmitry Gorodchanin * Modularization. * - Jan 1995 Bjorn Ekwall * Use ip_fast_csum from ip.h * - July 1995 Christos A. Polyzols * Spotted bug in tcp option checking * * * This module is a difficult issue. It's clearly inet code but it's also clearly * driver code belonging close to PPP and SLIP */ #include <linux/module.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/kernel.h> #include <net/slhc_vj.h> #ifdef CONFIG_INET /* Entire module is for IP only */ #include <linux/mm.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/termios.h> #include <linux/in.h> #include <linux/fcntl.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <net/ip.h> #include <net/protocol.h> #include <net/icmp.h> #include <net/tcp.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/timer.h> #include <linux/uaccess.h> #include <net/checksum.h> #include <asm/unaligned.h> static unsigned char *encode(unsigned char *cp, unsigned short n); static long decode(unsigned char **cpp); static unsigned char * put16(unsigned char *cp, unsigned short x); static unsigned short pull16(unsigned char **cpp); /* Allocate compression data structure * slots must be in range 0 to 255 (zero meaning no compression) * Returns pointer to structure or ERR_PTR() on error. */ struct slcompress * slhc_init(int rslots, int tslots) { short i; struct cstate *ts; struct slcompress *comp; if (rslots < 0 || rslots > 255 || tslots < 0 || tslots > 255) return ERR_PTR(-EINVAL); comp = kzalloc(sizeof(struct slcompress), GFP_KERNEL); if (! comp) goto out_fail; if (rslots > 0) { size_t rsize = rslots * sizeof(struct cstate); comp->rstate = kzalloc(rsize, GFP_KERNEL); if (! comp->rstate) goto out_free; comp->rslot_limit = rslots - 1; } if (tslots > 0) { size_t tsize = tslots * sizeof(struct cstate); comp->tstate = kzalloc(tsize, GFP_KERNEL); if (! comp->tstate) goto out_free2; comp->tslot_limit = tslots - 1; } comp->xmit_oldest = 0; comp->xmit_current = 255; comp->recv_current = 255; /* * don't accept any packets with implicit index until we get * one with an explicit index. Otherwise the uncompress code * will try to use connection 255, which is almost certainly * out of range */ comp->flags |= SLF_TOSS; if ( tslots > 0 ) { ts = comp->tstate; for(i = comp->tslot_limit; i > 0; --i){ ts[i].cs_this = i; ts[i].next = &(ts[i - 1]); } ts[0].next = &(ts[comp->tslot_limit]); ts[0].cs_this = 0; } return comp; out_free2: kfree(comp->rstate); out_free: kfree(comp); out_fail: return ERR_PTR(-ENOMEM); } /* Free a compression data structure */ void slhc_free(struct slcompress *comp) { if ( IS_ERR_OR_NULL(comp) ) return; if ( comp->tstate != NULLSLSTATE ) kfree( comp->tstate ); if ( comp->rstate != NULLSLSTATE ) kfree( comp->rstate ); kfree( comp ); } /* Put a short in host order into a char array in network order */ static inline unsigned char * put16(unsigned char *cp, unsigned short x) { *cp++ = x >> 8; *cp++ = x; return cp; } /* Encode a number */ static unsigned char * encode(unsigned char *cp, unsigned short n) { if(n >= 256 || n == 0){ *cp++ = 0; cp = put16(cp,n); } else { *cp++ = n; } return cp; } /* Pull a 16-bit integer in host order from buffer in network byte order */ static unsigned short pull16(unsigned char **cpp) { short rval; rval = *(*cpp)++; rval <<= 8; rval |= *(*cpp)++; return rval; } /* Decode a number */ static long decode(unsigned char **cpp) { int x; x = *(*cpp)++; if(x == 0){ return pull16(cpp) & 0xffff; /* pull16 returns -1 on error */ } else { return x & 0xff; /* -1 if PULLCHAR returned error */ } } /* * icp and isize are the original packet. * ocp is a place to put a copy if necessary. * cpp is initially a pointer to icp. If the copy is used, * change it to ocp. */ int slhc_compress(struct slcompress *comp, unsigned char *icp, int isize, unsigned char *ocp, unsigned char **cpp, int compress_cid) { struct cstate *ocs = &(comp->tstate[comp->xmit_oldest]); struct cstate *lcs = ocs; struct cstate *cs = lcs->next; unsigned long deltaS, deltaA; short changes = 0; int nlen, hlen; unsigned char new_seq[16]; unsigned char *cp = new_seq; struct iphdr *ip; struct tcphdr *th, *oth; __sum16 csum; /* * Don't play with runt packets. */ if(isize<sizeof(struct iphdr)) return isize; ip = (struct iphdr *) icp; if (ip->version != 4 || ip->ihl < 5) return isize; /* Bail if this packet isn't TCP, or is an IP fragment */ if (ip->protocol != IPPROTO_TCP || (ntohs(ip->frag_off) & 0x3fff)) { /* Send as regular IP */ if(ip->protocol != IPPROTO_TCP) comp->sls_o_nontcp++; else comp->sls_o_tcp++; return isize; } nlen = ip->ihl * 4; if (isize < nlen + sizeof(*th)) return isize; th = (struct tcphdr *)(icp + nlen); if (th->doff < sizeof(struct tcphdr) / 4) return isize; hlen = nlen + th->doff * 4; /* Bail if the TCP packet isn't `compressible' (i.e., ACK isn't set or * some other control bit is set). Also uncompressible if * it's a runt. */ if(hlen > isize || th->syn || th->fin || th->rst || ! (th->ack)){ /* TCP connection stuff; send as regular IP */ comp->sls_o_tcp++; return isize; } /* * Packet is compressible -- we're going to send either a * COMPRESSED_TCP or UNCOMPRESSED_TCP packet. Either way, * we need to locate (or create) the connection state. * * States are kept in a circularly linked list with * xmit_oldest pointing to the end of the list. The * list is kept in lru order by moving a state to the * head of the list whenever it is referenced. Since * the list is short and, empirically, the connection * we want is almost always near the front, we locate * states via linear search. If we don't find a state * for the datagram, the oldest state is (re-)used. */ for ( ; ; ) { if( ip->saddr == cs->cs_ip.saddr && ip->daddr == cs->cs_ip.daddr && th->source == cs->cs_tcp.source && th->dest == cs->cs_tcp.dest) goto found; /* if current equal oldest, at end of list */ if ( cs == ocs ) break; lcs = cs; cs = cs->next; comp->sls_o_searches++; } /* * Didn't find it -- re-use oldest cstate. Send an * uncompressed packet that tells the other side what * connection number we're using for this conversation. * * Note that since the state list is circular, the oldest * state points to the newest and we only need to set * xmit_oldest to update the lru linkage. */ comp->sls_o_misses++; comp->xmit_oldest = lcs->cs_this; goto uncompressed; found: /* * Found it -- move to the front on the connection list. */ if(lcs == ocs) { /* found at most recently used */ } else if (cs == ocs) { /* found at least recently used */ comp->xmit_oldest = lcs->cs_this; } else { /* more than 2 elements */ lcs->next = cs->next; cs->next = ocs->next; ocs->next = cs; } /* * Make sure that only what we expect to change changed. * Check the following: * IP protocol version, header length & type of service. * The "Don't fragment" bit. * The time-to-live field. * The TCP header length. * IP options, if any. * TCP options, if any. * If any of these things are different between the previous & * current datagram, we send the current datagram `uncompressed'. */ oth = &cs->cs_tcp; if(ip->version != cs->cs_ip.version || ip->ihl != cs->cs_ip.ihl || ip->tos != cs->cs_ip.tos || (ip->frag_off & htons(0x4000)) != (cs->cs_ip.frag_off & htons(0x4000)) || ip->ttl != cs->cs_ip.ttl || th->doff != cs->cs_tcp.doff || (ip->ihl > 5 && memcmp(ip+1,cs->cs_ipopt,((ip->ihl)-5)*4) != 0) || (th->doff > 5 && memcmp(th+1,cs->cs_tcpopt,((th->doff)-5)*4) != 0)){ goto uncompressed; } /* * Figure out which of the changing fields changed. The * receiver expects changes in the order: urgent, window, * ack, seq (the order minimizes the number of temporaries * needed in this section of code). */ if(th->urg){ deltaS = ntohs(th->urg_ptr); cp = encode(cp,deltaS); changes |= NEW_U; } else if(th->urg_ptr != oth->urg_ptr){ /* argh! URG not set but urp changed -- a sensible * implementation should never do this but RFC793 * doesn't prohibit the change so we have to deal * with it. */ goto uncompressed; } if((deltaS = ntohs(th->window) - ntohs(oth->window)) != 0){ cp = encode(cp,deltaS); changes |= NEW_W; } if((deltaA = ntohl(th->ack_seq) - ntohl(oth->ack_seq)) != 0L){ if(deltaA > 0x0000ffff) goto uncompressed; cp = encode(cp,deltaA); changes |= NEW_A; } if((deltaS = ntohl(th->seq) - ntohl(oth->seq)) != 0L){ if(deltaS > 0x0000ffff) goto uncompressed; cp = encode(cp,deltaS); changes |= NEW_S; } switch(changes){ case 0: /* Nothing changed. If this packet contains data and the * last one didn't, this is probably a data packet following * an ack (normal on an interactive connection) and we send * it compressed. Otherwise it's probably a retransmit, * retransmitted ack or window probe. Send it uncompressed * in case the other side missed the compressed version. */ if(ip->tot_len != cs->cs_ip.tot_len && ntohs(cs->cs_ip.tot_len) == hlen) break; goto uncompressed; case SPECIAL_I: case SPECIAL_D: /* actual changes match one of our special case encodings -- * send packet uncompressed. */ goto uncompressed; case NEW_S|NEW_A: if(deltaS == deltaA && deltaS == ntohs(cs->cs_ip.tot_len) - hlen){ /* special case for echoed terminal traffic */ changes = SPECIAL_I; cp = new_seq; } break; case NEW_S: if(deltaS == ntohs(cs->cs_ip.tot_len) - hlen){ /* special case for data xfer */ changes = SPECIAL_D; cp = new_seq; } break; } deltaS = ntohs(ip->id) - ntohs(cs->cs_ip.id); if(deltaS != 1){ cp = encode(cp,deltaS); changes |= NEW_I; } if(th->psh) changes |= TCP_PUSH_BIT; /* Grab the cksum before we overwrite it below. Then update our * state with this packet's header. */ csum = th->check; memcpy(&cs->cs_ip,ip,20); memcpy(&cs->cs_tcp,th,20); /* We want to use the original packet as our compressed packet. * (cp - new_seq) is the number of bytes we need for compressed * sequence numbers. In addition we need one byte for the change * mask, one for the connection id and two for the tcp checksum. * So, (cp - new_seq) + 4 bytes of header are needed. */ deltaS = cp - new_seq; if(compress_cid == 0 || comp->xmit_current != cs->cs_this){ cp = ocp; *cpp = ocp; *cp++ = changes | NEW_C; *cp++ = cs->cs_this; comp->xmit_current = cs->cs_this; } else { cp = ocp; *cpp = ocp; *cp++ = changes; } *(__sum16 *)cp = csum; cp += 2; /* deltaS is now the size of the change section of the compressed header */ memcpy(cp,new_seq,deltaS); /* Write list of deltas */ memcpy(cp+deltaS,icp+hlen,isize-hlen); comp->sls_o_compressed++; ocp[0] |= SL_TYPE_COMPRESSED_TCP; return isize - hlen + deltaS + (cp - ocp); /* Update connection state cs & send uncompressed packet (i.e., * a regular ip/tcp packet but with the 'conversation id' we hope * to use on future compressed packets in the protocol field). */ uncompressed: memcpy(&cs->cs_ip,ip,20); memcpy(&cs->cs_tcp,th,20); if (ip->ihl > 5) memcpy(cs->cs_ipopt, ip+1, ((ip->ihl) - 5) * 4); if (th->doff > 5) memcpy(cs->cs_tcpopt, th+1, ((th->doff) - 5) * 4); comp->xmit_current = cs->cs_this; comp->sls_o_uncompressed++; memcpy(ocp, icp, isize); *cpp = ocp; ocp[9] = cs->cs_this; ocp[0] |= SL_TYPE_UNCOMPRESSED_TCP; return isize; } int slhc_uncompress(struct slcompress *comp, unsigned char *icp, int isize) { int changes; long x; struct tcphdr *thp; struct iphdr *ip; struct cstate *cs; int len, hdrlen; unsigned char *cp = icp; /* We've got a compressed packet; read the change byte */ comp->sls_i_compressed++; if(isize < 3){ comp->sls_i_error++; return 0; } changes = *cp++; if(changes & NEW_C){ /* Make sure the state index is in range, then grab the state. * If we have a good state index, clear the 'discard' flag. */ x = *cp++; /* Read conn index */ if(x < 0 || x > comp->rslot_limit) goto bad; /* Check if the cstate is initialized */ if (!comp->rstate[x].initialized) goto bad; comp->flags &=~ SLF_TOSS; comp->recv_current = x; } else { /* this packet has an implicit state index. If we've * had a line error since the last time we got an * explicit state index, we have to toss the packet. */ if(comp->flags & SLF_TOSS){ comp->sls_i_tossed++; return 0; } } cs = &comp->rstate[comp->recv_current]; thp = &cs->cs_tcp; ip = &cs->cs_ip; thp->check = *(__sum16 *)cp; cp += 2; thp->psh = (changes & TCP_PUSH_BIT) ? 1 : 0; /* * we can use the same number for the length of the saved header and * the current one, because the packet wouldn't have been sent * as compressed unless the options were the same as the previous one */ hdrlen = ip->ihl * 4 + thp->doff * 4; switch(changes & SPECIALS_MASK){ case SPECIAL_I: /* Echoed terminal traffic */ { short i; i = ntohs(ip->tot_len) - hdrlen; thp->ack_seq = htonl( ntohl(thp->ack_seq) + i); thp->seq = htonl( ntohl(thp->seq) + i); } break; case SPECIAL_D: /* Unidirectional data */ thp->seq = htonl( ntohl(thp->seq) + ntohs(ip->tot_len) - hdrlen); break; default: if(changes & NEW_U){ thp->urg = 1; if((x = decode(&cp)) == -1) { goto bad; } thp->urg_ptr = htons(x); } else thp->urg = 0; if(changes & NEW_W){ if((x = decode(&cp)) == -1) { goto bad; } thp->window = htons( ntohs(thp->window) + x); } if(changes & NEW_A){ if((x = decode(&cp)) == -1) { goto bad; } thp->ack_seq = htonl( ntohl(thp->ack_seq) + x); } if(changes & NEW_S){ if((x = decode(&cp)) == -1) { goto bad; } thp->seq = htonl( ntohl(thp->seq) + x); } break; } if(changes & NEW_I){ if((x = decode(&cp)) == -1) { goto bad; } ip->id = htons (ntohs (ip->id) + x); } else ip->id = htons (ntohs (ip->id) + 1); /* * At this point, cp points to the first byte of data in the * packet. Put the reconstructed TCP and IP headers back on the * packet. Recalculate IP checksum (but not TCP checksum). */ len = isize - (cp - icp); if (len < 0) goto bad; len += hdrlen; ip->tot_len = htons(len); ip->check = 0; memmove(icp + hdrlen, cp, len - hdrlen); cp = icp; memcpy(cp, ip, 20); cp += 20; if (ip->ihl > 5) { memcpy(cp, cs->cs_ipopt, (ip->ihl - 5) * 4); cp += (ip->ihl - 5) * 4; } put_unaligned(ip_fast_csum(icp, ip->ihl), &((struct iphdr *)icp)->check); memcpy(cp, thp, 20); cp += 20; if (thp->doff > 5) { memcpy(cp, cs->cs_tcpopt, ((thp->doff) - 5) * 4); cp += ((thp->doff) - 5) * 4; } return len; bad: comp->sls_i_error++; return slhc_toss( comp ); } int slhc_remember(struct slcompress *comp, unsigned char *icp, int isize) { struct cstate *cs; unsigned ihl; unsigned char index; if(isize < 20) { /* The packet is shorter than a legal IP header */ comp->sls_i_runt++; return slhc_toss( comp ); } /* Peek at the IP header's IHL field to find its length */ ihl = icp[0] & 0xf; if(ihl < 20 / 4){ /* The IP header length field is too small */ comp->sls_i_runt++; return slhc_toss( comp ); } index = icp[9]; icp[9] = IPPROTO_TCP; if (ip_fast_csum(icp, ihl)) { /* Bad IP header checksum; discard */ comp->sls_i_badcheck++; return slhc_toss( comp ); } if(index > comp->rslot_limit) { comp->sls_i_error++; return slhc_toss(comp); } /* Update local state */ cs = &comp->rstate[comp->recv_current = index]; comp->flags &=~ SLF_TOSS; memcpy(&cs->cs_ip,icp,20); memcpy(&cs->cs_tcp,icp + ihl*4,20); if (ihl > 5) memcpy(cs->cs_ipopt, icp + sizeof(struct iphdr), (ihl - 5) * 4); if (cs->cs_tcp.doff > 5) memcpy(cs->cs_tcpopt, icp + ihl*4 + sizeof(struct tcphdr), (cs->cs_tcp.doff - 5) * 4); cs->cs_hsize = ihl*2 + cs->cs_tcp.doff*2; cs->initialized = true; /* Put headers back on packet * Neither header checksum is recalculated */ comp->sls_i_uncompressed++; return isize; } int slhc_toss(struct slcompress *comp) { if ( comp == NULLSLCOMPR ) return 0; comp->flags |= SLF_TOSS; return 0; } #else /* CONFIG_INET */ int slhc_toss(struct slcompress *comp) { printk(KERN_DEBUG "Called IP function on non IP-system: slhc_toss"); return -EINVAL; } int slhc_uncompress(struct slcompress *comp, unsigned char *icp, int isize) { printk(KERN_DEBUG "Called IP function on non IP-system: slhc_uncompress"); return -EINVAL; } int slhc_compress(struct slcompress *comp, unsigned char *icp, int isize, unsigned char *ocp, unsigned char **cpp, int compress_cid) { printk(KERN_DEBUG "Called IP function on non IP-system: slhc_compress"); return -EINVAL; } int slhc_remember(struct slcompress *comp, unsigned char *icp, int isize) { printk(KERN_DEBUG "Called IP function on non IP-system: slhc_remember"); return -EINVAL; } void slhc_free(struct slcompress *comp) { printk(KERN_DEBUG "Called IP function on non IP-system: slhc_free"); } struct slcompress * slhc_init(int rslots, int tslots) { printk(KERN_DEBUG "Called IP function on non IP-system: slhc_init"); return NULL; } #endif /* CONFIG_INET */ /* VJ header compression */ EXPORT_SYMBOL(slhc_init); EXPORT_SYMBOL(slhc_free); EXPORT_SYMBOL(slhc_remember); EXPORT_SYMBOL(slhc_compress); EXPORT_SYMBOL(slhc_uncompress); EXPORT_SYMBOL(slhc_toss); MODULE_DESCRIPTION("Compression helpers for SLIP (serial line)"); MODULE_LICENSE("Dual BSD/GPL");
27 27 23 15 1 2 12 1 14 14 14 1 9 6 3 51 2 2 3 4 14 2 15 5 3 10 8 1 25 4 8 7 13 14 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 /* * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <net/sock.h> #include <linux/in.h> #include <linux/ipv6.h> #include <linux/if_arp.h> #include <linux/jhash.h> #include <linux/ratelimit.h> #include "rds.h" static struct rhashtable bind_hash_table; static const struct rhashtable_params ht_parms = { .nelem_hint = 768, .key_len = RDS_BOUND_KEY_LEN, .key_offset = offsetof(struct rds_sock, rs_bound_key), .head_offset = offsetof(struct rds_sock, rs_bound_node), .max_size = 16384, .min_size = 1024, }; /* Create a key for the bind hash table manipulation. Port is in network byte * order. */ static inline void __rds_create_bind_key(u8 *key, const struct in6_addr *addr, __be16 port, __u32 scope_id) { memcpy(key, addr, sizeof(*addr)); key += sizeof(*addr); memcpy(key, &port, sizeof(port)); key += sizeof(port); memcpy(key, &scope_id, sizeof(scope_id)); } /* * Return the rds_sock bound at the given local address. * * The rx path can race with rds_release. We notice if rds_release() has * marked this socket and don't return a rs ref to the rx path. */ struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port, __u32 scope_id) { u8 key[RDS_BOUND_KEY_LEN]; struct rds_sock *rs; __rds_create_bind_key(key, addr, port, scope_id); rcu_read_lock(); rs = rhashtable_lookup(&bind_hash_table, key, ht_parms); if (rs && (sock_flag(rds_rs_to_sk(rs), SOCK_DEAD) || !refcount_inc_not_zero(&rds_rs_to_sk(rs)->sk_refcnt))) rs = NULL; rcu_read_unlock(); rdsdebug("returning rs %p for %pI6c:%u\n", rs, addr, ntohs(port)); return rs; } /* returns -ve errno or +ve port */ static int rds_add_bound(struct rds_sock *rs, const struct in6_addr *addr, __be16 *port, __u32 scope_id) { int ret = -EADDRINUSE; u16 rover, last; u8 key[RDS_BOUND_KEY_LEN]; if (*port != 0) { rover = be16_to_cpu(*port); if (rover == RDS_FLAG_PROBE_PORT) return -EINVAL; last = rover; } else { rover = max_t(u16, get_random_u16(), 2); last = rover - 1; } do { if (rover == 0) rover++; if (rover == RDS_FLAG_PROBE_PORT) continue; __rds_create_bind_key(key, addr, cpu_to_be16(rover), scope_id); if (rhashtable_lookup_fast(&bind_hash_table, key, ht_parms)) continue; memcpy(rs->rs_bound_key, key, sizeof(rs->rs_bound_key)); rs->rs_bound_addr = *addr; net_get_random_once(&rs->rs_hash_initval, sizeof(rs->rs_hash_initval)); rs->rs_bound_port = cpu_to_be16(rover); rs->rs_bound_node.next = NULL; rds_sock_addref(rs); if (!rhashtable_insert_fast(&bind_hash_table, &rs->rs_bound_node, ht_parms)) { *port = rs->rs_bound_port; rs->rs_bound_scope_id = scope_id; ret = 0; rdsdebug("rs %p binding to %pI6c:%d\n", rs, addr, (int)ntohs(*port)); break; } else { rs->rs_bound_addr = in6addr_any; rds_sock_put(rs); ret = -ENOMEM; break; } } while (rover++ != last); return ret; } void rds_remove_bound(struct rds_sock *rs) { if (ipv6_addr_any(&rs->rs_bound_addr)) return; rdsdebug("rs %p unbinding from %pI6c:%d\n", rs, &rs->rs_bound_addr, ntohs(rs->rs_bound_port)); rhashtable_remove_fast(&bind_hash_table, &rs->rs_bound_node, ht_parms); rds_sock_put(rs); rs->rs_bound_addr = in6addr_any; } int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; struct rds_sock *rs = rds_sk_to_rs(sk); struct in6_addr v6addr, *binding_addr; struct rds_transport *trans; __u32 scope_id = 0; int ret = 0; __be16 port; /* We allow an RDS socket to be bound to either IPv4 or IPv6 * address. */ if (addr_len < offsetofend(struct sockaddr, sa_family)) return -EINVAL; if (uaddr->sa_family == AF_INET) { struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; if (addr_len < sizeof(struct sockaddr_in) || sin->sin_addr.s_addr == htonl(INADDR_ANY) || sin->sin_addr.s_addr == htonl(INADDR_BROADCAST) || ipv4_is_multicast(sin->sin_addr.s_addr)) return -EINVAL; ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &v6addr); binding_addr = &v6addr; port = sin->sin_port; #if IS_ENABLED(CONFIG_IPV6) } else if (uaddr->sa_family == AF_INET6) { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)uaddr; int addr_type; if (addr_len < sizeof(struct sockaddr_in6)) return -EINVAL; addr_type = ipv6_addr_type(&sin6->sin6_addr); if (!(addr_type & IPV6_ADDR_UNICAST)) { __be32 addr4; if (!(addr_type & IPV6_ADDR_MAPPED)) return -EINVAL; /* It is a mapped address. Need to do some sanity * checks. */ addr4 = sin6->sin6_addr.s6_addr32[3]; if (addr4 == htonl(INADDR_ANY) || addr4 == htonl(INADDR_BROADCAST) || ipv4_is_multicast(addr4)) return -EINVAL; } /* The scope ID must be specified for link local address. */ if (addr_type & IPV6_ADDR_LINKLOCAL) { if (sin6->sin6_scope_id == 0) return -EINVAL; scope_id = sin6->sin6_scope_id; } binding_addr = &sin6->sin6_addr; port = sin6->sin6_port; #endif } else { return -EINVAL; } lock_sock(sk); /* RDS socket does not allow re-binding. */ if (!ipv6_addr_any(&rs->rs_bound_addr)) { ret = -EINVAL; goto out; } /* Socket is connected. The binding address should have the same * scope ID as the connected address, except the case when one is * non-link local address (scope_id is 0). */ if (!ipv6_addr_any(&rs->rs_conn_addr) && scope_id && rs->rs_bound_scope_id && scope_id != rs->rs_bound_scope_id) { ret = -EINVAL; goto out; } /* The transport can be set using SO_RDS_TRANSPORT option before the * socket is bound. */ if (rs->rs_transport) { trans = rs->rs_transport; if (!trans->laddr_check || trans->laddr_check(sock_net(sock->sk), binding_addr, scope_id) != 0) { ret = -ENOPROTOOPT; goto out; } } else { trans = rds_trans_get_preferred(sock_net(sock->sk), binding_addr, scope_id); if (!trans) { ret = -EADDRNOTAVAIL; pr_info_ratelimited("RDS: %s could not find a transport for %pI6c, load rds_tcp or rds_rdma?\n", __func__, binding_addr); goto out; } rs->rs_transport = trans; } sock_set_flag(sk, SOCK_RCU_FREE); ret = rds_add_bound(rs, binding_addr, &port, scope_id); if (ret) rs->rs_transport = NULL; out: release_sock(sk); return ret; } void rds_bind_lock_destroy(void) { rhashtable_destroy(&bind_hash_table); } int rds_bind_lock_init(void) { return rhashtable_init(&bind_hash_table, &ht_parms); }
1 176 763 5266 1692 300 2305 155 1 170 2615 19 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_DCACHE_H #define __LINUX_DCACHE_H #include <linux/atomic.h> #include <linux/list.h> #include <linux/math.h> #include <linux/rculist.h> #include <linux/rculist_bl.h> #include <linux/spinlock.h> #include <linux/seqlock.h> #include <linux/cache.h> #include <linux/rcupdate.h> #include <linux/lockref.h> #include <linux/stringhash.h> #include <linux/wait.h> struct path; struct file; struct vfsmount; /* * linux/include/linux/dcache.h * * Dirent cache data structures * * (C) Copyright 1997 Thomas Schoebel-Theuer, * with heavy changes by Linus Torvalds */ #define IS_ROOT(x) ((x) == (x)->d_parent) /* The hash is always the low bits of hash_len */ #ifdef __LITTLE_ENDIAN #define HASH_LEN_DECLARE u32 hash; u32 len #define bytemask_from_count(cnt) (~(~0ul << (cnt)*8)) #else #define HASH_LEN_DECLARE u32 len; u32 hash #define bytemask_from_count(cnt) (~(~0ul >> (cnt)*8)) #endif /* * "quick string" -- eases parameter passing, but more importantly * saves "metadata" about the string (ie length and the hash). * * hash comes first so it snuggles against d_parent in the * dentry. */ struct qstr { union { struct { HASH_LEN_DECLARE; }; u64 hash_len; }; const unsigned char *name; }; #define QSTR_INIT(n,l) { { { .len = l } }, .name = n } extern const struct qstr empty_name; extern const struct qstr slash_name; extern const struct qstr dotdot_name; /* * Try to keep struct dentry aligned on 64 byte cachelines (this will * give reasonable cacheline footprint with larger lines without the * large memory footprint increase). */ #ifdef CONFIG_64BIT # define DNAME_INLINE_LEN 40 /* 192 bytes */ #else # ifdef CONFIG_SMP # define DNAME_INLINE_LEN 40 /* 128 bytes */ # else # define DNAME_INLINE_LEN 44 /* 128 bytes */ # endif #endif #define d_lock d_lockref.lock struct dentry { /* RCU lookup touched fields */ unsigned int d_flags; /* protected by d_lock */ seqcount_spinlock_t d_seq; /* per dentry seqlock */ struct hlist_bl_node d_hash; /* lookup hash list */ struct dentry *d_parent; /* parent directory */ struct qstr d_name; struct inode *d_inode; /* Where the name belongs to - NULL is * negative */ unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */ /* Ref lookup also touches following */ struct lockref d_lockref; /* per-dentry lock and refcount */ const struct dentry_operations *d_op; struct super_block *d_sb; /* The root of the dentry tree */ unsigned long d_time; /* used by d_revalidate */ void *d_fsdata; /* fs-specific data */ union { struct list_head d_lru; /* LRU list */ wait_queue_head_t *d_wait; /* in-lookup ones only */ }; struct hlist_node d_sib; /* child of parent list */ struct hlist_head d_children; /* our children */ /* * d_alias and d_rcu can share memory */ union { struct hlist_node d_alias; /* inode alias list */ struct hlist_bl_node d_in_lookup_hash; /* only for in-lookup ones */ struct rcu_head d_rcu; } d_u; }; /* * dentry->d_lock spinlock nesting subclasses: * * 0: normal * 1: nested */ enum dentry_d_lock_class { DENTRY_D_LOCK_NORMAL, /* implicitly used by plain spin_lock() APIs. */ DENTRY_D_LOCK_NESTED }; struct dentry_operations { int (*d_revalidate)(struct dentry *, unsigned int); int (*d_weak_revalidate)(struct dentry *, unsigned int); int (*d_hash)(const struct dentry *, struct qstr *); int (*d_compare)(const struct dentry *, unsigned int, const char *, const struct qstr *); int (*d_delete)(const struct dentry *); int (*d_init)(struct dentry *); void (*d_release)(struct dentry *); void (*d_prune)(struct dentry *); void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)(struct dentry *, char *, int); struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(const struct path *, bool); struct dentry *(*d_real)(struct dentry *, const struct inode *); } ____cacheline_aligned; /* * Locking rules for dentry_operations callbacks are to be found in * Documentation/filesystems/locking.rst. Keep it updated! * * FUrther descriptions are found in Documentation/filesystems/vfs.rst. * Keep it updated too! */ /* d_flags entries */ #define DCACHE_OP_HASH BIT(0) #define DCACHE_OP_COMPARE BIT(1) #define DCACHE_OP_REVALIDATE BIT(2) #define DCACHE_OP_DELETE BIT(3) #define DCACHE_OP_PRUNE BIT(4) #define DCACHE_DISCONNECTED BIT(5) /* This dentry is possibly not currently connected to the dcache tree, in * which case its parent will either be itself, or will have this flag as * well. nfsd will not use a dentry with this bit set, but will first * endeavour to clear the bit either by discovering that it is connected, * or by performing lookup operations. Any filesystem which supports * nfsd_operations MUST have a lookup function which, if it finds a * directory inode with a DCACHE_DISCONNECTED dentry, will d_move that * dentry into place and return that dentry rather than the passed one, * typically using d_splice_alias. */ #define DCACHE_REFERENCED BIT(6) /* Recently used, don't discard. */ #define DCACHE_DONTCACHE BIT(7) /* Purge from memory on final dput() */ #define DCACHE_CANT_MOUNT BIT(8) #define DCACHE_SHRINK_LIST BIT(10) #define DCACHE_OP_WEAK_REVALIDATE BIT(11) #define DCACHE_NFSFS_RENAMED BIT(12) /* this dentry has been "silly renamed" and has to be deleted on the last * dput() */ #define DCACHE_FSNOTIFY_PARENT_WATCHED BIT(14) /* Parent inode is watched by some fsnotify listener */ #define DCACHE_DENTRY_KILLED BIT(15) #define DCACHE_MOUNTED BIT(16) /* is a mountpoint */ #define DCACHE_NEED_AUTOMOUNT BIT(17) /* handle automount on this dir */ #define DCACHE_MANAGE_TRANSIT BIT(18) /* manage transit from this dirent */ #define DCACHE_MANAGED_DENTRY \ (DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT) #define DCACHE_LRU_LIST BIT(19) #define DCACHE_ENTRY_TYPE (7 << 20) /* bits 20..22 are for storing type: */ #define DCACHE_MISS_TYPE (0 << 20) /* Negative dentry */ #define DCACHE_WHITEOUT_TYPE (1 << 20) /* Whiteout dentry (stop pathwalk) */ #define DCACHE_DIRECTORY_TYPE (2 << 20) /* Normal directory */ #define DCACHE_AUTODIR_TYPE (3 << 20) /* Lookupless directory (presumed automount) */ #define DCACHE_REGULAR_TYPE (4 << 20) /* Regular file type */ #define DCACHE_SPECIAL_TYPE (5 << 20) /* Other file type */ #define DCACHE_SYMLINK_TYPE (6 << 20) /* Symlink */ #define DCACHE_NOKEY_NAME BIT(25) /* Encrypted name encoded without key */ #define DCACHE_OP_REAL BIT(26) #define DCACHE_PAR_LOOKUP BIT(28) /* being looked up (with parent locked shared) */ #define DCACHE_DENTRY_CURSOR BIT(29) #define DCACHE_NORCU BIT(30) /* No RCU delay for freeing */ extern seqlock_t rename_lock; /* * These are the low-level FS interfaces to the dcache.. */ extern void d_instantiate(struct dentry *, struct inode *); extern void d_instantiate_new(struct dentry *, struct inode *); extern void __d_drop(struct dentry *dentry); extern void d_drop(struct dentry *dentry); extern void d_delete(struct dentry *); extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op); /* allocate/de-allocate */ extern struct dentry * d_alloc(struct dentry *, const struct qstr *); extern struct dentry * d_alloc_anon(struct super_block *); extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *, wait_queue_head_t *); extern struct dentry * d_splice_alias(struct inode *, struct dentry *); extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *); extern bool d_same_name(const struct dentry *dentry, const struct dentry *parent, const struct qstr *name); extern struct dentry * d_exact_alias(struct dentry *, struct inode *); extern struct dentry *d_find_any_alias(struct inode *inode); extern struct dentry * d_obtain_alias(struct inode *); extern struct dentry * d_obtain_root(struct inode *); extern void shrink_dcache_sb(struct super_block *); extern void shrink_dcache_parent(struct dentry *); extern void d_invalidate(struct dentry *); /* only used at mount-time */ extern struct dentry * d_make_root(struct inode *); extern void d_mark_tmpfile(struct file *, struct inode *); extern void d_tmpfile(struct file *, struct inode *); extern struct dentry *d_find_alias(struct inode *); extern void d_prune_aliases(struct inode *); extern struct dentry *d_find_alias_rcu(struct inode *); /* test whether we have any submounts in a subdir tree */ extern int path_has_submounts(const struct path *); /* * This adds the entry to the hash queues. */ extern void d_rehash(struct dentry *); extern void d_add(struct dentry *, struct inode *); /* used for rename() and baskets */ extern void d_move(struct dentry *, struct dentry *); extern void d_exchange(struct dentry *, struct dentry *); extern struct dentry *d_ancestor(struct dentry *, struct dentry *); extern struct dentry *d_lookup(const struct dentry *, const struct qstr *); extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *); static inline unsigned d_count(const struct dentry *dentry) { return dentry->d_lockref.count; } /* * helper function for dentry_operations.d_dname() members */ extern __printf(3, 4) char *dynamic_dname(char *, int, const char *, ...); extern char *__d_path(const struct path *, const struct path *, char *, int); extern char *d_absolute_path(const struct path *, char *, int); extern char *d_path(const struct path *, char *, int); extern char *dentry_path_raw(const struct dentry *, char *, int); extern char *dentry_path(const struct dentry *, char *, int); /* Allocation counts.. */ /** * dget_dlock - get a reference to a dentry * @dentry: dentry to get a reference to * * Given a live dentry, increment the reference count and return the dentry. * Caller must hold @dentry->d_lock. Making sure that dentry is alive is * caller's resonsibility. There are many conditions sufficient to guarantee * that; e.g. anything with non-negative refcount is alive, so's anything * hashed, anything positive, anyone's parent, etc. */ static inline struct dentry *dget_dlock(struct dentry *dentry) { dentry->d_lockref.count++; return dentry; } /** * dget - get a reference to a dentry * @dentry: dentry to get a reference to * * Given a dentry or %NULL pointer increment the reference count * if appropriate and return the dentry. A dentry will not be * destroyed when it has references. Conversely, a dentry with * no references can disappear for any number of reasons, starting * with memory pressure. In other words, that primitive is * used to clone an existing reference; using it on something with * zero refcount is a bug. * * NOTE: it will spin if @dentry->d_lock is held. From the deadlock * avoidance point of view it is equivalent to spin_lock()/increment * refcount/spin_unlock(), so calling it under @dentry->d_lock is * always a bug; so's calling it under ->d_lock on any of its descendents. * */ static inline struct dentry *dget(struct dentry *dentry) { if (dentry) lockref_get(&dentry->d_lockref); return dentry; } extern struct dentry *dget_parent(struct dentry *dentry); /** * d_unhashed - is dentry hashed * @dentry: entry to check * * Returns true if the dentry passed is not currently hashed. */ static inline int d_unhashed(const struct dentry *dentry) { return hlist_bl_unhashed(&dentry->d_hash); } static inline int d_unlinked(const struct dentry *dentry) { return d_unhashed(dentry) && !IS_ROOT(dentry); } static inline int cant_mount(const struct dentry *dentry) { return (dentry->d_flags & DCACHE_CANT_MOUNT); } static inline void dont_mount(struct dentry *dentry) { spin_lock(&dentry->d_lock); dentry->d_flags |= DCACHE_CANT_MOUNT; spin_unlock(&dentry->d_lock); } extern void __d_lookup_unhash_wake(struct dentry *dentry); static inline int d_in_lookup(const struct dentry *dentry) { return dentry->d_flags & DCACHE_PAR_LOOKUP; } static inline void d_lookup_done(struct dentry *dentry) { if (unlikely(d_in_lookup(dentry))) __d_lookup_unhash_wake(dentry); } extern void dput(struct dentry *); static inline bool d_managed(const struct dentry *dentry) { return dentry->d_flags & DCACHE_MANAGED_DENTRY; } static inline bool d_mountpoint(const struct dentry *dentry) { return dentry->d_flags & DCACHE_MOUNTED; } /* * Directory cache entry type accessor functions. */ static inline unsigned __d_entry_type(const struct dentry *dentry) { return dentry->d_flags & DCACHE_ENTRY_TYPE; } static inline bool d_is_miss(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_MISS_TYPE; } static inline bool d_is_whiteout(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_WHITEOUT_TYPE; } static inline bool d_can_lookup(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_DIRECTORY_TYPE; } static inline bool d_is_autodir(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_AUTODIR_TYPE; } static inline bool d_is_dir(const struct dentry *dentry) { return d_can_lookup(dentry) || d_is_autodir(dentry); } static inline bool d_is_symlink(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_SYMLINK_TYPE; } static inline bool d_is_reg(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_REGULAR_TYPE; } static inline bool d_is_special(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_SPECIAL_TYPE; } static inline bool d_is_file(const struct dentry *dentry) { return d_is_reg(dentry) || d_is_special(dentry); } static inline bool d_is_negative(const struct dentry *dentry) { // TODO: check d_is_whiteout(dentry) also. return d_is_miss(dentry); } static inline bool d_flags_negative(unsigned flags) { return (flags & DCACHE_ENTRY_TYPE) == DCACHE_MISS_TYPE; } static inline bool d_is_positive(const struct dentry *dentry) { return !d_is_negative(dentry); } /** * d_really_is_negative - Determine if a dentry is really negative (ignoring fallthroughs) * @dentry: The dentry in question * * Returns true if the dentry represents either an absent name or a name that * doesn't map to an inode (ie. ->d_inode is NULL). The dentry could represent * a true miss, a whiteout that isn't represented by a 0,0 chardev or a * fallthrough marker in an opaque directory. * * Note! (1) This should be used *only* by a filesystem to examine its own * dentries. It should not be used to look at some other filesystem's * dentries. (2) It should also be used in combination with d_inode() to get * the inode. (3) The dentry may have something attached to ->d_lower and the * type field of the flags may be set to something other than miss or whiteout. */ static inline bool d_really_is_negative(const struct dentry *dentry) { return dentry->d_inode == NULL; } /** * d_really_is_positive - Determine if a dentry is really positive (ignoring fallthroughs) * @dentry: The dentry in question * * Returns true if the dentry represents a name that maps to an inode * (ie. ->d_inode is not NULL). The dentry might still represent a whiteout if * that is represented on medium as a 0,0 chardev. * * Note! (1) This should be used *only* by a filesystem to examine its own * dentries. It should not be used to look at some other filesystem's * dentries. (2) It should also be used in combination with d_inode() to get * the inode. */ static inline bool d_really_is_positive(const struct dentry *dentry) { return dentry->d_inode != NULL; } static inline int simple_positive(const struct dentry *dentry) { return d_really_is_positive(dentry) && !d_unhashed(dentry); } extern int sysctl_vfs_cache_pressure; static inline unsigned long vfs_pressure_ratio(unsigned long val) { return mult_frac(val, sysctl_vfs_cache_pressure, 100); } /** * d_inode - Get the actual inode of this dentry * @dentry: The dentry to query * * This is the helper normal filesystems should use to get at their own inodes * in their own dentries and ignore the layering superimposed upon them. */ static inline struct inode *d_inode(const struct dentry *dentry) { return dentry->d_inode; } /** * d_inode_rcu - Get the actual inode of this dentry with READ_ONCE() * @dentry: The dentry to query * * This is the helper normal filesystems should use to get at their own inodes * in their own dentries and ignore the layering superimposed upon them. */ static inline struct inode *d_inode_rcu(const struct dentry *dentry) { return READ_ONCE(dentry->d_inode); } /** * d_backing_inode - Get upper or lower inode we should be using * @upper: The upper layer * * This is the helper that should be used to get at the inode that will be used * if this dentry were to be opened as a file. The inode may be on the upper * dentry or it may be on a lower dentry pinned by the upper. * * Normal filesystems should not use this to access their own inodes. */ static inline struct inode *d_backing_inode(const struct dentry *upper) { struct inode *inode = upper->d_inode; return inode; } /** * d_real - Return the real dentry * @dentry: the dentry to query * @inode: inode to select the dentry from multiple layers (can be NULL) * * If dentry is on a union/overlay, then return the underlying, real dentry. * Otherwise return the dentry itself. * * See also: Documentation/filesystems/vfs.rst */ static inline struct dentry *d_real(struct dentry *dentry, const struct inode *inode) { if (unlikely(dentry->d_flags & DCACHE_OP_REAL)) return dentry->d_op->d_real(dentry, inode); else return dentry; } /** * d_real_inode - Return the real inode * @dentry: The dentry to query * * If dentry is on a union/overlay, then return the underlying, real inode. * Otherwise return d_inode(). */ static inline struct inode *d_real_inode(const struct dentry *dentry) { /* This usage of d_real() results in const dentry */ return d_backing_inode(d_real((struct dentry *) dentry, NULL)); } struct name_snapshot { struct qstr name; unsigned char inline_name[DNAME_INLINE_LEN]; }; void take_dentry_name_snapshot(struct name_snapshot *, struct dentry *); void release_dentry_name_snapshot(struct name_snapshot *); static inline struct dentry *d_first_child(const struct dentry *dentry) { return hlist_entry_safe(dentry->d_children.first, struct dentry, d_sib); } static inline struct dentry *d_next_sibling(const struct dentry *dentry) { return hlist_entry_safe(dentry->d_sib.next, struct dentry, d_sib); } #endif /* __LINUX_DCACHE_H */
316 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_ICMPV6_H #define _LINUX_ICMPV6_H #include <linux/skbuff.h> #include <linux/ipv6.h> #include <uapi/linux/icmpv6.h> static inline struct icmp6hdr *icmp6_hdr(const struct sk_buff *skb) { return (struct icmp6hdr *)skb_transport_header(skb); } #include <linux/netdevice.h> #if IS_ENABLED(CONFIG_IPV6) typedef void ip6_icmp_send_t(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct in6_addr *force_saddr, const struct inet6_skb_parm *parm); void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct in6_addr *force_saddr, const struct inet6_skb_parm *parm); #if IS_BUILTIN(CONFIG_IPV6) static inline void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct inet6_skb_parm *parm) { icmp6_send(skb, type, code, info, NULL, parm); } static inline int inet6_register_icmp_sender(ip6_icmp_send_t *fn) { BUILD_BUG_ON(fn != icmp6_send); return 0; } static inline int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn) { BUILD_BUG_ON(fn != icmp6_send); return 0; } #else extern void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct inet6_skb_parm *parm); extern int inet6_register_icmp_sender(ip6_icmp_send_t *fn); extern int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn); #endif static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) { __icmpv6_send(skb, type, code, info, IP6CB(skb)); } int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, unsigned int data_len); #if IS_ENABLED(CONFIG_NF_NAT) void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info); #else static inline void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) { struct inet6_skb_parm parm = { 0 }; __icmpv6_send(skb_in, type, code, info, &parm); } #endif #else static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) { } static inline void icmpv6_ndo_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) { } #endif extern int icmpv6_init(void); extern int icmpv6_err_convert(u8 type, u8 code, int *err); extern void icmpv6_cleanup(void); extern void icmpv6_param_prob_reason(struct sk_buff *skb, u8 code, int pos, enum skb_drop_reason reason); struct flowi6; struct in6_addr; void icmpv6_flow_init(const struct sock *sk, struct flowi6 *fl6, u8 type, const struct in6_addr *saddr, const struct in6_addr *daddr, int oif); static inline void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos) { icmpv6_param_prob_reason(skb, code, pos, SKB_DROP_REASON_NOT_SPECIFIED); } static inline bool icmpv6_is_err(int type) { switch (type) { case ICMPV6_DEST_UNREACH: case ICMPV6_PKT_TOOBIG: case ICMPV6_TIME_EXCEED: case ICMPV6_PARAMPROB: return true; } return false; } #endif
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */ #include <linux/types.h> #include <linux/bpf.h> #include <linux/bpf_local_storage.h> #include <uapi/linux/btf.h> #include <linux/btf_ids.h> DEFINE_BPF_STORAGE_CACHE(cgroup_cache); static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy); static void bpf_cgrp_storage_lock(void) { migrate_disable(); this_cpu_inc(bpf_cgrp_storage_busy); } static void bpf_cgrp_storage_unlock(void) { this_cpu_dec(bpf_cgrp_storage_busy); migrate_enable(); } static bool bpf_cgrp_storage_trylock(void) { migrate_disable(); if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) { this_cpu_dec(bpf_cgrp_storage_busy); migrate_enable(); return false; } return true; } static struct bpf_local_storage __rcu **cgroup_storage_ptr(void *owner) { struct cgroup *cg = owner; return &cg->bpf_cgrp_storage; } void bpf_cgrp_storage_free(struct cgroup *cgroup) { struct bpf_local_storage *local_storage; rcu_read_lock(); local_storage = rcu_dereference(cgroup->bpf_cgrp_storage); if (!local_storage) { rcu_read_unlock(); return; } bpf_cgrp_storage_lock(); bpf_local_storage_destroy(local_storage); bpf_cgrp_storage_unlock(); rcu_read_unlock(); } static struct bpf_local_storage_data * cgroup_storage_lookup(struct cgroup *cgroup, struct bpf_map *map, bool cacheit_lockit) { struct bpf_local_storage *cgroup_storage; struct bpf_local_storage_map *smap; cgroup_storage = rcu_dereference_check(cgroup->bpf_cgrp_storage, bpf_rcu_lock_held()); if (!cgroup_storage) return NULL; smap = (struct bpf_local_storage_map *)map; return bpf_local_storage_lookup(cgroup_storage, smap, cacheit_lockit); } static void *bpf_cgrp_storage_lookup_elem(struct bpf_map *map, void *key) { struct bpf_local_storage_data *sdata; struct cgroup *cgroup; int fd; fd = *(int *)key; cgroup = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgroup)) return ERR_CAST(cgroup); bpf_cgrp_storage_lock(); sdata = cgroup_storage_lookup(cgroup, map, true); bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return sdata ? sdata->data : NULL; } static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_local_storage_data *sdata; struct cgroup *cgroup; int fd; fd = *(int *)key; cgroup = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgroup)) return PTR_ERR(cgroup); bpf_cgrp_storage_lock(); sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, value, map_flags, GFP_ATOMIC); bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return PTR_ERR_OR_ZERO(sdata); } static int cgroup_storage_delete(struct cgroup *cgroup, struct bpf_map *map) { struct bpf_local_storage_data *sdata; sdata = cgroup_storage_lookup(cgroup, map, false); if (!sdata) return -ENOENT; bpf_selem_unlink(SELEM(sdata), false); return 0; } static long bpf_cgrp_storage_delete_elem(struct bpf_map *map, void *key) { struct cgroup *cgroup; int err, fd; fd = *(int *)key; cgroup = cgroup_v1v2_get_from_fd(fd); if (IS_ERR(cgroup)) return PTR_ERR(cgroup); bpf_cgrp_storage_lock(); err = cgroup_storage_delete(cgroup, map); bpf_cgrp_storage_unlock(); cgroup_put(cgroup); return err; } static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) { return -ENOTSUPP; } static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) { return bpf_local_storage_map_alloc(attr, &cgroup_cache, true); } static void cgroup_storage_map_free(struct bpf_map *map) { bpf_local_storage_map_free(map, &cgroup_cache, NULL); } /* *gfp_flags* is a hidden argument provided by the verifier */ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup, void *, value, u64, flags, gfp_t, gfp_flags) { struct bpf_local_storage_data *sdata; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) return (unsigned long)NULL; if (!cgroup) return (unsigned long)NULL; if (!bpf_cgrp_storage_trylock()) return (unsigned long)NULL; sdata = cgroup_storage_lookup(cgroup, map, true); if (sdata) goto unlock; /* only allocate new storage, when the cgroup is refcounted */ if (!percpu_ref_is_dying(&cgroup->self.refcnt) && (flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map, value, BPF_NOEXIST, gfp_flags); unlock: bpf_cgrp_storage_unlock(); return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data; } BPF_CALL_2(bpf_cgrp_storage_delete, struct bpf_map *, map, struct cgroup *, cgroup) { int ret; WARN_ON_ONCE(!bpf_rcu_lock_held()); if (!cgroup) return -EINVAL; if (!bpf_cgrp_storage_trylock()) return -EBUSY; ret = cgroup_storage_delete(cgroup, map); bpf_cgrp_storage_unlock(); return ret; } const struct bpf_map_ops cgrp_storage_map_ops = { .map_meta_equal = bpf_map_meta_equal, .map_alloc_check = bpf_local_storage_map_alloc_check, .map_alloc = cgroup_storage_map_alloc, .map_free = cgroup_storage_map_free, .map_get_next_key = notsupp_get_next_key, .map_lookup_elem = bpf_cgrp_storage_lookup_elem, .map_update_elem = bpf_cgrp_storage_update_elem, .map_delete_elem = bpf_cgrp_storage_delete_elem, .map_check_btf = bpf_local_storage_map_check_btf, .map_mem_usage = bpf_local_storage_map_mem_usage, .map_btf_id = &bpf_local_storage_map_btf_id[0], .map_owner_storage_ptr = cgroup_storage_ptr, }; const struct bpf_func_proto bpf_cgrp_storage_get_proto = { .func = bpf_cgrp_storage_get, .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &bpf_cgroup_btf_id[0], .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, .arg4_type = ARG_ANYTHING, }; const struct bpf_func_proto bpf_cgrp_storage_delete_proto = { .func = bpf_cgrp_storage_delete, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_BTF_ID_OR_NULL, .arg2_btf_id = &bpf_cgroup_btf_id[0], };
683 75 77 82 80 19 19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 // SPDX-License-Identifier: GPL-2.0 /* * Creating audit events from TTY input. * * Copyright (C) 2007 Red Hat, Inc. All rights reserved. * * Authors: Miloslav Trmac <mitr@redhat.com> */ #include <linux/audit.h> #include <linux/slab.h> #include <linux/tty.h> #include "tty.h" struct tty_audit_buf { struct mutex mutex; /* Protects all data below */ dev_t dev; /* The TTY which the data is from */ bool icanon; size_t valid; u8 *data; /* Allocated size N_TTY_BUF_SIZE */ }; static struct tty_audit_buf *tty_audit_buf_ref(void) { struct tty_audit_buf *buf; buf = current->signal->tty_audit_buf; WARN_ON(buf == ERR_PTR(-ESRCH)); return buf; } static struct tty_audit_buf *tty_audit_buf_alloc(void) { struct tty_audit_buf *buf; buf = kzalloc(sizeof(*buf), GFP_KERNEL); if (!buf) goto err; buf->data = kmalloc(N_TTY_BUF_SIZE, GFP_KERNEL); if (!buf->data) goto err_buf; mutex_init(&buf->mutex); return buf; err_buf: kfree(buf); err: return NULL; } static void tty_audit_buf_free(struct tty_audit_buf *buf) { WARN_ON(buf->valid != 0); kfree(buf->data); kfree(buf); } static void tty_audit_log(const char *description, dev_t dev, const u8 *data, size_t size) { struct audit_buffer *ab; pid_t pid = task_pid_nr(current); uid_t uid = from_kuid(&init_user_ns, task_uid(current)); uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current)); unsigned int sessionid = audit_get_sessionid(current); char name[TASK_COMM_LEN]; ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_TTY); if (!ab) return; audit_log_format(ab, "%s pid=%u uid=%u auid=%u ses=%u major=%d minor=%d comm=", description, pid, uid, loginuid, sessionid, MAJOR(dev), MINOR(dev)); get_task_comm(name, current); audit_log_untrustedstring(ab, name); audit_log_format(ab, " data="); audit_log_n_hex(ab, data, size); audit_log_end(ab); } /* * tty_audit_buf_push - Push buffered data out * * Generate an audit message from the contents of @buf, which is owned by * the current task. @buf->mutex must be locked. */ static void tty_audit_buf_push(struct tty_audit_buf *buf) { if (buf->valid == 0) return; if (audit_enabled == AUDIT_OFF) { buf->valid = 0; return; } tty_audit_log("tty", buf->dev, buf->data, buf->valid); buf->valid = 0; } /** * tty_audit_exit - Handle a task exit * * Make sure all buffered data is written out and deallocate the buffer. * Only needs to be called if current->signal->tty_audit_buf != %NULL. * * The process is single-threaded at this point; no other threads share * current->signal. */ void tty_audit_exit(void) { struct tty_audit_buf *buf; buf = xchg(&current->signal->tty_audit_buf, ERR_PTR(-ESRCH)); if (!buf) return; tty_audit_buf_push(buf); tty_audit_buf_free(buf); } /* * tty_audit_fork - Copy TTY audit state for a new task * * Set up TTY audit state in @sig from current. @sig needs no locking. */ void tty_audit_fork(struct signal_struct *sig) { sig->audit_tty = current->signal->audit_tty; } /* * tty_audit_tiocsti - Log TIOCSTI */ void tty_audit_tiocsti(const struct tty_struct *tty, u8 ch) { dev_t dev; dev = MKDEV(tty->driver->major, tty->driver->minor_start) + tty->index; if (tty_audit_push()) return; if (audit_enabled) tty_audit_log("ioctl=TIOCSTI", dev, &ch, 1); } /* * tty_audit_push - Flush current's pending audit data * * Returns 0 if success, -EPERM if tty audit is disabled */ int tty_audit_push(void) { struct tty_audit_buf *buf; if (~current->signal->audit_tty & AUDIT_TTY_ENABLE) return -EPERM; buf = tty_audit_buf_ref(); if (!IS_ERR_OR_NULL(buf)) { mutex_lock(&buf->mutex); tty_audit_buf_push(buf); mutex_unlock(&buf->mutex); } return 0; } /* * tty_audit_buf_get - Get an audit buffer. * * Get an audit buffer, allocate it if necessary. Return %NULL * if out of memory or ERR_PTR(-ESRCH) if tty_audit_exit() has already * occurred. Otherwise, return a new reference to the buffer. */ static struct tty_audit_buf *tty_audit_buf_get(void) { struct tty_audit_buf *buf; buf = tty_audit_buf_ref(); if (buf) return buf; buf = tty_audit_buf_alloc(); if (buf == NULL) { audit_log_lost("out of memory in TTY auditing"); return NULL; } /* Race to use this buffer, free it if another wins */ if (cmpxchg(&current->signal->tty_audit_buf, NULL, buf) != NULL) tty_audit_buf_free(buf); return tty_audit_buf_ref(); } /* * tty_audit_add_data - Add data for TTY auditing. * * Audit @data of @size from @tty, if necessary. */ void tty_audit_add_data(const struct tty_struct *tty, const void *data, size_t size) { struct tty_audit_buf *buf; unsigned int audit_tty; bool icanon = L_ICANON(tty); dev_t dev; audit_tty = READ_ONCE(current->signal->audit_tty); if (~audit_tty & AUDIT_TTY_ENABLE) return; if (unlikely(size == 0)) return; if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER) return; if ((~audit_tty & AUDIT_TTY_LOG_PASSWD) && icanon && !L_ECHO(tty)) return; buf = tty_audit_buf_get(); if (IS_ERR_OR_NULL(buf)) return; mutex_lock(&buf->mutex); dev = MKDEV(tty->driver->major, tty->driver->minor_start) + tty->index; if (buf->dev != dev || buf->icanon != icanon) { tty_audit_buf_push(buf); buf->dev = dev; buf->icanon = icanon; } do { size_t run; run = N_TTY_BUF_SIZE - buf->valid; if (run > size) run = size; memcpy(buf->data + buf->valid, data, run); buf->valid += run; data += run; size -= run; if (buf->valid == N_TTY_BUF_SIZE) tty_audit_buf_push(buf); } while (size != 0); mutex_unlock(&buf->mutex); }
1725 559 8 1485 172 4 157 1485 1608 40 1589 1586 1072 326 325 326 5 6 23 23 8 8 8 174 172 172 171 6 23 6 1 6 6 165 166 1624 560 1621 9242 9123 1623 1622 1624 1622 1623 1731 2 1729 1733 2 1725 557 559 560 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved. * Authors: David Chinner and Glauber Costa * * Generic LRU infrastructure */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/list_lru.h> #include <linux/slab.h> #include <linux/mutex.h> #include <linux/memcontrol.h> #include "slab.h" #include "internal.h" #ifdef CONFIG_MEMCG_KMEM static LIST_HEAD(memcg_list_lrus); static DEFINE_MUTEX(list_lrus_mutex); static inline bool list_lru_memcg_aware(struct list_lru *lru) { return lru->memcg_aware; } static void list_lru_register(struct list_lru *lru) { if (!list_lru_memcg_aware(lru)) return; mutex_lock(&list_lrus_mutex); list_add(&lru->list, &memcg_list_lrus); mutex_unlock(&list_lrus_mutex); } static void list_lru_unregister(struct list_lru *lru) { if (!list_lru_memcg_aware(lru)) return; mutex_lock(&list_lrus_mutex); list_del(&lru->list); mutex_unlock(&list_lrus_mutex); } static int lru_shrinker_id(struct list_lru *lru) { return lru->shrinker_id; } static inline struct list_lru_one * list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) { if (list_lru_memcg_aware(lru) && idx >= 0) { struct list_lru_memcg *mlru = xa_load(&lru->xa, idx); return mlru ? &mlru->node[nid] : NULL; } return &lru->node[nid].lru; } #else static void list_lru_register(struct list_lru *lru) { } static void list_lru_unregister(struct list_lru *lru) { } static int lru_shrinker_id(struct list_lru *lru) { return -1; } static inline bool list_lru_memcg_aware(struct list_lru *lru) { return false; } static inline struct list_lru_one * list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) { return &lru->node[nid].lru; } #endif /* CONFIG_MEMCG_KMEM */ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, struct mem_cgroup *memcg) { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; spin_lock(&nlru->lock); if (list_empty(item)) { l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); list_add_tail(item, &l->list); /* Set shrinker bit if the first element was added */ if (!l->nr_items++) set_shrinker_bit(memcg, nid, lru_shrinker_id(lru)); nlru->nr_items++; spin_unlock(&nlru->lock); return true; } spin_unlock(&nlru->lock); return false; } EXPORT_SYMBOL_GPL(list_lru_add); bool list_lru_add_obj(struct list_lru *lru, struct list_head *item) { int nid = page_to_nid(virt_to_page(item)); struct mem_cgroup *memcg = list_lru_memcg_aware(lru) ? mem_cgroup_from_slab_obj(item) : NULL; return list_lru_add(lru, item, nid, memcg); } EXPORT_SYMBOL_GPL(list_lru_add_obj); bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, struct mem_cgroup *memcg) { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; spin_lock(&nlru->lock); if (!list_empty(item)) { l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); list_del_init(item); l->nr_items--; nlru->nr_items--; spin_unlock(&nlru->lock); return true; } spin_unlock(&nlru->lock); return false; } EXPORT_SYMBOL_GPL(list_lru_del); bool list_lru_del_obj(struct list_lru *lru, struct list_head *item) { int nid = page_to_nid(virt_to_page(item)); struct mem_cgroup *memcg = list_lru_memcg_aware(lru) ? mem_cgroup_from_slab_obj(item) : NULL; return list_lru_del(lru, item, nid, memcg); } EXPORT_SYMBOL_GPL(list_lru_del_obj); void list_lru_isolate(struct list_lru_one *list, struct list_head *item) { list_del_init(item); list->nr_items--; } EXPORT_SYMBOL_GPL(list_lru_isolate); void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, struct list_head *head) { list_move(item, head); list->nr_items--; } EXPORT_SYMBOL_GPL(list_lru_isolate_move); void list_lru_putback(struct list_lru *lru, struct list_head *item, int nid, struct mem_cgroup *memcg) { struct list_lru_one *list = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); if (list_empty(item)) { list_add_tail(item, &list->list); if (!list->nr_items++) set_shrinker_bit(memcg, nid, lru_shrinker_id(lru)); } } EXPORT_SYMBOL_GPL(list_lru_putback); unsigned long list_lru_count_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg) { struct list_lru_one *l; long count; rcu_read_lock(); l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); count = l ? READ_ONCE(l->nr_items) : 0; rcu_read_unlock(); if (unlikely(count < 0)) count = 0; return count; } EXPORT_SYMBOL_GPL(list_lru_count_one); unsigned long list_lru_count_node(struct list_lru *lru, int nid) { struct list_lru_node *nlru; nlru = &lru->node[nid]; return nlru->nr_items; } EXPORT_SYMBOL_GPL(list_lru_count_node); static unsigned long __list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; struct list_head *item, *n; unsigned long isolated = 0; restart: l = list_lru_from_memcg_idx(lru, nid, memcg_idx); if (!l) goto out; list_for_each_safe(item, n, &l->list) { enum lru_status ret; /* * decrement nr_to_walk first so that we don't livelock if we * get stuck on large numbers of LRU_RETRY items */ if (!*nr_to_walk) break; --*nr_to_walk; ret = isolate(item, l, &nlru->lock, cb_arg); switch (ret) { case LRU_REMOVED_RETRY: assert_spin_locked(&nlru->lock); fallthrough; case LRU_REMOVED: isolated++; nlru->nr_items--; /* * If the lru lock has been dropped, our list * traversal is now invalid and so we have to * restart from scratch. */ if (ret == LRU_REMOVED_RETRY) goto restart; break; case LRU_ROTATE: list_move_tail(item, &l->list); break; case LRU_SKIP: break; case LRU_RETRY: /* * The lru lock has been dropped, our list traversal is * now invalid and so we have to restart from scratch. */ assert_spin_locked(&nlru->lock); goto restart; default: BUG(); } } out: return isolated; } unsigned long list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { struct list_lru_node *nlru = &lru->node[nid]; unsigned long ret; spin_lock(&nlru->lock); ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate, cb_arg, nr_to_walk); spin_unlock(&nlru->lock); return ret; } EXPORT_SYMBOL_GPL(list_lru_walk_one); unsigned long list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { struct list_lru_node *nlru = &lru->node[nid]; unsigned long ret; spin_lock_irq(&nlru->lock); ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate, cb_arg, nr_to_walk); spin_unlock_irq(&nlru->lock); return ret; } unsigned long list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { long isolated = 0; isolated += list_lru_walk_one(lru, nid, NULL, isolate, cb_arg, nr_to_walk); #ifdef CONFIG_MEMCG_KMEM if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { struct list_lru_memcg *mlru; unsigned long index; xa_for_each(&lru->xa, index, mlru) { struct list_lru_node *nlru = &lru->node[nid]; spin_lock(&nlru->lock); isolated += __list_lru_walk_one(lru, nid, index, isolate, cb_arg, nr_to_walk); spin_unlock(&nlru->lock); if (*nr_to_walk <= 0) break; } } #endif return isolated; } EXPORT_SYMBOL_GPL(list_lru_walk_node); static void init_one_lru(struct list_lru_one *l) { INIT_LIST_HEAD(&l->list); l->nr_items = 0; } #ifdef CONFIG_MEMCG_KMEM static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp) { int nid; struct list_lru_memcg *mlru; mlru = kmalloc(struct_size(mlru, node, nr_node_ids), gfp); if (!mlru) return NULL; for_each_node(nid) init_one_lru(&mlru->node[nid]); return mlru; } static void memcg_list_lru_free(struct list_lru *lru, int src_idx) { struct list_lru_memcg *mlru = xa_erase_irq(&lru->xa, src_idx); /* * The __list_lru_walk_one() can walk the list of this node. * We need kvfree_rcu() here. And the walking of the list * is under lru->node[nid]->lock, which can serve as a RCU * read-side critical section. */ if (mlru) kvfree_rcu(mlru, rcu); } static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) { if (memcg_aware) xa_init_flags(&lru->xa, XA_FLAGS_LOCK_IRQ); lru->memcg_aware = memcg_aware; } static void memcg_destroy_list_lru(struct list_lru *lru) { XA_STATE(xas, &lru->xa, 0); struct list_lru_memcg *mlru; if (!list_lru_memcg_aware(lru)) return; xas_lock_irq(&xas); xas_for_each(&xas, mlru, ULONG_MAX) { kfree(mlru); xas_store(&xas, NULL); } xas_unlock_irq(&xas); } static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid, int src_idx, struct mem_cgroup *dst_memcg) { struct list_lru_node *nlru = &lru->node[nid]; int dst_idx = dst_memcg->kmemcg_id; struct list_lru_one *src, *dst; /* * Since list_lru_{add,del} may be called under an IRQ-safe lock, * we have to use IRQ-safe primitives here to avoid deadlock. */ spin_lock_irq(&nlru->lock); src = list_lru_from_memcg_idx(lru, nid, src_idx); if (!src) goto out; dst = list_lru_from_memcg_idx(lru, nid, dst_idx); list_splice_init(&src->list, &dst->list); if (src->nr_items) { dst->nr_items += src->nr_items; set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); src->nr_items = 0; } out: spin_unlock_irq(&nlru->lock); } static void memcg_reparent_list_lru(struct list_lru *lru, int src_idx, struct mem_cgroup *dst_memcg) { int i; for_each_node(i) memcg_reparent_list_lru_node(lru, i, src_idx, dst_memcg); memcg_list_lru_free(lru, src_idx); } void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent) { struct cgroup_subsys_state *css; struct list_lru *lru; int src_idx = memcg->kmemcg_id; /* * Change kmemcg_id of this cgroup and all its descendants to the * parent's id, and then move all entries from this cgroup's list_lrus * to ones of the parent. * * After we have finished, all list_lrus corresponding to this cgroup * are guaranteed to remain empty. So we can safely free this cgroup's * list lrus in memcg_list_lru_free(). * * Changing ->kmemcg_id to the parent can prevent memcg_list_lru_alloc() * from allocating list lrus for this cgroup after memcg_list_lru_free() * call. */ rcu_read_lock(); css_for_each_descendant_pre(css, &memcg->css) { struct mem_cgroup *child; child = mem_cgroup_from_css(css); WRITE_ONCE(child->kmemcg_id, parent->kmemcg_id); } rcu_read_unlock(); mutex_lock(&list_lrus_mutex); list_for_each_entry(lru, &memcg_list_lrus, list) memcg_reparent_list_lru(lru, src_idx, parent); mutex_unlock(&list_lrus_mutex); } static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg, struct list_lru *lru) { int idx = memcg->kmemcg_id; return idx < 0 || xa_load(&lru->xa, idx); } int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, gfp_t gfp) { int i; unsigned long flags; struct list_lru_memcg_table { struct list_lru_memcg *mlru; struct mem_cgroup *memcg; } *table; XA_STATE(xas, &lru->xa, 0); if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru)) return 0; gfp &= GFP_RECLAIM_MASK; table = kmalloc_array(memcg->css.cgroup->level, sizeof(*table), gfp); if (!table) return -ENOMEM; /* * Because the list_lru can be reparented to the parent cgroup's * list_lru, we should make sure that this cgroup and all its * ancestors have allocated list_lru_memcg. */ for (i = 0; memcg; memcg = parent_mem_cgroup(memcg), i++) { if (memcg_list_lru_allocated(memcg, lru)) break; table[i].memcg = memcg; table[i].mlru = memcg_init_list_lru_one(gfp); if (!table[i].mlru) { while (i--) kfree(table[i].mlru); kfree(table); return -ENOMEM; } } xas_lock_irqsave(&xas, flags); while (i--) { int index = READ_ONCE(table[i].memcg->kmemcg_id); struct list_lru_memcg *mlru = table[i].mlru; xas_set(&xas, index); retry: if (unlikely(index < 0 || xas_error(&xas) || xas_load(&xas))) { kfree(mlru); } else { xas_store(&xas, mlru); if (xas_error(&xas) == -ENOMEM) { xas_unlock_irqrestore(&xas, flags); if (xas_nomem(&xas, gfp)) xas_set_err(&xas, 0); xas_lock_irqsave(&xas, flags); /* * The xas lock has been released, this memcg * can be reparented before us. So reload * memcg id. More details see the comments * in memcg_reparent_list_lrus(). */ index = READ_ONCE(table[i].memcg->kmemcg_id); if (index < 0) xas_set_err(&xas, 0); else if (!xas_error(&xas) && index != xas.xa_index) xas_set(&xas, index); goto retry; } } } /* xas_nomem() is used to free memory instead of memory allocation. */ if (xas.xa_alloc) xas_nomem(&xas, gfp); xas_unlock_irqrestore(&xas, flags); kfree(table); return xas_error(&xas); } #else static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) { } static void memcg_destroy_list_lru(struct list_lru *lru) { } #endif /* CONFIG_MEMCG_KMEM */ int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct lock_class_key *key, struct shrinker *shrinker) { int i; #ifdef CONFIG_MEMCG_KMEM if (shrinker) lru->shrinker_id = shrinker->id; else lru->shrinker_id = -1; #endif lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL); if (!lru->node) return -ENOMEM; for_each_node(i) { spin_lock_init(&lru->node[i].lock); if (key) lockdep_set_class(&lru->node[i].lock, key); init_one_lru(&lru->node[i].lru); } memcg_init_list_lru(lru, memcg_aware); list_lru_register(lru); return 0; } EXPORT_SYMBOL_GPL(__list_lru_init); void list_lru_destroy(struct list_lru *lru) { /* Already destroyed or not yet initialized? */ if (!lru->node) return; list_lru_unregister(lru); memcg_destroy_list_lru(lru); kfree(lru->node); lru->node = NULL; #ifdef CONFIG_MEMCG_KMEM lru->shrinker_id = -1; #endif } EXPORT_SYMBOL_GPL(list_lru_destroy);
16 16 16 16 16 16 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 // SPDX-License-Identifier: GPL-2.0-or-later /* * index.c - NTFS kernel index handling. Part of the Linux-NTFS project. * * Copyright (c) 2004-2005 Anton Altaparmakov */ #include <linux/slab.h> #include "aops.h" #include "collate.h" #include "debug.h" #include "index.h" #include "ntfs.h" /** * ntfs_index_ctx_get - allocate and initialize a new index context * @idx_ni: ntfs index inode with which to initialize the context * * Allocate a new index context, initialize it with @idx_ni and return it. * Return NULL if allocation failed. * * Locking: Caller must hold i_mutex on the index inode. */ ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni) { ntfs_index_context *ictx; ictx = kmem_cache_alloc(ntfs_index_ctx_cache, GFP_NOFS); if (ictx) *ictx = (ntfs_index_context){ .idx_ni = idx_ni }; return ictx; } /** * ntfs_index_ctx_put - release an index context * @ictx: index context to free * * Release the index context @ictx, releasing all associated resources. * * Locking: Caller must hold i_mutex on the index inode. */ void ntfs_index_ctx_put(ntfs_index_context *ictx) { if (ictx->entry) { if (ictx->is_in_root) { if (ictx->actx) ntfs_attr_put_search_ctx(ictx->actx); if (ictx->base_ni) unmap_mft_record(ictx->base_ni); } else { struct page *page = ictx->page; if (page) { BUG_ON(!PageLocked(page)); unlock_page(page); ntfs_unmap_page(page); } } } kmem_cache_free(ntfs_index_ctx_cache, ictx); return; } /** * ntfs_index_lookup - find a key in an index and return its index entry * @key: [IN] key for which to search in the index * @key_len: [IN] length of @key in bytes * @ictx: [IN/OUT] context describing the index and the returned entry * * Before calling ntfs_index_lookup(), @ictx must have been obtained from a * call to ntfs_index_ctx_get(). * * Look for the @key in the index specified by the index lookup context @ictx. * ntfs_index_lookup() walks the contents of the index looking for the @key. * * If the @key is found in the index, 0 is returned and @ictx is setup to * describe the index entry containing the matching @key. @ictx->entry is the * index entry and @ictx->data and @ictx->data_len are the index entry data and * its length in bytes, respectively. * * If the @key is not found in the index, -ENOENT is returned and @ictx is * setup to describe the index entry whose key collates immediately after the * search @key, i.e. this is the position in the index at which an index entry * with a key of @key would need to be inserted. * * If an error occurs return the negative error code and @ictx is left * untouched. * * When finished with the entry and its data, call ntfs_index_ctx_put() to free * the context and other associated resources. * * If the index entry was modified, call flush_dcache_index_entry_page() * immediately after the modification and either ntfs_index_entry_mark_dirty() * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to * ensure that the changes are written to disk. * * Locking: - Caller must hold i_mutex on the index inode. * - Each page cache page in the index allocation mapping must be * locked whilst being accessed otherwise we may find a corrupt * page due to it being under ->writepage at the moment which * applies the mst protection fixups before writing out and then * removes them again after the write is complete after which it * unlocks the page. */ int ntfs_index_lookup(const void *key, const int key_len, ntfs_index_context *ictx) { VCN vcn, old_vcn; ntfs_inode *idx_ni = ictx->idx_ni; ntfs_volume *vol = idx_ni->vol; struct super_block *sb = vol->sb; ntfs_inode *base_ni = idx_ni->ext.base_ntfs_ino; MFT_RECORD *m; INDEX_ROOT *ir; INDEX_ENTRY *ie; INDEX_ALLOCATION *ia; u8 *index_end, *kaddr; ntfs_attr_search_ctx *actx; struct address_space *ia_mapping; struct page *page; int rc, err = 0; ntfs_debug("Entering."); BUG_ON(!NInoAttr(idx_ni)); BUG_ON(idx_ni->type != AT_INDEX_ALLOCATION); BUG_ON(idx_ni->nr_extents != -1); BUG_ON(!base_ni); BUG_ON(!key); BUG_ON(key_len <= 0); if (!ntfs_is_collation_rule_supported( idx_ni->itype.index.collation_rule)) { ntfs_error(sb, "Index uses unsupported collation rule 0x%x. " "Aborting lookup.", le32_to_cpu( idx_ni->itype.index.collation_rule)); return -EOPNOTSUPP; } /* Get hold of the mft record for the index inode. */ m = map_mft_record(base_ni); if (IS_ERR(m)) { ntfs_error(sb, "map_mft_record() failed with error code %ld.", -PTR_ERR(m)); return PTR_ERR(m); } actx = ntfs_attr_get_search_ctx(base_ni, m); if (unlikely(!actx)) { err = -ENOMEM; goto err_out; } /* Find the index root attribute in the mft record. */ err = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, actx); if (unlikely(err)) { if (err == -ENOENT) { ntfs_error(sb, "Index root attribute missing in inode " "0x%lx.", idx_ni->mft_no); err = -EIO; } goto err_out; } /* Get to the index root value (it has been verified in read_inode). */ ir = (INDEX_ROOT*)((u8*)actx->attr + le16_to_cpu(actx->attr->data.resident.value_offset)); index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length); /* The first index entry. */ ie = (INDEX_ENTRY*)((u8*)&ir->index + le32_to_cpu(ir->index.entries_offset)); /* * Loop until we exceed valid memory (corruption case) or until we * reach the last entry. */ for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { /* Bounds checks. */ if ((u8*)ie < (u8*)actx->mrec || (u8*)ie + sizeof(INDEX_ENTRY_HEADER) > index_end || (u8*)ie + le16_to_cpu(ie->length) > index_end) goto idx_err_out; /* * The last entry cannot contain a key. It can however contain * a pointer to a child node in the B+tree so we just break out. */ if (ie->flags & INDEX_ENTRY_END) break; /* Further bounds checks. */ if ((u32)sizeof(INDEX_ENTRY_HEADER) + le16_to_cpu(ie->key_length) > le16_to_cpu(ie->data.vi.data_offset) || (u32)le16_to_cpu(ie->data.vi.data_offset) + le16_to_cpu(ie->data.vi.data_length) > le16_to_cpu(ie->length)) goto idx_err_out; /* If the keys match perfectly, we setup @ictx and return 0. */ if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key, &ie->key, key_len)) { ir_done: ictx->is_in_root = true; ictx->ir = ir; ictx->actx = actx; ictx->base_ni = base_ni; ictx->ia = NULL; ictx->page = NULL; done: ictx->entry = ie; ictx->data = (u8*)ie + le16_to_cpu(ie->data.vi.data_offset); ictx->data_len = le16_to_cpu(ie->data.vi.data_length); ntfs_debug("Done."); return err; } /* * Not a perfect match, need to do full blown collation so we * know which way in the B+tree we have to go. */ rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key, key_len, &ie->key, le16_to_cpu(ie->key_length)); /* * If @key collates before the key of the current entry, there * is definitely no such key in this index but we might need to * descend into the B+tree so we just break out of the loop. */ if (rc == -1) break; /* * A match should never happen as the memcmp() call should have * cought it, but we still treat it correctly. */ if (!rc) goto ir_done; /* The keys are not equal, continue the search. */ } /* * We have finished with this index without success. Check for the * presence of a child node and if not present setup @ictx and return * -ENOENT. */ if (!(ie->flags & INDEX_ENTRY_NODE)) { ntfs_debug("Entry not found."); err = -ENOENT; goto ir_done; } /* Child node present, descend into it. */ /* Consistency check: Verify that an index allocation exists. */ if (!NInoIndexAllocPresent(idx_ni)) { ntfs_error(sb, "No index allocation attribute but index entry " "requires one. Inode 0x%lx is corrupt or " "driver bug.", idx_ni->mft_no); goto err_out; } /* Get the starting vcn of the index_block holding the child node. */ vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8)); ia_mapping = VFS_I(idx_ni)->i_mapping; /* * We are done with the index root and the mft record. Release them, * otherwise we deadlock with ntfs_map_page(). */ ntfs_attr_put_search_ctx(actx); unmap_mft_record(base_ni); m = NULL; actx = NULL; descend_into_child_node: /* * Convert vcn to index into the index allocation attribute in units * of PAGE_SIZE and map the page cache page, reading it from * disk if necessary. */ page = ntfs_map_page(ia_mapping, vcn << idx_ni->itype.index.vcn_size_bits >> PAGE_SHIFT); if (IS_ERR(page)) { ntfs_error(sb, "Failed to map index page, error %ld.", -PTR_ERR(page)); err = PTR_ERR(page); goto err_out; } lock_page(page); kaddr = (u8*)page_address(page); fast_descend_into_child_node: /* Get to the index allocation block. */ ia = (INDEX_ALLOCATION*)(kaddr + ((vcn << idx_ni->itype.index.vcn_size_bits) & ~PAGE_MASK)); /* Bounds checks. */ if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) { ntfs_error(sb, "Out of bounds check failed. Corrupt inode " "0x%lx or driver bug.", idx_ni->mft_no); goto unm_err_out; } /* Catch multi sector transfer fixup errors. */ if (unlikely(!ntfs_is_indx_record(ia->magic))) { ntfs_error(sb, "Index record with vcn 0x%llx is corrupt. " "Corrupt inode 0x%lx. Run chkdsk.", (long long)vcn, idx_ni->mft_no); goto unm_err_out; } if (sle64_to_cpu(ia->index_block_vcn) != vcn) { ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is " "different from expected VCN (0x%llx). Inode " "0x%lx is corrupt or driver bug.", (unsigned long long) sle64_to_cpu(ia->index_block_vcn), (unsigned long long)vcn, idx_ni->mft_no); goto unm_err_out; } if (le32_to_cpu(ia->index.allocated_size) + 0x18 != idx_ni->itype.index.block_size) { ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx has " "a size (%u) differing from the index " "specified size (%u). Inode is corrupt or " "driver bug.", (unsigned long long)vcn, idx_ni->mft_no, le32_to_cpu(ia->index.allocated_size) + 0x18, idx_ni->itype.index.block_size); goto unm_err_out; } index_end = (u8*)ia + idx_ni->itype.index.block_size; if (index_end > kaddr + PAGE_SIZE) { ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx " "crosses page boundary. Impossible! Cannot " "access! This is probably a bug in the " "driver.", (unsigned long long)vcn, idx_ni->mft_no); goto unm_err_out; } index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length); if (index_end > (u8*)ia + idx_ni->itype.index.block_size) { ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of inode " "0x%lx exceeds maximum size.", (unsigned long long)vcn, idx_ni->mft_no); goto unm_err_out; } /* The first index entry. */ ie = (INDEX_ENTRY*)((u8*)&ia->index + le32_to_cpu(ia->index.entries_offset)); /* * Iterate similar to above big loop but applied to index buffer, thus * loop until we exceed valid memory (corruption case) or until we * reach the last entry. */ for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) { /* Bounds checks. */ if ((u8*)ie < (u8*)ia || (u8*)ie + sizeof(INDEX_ENTRY_HEADER) > index_end || (u8*)ie + le16_to_cpu(ie->length) > index_end) { ntfs_error(sb, "Index entry out of bounds in inode " "0x%lx.", idx_ni->mft_no); goto unm_err_out; } /* * The last entry cannot contain a key. It can however contain * a pointer to a child node in the B+tree so we just break out. */ if (ie->flags & INDEX_ENTRY_END) break; /* Further bounds checks. */ if ((u32)sizeof(INDEX_ENTRY_HEADER) + le16_to_cpu(ie->key_length) > le16_to_cpu(ie->data.vi.data_offset) || (u32)le16_to_cpu(ie->data.vi.data_offset) + le16_to_cpu(ie->data.vi.data_length) > le16_to_cpu(ie->length)) { ntfs_error(sb, "Index entry out of bounds in inode " "0x%lx.", idx_ni->mft_no); goto unm_err_out; } /* If the keys match perfectly, we setup @ictx and return 0. */ if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key, &ie->key, key_len)) { ia_done: ictx->is_in_root = false; ictx->actx = NULL; ictx->base_ni = NULL; ictx->ia = ia; ictx->page = page; goto done; } /* * Not a perfect match, need to do full blown collation so we * know which way in the B+tree we have to go. */ rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key, key_len, &ie->key, le16_to_cpu(ie->key_length)); /* * If @key collates before the key of the current entry, there * is definitely no such key in this index but we might need to * descend into the B+tree so we just break out of the loop. */ if (rc == -1) break; /* * A match should never happen as the memcmp() call should have * cought it, but we still treat it correctly. */ if (!rc) goto ia_done; /* The keys are not equal, continue the search. */ } /* * We have finished with this index buffer without success. Check for * the presence of a child node and if not present return -ENOENT. */ if (!(ie->flags & INDEX_ENTRY_NODE)) { ntfs_debug("Entry not found."); err = -ENOENT; goto ia_done; } if ((ia->index.flags & NODE_MASK) == LEAF_NODE) { ntfs_error(sb, "Index entry with child node found in a leaf " "node in inode 0x%lx.", idx_ni->mft_no); goto unm_err_out; } /* Child node present, descend into it. */ old_vcn = vcn; vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8)); if (vcn >= 0) { /* * If vcn is in the same page cache page as old_vcn we recycle * the mapped page. */ if (old_vcn << vol->cluster_size_bits >> PAGE_SHIFT == vcn << vol->cluster_size_bits >> PAGE_SHIFT) goto fast_descend_into_child_node; unlock_page(page); ntfs_unmap_page(page); goto descend_into_child_node; } ntfs_error(sb, "Negative child node vcn in inode 0x%lx.", idx_ni->mft_no); unm_err_out: unlock_page(page); ntfs_unmap_page(page); err_out: if (!err) err = -EIO; if (actx) ntfs_attr_put_search_ctx(actx); if (m) unmap_mft_record(base_ni); return err; idx_err_out: ntfs_error(sb, "Corrupt index. Aborting lookup."); goto err_out; }
10 1 9 1 5 10 1 9 14 12 1 2 1 1 14 14 17 21 21 11 9 17 14 6 3 2 3 8 4 2 20 10 12 2 17 3 16 13 7 9 4 4 15 1 11 3 11 3 10 4 9 5 8 6 9 5 7 7 8 8 104 187 1 1 1 1 187 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux NET3: IP/IP protocol decoder modified to support * virtual tunnel interface * * Authors: * Saurabh Mohan (saurabh.mohan@vyatta.com) 05/07/2012 */ /* This version of net/ipv4/ip_vti.c is cloned of net/ipv4/ipip.c For comments look at net/ipv4/ip_gre.c --ANK */ #include <linux/capability.h> #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/in.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/if_arp.h> #include <linux/init.h> #include <linux/netfilter_ipv4.h> #include <linux/if_ether.h> #include <linux/icmpv6.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/ip_tunnels.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/netns/generic.h> static struct rtnl_link_ops vti_link_ops __read_mostly; static unsigned int vti_net_id __read_mostly; static int vti_tunnel_init(struct net_device *dev); static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type, bool update_skb_dev) { struct ip_tunnel *tunnel; const struct iphdr *iph = ip_hdr(skb); struct net *net = dev_net(skb->dev); struct ip_tunnel_net *itn = net_generic(net, vti_net_id); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, iph->saddr, iph->daddr, 0); if (tunnel) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel; if (update_skb_dev) skb->dev = tunnel->dev; return xfrm_input(skb, nexthdr, spi, encap_type); } return -EINVAL; drop: kfree_skb(skb); return 0; } static int vti_input_proto(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { return vti_input(skb, nexthdr, spi, encap_type, false); } static int vti_rcv(struct sk_buff *skb, __be32 spi, bool update_skb_dev) { XFRM_SPI_SKB_CB(skb)->family = AF_INET; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); return vti_input(skb, ip_hdr(skb)->protocol, spi, 0, update_skb_dev); } static int vti_rcv_proto(struct sk_buff *skb) { return vti_rcv(skb, 0, false); } static int vti_rcv_cb(struct sk_buff *skb, int err) { unsigned short family; struct net_device *dev; struct xfrm_state *x; const struct xfrm_mode *inner_mode; struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4; u32 orig_mark = skb->mark; int ret; if (!tunnel) return 1; dev = tunnel->dev; if (err) { DEV_STATS_INC(dev, rx_errors); DEV_STATS_INC(dev, rx_dropped); return 0; } x = xfrm_input_state(skb); inner_mode = &x->inner_mode; if (x->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol); if (inner_mode == NULL) { XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMINSTATEMODEERROR); return -EINVAL; } } family = inner_mode->family; skb->mark = be32_to_cpu(tunnel->parms.i_key); ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family); skb->mark = orig_mark; if (!ret) return -EPERM; skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev))); skb->dev = dev; dev_sw_netstats_rx_add(dev, skb->len); return 0; } static bool vti_state_check(const struct xfrm_state *x, __be32 dst, __be32 src) { xfrm_address_t *daddr = (xfrm_address_t *)&dst; xfrm_address_t *saddr = (xfrm_address_t *)&src; /* if there is no transform then this tunnel is not functional. * Or if the xfrm is not mode tunnel. */ if (!x || x->props.mode != XFRM_MODE_TUNNEL || x->props.family != AF_INET) return false; if (!dst) return xfrm_addr_equal(saddr, &x->props.saddr, AF_INET); if (!xfrm_state_addr_check(x, daddr, saddr, AF_INET)) return false; return true; } static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) { struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_parm *parms = &tunnel->parms; struct dst_entry *dst = skb_dst(skb); struct net_device *tdev; /* Device to other host */ int pkt_len = skb->len; int err; int mtu; if (!dst) { switch (skb->protocol) { case htons(ETH_P_IP): { struct rtable *rt; fl->u.ip4.flowi4_oif = dev->ifindex; fl->u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; rt = __ip_route_output_key(dev_net(dev), &fl->u.ip4); if (IS_ERR(rt)) { DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } dst = &rt->dst; skb_dst_set(skb, dst); break; } #if IS_ENABLED(CONFIG_IPV6) case htons(ETH_P_IPV6): fl->u.ip6.flowi6_oif = dev->ifindex; fl->u.ip6.flowi6_flags |= FLOWI_FLAG_ANYSRC; dst = ip6_route_output(dev_net(dev), NULL, &fl->u.ip6); if (dst->error) { dst_release(dst); dst = NULL; DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } skb_dst_set(skb, dst); break; #endif default: DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } } dst_hold(dst); dst = xfrm_lookup_route(tunnel->net, dst, fl, NULL, 0); if (IS_ERR(dst)) { DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } if (dst->flags & DST_XFRM_QUEUE) goto xmit; if (!vti_state_check(dst->xfrm, parms->iph.daddr, parms->iph.saddr)) { DEV_STATS_INC(dev, tx_carrier_errors); dst_release(dst); goto tx_error_icmp; } tdev = dst->dev; if (tdev == dev) { dst_release(dst); DEV_STATS_INC(dev, collisions); goto tx_error; } mtu = dst_mtu(dst); if (skb->len > mtu) { skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->protocol == htons(ETH_P_IP)) { if (!(ip_hdr(skb)->frag_off & htons(IP_DF))) goto xmit; icmp_ndo_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); } else { if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); } dst_release(dst); goto tx_error; } xmit: skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev))); skb_dst_set(skb, dst); skb->dev = skb_dst(skb)->dev; err = dst_output(tunnel->net, skb->sk, skb); if (net_xmit_eval(err) == 0) err = pkt_len; iptunnel_xmit_stats(dev, err); return NETDEV_TX_OK; tx_error_icmp: dst_link_failure(skb); tx_error: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return NETDEV_TX_OK; } /* This function assumes it is being called from dev_queue_xmit() * and that skb is filled properly by that function. */ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct flowi fl; if (!pskb_inet_may_pull(skb)) goto tx_err; memset(&fl, 0, sizeof(fl)); switch (skb->protocol) { case htons(ETH_P_IP): memset(IPCB(skb), 0, sizeof(*IPCB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET); break; case htons(ETH_P_IPV6): memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); xfrm_decode_session(dev_net(dev), skb, &fl, AF_INET6); break; default: goto tx_err; } /* override mark with tunnel output key */ fl.flowi_mark = be32_to_cpu(tunnel->parms.o_key); return vti_xmit(skb, dev, &fl); tx_err: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return NETDEV_TX_OK; } static int vti4_err(struct sk_buff *skb, u32 info) { __be32 spi; __u32 mark; struct xfrm_state *x; struct ip_tunnel *tunnel; struct ip_esp_hdr *esph; struct ip_auth_hdr *ah ; struct ip_comp_hdr *ipch; struct net *net = dev_net(skb->dev); const struct iphdr *iph = (const struct iphdr *)skb->data; int protocol = iph->protocol; struct ip_tunnel_net *itn = net_generic(net, vti_net_id); tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY, iph->daddr, iph->saddr, 0); if (!tunnel) return -1; mark = be32_to_cpu(tunnel->parms.o_key); switch (protocol) { case IPPROTO_ESP: esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2)); spi = esph->spi; break; case IPPROTO_AH: ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); spi = ah->spi; break; case IPPROTO_COMP: ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); spi = htonl(ntohs(ipch->cpi)); break; default: return 0; } switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return 0; break; case ICMP_REDIRECT: break; default: return 0; } x = xfrm_state_lookup(net, mark, (const xfrm_address_t *)&iph->daddr, spi, protocol, AF_INET); if (!x) return 0; if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, protocol); else ipv4_redirect(skb, net, 0, protocol); xfrm_state_put(x); return 0; } static int vti_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) { int err = 0; if (cmd == SIOCADDTUNNEL || cmd == SIOCCHGTUNNEL) { if (p->iph.version != 4 || p->iph.protocol != IPPROTO_IPIP || p->iph.ihl != 5) return -EINVAL; } if (!(p->i_flags & GRE_KEY)) p->i_key = 0; if (!(p->o_flags & GRE_KEY)) p->o_key = 0; p->i_flags = VTI_ISVTI; err = ip_tunnel_ctl(dev, p, cmd); if (err) return err; if (cmd != SIOCDELTUNNEL) { p->i_flags |= GRE_KEY; p->o_flags |= GRE_KEY; } return 0; } static const struct net_device_ops vti_netdev_ops = { .ndo_init = vti_tunnel_init, .ndo_uninit = ip_tunnel_uninit, .ndo_start_xmit = vti_tunnel_xmit, .ndo_siocdevprivate = ip_tunnel_siocdevprivate, .ndo_change_mtu = ip_tunnel_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = vti_tunnel_ctl, }; static void vti_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &vti_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->type = ARPHRD_TUNNEL; ip_tunnel_setup(dev, vti_net_id); } static int vti_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; __dev_addr_set(dev, &iph->saddr, 4); memcpy(dev->broadcast, &iph->daddr, 4); dev->flags = IFF_NOARP; dev->addr_len = 4; dev->features |= NETIF_F_LLTX; netif_keep_dst(dev); return ip_tunnel_init(dev); } static void __net_init vti_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; iph->version = 4; iph->protocol = IPPROTO_IPIP; iph->ihl = 5; } static struct xfrm4_protocol vti_esp4_protocol __read_mostly = { .handler = vti_rcv_proto, .input_handler = vti_input_proto, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 100, }; static struct xfrm4_protocol vti_ah4_protocol __read_mostly = { .handler = vti_rcv_proto, .input_handler = vti_input_proto, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 100, }; static struct xfrm4_protocol vti_ipcomp4_protocol __read_mostly = { .handler = vti_rcv_proto, .input_handler = vti_input_proto, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 100, }; #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) static int vti_rcv_tunnel(struct sk_buff *skb) { XFRM_SPI_SKB_CB(skb)->family = AF_INET; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); return vti_input(skb, IPPROTO_IPIP, ip_hdr(skb)->saddr, 0, false); } static struct xfrm_tunnel vti_ipip_handler __read_mostly = { .handler = vti_rcv_tunnel, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 0, }; #if IS_ENABLED(CONFIG_IPV6) static struct xfrm_tunnel vti_ipip6_handler __read_mostly = { .handler = vti_rcv_tunnel, .cb_handler = vti_rcv_cb, .err_handler = vti4_err, .priority = 0, }; #endif #endif static int __net_init vti_init_net(struct net *net) { int err; struct ip_tunnel_net *itn; err = ip_tunnel_init_net(net, vti_net_id, &vti_link_ops, "ip_vti0"); if (err) return err; itn = net_generic(net, vti_net_id); if (itn->fb_tunnel_dev) vti_fb_tunnel_init(itn->fb_tunnel_dev); return 0; } static void __net_exit vti_exit_batch_net(struct list_head *list_net) { ip_tunnel_delete_nets(list_net, vti_net_id, &vti_link_ops); } static struct pernet_operations vti_net_ops = { .init = vti_init_net, .exit_batch = vti_exit_batch_net, .id = &vti_net_id, .size = sizeof(struct ip_tunnel_net), }; static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { return 0; } static void vti_netlink_parms(struct nlattr *data[], struct ip_tunnel_parm *parms, __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); parms->iph.protocol = IPPROTO_IPIP; if (!data) return; parms->i_flags = VTI_ISVTI; if (data[IFLA_VTI_LINK]) parms->link = nla_get_u32(data[IFLA_VTI_LINK]); if (data[IFLA_VTI_IKEY]) parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]); if (data[IFLA_VTI_OKEY]) parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]); if (data[IFLA_VTI_LOCAL]) parms->iph.saddr = nla_get_in_addr(data[IFLA_VTI_LOCAL]); if (data[IFLA_VTI_REMOTE]) parms->iph.daddr = nla_get_in_addr(data[IFLA_VTI_REMOTE]); if (data[IFLA_VTI_FWMARK]) *fwmark = nla_get_u32(data[IFLA_VTI_FWMARK]); } static int vti_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip_tunnel_parm parms; __u32 fwmark = 0; vti_netlink_parms(data, &parms, &fwmark); return ip_tunnel_newlink(dev, tb, &parms, fwmark); } static int vti_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); __u32 fwmark = t->fwmark; struct ip_tunnel_parm p; vti_netlink_parms(data, &p, &fwmark); return ip_tunnel_changelink(dev, tb, &p, fwmark); } static size_t vti_get_size(const struct net_device *dev) { return /* IFLA_VTI_LINK */ nla_total_size(4) + /* IFLA_VTI_IKEY */ nla_total_size(4) + /* IFLA_VTI_OKEY */ nla_total_size(4) + /* IFLA_VTI_LOCAL */ nla_total_size(4) + /* IFLA_VTI_REMOTE */ nla_total_size(4) + /* IFLA_VTI_FWMARK */ nla_total_size(4) + 0; } static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm *p = &t->parms; if (nla_put_u32(skb, IFLA_VTI_LINK, p->link) || nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key) || nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key) || nla_put_in_addr(skb, IFLA_VTI_LOCAL, p->iph.saddr) || nla_put_in_addr(skb, IFLA_VTI_REMOTE, p->iph.daddr) || nla_put_u32(skb, IFLA_VTI_FWMARK, t->fwmark)) return -EMSGSIZE; return 0; } static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = { [IFLA_VTI_LINK] = { .type = NLA_U32 }, [IFLA_VTI_IKEY] = { .type = NLA_U32 }, [IFLA_VTI_OKEY] = { .type = NLA_U32 }, [IFLA_VTI_LOCAL] = { .len = sizeof_field(struct iphdr, saddr) }, [IFLA_VTI_REMOTE] = { .len = sizeof_field(struct iphdr, daddr) }, [IFLA_VTI_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops vti_link_ops __read_mostly = { .kind = "vti", .maxtype = IFLA_VTI_MAX, .policy = vti_policy, .priv_size = sizeof(struct ip_tunnel), .setup = vti_tunnel_setup, .validate = vti_tunnel_validate, .newlink = vti_newlink, .changelink = vti_changelink, .dellink = ip_tunnel_dellink, .get_size = vti_get_size, .fill_info = vti_fill_info, .get_link_net = ip_tunnel_get_link_net, }; static int __init vti_init(void) { const char *msg; int err; pr_info("IPv4 over IPsec tunneling driver\n"); msg = "tunnel device"; err = register_pernet_device(&vti_net_ops); if (err < 0) goto pernet_dev_failed; msg = "tunnel protocols"; err = xfrm4_protocol_register(&vti_esp4_protocol, IPPROTO_ESP); if (err < 0) goto xfrm_proto_esp_failed; err = xfrm4_protocol_register(&vti_ah4_protocol, IPPROTO_AH); if (err < 0) goto xfrm_proto_ah_failed; err = xfrm4_protocol_register(&vti_ipcomp4_protocol, IPPROTO_COMP); if (err < 0) goto xfrm_proto_comp_failed; #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) msg = "ipip tunnel"; err = xfrm4_tunnel_register(&vti_ipip_handler, AF_INET); if (err < 0) goto xfrm_tunnel_ipip_failed; #if IS_ENABLED(CONFIG_IPV6) err = xfrm4_tunnel_register(&vti_ipip6_handler, AF_INET6); if (err < 0) goto xfrm_tunnel_ipip6_failed; #endif #endif msg = "netlink interface"; err = rtnl_link_register(&vti_link_ops); if (err < 0) goto rtnl_link_failed; return err; rtnl_link_failed: #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) #if IS_ENABLED(CONFIG_IPV6) xfrm4_tunnel_deregister(&vti_ipip6_handler, AF_INET6); xfrm_tunnel_ipip6_failed: #endif xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET); xfrm_tunnel_ipip_failed: #endif xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); xfrm_proto_comp_failed: xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); xfrm_proto_ah_failed: xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); xfrm_proto_esp_failed: unregister_pernet_device(&vti_net_ops); pernet_dev_failed: pr_err("vti init: failed to register %s\n", msg); return err; } static void __exit vti_fini(void) { rtnl_link_unregister(&vti_link_ops); #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) #if IS_ENABLED(CONFIG_IPV6) xfrm4_tunnel_deregister(&vti_ipip6_handler, AF_INET6); #endif xfrm4_tunnel_deregister(&vti_ipip_handler, AF_INET); #endif xfrm4_protocol_deregister(&vti_ipcomp4_protocol, IPPROTO_COMP); xfrm4_protocol_deregister(&vti_ah4_protocol, IPPROTO_AH); xfrm4_protocol_deregister(&vti_esp4_protocol, IPPROTO_ESP); unregister_pernet_device(&vti_net_ops); } module_init(vti_init); module_exit(vti_fini); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("vti"); MODULE_ALIAS_NETDEV("ip_vti0");
3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef DRIVERS_PCI_H #define DRIVERS_PCI_H #include <linux/pci.h> /* Number of possible devfns: 0.0 to 1f.7 inclusive */ #define MAX_NR_DEVFNS 256 #define PCI_FIND_CAP_TTL 48 #define PCI_VSEC_ID_INTEL_TBT 0x1234 /* Thunderbolt */ #define PCIE_LINK_RETRAIN_TIMEOUT_MS 1000 /* Power stable to PERST# inactive from PCIe card Electromechanical Spec */ #define PCIE_T_PVPERL_MS 100 /* * PCIe r6.0, sec 5.3.3.2.1 <PME Synchronization> * Recommends 1ms to 10ms timeout to check L2 ready. */ #define PCIE_PME_TO_L2_TIMEOUT_US 10000 extern const unsigned char pcie_link_speed[]; extern bool pci_early_dump; bool pcie_cap_has_lnkctl(const struct pci_dev *dev); bool pcie_cap_has_lnkctl2(const struct pci_dev *dev); bool pcie_cap_has_rtctl(const struct pci_dev *dev); /* Functions internal to the PCI core code */ int pci_create_sysfs_dev_files(struct pci_dev *pdev); void pci_remove_sysfs_dev_files(struct pci_dev *pdev); void pci_cleanup_rom(struct pci_dev *dev); #ifdef CONFIG_DMI extern const struct attribute_group pci_dev_smbios_attr_group; #endif enum pci_mmap_api { PCI_MMAP_SYSFS, /* mmap on /sys/bus/pci/devices/<BDF>/resource<N> */ PCI_MMAP_PROCFS /* mmap on /proc/bus/pci/<BDF> */ }; int pci_mmap_fits(struct pci_dev *pdev, int resno, struct vm_area_struct *vmai, enum pci_mmap_api mmap_api); bool pci_reset_supported(struct pci_dev *dev); void pci_init_reset_methods(struct pci_dev *dev); int pci_bridge_secondary_bus_reset(struct pci_dev *dev); int pci_bus_error_reset(struct pci_dev *dev); struct pci_cap_saved_data { u16 cap_nr; bool cap_extended; unsigned int size; u32 data[]; }; struct pci_cap_saved_state { struct hlist_node next; struct pci_cap_saved_data cap; }; void pci_allocate_cap_save_buffers(struct pci_dev *dev); void pci_free_cap_save_buffers(struct pci_dev *dev); int pci_add_cap_save_buffer(struct pci_dev *dev, char cap, unsigned int size); int pci_add_ext_cap_save_buffer(struct pci_dev *dev, u16 cap, unsigned int size); struct pci_cap_saved_state *pci_find_saved_cap(struct pci_dev *dev, char cap); struct pci_cap_saved_state *pci_find_saved_ext_cap(struct pci_dev *dev, u16 cap); #define PCI_PM_D2_DELAY 200 /* usec; see PCIe r4.0, sec 5.9.1 */ #define PCI_PM_D3HOT_WAIT 10 /* msec */ #define PCI_PM_D3COLD_WAIT 100 /* msec */ void pci_update_current_state(struct pci_dev *dev, pci_power_t state); void pci_refresh_power_state(struct pci_dev *dev); int pci_power_up(struct pci_dev *dev); void pci_disable_enabled_device(struct pci_dev *dev); int pci_finish_runtime_suspend(struct pci_dev *dev); void pcie_clear_device_status(struct pci_dev *dev); void pcie_clear_root_pme_status(struct pci_dev *dev); bool pci_check_pme_status(struct pci_dev *dev); void pci_pme_wakeup_bus(struct pci_bus *bus); int __pci_pme_wakeup(struct pci_dev *dev, void *ign); void pci_pme_restore(struct pci_dev *dev); bool pci_dev_need_resume(struct pci_dev *dev); void pci_dev_adjust_pme(struct pci_dev *dev); void pci_dev_complete_resume(struct pci_dev *pci_dev); void pci_config_pm_runtime_get(struct pci_dev *dev); void pci_config_pm_runtime_put(struct pci_dev *dev); void pci_pm_init(struct pci_dev *dev); void pci_ea_init(struct pci_dev *dev); void pci_msi_init(struct pci_dev *dev); void pci_msix_init(struct pci_dev *dev); bool pci_bridge_d3_possible(struct pci_dev *dev); void pci_bridge_d3_update(struct pci_dev *dev); void pci_bridge_reconfigure_ltr(struct pci_dev *dev); int pci_bridge_wait_for_secondary_bus(struct pci_dev *dev, char *reset_type); static inline void pci_wakeup_event(struct pci_dev *dev) { /* Wait 100 ms before the system can be put into a sleep state. */ pm_wakeup_event(&dev->dev, 100); } static inline bool pci_has_subordinate(struct pci_dev *pci_dev) { return !!(pci_dev->subordinate); } static inline bool pci_power_manageable(struct pci_dev *pci_dev) { /* * Currently we allow normal PCI devices and PCI bridges transition * into D3 if their bridge_d3 is set. */ return !pci_has_subordinate(pci_dev) || pci_dev->bridge_d3; } static inline bool pcie_downstream_port(const struct pci_dev *dev) { int type = pci_pcie_type(dev); return type == PCI_EXP_TYPE_ROOT_PORT || type == PCI_EXP_TYPE_DOWNSTREAM || type == PCI_EXP_TYPE_PCIE_BRIDGE; } void pci_vpd_init(struct pci_dev *dev); void pci_vpd_release(struct pci_dev *dev); extern const struct attribute_group pci_dev_vpd_attr_group; /* PCI Virtual Channel */ int pci_save_vc_state(struct pci_dev *dev); void pci_restore_vc_state(struct pci_dev *dev); void pci_allocate_vc_save_buffers(struct pci_dev *dev); /* PCI /proc functions */ #ifdef CONFIG_PROC_FS int pci_proc_attach_device(struct pci_dev *dev); int pci_proc_detach_device(struct pci_dev *dev); int pci_proc_detach_bus(struct pci_bus *bus); #else static inline int pci_proc_attach_device(struct pci_dev *dev) { return 0; } static inline int pci_proc_detach_device(struct pci_dev *dev) { return 0; } static inline int pci_proc_detach_bus(struct pci_bus *bus) { return 0; } #endif /* Functions for PCI Hotplug drivers to use */ int pci_hp_add_bridge(struct pci_dev *dev); #ifdef HAVE_PCI_LEGACY void pci_create_legacy_files(struct pci_bus *bus); void pci_remove_legacy_files(struct pci_bus *bus); #else static inline void pci_create_legacy_files(struct pci_bus *bus) { } static inline void pci_remove_legacy_files(struct pci_bus *bus) { } #endif /* Lock for read/write access to pci device and bus lists */ extern struct rw_semaphore pci_bus_sem; extern struct mutex pci_slot_mutex; extern raw_spinlock_t pci_lock; extern unsigned int pci_pm_d3hot_delay; #ifdef CONFIG_PCI_MSI void pci_no_msi(void); #else static inline void pci_no_msi(void) { } #endif void pci_realloc_get_opt(char *); static inline int pci_no_d1d2(struct pci_dev *dev) { unsigned int parent_dstates = 0; if (dev->bus->self) parent_dstates = dev->bus->self->no_d1d2; return (dev->no_d1d2 || parent_dstates); } extern const struct attribute_group *pci_dev_groups[]; extern const struct attribute_group *pcibus_groups[]; extern const struct device_type pci_dev_type; extern const struct attribute_group *pci_bus_groups[]; extern unsigned long pci_hotplug_io_size; extern unsigned long pci_hotplug_mmio_size; extern unsigned long pci_hotplug_mmio_pref_size; extern unsigned long pci_hotplug_bus_size; /** * pci_match_one_device - Tell if a PCI device structure has a matching * PCI device id structure * @id: single PCI device id structure to match * @dev: the PCI device structure to match against * * Returns the matching pci_device_id structure or %NULL if there is no match. */ static inline const struct pci_device_id * pci_match_one_device(const struct pci_device_id *id, const struct pci_dev *dev) { if ((id->vendor == PCI_ANY_ID || id->vendor == dev->vendor) && (id->device == PCI_ANY_ID || id->device == dev->device) && (id->subvendor == PCI_ANY_ID || id->subvendor == dev->subsystem_vendor) && (id->subdevice == PCI_ANY_ID || id->subdevice == dev->subsystem_device) && !((id->class ^ dev->class) & id->class_mask)) return id; return NULL; } /* PCI slot sysfs helper code */ #define to_pci_slot(s) container_of(s, struct pci_slot, kobj) extern struct kset *pci_slots_kset; struct pci_slot_attribute { struct attribute attr; ssize_t (*show)(struct pci_slot *, char *); ssize_t (*store)(struct pci_slot *, const char *, size_t); }; #define to_pci_slot_attr(s) container_of(s, struct pci_slot_attribute, attr) enum pci_bar_type { pci_bar_unknown, /* Standard PCI BAR probe */ pci_bar_io, /* An I/O port BAR */ pci_bar_mem32, /* A 32-bit memory BAR */ pci_bar_mem64, /* A 64-bit memory BAR */ }; struct device *pci_get_host_bridge_device(struct pci_dev *dev); void pci_put_host_bridge_device(struct device *dev); int pci_configure_extended_tags(struct pci_dev *dev, void *ign); bool pci_bus_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *pl, int crs_timeout); bool pci_bus_generic_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *pl, int crs_timeout); int pci_idt_bus_quirk(struct pci_bus *bus, int devfn, u32 *pl, int crs_timeout); int pci_setup_device(struct pci_dev *dev); int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type, struct resource *res, unsigned int reg); void pci_configure_ari(struct pci_dev *dev); void __pci_bus_size_bridges(struct pci_bus *bus, struct list_head *realloc_head); void __pci_bus_assign_resources(const struct pci_bus *bus, struct list_head *realloc_head, struct list_head *fail_head); bool pci_bus_clip_resource(struct pci_dev *dev, int idx); const char *pci_resource_name(struct pci_dev *dev, unsigned int i); void pci_reassigndev_resource_alignment(struct pci_dev *dev); void pci_disable_bridge_window(struct pci_dev *dev); struct pci_bus *pci_bus_get(struct pci_bus *bus); void pci_bus_put(struct pci_bus *bus); /* PCIe link information from Link Capabilities 2 */ #define PCIE_LNKCAP2_SLS2SPEED(lnkcap2) \ ((lnkcap2) & PCI_EXP_LNKCAP2_SLS_64_0GB ? PCIE_SPEED_64_0GT : \ (lnkcap2) & PCI_EXP_LNKCAP2_SLS_32_0GB ? PCIE_SPEED_32_0GT : \ (lnkcap2) & PCI_EXP_LNKCAP2_SLS_16_0GB ? PCIE_SPEED_16_0GT : \ (lnkcap2) & PCI_EXP_LNKCAP2_SLS_8_0GB ? PCIE_SPEED_8_0GT : \ (lnkcap2) & PCI_EXP_LNKCAP2_SLS_5_0GB ? PCIE_SPEED_5_0GT : \ (lnkcap2) & PCI_EXP_LNKCAP2_SLS_2_5GB ? PCIE_SPEED_2_5GT : \ PCI_SPEED_UNKNOWN) /* PCIe speed to Mb/s reduced by encoding overhead */ #define PCIE_SPEED2MBS_ENC(speed) \ ((speed) == PCIE_SPEED_64_0GT ? 64000*1/1 : \ (speed) == PCIE_SPEED_32_0GT ? 32000*128/130 : \ (speed) == PCIE_SPEED_16_0GT ? 16000*128/130 : \ (speed) == PCIE_SPEED_8_0GT ? 8000*128/130 : \ (speed) == PCIE_SPEED_5_0GT ? 5000*8/10 : \ (speed) == PCIE_SPEED_2_5GT ? 2500*8/10 : \ 0) const char *pci_speed_string(enum pci_bus_speed speed); enum pci_bus_speed pcie_get_speed_cap(struct pci_dev *dev); enum pcie_link_width pcie_get_width_cap(struct pci_dev *dev); u32 pcie_bandwidth_capable(struct pci_dev *dev, enum pci_bus_speed *speed, enum pcie_link_width *width); void __pcie_print_link_status(struct pci_dev *dev, bool verbose); void pcie_report_downtraining(struct pci_dev *dev); void pcie_update_link_speed(struct pci_bus *bus, u16 link_status); /* Single Root I/O Virtualization */ struct pci_sriov { int pos; /* Capability position */ int nres; /* Number of resources */ u32 cap; /* SR-IOV Capabilities */ u16 ctrl; /* SR-IOV Control */ u16 total_VFs; /* Total VFs associated with the PF */ u16 initial_VFs; /* Initial VFs associated with the PF */ u16 num_VFs; /* Number of VFs available */ u16 offset; /* First VF Routing ID offset */ u16 stride; /* Following VF stride */ u16 vf_device; /* VF device ID */ u32 pgsz; /* Page size for BAR alignment */ u8 link; /* Function Dependency Link */ u8 max_VF_buses; /* Max buses consumed by VFs */ u16 driver_max_VFs; /* Max num VFs driver supports */ struct pci_dev *dev; /* Lowest numbered PF */ struct pci_dev *self; /* This PF */ u32 class; /* VF device */ u8 hdr_type; /* VF header type */ u16 subsystem_vendor; /* VF subsystem vendor */ u16 subsystem_device; /* VF subsystem device */ resource_size_t barsz[PCI_SRIOV_NUM_BARS]; /* VF BAR size */ bool drivers_autoprobe; /* Auto probing of VFs by driver */ }; #ifdef CONFIG_PCI_DOE void pci_doe_init(struct pci_dev *pdev); void pci_doe_destroy(struct pci_dev *pdev); void pci_doe_disconnected(struct pci_dev *pdev); #else static inline void pci_doe_init(struct pci_dev *pdev) { } static inline void pci_doe_destroy(struct pci_dev *pdev) { } static inline void pci_doe_disconnected(struct pci_dev *pdev) { } #endif /** * pci_dev_set_io_state - Set the new error state if possible. * * @dev: PCI device to set new error_state * @new: the state we want dev to be in * * If the device is experiencing perm_failure, it has to remain in that state. * Any other transition is allowed. * * Returns true if state has been changed to the requested state. */ static inline bool pci_dev_set_io_state(struct pci_dev *dev, pci_channel_state_t new) { pci_channel_state_t old; switch (new) { case pci_channel_io_perm_failure: xchg(&dev->error_state, pci_channel_io_perm_failure); return true; case pci_channel_io_frozen: old = cmpxchg(&dev->error_state, pci_channel_io_normal, pci_channel_io_frozen); return old != pci_channel_io_perm_failure; case pci_channel_io_normal: old = cmpxchg(&dev->error_state, pci_channel_io_frozen, pci_channel_io_normal); return old != pci_channel_io_perm_failure; default: return false; } } static inline int pci_dev_set_disconnected(struct pci_dev *dev, void *unused) { pci_dev_set_io_state(dev, pci_channel_io_perm_failure); pci_doe_disconnected(dev); return 0; } static inline bool pci_dev_is_disconnected(const struct pci_dev *dev) { return dev->error_state == pci_channel_io_perm_failure; } /* pci_dev priv_flags */ #define PCI_DEV_ADDED 0 #define PCI_DPC_RECOVERED 1 #define PCI_DPC_RECOVERING 2 static inline void pci_dev_assign_added(struct pci_dev *dev, bool added) { assign_bit(PCI_DEV_ADDED, &dev->priv_flags, added); } static inline bool pci_dev_is_added(const struct pci_dev *dev) { return test_bit(PCI_DEV_ADDED, &dev->priv_flags); } #ifdef CONFIG_PCIEAER #include <linux/aer.h> #define AER_MAX_MULTI_ERR_DEVICES 5 /* Not likely to have more */ struct aer_err_info { struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES]; int error_dev_num; unsigned int id:16; unsigned int severity:2; /* 0:NONFATAL | 1:FATAL | 2:COR */ unsigned int __pad1:5; unsigned int multi_error_valid:1; unsigned int first_error:5; unsigned int __pad2:2; unsigned int tlp_header_valid:1; unsigned int status; /* COR/UNCOR Error Status */ unsigned int mask; /* COR/UNCOR Error Mask */ struct aer_header_log_regs tlp; /* TLP Header */ }; int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info); void aer_print_error(struct pci_dev *dev, struct aer_err_info *info); #endif /* CONFIG_PCIEAER */ #ifdef CONFIG_PCIEPORTBUS /* Cached RCEC Endpoint Association */ struct rcec_ea { u8 nextbusn; u8 lastbusn; u32 bitmap; }; #endif #ifdef CONFIG_PCIE_DPC void pci_save_dpc_state(struct pci_dev *dev); void pci_restore_dpc_state(struct pci_dev *dev); void pci_dpc_init(struct pci_dev *pdev); void dpc_process_error(struct pci_dev *pdev); pci_ers_result_t dpc_reset_link(struct pci_dev *pdev); bool pci_dpc_recovered(struct pci_dev *pdev); #else static inline void pci_save_dpc_state(struct pci_dev *dev) { } static inline void pci_restore_dpc_state(struct pci_dev *dev) { } static inline void pci_dpc_init(struct pci_dev *pdev) { } static inline bool pci_dpc_recovered(struct pci_dev *pdev) { return false; } #endif #ifdef CONFIG_PCIEPORTBUS void pci_rcec_init(struct pci_dev *dev); void pci_rcec_exit(struct pci_dev *dev); void pcie_link_rcec(struct pci_dev *rcec); void pcie_walk_rcec(struct pci_dev *rcec, int (*cb)(struct pci_dev *, void *), void *userdata); #else static inline void pci_rcec_init(struct pci_dev *dev) { } static inline void pci_rcec_exit(struct pci_dev *dev) { } static inline void pcie_link_rcec(struct pci_dev *rcec) { } static inline void pcie_walk_rcec(struct pci_dev *rcec, int (*cb)(struct pci_dev *, void *), void *userdata) { } #endif #ifdef CONFIG_PCI_ATS /* Address Translation Service */ void pci_ats_init(struct pci_dev *dev); void pci_restore_ats_state(struct pci_dev *dev); #else static inline void pci_ats_init(struct pci_dev *d) { } static inline void pci_restore_ats_state(struct pci_dev *dev) { } #endif /* CONFIG_PCI_ATS */ #ifdef CONFIG_PCI_PRI void pci_pri_init(struct pci_dev *dev); void pci_restore_pri_state(struct pci_dev *pdev); #else static inline void pci_pri_init(struct pci_dev *dev) { } static inline void pci_restore_pri_state(struct pci_dev *pdev) { } #endif #ifdef CONFIG_PCI_PASID void pci_pasid_init(struct pci_dev *dev); void pci_restore_pasid_state(struct pci_dev *pdev); #else static inline void pci_pasid_init(struct pci_dev *dev) { } static inline void pci_restore_pasid_state(struct pci_dev *pdev) { } #endif #ifdef CONFIG_PCI_IOV int pci_iov_init(struct pci_dev *dev); void pci_iov_release(struct pci_dev *dev); void pci_iov_remove(struct pci_dev *dev); void pci_iov_update_resource(struct pci_dev *dev, int resno); resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno); void pci_restore_iov_state(struct pci_dev *dev); int pci_iov_bus_range(struct pci_bus *bus); extern const struct attribute_group sriov_pf_dev_attr_group; extern const struct attribute_group sriov_vf_dev_attr_group; #else static inline int pci_iov_init(struct pci_dev *dev) { return -ENODEV; } static inline void pci_iov_release(struct pci_dev *dev) { } static inline void pci_iov_remove(struct pci_dev *dev) { } static inline void pci_restore_iov_state(struct pci_dev *dev) { } static inline int pci_iov_bus_range(struct pci_bus *bus) { return 0; } #endif /* CONFIG_PCI_IOV */ #ifdef CONFIG_PCIE_PTM void pci_ptm_init(struct pci_dev *dev); void pci_save_ptm_state(struct pci_dev *dev); void pci_restore_ptm_state(struct pci_dev *dev); void pci_suspend_ptm(struct pci_dev *dev); void pci_resume_ptm(struct pci_dev *dev); #else static inline void pci_ptm_init(struct pci_dev *dev) { } static inline void pci_save_ptm_state(struct pci_dev *dev) { } static inline void pci_restore_ptm_state(struct pci_dev *dev) { } static inline void pci_suspend_ptm(struct pci_dev *dev) { } static inline void pci_resume_ptm(struct pci_dev *dev) { } #endif unsigned long pci_cardbus_resource_alignment(struct resource *); static inline resource_size_t pci_resource_alignment(struct pci_dev *dev, struct resource *res) { #ifdef CONFIG_PCI_IOV int resno = res - dev->resource; if (resno >= PCI_IOV_RESOURCES && resno <= PCI_IOV_RESOURCE_END) return pci_sriov_resource_alignment(dev, resno); #endif if (dev->class >> 8 == PCI_CLASS_BRIDGE_CARDBUS) return pci_cardbus_resource_alignment(res); return resource_alignment(res); } void pci_acs_init(struct pci_dev *dev); #ifdef CONFIG_PCI_QUIRKS int pci_dev_specific_acs_enabled(struct pci_dev *dev, u16 acs_flags); int pci_dev_specific_enable_acs(struct pci_dev *dev); int pci_dev_specific_disable_acs_redir(struct pci_dev *dev); bool pcie_failed_link_retrain(struct pci_dev *dev); #else static inline int pci_dev_specific_acs_enabled(struct pci_dev *dev, u16 acs_flags) { return -ENOTTY; } static inline int pci_dev_specific_enable_acs(struct pci_dev *dev) { return -ENOTTY; } static inline int pci_dev_specific_disable_acs_redir(struct pci_dev *dev) { return -ENOTTY; } static inline bool pcie_failed_link_retrain(struct pci_dev *dev) { return false; } #endif /* PCI error reporting and recovery */ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev, pci_channel_state_t state, pci_ers_result_t (*reset_subordinates)(struct pci_dev *pdev)); bool pcie_wait_for_link(struct pci_dev *pdev, bool active); int pcie_retrain_link(struct pci_dev *pdev, bool use_lt); #ifdef CONFIG_PCIEASPM void pcie_aspm_init_link_state(struct pci_dev *pdev); void pcie_aspm_exit_link_state(struct pci_dev *pdev); void pcie_aspm_pm_state_change(struct pci_dev *pdev, bool locked); void pcie_aspm_powersave_config_link(struct pci_dev *pdev); #else static inline void pcie_aspm_init_link_state(struct pci_dev *pdev) { } static inline void pcie_aspm_exit_link_state(struct pci_dev *pdev) { } static inline void pcie_aspm_pm_state_change(struct pci_dev *pdev, bool locked) { } static inline void pcie_aspm_powersave_config_link(struct pci_dev *pdev) { } #endif #ifdef CONFIG_PCIE_ECRC void pcie_set_ecrc_checking(struct pci_dev *dev); void pcie_ecrc_get_policy(char *str); #else static inline void pcie_set_ecrc_checking(struct pci_dev *dev) { } static inline void pcie_ecrc_get_policy(char *str) { } #endif struct pci_dev_reset_methods { u16 vendor; u16 device; int (*reset)(struct pci_dev *dev, bool probe); }; struct pci_reset_fn_method { int (*reset_fn)(struct pci_dev *pdev, bool probe); char *name; }; #ifdef CONFIG_PCI_QUIRKS int pci_dev_specific_reset(struct pci_dev *dev, bool probe); #else static inline int pci_dev_specific_reset(struct pci_dev *dev, bool probe) { return -ENOTTY; } #endif #if defined(CONFIG_PCI_QUIRKS) && defined(CONFIG_ARM64) int acpi_get_rc_resources(struct device *dev, const char *hid, u16 segment, struct resource *res); #else static inline int acpi_get_rc_resources(struct device *dev, const char *hid, u16 segment, struct resource *res) { return -ENODEV; } #endif int pci_rebar_get_current_size(struct pci_dev *pdev, int bar); int pci_rebar_set_size(struct pci_dev *pdev, int bar, int size); static inline u64 pci_rebar_size_to_bytes(int size) { return 1ULL << (size + 20); } struct device_node; #ifdef CONFIG_OF int of_pci_parse_bus_range(struct device_node *node, struct resource *res); int of_get_pci_domain_nr(struct device_node *node); int of_pci_get_max_link_speed(struct device_node *node); u32 of_pci_get_slot_power_limit(struct device_node *node, u8 *slot_power_limit_value, u8 *slot_power_limit_scale); int pci_set_of_node(struct pci_dev *dev); void pci_release_of_node(struct pci_dev *dev); void pci_set_bus_of_node(struct pci_bus *bus); void pci_release_bus_of_node(struct pci_bus *bus); int devm_of_pci_bridge_init(struct device *dev, struct pci_host_bridge *bridge); #else static inline int of_pci_parse_bus_range(struct device_node *node, struct resource *res) { return -EINVAL; } static inline int of_get_pci_domain_nr(struct device_node *node) { return -1; } static inline int of_pci_get_max_link_speed(struct device_node *node) { return -EINVAL; } static inline u32 of_pci_get_slot_power_limit(struct device_node *node, u8 *slot_power_limit_value, u8 *slot_power_limit_scale) { if (slot_power_limit_value) *slot_power_limit_value = 0; if (slot_power_limit_scale) *slot_power_limit_scale = 0; return 0; } static inline int pci_set_of_node(struct pci_dev *dev) { return 0; } static inline void pci_release_of_node(struct pci_dev *dev) { } static inline void pci_set_bus_of_node(struct pci_bus *bus) { } static inline void pci_release_bus_of_node(struct pci_bus *bus) { } static inline int devm_of_pci_bridge_init(struct device *dev, struct pci_host_bridge *bridge) { return 0; } #endif /* CONFIG_OF */ struct of_changeset; #ifdef CONFIG_PCI_DYNAMIC_OF_NODES void of_pci_make_dev_node(struct pci_dev *pdev); void of_pci_remove_node(struct pci_dev *pdev); int of_pci_add_properties(struct pci_dev *pdev, struct of_changeset *ocs, struct device_node *np); #else static inline void of_pci_make_dev_node(struct pci_dev *pdev) { } static inline void of_pci_remove_node(struct pci_dev *pdev) { } #endif #ifdef CONFIG_PCIEAER void pci_no_aer(void); void pci_aer_init(struct pci_dev *dev); void pci_aer_exit(struct pci_dev *dev); extern const struct attribute_group aer_stats_attr_group; void pci_aer_clear_fatal_status(struct pci_dev *dev); int pci_aer_clear_status(struct pci_dev *dev); int pci_aer_raw_clear_status(struct pci_dev *dev); void pci_save_aer_state(struct pci_dev *dev); void pci_restore_aer_state(struct pci_dev *dev); #else static inline void pci_no_aer(void) { } static inline void pci_aer_init(struct pci_dev *d) { } static inline void pci_aer_exit(struct pci_dev *d) { } static inline void pci_aer_clear_fatal_status(struct pci_dev *dev) { } static inline int pci_aer_clear_status(struct pci_dev *dev) { return -EINVAL; } static inline int pci_aer_raw_clear_status(struct pci_dev *dev) { return -EINVAL; } static inline void pci_save_aer_state(struct pci_dev *dev) { } static inline void pci_restore_aer_state(struct pci_dev *dev) { } #endif #ifdef CONFIG_ACPI int pci_acpi_program_hp_params(struct pci_dev *dev); extern const struct attribute_group pci_dev_acpi_attr_group; void pci_set_acpi_fwnode(struct pci_dev *dev); int pci_dev_acpi_reset(struct pci_dev *dev, bool probe); bool acpi_pci_power_manageable(struct pci_dev *dev); bool acpi_pci_bridge_d3(struct pci_dev *dev); int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state); pci_power_t acpi_pci_get_power_state(struct pci_dev *dev); void acpi_pci_refresh_power_state(struct pci_dev *dev); int acpi_pci_wakeup(struct pci_dev *dev, bool enable); bool acpi_pci_need_resume(struct pci_dev *dev); pci_power_t acpi_pci_choose_state(struct pci_dev *pdev); #else static inline int pci_dev_acpi_reset(struct pci_dev *dev, bool probe) { return -ENOTTY; } static inline void pci_set_acpi_fwnode(struct pci_dev *dev) { } static inline int pci_acpi_program_hp_params(struct pci_dev *dev) { return -ENODEV; } static inline bool acpi_pci_power_manageable(struct pci_dev *dev) { return false; } static inline bool acpi_pci_bridge_d3(struct pci_dev *dev) { return false; } static inline int acpi_pci_set_power_state(struct pci_dev *dev, pci_power_t state) { return -ENODEV; } static inline pci_power_t acpi_pci_get_power_state(struct pci_dev *dev) { return PCI_UNKNOWN; } static inline void acpi_pci_refresh_power_state(struct pci_dev *dev) { } static inline int acpi_pci_wakeup(struct pci_dev *dev, bool enable) { return -ENODEV; } static inline bool acpi_pci_need_resume(struct pci_dev *dev) { return false; } static inline pci_power_t acpi_pci_choose_state(struct pci_dev *pdev) { return PCI_POWER_ERROR; } #endif #ifdef CONFIG_PCIEASPM extern const struct attribute_group aspm_ctrl_attr_group; #endif extern const struct attribute_group pci_dev_reset_method_attr_group; #ifdef CONFIG_X86_INTEL_MID bool pci_use_mid_pm(void); int mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state); pci_power_t mid_pci_get_power_state(struct pci_dev *pdev); #else static inline bool pci_use_mid_pm(void) { return false; } static inline int mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state) { return -ENODEV; } static inline pci_power_t mid_pci_get_power_state(struct pci_dev *pdev) { return PCI_UNKNOWN; } #endif /* * Config Address for PCI Configuration Mechanism #1 * * See PCI Local Bus Specification, Revision 3.0, * Section 3.2.2.3.2, Figure 3-2, p. 50. */ #define PCI_CONF1_BUS_SHIFT 16 /* Bus number */ #define PCI_CONF1_DEV_SHIFT 11 /* Device number */ #define PCI_CONF1_FUNC_SHIFT 8 /* Function number */ #define PCI_CONF1_BUS_MASK 0xff #define PCI_CONF1_DEV_MASK 0x1f #define PCI_CONF1_FUNC_MASK 0x7 #define PCI_CONF1_REG_MASK 0xfc /* Limit aligned offset to a maximum of 256B */ #define PCI_CONF1_ENABLE BIT(31) #define PCI_CONF1_BUS(x) (((x) & PCI_CONF1_BUS_MASK) << PCI_CONF1_BUS_SHIFT) #define PCI_CONF1_DEV(x) (((x) & PCI_CONF1_DEV_MASK) << PCI_CONF1_DEV_SHIFT) #define PCI_CONF1_FUNC(x) (((x) & PCI_CONF1_FUNC_MASK) << PCI_CONF1_FUNC_SHIFT) #define PCI_CONF1_REG(x) ((x) & PCI_CONF1_REG_MASK) #define PCI_CONF1_ADDRESS(bus, dev, func, reg) \ (PCI_CONF1_ENABLE | \ PCI_CONF1_BUS(bus) | \ PCI_CONF1_DEV(dev) | \ PCI_CONF1_FUNC(func) | \ PCI_CONF1_REG(reg)) /* * Extension of PCI Config Address for accessing extended PCIe registers * * No standardized specification, but used on lot of non-ECAM-compliant ARM SoCs * or on AMD Barcelona and new CPUs. Reserved bits [27:24] of PCI Config Address * are used for specifying additional 4 high bits of PCI Express register. */ #define PCI_CONF1_EXT_REG_SHIFT 16 #define PCI_CONF1_EXT_REG_MASK 0xf00 #define PCI_CONF1_EXT_REG(x) (((x) & PCI_CONF1_EXT_REG_MASK) << PCI_CONF1_EXT_REG_SHIFT) #define PCI_CONF1_EXT_ADDRESS(bus, dev, func, reg) \ (PCI_CONF1_ADDRESS(bus, dev, func, reg) | \ PCI_CONF1_EXT_REG(reg)) #endif /* DRIVERS_PCI_H */
11 16 14 7 15 4 4 3 4 6 5 7 3 7 7 4 1 5 2 3 3 4 1 1 2 3 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/isofs/dir.c * * (C) 1992, 1993, 1994 Eric Youngdale Modified for ISO 9660 filesystem. * * (C) 1991 Linus Torvalds - minix filesystem * * Steve Beynon : Missing last directory entries fixed * (stephen@askone.demon.co.uk) : 21st June 1996 * * isofs directory handling functions */ #include <linux/gfp.h> #include "isofs.h" int isofs_name_translate(struct iso_directory_record *de, char *new, struct inode *inode) { char * old = de->name; int len = de->name_len[0]; int i; for (i = 0; i < len; i++) { unsigned char c = old[i]; if (!c) break; if (c >= 'A' && c <= 'Z') c |= 0x20; /* lower case */ /* Drop trailing '.;1' (ISO 9660:1988 7.5.1 requires period) */ if (c == '.' && i == len - 3 && old[i + 1] == ';' && old[i + 2] == '1') break; /* Drop trailing ';1' */ if (c == ';' && i == len - 2 && old[i + 1] == '1') break; /* Convert remaining ';' to '.' */ /* Also '/' to '.' (broken Acorn-generated ISO9660 images) */ if (c == ';' || c == '/') c = '.'; new[i] = c; } return i; } /* Acorn extensions written by Matthew Wilcox <willy@infradead.org> 1998 */ int get_acorn_filename(struct iso_directory_record *de, char *retname, struct inode *inode) { int std; unsigned char *chr; int retnamlen = isofs_name_translate(de, retname, inode); if (retnamlen == 0) return 0; std = sizeof(struct iso_directory_record) + de->name_len[0]; if (std & 1) std++; if (de->length[0] - std != 32) return retnamlen; chr = ((unsigned char *) de) + std; if (strncmp(chr, "ARCHIMEDES", 10)) return retnamlen; if ((*retname == '_') && ((chr[19] & 1) == 1)) *retname = '!'; if (((de->flags[0] & 2) == 0) && (chr[13] == 0xff) && ((chr[12] & 0xf0) == 0xf0)) { retname[retnamlen] = ','; sprintf(retname+retnamlen+1, "%3.3x", ((chr[12] & 0xf) << 8) | chr[11]); retnamlen += 4; } return retnamlen; } /* * This should _really_ be cleaned up some day.. */ static int do_isofs_readdir(struct inode *inode, struct file *file, struct dir_context *ctx, char *tmpname, struct iso_directory_record *tmpde) { unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); unsigned char bufbits = ISOFS_BUFFER_BITS(inode); unsigned long block, offset, block_saved, offset_saved; unsigned long inode_number = 0; /* Quiet GCC */ struct buffer_head *bh = NULL; int len; int map; int first_de = 1; char *p = NULL; /* Quiet GCC */ struct iso_directory_record *de; struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb); offset = ctx->pos & (bufsize - 1); block = ctx->pos >> bufbits; while (ctx->pos < inode->i_size) { int de_len; if (!bh) { bh = isofs_bread(inode, block); if (!bh) return 0; } de = (struct iso_directory_record *) (bh->b_data + offset); de_len = *(unsigned char *)de; /* * If the length byte is zero, we should move on to the next * CDROM sector. If we are at the end of the directory, we * kick out of the while loop. */ if (de_len == 0) { brelse(bh); bh = NULL; ctx->pos = (ctx->pos + ISOFS_BLOCK_SIZE) & ~(ISOFS_BLOCK_SIZE - 1); block = ctx->pos >> bufbits; offset = 0; continue; } block_saved = block; offset_saved = offset; offset += de_len; /* Make sure we have a full directory entry */ if (offset >= bufsize) { int slop = bufsize - offset + de_len; memcpy(tmpde, de, slop); offset &= bufsize - 1; block++; brelse(bh); bh = NULL; if (offset) { bh = isofs_bread(inode, block); if (!bh) return 0; memcpy((void *) tmpde + slop, bh->b_data, offset); } de = tmpde; } /* Basic sanity check, whether name doesn't exceed dir entry */ if (de_len < de->name_len[0] + sizeof(struct iso_directory_record)) { printk(KERN_NOTICE "iso9660: Corrupted directory entry" " in block %lu of inode %lu\n", block, inode->i_ino); brelse(bh); return -EIO; } if (first_de) { isofs_normalize_block_and_offset(de, &block_saved, &offset_saved); inode_number = isofs_get_ino(block_saved, offset_saved, bufbits); } if (de->flags[-sbi->s_high_sierra] & 0x80) { first_de = 0; ctx->pos += de_len; continue; } first_de = 1; /* Handle the case of the '.' directory */ if (de->name_len[0] == 1 && de->name[0] == 0) { if (!dir_emit_dot(file, ctx)) break; ctx->pos += de_len; continue; } len = 0; /* Handle the case of the '..' directory */ if (de->name_len[0] == 1 && de->name[0] == 1) { if (!dir_emit_dotdot(file, ctx)) break; ctx->pos += de_len; continue; } /* Handle everything else. Do name translation if there is no Rock Ridge NM field. */ /* * Do not report hidden files if so instructed, or associated * files unless instructed to do so */ if ((sbi->s_hide && (de->flags[-sbi->s_high_sierra] & 1)) || (!sbi->s_showassoc && (de->flags[-sbi->s_high_sierra] & 4))) { ctx->pos += de_len; continue; } map = 1; if (sbi->s_rock) { len = get_rock_ridge_filename(de, tmpname, inode); if (len != 0) { /* may be -1 */ p = tmpname; map = 0; } } if (map) { #ifdef CONFIG_JOLIET if (sbi->s_joliet_level) { len = get_joliet_filename(de, tmpname, inode); p = tmpname; } else #endif if (sbi->s_mapping == 'a') { len = get_acorn_filename(de, tmpname, inode); p = tmpname; } else if (sbi->s_mapping == 'n') { len = isofs_name_translate(de, tmpname, inode); p = tmpname; } else { p = de->name; len = de->name_len[0]; } } if (len > 0) { if (!dir_emit(ctx, p, len, inode_number, DT_UNKNOWN)) break; } ctx->pos += de_len; } if (bh) brelse(bh); return 0; } /* * Handle allocation of temporary space for name translation and * handling split directory entries.. The real work is done by * "do_isofs_readdir()". */ static int isofs_readdir(struct file *file, struct dir_context *ctx) { int result; char *tmpname; struct iso_directory_record *tmpde; struct inode *inode = file_inode(file); tmpname = (char *)__get_free_page(GFP_KERNEL); if (tmpname == NULL) return -ENOMEM; tmpde = (struct iso_directory_record *) (tmpname+1024); result = do_isofs_readdir(inode, file, ctx, tmpname, tmpde); free_page((unsigned long) tmpname); return result; } const struct file_operations isofs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = isofs_readdir, }; /* * directories can handle most operations... */ const struct inode_operations isofs_dir_inode_operations = { .lookup = isofs_lookup, };
20 5 15 31 23 8 15 7 3 2 1 1 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 /* * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/in.h> #include <linux/ipv6.h> #include "rds.h" #include "loop.h" static char * const rds_trans_modules[] = { [RDS_TRANS_IB] = "rds_rdma", [RDS_TRANS_GAP] = NULL, [RDS_TRANS_TCP] = "rds_tcp", }; static struct rds_transport *transports[RDS_TRANS_COUNT]; static DECLARE_RWSEM(rds_trans_sem); void rds_trans_register(struct rds_transport *trans) { BUG_ON(strlen(trans->t_name) + 1 > TRANSNAMSIZ); down_write(&rds_trans_sem); if (transports[trans->t_type]) printk(KERN_ERR "RDS Transport type %d already registered\n", trans->t_type); else { transports[trans->t_type] = trans; printk(KERN_INFO "Registered RDS/%s transport\n", trans->t_name); } up_write(&rds_trans_sem); } EXPORT_SYMBOL_GPL(rds_trans_register); void rds_trans_unregister(struct rds_transport *trans) { down_write(&rds_trans_sem); transports[trans->t_type] = NULL; printk(KERN_INFO "Unregistered RDS/%s transport\n", trans->t_name); up_write(&rds_trans_sem); } EXPORT_SYMBOL_GPL(rds_trans_unregister); void rds_trans_put(struct rds_transport *trans) { if (trans) module_put(trans->t_owner); } struct rds_transport *rds_trans_get_preferred(struct net *net, const struct in6_addr *addr, __u32 scope_id) { struct rds_transport *ret = NULL; struct rds_transport *trans; unsigned int i; if (ipv6_addr_v4mapped(addr)) { if (*(u_int8_t *)&addr->s6_addr32[3] == IN_LOOPBACKNET) return &rds_loop_transport; } else if (ipv6_addr_loopback(addr)) { return &rds_loop_transport; } down_read(&rds_trans_sem); for (i = 0; i < RDS_TRANS_COUNT; i++) { trans = transports[i]; if (trans && (trans->laddr_check(net, addr, scope_id) == 0) && (!trans->t_owner || try_module_get(trans->t_owner))) { ret = trans; break; } } up_read(&rds_trans_sem); return ret; } struct rds_transport *rds_trans_get(int t_type) { struct rds_transport *ret = NULL; struct rds_transport *trans; down_read(&rds_trans_sem); trans = transports[t_type]; if (!trans) { up_read(&rds_trans_sem); if (rds_trans_modules[t_type]) request_module(rds_trans_modules[t_type]); down_read(&rds_trans_sem); trans = transports[t_type]; } if (trans && trans->t_type == t_type && (!trans->t_owner || try_module_get(trans->t_owner))) ret = trans; up_read(&rds_trans_sem); return ret; } /* * This returns the number of stats entries in the snapshot and only * copies them using the iter if there is enough space for them. The * caller passes in the global stats so that we can size and copy while * holding the lock. */ unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail) { struct rds_transport *trans; unsigned int total = 0; unsigned int part; int i; rds_info_iter_unmap(iter); down_read(&rds_trans_sem); for (i = 0; i < RDS_TRANS_COUNT; i++) { trans = transports[i]; if (!trans || !trans->stats_info_copy) continue; part = trans->stats_info_copy(iter, avail); avail -= min(avail, part); total += part; } up_read(&rds_trans_sem); return total; }
38 38 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/isofs/util.c */ #include <linux/time.h> #include "isofs.h" /* * We have to convert from a MM/DD/YY format to the Unix ctime format. * We have to take into account leap years and all of that good stuff. * Unfortunately, the kernel does not have the information on hand to * take into account daylight savings time, but it shouldn't matter. * The time stored should be localtime (with or without DST in effect), * and the timezone offset should hold the offset required to get back * to GMT. Thus we should always be correct. */ int iso_date(u8 *p, int flag) { int year, month, day, hour, minute, second, tz; int crtime; year = p[0]; month = p[1]; day = p[2]; hour = p[3]; minute = p[4]; second = p[5]; if (flag == 0) tz = p[6]; /* High sierra has no time zone */ else tz = 0; if (year < 0) { crtime = 0; } else { crtime = mktime64(year+1900, month, day, hour, minute, second); /* sign extend */ if (tz & 0x80) tz |= (-1 << 8); /* * The timezone offset is unreliable on some disks, * so we make a sanity check. In no case is it ever * more than 13 hours from GMT, which is 52*15min. * The time is always stored in localtime with the * timezone offset being what get added to GMT to * get to localtime. Thus we need to subtract the offset * to get to true GMT, which is what we store the time * as internally. On the local system, the user may set * their timezone any way they wish, of course, so GMT * gets converted back to localtime on the receiving * system. * * NOTE: mkisofs in versions prior to mkisofs-1.10 had * the sign wrong on the timezone offset. This has now * been corrected there too, but if you are getting screwy * results this may be the explanation. If enough people * complain, a user configuration option could be added * to add the timezone offset in with the wrong sign * for 'compatibility' with older discs, but I cannot see how * it will matter that much. * * Thanks to kuhlmav@elec.canterbury.ac.nz (Volker Kuhlmann) * for pointing out the sign error. */ if (-52 <= tz && tz <= 52) crtime -= tz * 15 * 60; } return crtime; }
10 1 9 8 1 7 14 14 14 14 14 13 1 12 3 10 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPVS An implementation of the IP virtual server support for the * LINUX operating system. IPVS is now implemented as a module * over the Netfilter framework. IPVS can be used to build a * high-performance and highly available server based on a * cluster of servers. * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * Peter Kese <peter.kese@ijs.si> * * Changes: */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/module.h> #include <linux/spinlock.h> #include <linux/interrupt.h> #include <asm/string.h> #include <linux/kmod.h> #include <linux/sysctl.h> #include <net/ip_vs.h> EXPORT_SYMBOL(ip_vs_scheduler_err); /* * IPVS scheduler list */ static LIST_HEAD(ip_vs_schedulers); /* semaphore for schedulers */ static DEFINE_MUTEX(ip_vs_sched_mutex); /* * Bind a service with a scheduler */ int ip_vs_bind_scheduler(struct ip_vs_service *svc, struct ip_vs_scheduler *scheduler) { int ret; if (scheduler->init_service) { ret = scheduler->init_service(svc); if (ret) { pr_err("%s(): init error\n", __func__); return ret; } } rcu_assign_pointer(svc->scheduler, scheduler); return 0; } /* * Unbind a service with its scheduler */ void ip_vs_unbind_scheduler(struct ip_vs_service *svc, struct ip_vs_scheduler *sched) { struct ip_vs_scheduler *cur_sched; cur_sched = rcu_dereference_protected(svc->scheduler, 1); /* This check proves that old 'sched' was installed */ if (!cur_sched) return; if (sched->done_service) sched->done_service(svc); /* svc->scheduler can be set to NULL only by caller */ } /* * Get scheduler in the scheduler list by name */ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) { struct ip_vs_scheduler *sched; IP_VS_DBG(2, "%s(): sched_name \"%s\"\n", __func__, sched_name); mutex_lock(&ip_vs_sched_mutex); list_for_each_entry(sched, &ip_vs_schedulers, n_list) { /* * Test and get the modules atomically */ if (sched->module && !try_module_get(sched->module)) { /* * This scheduler is just deleted */ continue; } if (strcmp(sched_name, sched->name)==0) { /* HIT */ mutex_unlock(&ip_vs_sched_mutex); return sched; } module_put(sched->module); } mutex_unlock(&ip_vs_sched_mutex); return NULL; } /* * Lookup scheduler and try to load it if it doesn't exist */ struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name) { struct ip_vs_scheduler *sched; /* * Search for the scheduler by sched_name */ sched = ip_vs_sched_getbyname(sched_name); /* * If scheduler not found, load the module and search again */ if (sched == NULL) { request_module("ip_vs_%s", sched_name); sched = ip_vs_sched_getbyname(sched_name); } return sched; } void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) { if (scheduler) module_put(scheduler->module); } /* * Common error output helper for schedulers */ void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg) { struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler); char *sched_name = sched ? sched->name : "none"; if (svc->fwmark) { IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n", sched_name, svc->fwmark, svc->fwmark, msg); #ifdef CONFIG_IP_VS_IPV6 } else if (svc->af == AF_INET6) { IP_VS_ERR_RL("%s: %s [%pI6c]:%d - %s\n", sched_name, ip_vs_proto_name(svc->protocol), &svc->addr.in6, ntohs(svc->port), msg); #endif } else { IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n", sched_name, ip_vs_proto_name(svc->protocol), &svc->addr.ip, ntohs(svc->port), msg); } } /* * Register a scheduler in the scheduler list */ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) { struct ip_vs_scheduler *sched; if (!scheduler) { pr_err("%s(): NULL arg\n", __func__); return -EINVAL; } if (!scheduler->name) { pr_err("%s(): NULL scheduler_name\n", __func__); return -EINVAL; } /* increase the module use count */ if (!ip_vs_use_count_inc()) return -ENOENT; mutex_lock(&ip_vs_sched_mutex); if (!list_empty(&scheduler->n_list)) { mutex_unlock(&ip_vs_sched_mutex); ip_vs_use_count_dec(); pr_err("%s(): [%s] scheduler already linked\n", __func__, scheduler->name); return -EINVAL; } /* * Make sure that the scheduler with this name doesn't exist * in the scheduler list. */ list_for_each_entry(sched, &ip_vs_schedulers, n_list) { if (strcmp(scheduler->name, sched->name) == 0) { mutex_unlock(&ip_vs_sched_mutex); ip_vs_use_count_dec(); pr_err("%s(): [%s] scheduler already existed " "in the system\n", __func__, scheduler->name); return -EINVAL; } } /* * Add it into the d-linked scheduler list */ list_add(&scheduler->n_list, &ip_vs_schedulers); mutex_unlock(&ip_vs_sched_mutex); pr_info("[%s] scheduler registered.\n", scheduler->name); return 0; } /* * Unregister a scheduler from the scheduler list */ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) { if (!scheduler) { pr_err("%s(): NULL arg\n", __func__); return -EINVAL; } mutex_lock(&ip_vs_sched_mutex); if (list_empty(&scheduler->n_list)) { mutex_unlock(&ip_vs_sched_mutex); pr_err("%s(): [%s] scheduler is not in the list. failed\n", __func__, scheduler->name); return -EINVAL; } /* * Remove it from the d-linked scheduler list */ list_del(&scheduler->n_list); mutex_unlock(&ip_vs_sched_mutex); /* decrease the module use count */ ip_vs_use_count_dec(); pr_info("[%s] scheduler unregistered.\n", scheduler->name); return 0; }
1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2012 Fusion-io All rights reserved. * Copyright (C) 2012 Intel Corp. All rights reserved. */ #include <linux/sched.h> #include <linux/bio.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/raid/pq.h> #include <linux/hash.h> #include <linux/list_sort.h> #include <linux/raid/xor.h> #include <linux/mm.h> #include "messages.h" #include "misc.h" #include "ctree.h" #include "disk-io.h" #include "volumes.h" #include "raid56.h" #include "async-thread.h" #include "file-item.h" #include "btrfs_inode.h" /* set when additional merges to this rbio are not allowed */ #define RBIO_RMW_LOCKED_BIT 1 /* * set when this rbio is sitting in the hash, but it is just a cache * of past RMW */ #define RBIO_CACHE_BIT 2 /* * set when it is safe to trust the stripe_pages for caching */ #define RBIO_CACHE_READY_BIT 3 #define RBIO_CACHE_SIZE 1024 #define BTRFS_STRIPE_HASH_TABLE_BITS 11 /* Used by the raid56 code to lock stripes for read/modify/write */ struct btrfs_stripe_hash { struct list_head hash_list; spinlock_t lock; }; /* Used by the raid56 code to lock stripes for read/modify/write */ struct btrfs_stripe_hash_table { struct list_head stripe_cache; spinlock_t cache_lock; int cache_size; struct btrfs_stripe_hash table[]; }; /* * A bvec like structure to present a sector inside a page. * * Unlike bvec we don't need bvlen, as it's fixed to sectorsize. */ struct sector_ptr { struct page *page; unsigned int pgoff:24; unsigned int uptodate:8; }; static void rmw_rbio_work(struct work_struct *work); static void rmw_rbio_work_locked(struct work_struct *work); static void index_rbio_pages(struct btrfs_raid_bio *rbio); static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); static int finish_parity_scrub(struct btrfs_raid_bio *rbio); static void scrub_rbio_work_locked(struct work_struct *work); static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio) { bitmap_free(rbio->error_bitmap); kfree(rbio->stripe_pages); kfree(rbio->bio_sectors); kfree(rbio->stripe_sectors); kfree(rbio->finish_pointers); } static void free_raid_bio(struct btrfs_raid_bio *rbio) { int i; if (!refcount_dec_and_test(&rbio->refs)) return; WARN_ON(!list_empty(&rbio->stripe_cache)); WARN_ON(!list_empty(&rbio->hash_list)); WARN_ON(!bio_list_empty(&rbio->bio_list)); for (i = 0; i < rbio->nr_pages; i++) { if (rbio->stripe_pages[i]) { __free_page(rbio->stripe_pages[i]); rbio->stripe_pages[i] = NULL; } } btrfs_put_bioc(rbio->bioc); free_raid_bio_pointers(rbio); kfree(rbio); } static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func) { INIT_WORK(&rbio->work, work_func); queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work); } /* * the stripe hash table is used for locking, and to collect * bios in hopes of making a full stripe */ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) { struct btrfs_stripe_hash_table *table; struct btrfs_stripe_hash_table *x; struct btrfs_stripe_hash *cur; struct btrfs_stripe_hash *h; int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; int i; if (info->stripe_hash_table) return 0; /* * The table is large, starting with order 4 and can go as high as * order 7 in case lock debugging is turned on. * * Try harder to allocate and fallback to vmalloc to lower the chance * of a failing mount. */ table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL); if (!table) return -ENOMEM; spin_lock_init(&table->cache_lock); INIT_LIST_HEAD(&table->stripe_cache); h = table->table; for (i = 0; i < num_entries; i++) { cur = h + i; INIT_LIST_HEAD(&cur->hash_list); spin_lock_init(&cur->lock); } x = cmpxchg(&info->stripe_hash_table, NULL, table); kvfree(x); return 0; } /* * caching an rbio means to copy anything from the * bio_sectors array into the stripe_pages array. We * use the page uptodate bit in the stripe cache array * to indicate if it has valid data * * once the caching is done, we set the cache ready * bit. */ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) { int i; int ret; ret = alloc_rbio_pages(rbio); if (ret) return; for (i = 0; i < rbio->nr_sectors; i++) { /* Some range not covered by bio (partial write), skip it */ if (!rbio->bio_sectors[i].page) { /* * Even if the sector is not covered by bio, if it is * a data sector it should still be uptodate as it is * read from disk. */ if (i < rbio->nr_data * rbio->stripe_nsectors) ASSERT(rbio->stripe_sectors[i].uptodate); continue; } ASSERT(rbio->stripe_sectors[i].page); memcpy_page(rbio->stripe_sectors[i].page, rbio->stripe_sectors[i].pgoff, rbio->bio_sectors[i].page, rbio->bio_sectors[i].pgoff, rbio->bioc->fs_info->sectorsize); rbio->stripe_sectors[i].uptodate = 1; } set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); } /* * we hash on the first logical address of the stripe */ static int rbio_bucket(struct btrfs_raid_bio *rbio) { u64 num = rbio->bioc->full_stripe_logical; /* * we shift down quite a bit. We're using byte * addressing, and most of the lower bits are zeros. * This tends to upset hash_64, and it consistently * returns just one or two different values. * * shifting off the lower bits fixes things. */ return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); } static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, unsigned int page_nr) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; const u32 sectors_per_page = PAGE_SIZE / sectorsize; int i; ASSERT(page_nr < rbio->nr_pages); for (i = sectors_per_page * page_nr; i < sectors_per_page * page_nr + sectors_per_page; i++) { if (!rbio->stripe_sectors[i].uptodate) return false; } return true; } /* * Update the stripe_sectors[] array to use correct page and pgoff * * Should be called every time any page pointer in stripes_pages[] got modified. */ static void index_stripe_sectors(struct btrfs_raid_bio *rbio) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; u32 offset; int i; for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) { int page_index = offset >> PAGE_SHIFT; ASSERT(page_index < rbio->nr_pages); rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index]; rbio->stripe_sectors[i].pgoff = offset_in_page(offset); } } static void steal_rbio_page(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest, int page_nr) { const u32 sectorsize = src->bioc->fs_info->sectorsize; const u32 sectors_per_page = PAGE_SIZE / sectorsize; int i; if (dest->stripe_pages[page_nr]) __free_page(dest->stripe_pages[page_nr]); dest->stripe_pages[page_nr] = src->stripe_pages[page_nr]; src->stripe_pages[page_nr] = NULL; /* Also update the sector->uptodate bits. */ for (i = sectors_per_page * page_nr; i < sectors_per_page * page_nr + sectors_per_page; i++) dest->stripe_sectors[i].uptodate = true; } static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr) { const int sector_nr = (page_nr << PAGE_SHIFT) >> rbio->bioc->fs_info->sectorsize_bits; /* * We have ensured PAGE_SIZE is aligned with sectorsize, thus * we won't have a page which is half data half parity. * * Thus if the first sector of the page belongs to data stripes, then * the full page belongs to data stripes. */ return (sector_nr < rbio->nr_data * rbio->stripe_nsectors); } /* * Stealing an rbio means taking all the uptodate pages from the stripe array * in the source rbio and putting them into the destination rbio. * * This will also update the involved stripe_sectors[] which are referring to * the old pages. */ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) { int i; if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) return; for (i = 0; i < dest->nr_pages; i++) { struct page *p = src->stripe_pages[i]; /* * We don't need to steal P/Q pages as they will always be * regenerated for RMW or full write anyway. */ if (!is_data_stripe_page(src, i)) continue; /* * If @src already has RBIO_CACHE_READY_BIT, it should have * all data stripe pages present and uptodate. */ ASSERT(p); ASSERT(full_page_sectors_uptodate(src, i)); steal_rbio_page(src, dest, i); } index_stripe_sectors(dest); index_stripe_sectors(src); } /* * merging means we take the bio_list from the victim and * splice it into the destination. The victim should * be discarded afterwards. * * must be called with dest->rbio_list_lock held */ static void merge_rbio(struct btrfs_raid_bio *dest, struct btrfs_raid_bio *victim) { bio_list_merge(&dest->bio_list, &victim->bio_list); dest->bio_list_bytes += victim->bio_list_bytes; /* Also inherit the bitmaps from @victim. */ bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap, dest->stripe_nsectors); bio_list_init(&victim->bio_list); } /* * used to prune items that are in the cache. The caller * must hold the hash table lock. */ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) { int bucket = rbio_bucket(rbio); struct btrfs_stripe_hash_table *table; struct btrfs_stripe_hash *h; int freeit = 0; /* * check the bit again under the hash table lock. */ if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) return; table = rbio->bioc->fs_info->stripe_hash_table; h = table->table + bucket; /* hold the lock for the bucket because we may be * removing it from the hash table */ spin_lock(&h->lock); /* * hold the lock for the bio list because we need * to make sure the bio list is empty */ spin_lock(&rbio->bio_list_lock); if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { list_del_init(&rbio->stripe_cache); table->cache_size -= 1; freeit = 1; /* if the bio list isn't empty, this rbio is * still involved in an IO. We take it out * of the cache list, and drop the ref that * was held for the list. * * If the bio_list was empty, we also remove * the rbio from the hash_table, and drop * the corresponding ref */ if (bio_list_empty(&rbio->bio_list)) { if (!list_empty(&rbio->hash_list)) { list_del_init(&rbio->hash_list); refcount_dec(&rbio->refs); BUG_ON(!list_empty(&rbio->plug_list)); } } } spin_unlock(&rbio->bio_list_lock); spin_unlock(&h->lock); if (freeit) free_raid_bio(rbio); } /* * prune a given rbio from the cache */ static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) { struct btrfs_stripe_hash_table *table; if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) return; table = rbio->bioc->fs_info->stripe_hash_table; spin_lock(&table->cache_lock); __remove_rbio_from_cache(rbio); spin_unlock(&table->cache_lock); } /* * remove everything in the cache */ static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) { struct btrfs_stripe_hash_table *table; struct btrfs_raid_bio *rbio; table = info->stripe_hash_table; spin_lock(&table->cache_lock); while (!list_empty(&table->stripe_cache)) { rbio = list_entry(table->stripe_cache.next, struct btrfs_raid_bio, stripe_cache); __remove_rbio_from_cache(rbio); } spin_unlock(&table->cache_lock); } /* * remove all cached entries and free the hash table * used by unmount */ void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) { if (!info->stripe_hash_table) return; btrfs_clear_rbio_cache(info); kvfree(info->stripe_hash_table); info->stripe_hash_table = NULL; } /* * insert an rbio into the stripe cache. It * must have already been prepared by calling * cache_rbio_pages * * If this rbio was already cached, it gets * moved to the front of the lru. * * If the size of the rbio cache is too big, we * prune an item. */ static void cache_rbio(struct btrfs_raid_bio *rbio) { struct btrfs_stripe_hash_table *table; if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) return; table = rbio->bioc->fs_info->stripe_hash_table; spin_lock(&table->cache_lock); spin_lock(&rbio->bio_list_lock); /* bump our ref if we were not in the list before */ if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) refcount_inc(&rbio->refs); if (!list_empty(&rbio->stripe_cache)){ list_move(&rbio->stripe_cache, &table->stripe_cache); } else { list_add(&rbio->stripe_cache, &table->stripe_cache); table->cache_size += 1; } spin_unlock(&rbio->bio_list_lock); if (table->cache_size > RBIO_CACHE_SIZE) { struct btrfs_raid_bio *found; found = list_entry(table->stripe_cache.prev, struct btrfs_raid_bio, stripe_cache); if (found != rbio) __remove_rbio_from_cache(found); } spin_unlock(&table->cache_lock); } /* * helper function to run the xor_blocks api. It is only * able to do MAX_XOR_BLOCKS at a time, so we need to * loop through. */ static void run_xor(void **pages, int src_cnt, ssize_t len) { int src_off = 0; int xor_src_cnt = 0; void *dest = pages[src_cnt]; while(src_cnt > 0) { xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); xor_blocks(xor_src_cnt, len, dest, pages + src_off); src_cnt -= xor_src_cnt; src_off += xor_src_cnt; } } /* * Returns true if the bio list inside this rbio covers an entire stripe (no * rmw required). */ static int rbio_is_full(struct btrfs_raid_bio *rbio) { unsigned long size = rbio->bio_list_bytes; int ret = 1; spin_lock(&rbio->bio_list_lock); if (size != rbio->nr_data * BTRFS_STRIPE_LEN) ret = 0; BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN); spin_unlock(&rbio->bio_list_lock); return ret; } /* * returns 1 if it is safe to merge two rbios together. * The merging is safe if the two rbios correspond to * the same stripe and if they are both going in the same * direction (read vs write), and if neither one is * locked for final IO * * The caller is responsible for locking such that * rmw_locked is safe to test */ static int rbio_can_merge(struct btrfs_raid_bio *last, struct btrfs_raid_bio *cur) { if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) return 0; /* * we can't merge with cached rbios, since the * idea is that when we merge the destination * rbio is going to run our IO for us. We can * steal from cached rbios though, other functions * handle that. */ if (test_bit(RBIO_CACHE_BIT, &last->flags) || test_bit(RBIO_CACHE_BIT, &cur->flags)) return 0; if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical) return 0; /* we can't merge with different operations */ if (last->operation != cur->operation) return 0; /* * We've need read the full stripe from the drive. * check and repair the parity and write the new results. * * We're not allowed to add any new bios to the * bio list here, anyone else that wants to * change this stripe needs to do their own rmw. */ if (last->operation == BTRFS_RBIO_PARITY_SCRUB) return 0; if (last->operation == BTRFS_RBIO_READ_REBUILD) return 0; return 1; } static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio, unsigned int stripe_nr, unsigned int sector_nr) { ASSERT(stripe_nr < rbio->real_stripes); ASSERT(sector_nr < rbio->stripe_nsectors); return stripe_nr * rbio->stripe_nsectors + sector_nr; } /* Return a sector from rbio->stripe_sectors, not from the bio list */ static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio, unsigned int stripe_nr, unsigned int sector_nr) { return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr, sector_nr)]; } /* Grab a sector inside P stripe */ static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio, unsigned int sector_nr) { return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr); } /* Grab a sector inside Q stripe, return NULL if not RAID6 */ static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio, unsigned int sector_nr) { if (rbio->nr_data + 1 == rbio->real_stripes) return NULL; return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr); } /* * The first stripe in the table for a logical address * has the lock. rbios are added in one of three ways: * * 1) Nobody has the stripe locked yet. The rbio is given * the lock and 0 is returned. The caller must start the IO * themselves. * * 2) Someone has the stripe locked, but we're able to merge * with the lock owner. The rbio is freed and the IO will * start automatically along with the existing rbio. 1 is returned. * * 3) Someone has the stripe locked, but we're not able to merge. * The rbio is added to the lock owner's plug list, or merged into * an rbio already on the plug list. When the lock owner unlocks, * the next rbio on the list is run and the IO is started automatically. * 1 is returned * * If we return 0, the caller still owns the rbio and must continue with * IO submission. If we return 1, the caller must assume the rbio has * already been freed. */ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) { struct btrfs_stripe_hash *h; struct btrfs_raid_bio *cur; struct btrfs_raid_bio *pending; struct btrfs_raid_bio *freeit = NULL; struct btrfs_raid_bio *cache_drop = NULL; int ret = 0; h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio); spin_lock(&h->lock); list_for_each_entry(cur, &h->hash_list, hash_list) { if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical) continue; spin_lock(&cur->bio_list_lock); /* Can we steal this cached rbio's pages? */ if (bio_list_empty(&cur->bio_list) && list_empty(&cur->plug_list) && test_bit(RBIO_CACHE_BIT, &cur->flags) && !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { list_del_init(&cur->hash_list); refcount_dec(&cur->refs); steal_rbio(cur, rbio); cache_drop = cur; spin_unlock(&cur->bio_list_lock); goto lockit; } /* Can we merge into the lock owner? */ if (rbio_can_merge(cur, rbio)) { merge_rbio(cur, rbio); spin_unlock(&cur->bio_list_lock); freeit = rbio; ret = 1; goto out; } /* * We couldn't merge with the running rbio, see if we can merge * with the pending ones. We don't have to check for rmw_locked * because there is no way they are inside finish_rmw right now */ list_for_each_entry(pending, &cur->plug_list, plug_list) { if (rbio_can_merge(pending, rbio)) { merge_rbio(pending, rbio); spin_unlock(&cur->bio_list_lock); freeit = rbio; ret = 1; goto out; } } /* * No merging, put us on the tail of the plug list, our rbio * will be started with the currently running rbio unlocks */ list_add_tail(&rbio->plug_list, &cur->plug_list); spin_unlock(&cur->bio_list_lock); ret = 1; goto out; } lockit: refcount_inc(&rbio->refs); list_add(&rbio->hash_list, &h->hash_list); out: spin_unlock(&h->lock); if (cache_drop) remove_rbio_from_cache(cache_drop); if (freeit) free_raid_bio(freeit); return ret; } static void recover_rbio_work_locked(struct work_struct *work); /* * called as rmw or parity rebuild is completed. If the plug list has more * rbios waiting for this stripe, the next one on the list will be started */ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) { int bucket; struct btrfs_stripe_hash *h; int keep_cache = 0; bucket = rbio_bucket(rbio); h = rbio->bioc->fs_info->stripe_hash_table->table + bucket; if (list_empty(&rbio->plug_list)) cache_rbio(rbio); spin_lock(&h->lock); spin_lock(&rbio->bio_list_lock); if (!list_empty(&rbio->hash_list)) { /* * if we're still cached and there is no other IO * to perform, just leave this rbio here for others * to steal from later */ if (list_empty(&rbio->plug_list) && test_bit(RBIO_CACHE_BIT, &rbio->flags)) { keep_cache = 1; clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); BUG_ON(!bio_list_empty(&rbio->bio_list)); goto done; } list_del_init(&rbio->hash_list); refcount_dec(&rbio->refs); /* * we use the plug list to hold all the rbios * waiting for the chance to lock this stripe. * hand the lock over to one of them. */ if (!list_empty(&rbio->plug_list)) { struct btrfs_raid_bio *next; struct list_head *head = rbio->plug_list.next; next = list_entry(head, struct btrfs_raid_bio, plug_list); list_del_init(&rbio->plug_list); list_add(&next->hash_list, &h->hash_list); refcount_inc(&next->refs); spin_unlock(&rbio->bio_list_lock); spin_unlock(&h->lock); if (next->operation == BTRFS_RBIO_READ_REBUILD) { start_async_work(next, recover_rbio_work_locked); } else if (next->operation == BTRFS_RBIO_WRITE) { steal_rbio(rbio, next); start_async_work(next, rmw_rbio_work_locked); } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { steal_rbio(rbio, next); start_async_work(next, scrub_rbio_work_locked); } goto done_nolock; } } done: spin_unlock(&rbio->bio_list_lock); spin_unlock(&h->lock); done_nolock: if (!keep_cache) remove_rbio_from_cache(rbio); } static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) { struct bio *next; while (cur) { next = cur->bi_next; cur->bi_next = NULL; cur->bi_status = err; bio_endio(cur); cur = next; } } /* * this frees the rbio and runs through all the bios in the * bio_list and calls end_io on them */ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) { struct bio *cur = bio_list_get(&rbio->bio_list); struct bio *extra; kfree(rbio->csum_buf); bitmap_free(rbio->csum_bitmap); rbio->csum_buf = NULL; rbio->csum_bitmap = NULL; /* * Clear the data bitmap, as the rbio may be cached for later usage. * do this before before unlock_stripe() so there will be no new bio * for this bio. */ bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors); /* * At this moment, rbio->bio_list is empty, however since rbio does not * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the * hash list, rbio may be merged with others so that rbio->bio_list * becomes non-empty. * Once unlock_stripe() is done, rbio->bio_list will not be updated any * more and we can call bio_endio() on all queued bios. */ unlock_stripe(rbio); extra = bio_list_get(&rbio->bio_list); free_raid_bio(rbio); rbio_endio_bio_list(cur, err); if (extra) rbio_endio_bio_list(extra, err); } /* * Get a sector pointer specified by its @stripe_nr and @sector_nr. * * @rbio: The raid bio * @stripe_nr: Stripe number, valid range [0, real_stripe) * @sector_nr: Sector number inside the stripe, * valid range [0, stripe_nsectors) * @bio_list_only: Whether to use sectors inside the bio list only. * * The read/modify/write code wants to reuse the original bio page as much * as possible, and only use stripe_sectors as fallback. */ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, int stripe_nr, int sector_nr, bool bio_list_only) { struct sector_ptr *sector; int index; ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes); ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); index = stripe_nr * rbio->stripe_nsectors + sector_nr; ASSERT(index >= 0 && index < rbio->nr_sectors); spin_lock(&rbio->bio_list_lock); sector = &rbio->bio_sectors[index]; if (sector->page || bio_list_only) { /* Don't return sector without a valid page pointer */ if (!sector->page) sector = NULL; spin_unlock(&rbio->bio_list_lock); return sector; } spin_unlock(&rbio->bio_list_lock); return &rbio->stripe_sectors[index]; } /* * allocation and initial setup for the btrfs_raid_bio. Not * this does not allocate any pages for rbio->pages. */ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, struct btrfs_io_context *bioc) { const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes; const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT; const unsigned int num_pages = stripe_npages * real_stripes; const unsigned int stripe_nsectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; const unsigned int num_sectors = stripe_nsectors * real_stripes; struct btrfs_raid_bio *rbio; /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); /* * Our current stripe len should be fixed to 64k thus stripe_nsectors * (at most 16) should be no larger than BITS_PER_LONG. */ ASSERT(stripe_nsectors <= BITS_PER_LONG); rbio = kzalloc(sizeof(*rbio), GFP_NOFS); if (!rbio) return ERR_PTR(-ENOMEM); rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS); rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), GFP_NOFS); rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), GFP_NOFS); rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS); rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS); if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors || !rbio->finish_pointers || !rbio->error_bitmap) { free_raid_bio_pointers(rbio); kfree(rbio); return ERR_PTR(-ENOMEM); } bio_list_init(&rbio->bio_list); init_waitqueue_head(&rbio->io_wait); INIT_LIST_HEAD(&rbio->plug_list); spin_lock_init(&rbio->bio_list_lock); INIT_LIST_HEAD(&rbio->stripe_cache); INIT_LIST_HEAD(&rbio->hash_list); btrfs_get_bioc(bioc); rbio->bioc = bioc; rbio->nr_pages = num_pages; rbio->nr_sectors = num_sectors; rbio->real_stripes = real_stripes; rbio->stripe_npages = stripe_npages; rbio->stripe_nsectors = stripe_nsectors; refcount_set(&rbio->refs, 1); atomic_set(&rbio->stripes_pending, 0); ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type); return rbio; } /* allocate pages for all the stripes in the bio, including parity */ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) { int ret; ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, 0); if (ret < 0) return ret; /* Mapping all sectors */ index_stripe_sectors(rbio); return 0; } /* only allocate pages for p/q stripes */ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) { const int data_pages = rbio->nr_data * rbio->stripe_npages; int ret; ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages, rbio->stripe_pages + data_pages, 0); if (ret < 0) return ret; index_stripe_sectors(rbio); return 0; } /* * Return the total number of errors found in the vertical stripe of @sector_nr. * * @faila and @failb will also be updated to the first and second stripe * number of the errors. */ static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr, int *faila, int *failb) { int stripe_nr; int found_errors = 0; if (faila || failb) { /* * Both @faila and @failb should be valid pointers if any of * them is specified. */ ASSERT(faila && failb); *faila = -1; *failb = -1; } for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr; if (test_bit(total_sector_nr, rbio->error_bitmap)) { found_errors++; if (faila) { /* Update faila and failb. */ if (*faila < 0) *faila = stripe_nr; else if (*failb < 0) *failb = stripe_nr; } } } return found_errors; } /* * Add a single sector @sector into our list of bios for IO. * * Return 0 if everything went well. * Return <0 for error. */ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, struct bio_list *bio_list, struct sector_ptr *sector, unsigned int stripe_nr, unsigned int sector_nr, enum req_op op) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; struct bio *last = bio_list->tail; int ret; struct bio *bio; struct btrfs_io_stripe *stripe; u64 disk_start; /* * Note: here stripe_nr has taken device replace into consideration, * thus it can be larger than rbio->real_stripe. * So here we check against bioc->num_stripes, not rbio->real_stripes. */ ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes); ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); ASSERT(sector->page); stripe = &rbio->bioc->stripes[stripe_nr]; disk_start = stripe->physical + sector_nr * sectorsize; /* if the device is missing, just fail this stripe */ if (!stripe->dev->bdev) { int found_errors; set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr, rbio->error_bitmap); /* Check if we have reached tolerance early. */ found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL); if (found_errors > rbio->bioc->max_errors) return -EIO; return 0; } /* see if we can add this page onto our existing bio */ if (last) { u64 last_end = last->bi_iter.bi_sector << SECTOR_SHIFT; last_end += last->bi_iter.bi_size; /* * we can't merge these if they are from different * devices or if they are not contiguous */ if (last_end == disk_start && !last->bi_status && last->bi_bdev == stripe->dev->bdev) { ret = bio_add_page(last, sector->page, sectorsize, sector->pgoff); if (ret == sectorsize) return 0; } } /* put a new bio on the list */ bio = bio_alloc(stripe->dev->bdev, max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1), op, GFP_NOFS); bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT; bio->bi_private = rbio; __bio_add_page(bio, sector->page, sectorsize, sector->pgoff); bio_list_add(bio_list, bio); return 0; } static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; struct bio_vec bvec; struct bvec_iter iter; u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - rbio->bioc->full_stripe_logical; bio_for_each_segment(bvec, bio, iter) { u32 bvec_offset; for (bvec_offset = 0; bvec_offset < bvec.bv_len; bvec_offset += sectorsize, offset += sectorsize) { int index = offset / sectorsize; struct sector_ptr *sector = &rbio->bio_sectors[index]; sector->page = bvec.bv_page; sector->pgoff = bvec.bv_offset + bvec_offset; ASSERT(sector->pgoff < PAGE_SIZE); } } } /* * helper function to walk our bio list and populate the bio_pages array with * the result. This seems expensive, but it is faster than constantly * searching through the bio list as we setup the IO in finish_rmw or stripe * reconstruction. * * This must be called before you trust the answers from page_in_rbio */ static void index_rbio_pages(struct btrfs_raid_bio *rbio) { struct bio *bio; spin_lock(&rbio->bio_list_lock); bio_list_for_each(bio, &rbio->bio_list) index_one_bio(rbio, bio); spin_unlock(&rbio->bio_list_lock); } static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio, struct raid56_bio_trace_info *trace_info) { const struct btrfs_io_context *bioc = rbio->bioc; int i; ASSERT(bioc); /* We rely on bio->bi_bdev to find the stripe number. */ if (!bio->bi_bdev) goto not_found; for (i = 0; i < bioc->num_stripes; i++) { if (bio->bi_bdev != bioc->stripes[i].dev->bdev) continue; trace_info->stripe_nr = i; trace_info->devid = bioc->stripes[i].dev->devid; trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - bioc->stripes[i].physical; return; } not_found: trace_info->devid = -1; trace_info->offset = -1; trace_info->stripe_nr = -1; } static inline void bio_list_put(struct bio_list *bio_list) { struct bio *bio; while ((bio = bio_list_pop(bio_list))) bio_put(bio); } /* Generate PQ for one vertical stripe. */ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) { void **pointers = rbio->finish_pointers; const u32 sectorsize = rbio->bioc->fs_info->sectorsize; struct sector_ptr *sector; int stripe; const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6; /* First collect one sector from each data stripe */ for (stripe = 0; stripe < rbio->nr_data; stripe++) { sector = sector_in_rbio(rbio, stripe, sectornr, 0); pointers[stripe] = kmap_local_page(sector->page) + sector->pgoff; } /* Then add the parity stripe */ sector = rbio_pstripe_sector(rbio, sectornr); sector->uptodate = 1; pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; if (has_qstripe) { /* * RAID6, add the qstripe and call the library function * to fill in our p/q */ sector = rbio_qstripe_sector(rbio, sectornr); sector->uptodate = 1; pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, pointers); } else { /* raid5 */ memcpy(pointers[rbio->nr_data], pointers[0], sectorsize); run_xor(pointers + 1, rbio->nr_data - 1, sectorsize); } for (stripe = stripe - 1; stripe >= 0; stripe--) kunmap_local(pointers[stripe]); } static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { /* The total sector number inside the full stripe. */ int total_sector_nr; int sectornr; int stripe; int ret; ASSERT(bio_list_size(bio_list) == 0); /* We should have at least one data sector. */ ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); /* * Reset errors, as we may have errors inherited from from degraded * write. */ bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); /* * Start assembly. Make bios for everything from the higher layers (the * bio_list in our rbio) and our P/Q. Ignore everything else. */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { struct sector_ptr *sector; stripe = total_sector_nr / rbio->stripe_nsectors; sectornr = total_sector_nr % rbio->stripe_nsectors; /* This vertical stripe has no data, skip it. */ if (!test_bit(sectornr, &rbio->dbitmap)) continue; if (stripe < rbio->nr_data) { sector = sector_in_rbio(rbio, stripe, sectornr, 1); if (!sector) continue; } else { sector = rbio_stripe_sector(rbio, stripe, sectornr); } ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, sectornr, REQ_OP_WRITE); if (ret) goto error; } if (likely(!rbio->bioc->replace_nr_stripes)) return 0; /* * Make a copy for the replace target device. * * Thus the source stripe number (in replace_stripe_src) should be valid. */ ASSERT(rbio->bioc->replace_stripe_src >= 0); for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { struct sector_ptr *sector; stripe = total_sector_nr / rbio->stripe_nsectors; sectornr = total_sector_nr % rbio->stripe_nsectors; /* * For RAID56, there is only one device that can be replaced, * and replace_stripe_src[0] indicates the stripe number we * need to copy from. */ if (stripe != rbio->bioc->replace_stripe_src) { /* * We can skip the whole stripe completely, note * total_sector_nr will be increased by one anyway. */ ASSERT(sectornr == 0); total_sector_nr += rbio->stripe_nsectors - 1; continue; } /* This vertical stripe has no data, skip it. */ if (!test_bit(sectornr, &rbio->dbitmap)) continue; if (stripe < rbio->nr_data) { sector = sector_in_rbio(rbio, stripe, sectornr, 1); if (!sector) continue; } else { sector = rbio_stripe_sector(rbio, stripe, sectornr); } ret = rbio_add_io_sector(rbio, bio_list, sector, rbio->real_stripes, sectornr, REQ_OP_WRITE); if (ret) goto error; } return 0; error: bio_list_put(bio_list); return -EIO; } static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - rbio->bioc->full_stripe_logical; int total_nr_sector = offset >> fs_info->sectorsize_bits; ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors); bitmap_set(rbio->error_bitmap, total_nr_sector, bio->bi_iter.bi_size >> fs_info->sectorsize_bits); /* * Special handling for raid56_alloc_missing_rbio() used by * scrub/replace. Unlike call path in raid56_parity_recover(), they * pass an empty bio here. Thus we have to find out the missing device * and mark the stripe error instead. */ if (bio->bi_iter.bi_size == 0) { bool found_missing = false; int stripe_nr; for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { if (!rbio->bioc->stripes[stripe_nr].dev->bdev) { found_missing = true; bitmap_set(rbio->error_bitmap, stripe_nr * rbio->stripe_nsectors, rbio->stripe_nsectors); } } ASSERT(found_missing); } } /* * For subpage case, we can no longer set page Up-to-date directly for * stripe_pages[], thus we need to locate the sector. */ static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, struct page *page, unsigned int pgoff) { int i; for (i = 0; i < rbio->nr_sectors; i++) { struct sector_ptr *sector = &rbio->stripe_sectors[i]; if (sector->page == page && sector->pgoff == pgoff) return sector; } return NULL; } /* * this sets each page in the bio uptodate. It should only be used on private * rbio pages, nothing that comes in from the higher layers */ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; struct bio_vec *bvec; struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { struct sector_ptr *sector; int pgoff; for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len; pgoff += sectorsize) { sector = find_stripe_sector(rbio, bvec->bv_page, pgoff); ASSERT(sector); if (sector) sector->uptodate = 1; } } } static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio) { struct bio_vec *bv = bio_first_bvec_all(bio); int i; for (i = 0; i < rbio->nr_sectors; i++) { struct sector_ptr *sector; sector = &rbio->stripe_sectors[i]; if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) break; sector = &rbio->bio_sectors[i]; if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) break; } ASSERT(i < rbio->nr_sectors); return i; } static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio) { int total_sector_nr = get_bio_sector_nr(rbio, bio); u32 bio_size = 0; struct bio_vec *bvec; int i; bio_for_each_bvec_all(bvec, bio, i) bio_size += bvec->bv_len; /* * Since we can have multiple bios touching the error_bitmap, we cannot * call bitmap_set() without protection. * * Instead use set_bit() for each bit, as set_bit() itself is atomic. */ for (i = total_sector_nr; i < total_sector_nr + (bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++) set_bit(i, rbio->error_bitmap); } /* Verify the data sectors at read time. */ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, struct bio *bio) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; int total_sector_nr = get_bio_sector_nr(rbio, bio); struct bio_vec *bvec; struct bvec_iter_all iter_all; /* No data csum for the whole stripe, no need to verify. */ if (!rbio->csum_bitmap || !rbio->csum_buf) return; /* P/Q stripes, they have no data csum to verify against. */ if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors) return; bio_for_each_segment_all(bvec, bio, iter_all) { int bv_offset; for (bv_offset = bvec->bv_offset; bv_offset < bvec->bv_offset + bvec->bv_len; bv_offset += fs_info->sectorsize, total_sector_nr++) { u8 csum_buf[BTRFS_CSUM_SIZE]; u8 *expected_csum = rbio->csum_buf + total_sector_nr * fs_info->csum_size; int ret; /* No csum for this sector, skip to the next sector. */ if (!test_bit(total_sector_nr, rbio->csum_bitmap)) continue; ret = btrfs_check_sector_csum(fs_info, bvec->bv_page, bv_offset, csum_buf, expected_csum); if (ret < 0) set_bit(total_sector_nr, rbio->error_bitmap); } } } static void raid_wait_read_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; if (bio->bi_status) { rbio_update_error_bitmap(rbio, bio); } else { set_bio_pages_uptodate(rbio, bio); verify_bio_data_sectors(rbio, bio); } bio_put(bio); if (atomic_dec_and_test(&rbio->stripes_pending)) wake_up(&rbio->io_wait); } static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { struct bio *bio; atomic_set(&rbio->stripes_pending, bio_list_size(bio_list)); while ((bio = bio_list_pop(bio_list))) { bio->bi_end_io = raid_wait_read_end_io; if (trace_raid56_read_enabled()) { struct raid56_bio_trace_info trace_info = { 0 }; bio_get_trace_info(rbio, bio, &trace_info); trace_raid56_read(rbio, bio, &trace_info); } submit_bio(bio); } wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); } static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio) { const int data_pages = rbio->nr_data * rbio->stripe_npages; int ret; ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, 0); if (ret < 0) return ret; index_stripe_sectors(rbio); return 0; } /* * We use plugging call backs to collect full stripes. * Any time we get a partial stripe write while plugged * we collect it into a list. When the unplug comes down, * we sort the list by logical block number and merge * everything we can into the same rbios */ struct btrfs_plug_cb { struct blk_plug_cb cb; struct btrfs_fs_info *info; struct list_head rbio_list; }; /* * rbios on the plug list are sorted for easier merging. */ static int plug_cmp(void *priv, const struct list_head *a, const struct list_head *b) { const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, plug_list); const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, plug_list); u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; if (a_sector < b_sector) return -1; if (a_sector > b_sector) return 1; return 0; } static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule) { struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb); struct btrfs_raid_bio *cur; struct btrfs_raid_bio *last = NULL; list_sort(NULL, &plug->rbio_list, plug_cmp); while (!list_empty(&plug->rbio_list)) { cur = list_entry(plug->rbio_list.next, struct btrfs_raid_bio, plug_list); list_del_init(&cur->plug_list); if (rbio_is_full(cur)) { /* We have a full stripe, queue it down. */ start_async_work(cur, rmw_rbio_work); continue; } if (last) { if (rbio_can_merge(last, cur)) { merge_rbio(last, cur); free_raid_bio(cur); continue; } start_async_work(last, rmw_rbio_work); } last = cur; } if (last) start_async_work(last, rmw_rbio_work); kfree(plug); } /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */ static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) { const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT; const u64 full_stripe_start = rbio->bioc->full_stripe_logical; const u32 orig_len = orig_bio->bi_iter.bi_size; const u32 sectorsize = fs_info->sectorsize; u64 cur_logical; ASSERT(orig_logical >= full_stripe_start && orig_logical + orig_len <= full_stripe_start + rbio->nr_data * BTRFS_STRIPE_LEN); bio_list_add(&rbio->bio_list, orig_bio); rbio->bio_list_bytes += orig_bio->bi_iter.bi_size; /* Update the dbitmap. */ for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len; cur_logical += sectorsize) { int bit = ((u32)(cur_logical - full_stripe_start) >> fs_info->sectorsize_bits) % rbio->stripe_nsectors; set_bit(bit, &rbio->dbitmap); } } /* * our main entry point for writes from the rest of the FS. */ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; struct btrfs_plug_cb *plug = NULL; struct blk_plug_cb *cb; rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); bio_endio(bio); return; } rbio->operation = BTRFS_RBIO_WRITE; rbio_add_bio(rbio, bio); /* * Don't plug on full rbios, just get them out the door * as quickly as we can */ if (!rbio_is_full(rbio)) { cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); if (cb) { plug = container_of(cb, struct btrfs_plug_cb, cb); if (!plug->info) { plug->info = fs_info; INIT_LIST_HEAD(&plug->rbio_list); } list_add_tail(&rbio->plug_list, &plug->rbio_list); return; } } /* * Either we don't have any existing plug, or we're doing a full stripe, * queue the rmw work now. */ start_async_work(rbio, rmw_rbio_work); } static int verify_one_sector(struct btrfs_raid_bio *rbio, int stripe_nr, int sector_nr) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; struct sector_ptr *sector; u8 csum_buf[BTRFS_CSUM_SIZE]; u8 *csum_expected; int ret; if (!rbio->csum_bitmap || !rbio->csum_buf) return 0; /* No way to verify P/Q as they are not covered by data csum. */ if (stripe_nr >= rbio->nr_data) return 0; /* * If we're rebuilding a read, we have to use pages from the * bio list if possible. */ if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); } else { sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); } ASSERT(sector->page); csum_expected = rbio->csum_buf + (stripe_nr * rbio->stripe_nsectors + sector_nr) * fs_info->csum_size; ret = btrfs_check_sector_csum(fs_info, sector->page, sector->pgoff, csum_buf, csum_expected); return ret; } /* * Recover a vertical stripe specified by @sector_nr. * @*pointers are the pre-allocated pointers by the caller, so we don't * need to allocate/free the pointers again and again. */ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, void **pointers, void **unmap_array) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; struct sector_ptr *sector; const u32 sectorsize = fs_info->sectorsize; int found_errors; int faila; int failb; int stripe_nr; int ret = 0; /* * Now we just use bitmap to mark the horizontal stripes in * which we have data when doing parity scrub. */ if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && !test_bit(sector_nr, &rbio->dbitmap)) return 0; found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, &failb); /* * No errors in the vertical stripe, skip it. Can happen for recovery * which only part of a stripe failed csum check. */ if (!found_errors) return 0; if (found_errors > rbio->bioc->max_errors) return -EIO; /* * Setup our array of pointers with sectors from each stripe * * NOTE: store a duplicate array of pointers to preserve the * pointer order. */ for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { /* * If we're rebuilding a read, we have to use pages from the * bio list if possible. */ if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); } else { sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); } ASSERT(sector->page); pointers[stripe_nr] = kmap_local_page(sector->page) + sector->pgoff; unmap_array[stripe_nr] = pointers[stripe_nr]; } /* All raid6 handling here */ if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { /* Single failure, rebuild from parity raid5 style */ if (failb < 0) { if (faila == rbio->nr_data) /* * Just the P stripe has failed, without * a bad data or Q stripe. * We have nothing to do, just skip the * recovery for this stripe. */ goto cleanup; /* * a single failure in raid6 is rebuilt * in the pstripe code below */ goto pstripe; } /* * If the q stripe is failed, do a pstripe reconstruction from * the xors. * If both the q stripe and the P stripe are failed, we're * here due to a crc mismatch and we can't give them the * data they want. */ if (failb == rbio->real_stripes - 1) { if (faila == rbio->real_stripes - 2) /* * Only P and Q are corrupted. * We only care about data stripes recovery, * can skip this vertical stripe. */ goto cleanup; /* * Otherwise we have one bad data stripe and * a good P stripe. raid5! */ goto pstripe; } if (failb == rbio->real_stripes - 2) { raid6_datap_recov(rbio->real_stripes, sectorsize, faila, pointers); } else { raid6_2data_recov(rbio->real_stripes, sectorsize, faila, failb, pointers); } } else { void *p; /* Rebuild from P stripe here (raid5 or raid6). */ ASSERT(failb == -1); pstripe: /* Copy parity block into failed block to start with */ memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); /* Rearrange the pointer array */ p = pointers[faila]; for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1; stripe_nr++) pointers[stripe_nr] = pointers[stripe_nr + 1]; pointers[rbio->nr_data - 1] = p; /* Xor in the rest */ run_xor(pointers, rbio->nr_data - 1, sectorsize); } /* * No matter if this is a RMW or recovery, we should have all * failed sectors repaired in the vertical stripe, thus they are now * uptodate. * Especially if we determine to cache the rbio, we need to * have at least all data sectors uptodate. * * If possible, also check if the repaired sector matches its data * checksum. */ if (faila >= 0) { ret = verify_one_sector(rbio, faila, sector_nr); if (ret < 0) goto cleanup; sector = rbio_stripe_sector(rbio, faila, sector_nr); sector->uptodate = 1; } if (failb >= 0) { ret = verify_one_sector(rbio, failb, sector_nr); if (ret < 0) goto cleanup; sector = rbio_stripe_sector(rbio, failb, sector_nr); sector->uptodate = 1; } cleanup: for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--) kunmap_local(unmap_array[stripe_nr]); return ret; } static int recover_sectors(struct btrfs_raid_bio *rbio) { void **pointers = NULL; void **unmap_array = NULL; int sectornr; int ret = 0; /* * @pointers array stores the pointer for each sector. * * @unmap_array stores copy of pointers that does not get reordered * during reconstruction so that kunmap_local works. */ pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); if (!pointers || !unmap_array) { ret = -ENOMEM; goto out; } if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { spin_lock(&rbio->bio_list_lock); set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); spin_unlock(&rbio->bio_list_lock); } index_rbio_pages(rbio); for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { ret = recover_vertical(rbio, sectornr, pointers, unmap_array); if (ret < 0) break; } out: kfree(pointers); kfree(unmap_array); return ret; } static void recover_rbio(struct btrfs_raid_bio *rbio) { struct bio_list bio_list = BIO_EMPTY_LIST; int total_sector_nr; int ret = 0; /* * Either we're doing recover for a read failure or degraded write, * caller should have set error bitmap correctly. */ ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); /* For recovery, we need to read all sectors including P/Q. */ ret = alloc_rbio_pages(rbio); if (ret < 0) goto out; index_rbio_pages(rbio); /* * Read everything that hasn't failed. However this time we will * not trust any cached sector. * As we may read out some stale data but higher layer is not reading * that stale part. * * So here we always re-read everything in recovery path. */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { int stripe = total_sector_nr / rbio->stripe_nsectors; int sectornr = total_sector_nr % rbio->stripe_nsectors; struct sector_ptr *sector; /* * Skip the range which has error. It can be a range which is * marked error (for csum mismatch), or it can be a missing * device. */ if (!rbio->bioc->stripes[stripe].dev->bdev || test_bit(total_sector_nr, rbio->error_bitmap)) { /* * Also set the error bit for missing device, which * may not yet have its error bit set. */ set_bit(total_sector_nr, rbio->error_bitmap); continue; } sector = rbio_stripe_sector(rbio, stripe, sectornr); ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, sectornr, REQ_OP_READ); if (ret < 0) { bio_list_put(&bio_list); goto out; } } submit_read_wait_bio_list(rbio, &bio_list); ret = recover_sectors(rbio); out: rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } static void recover_rbio_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; rbio = container_of(work, struct btrfs_raid_bio, work); if (!lock_stripe_add(rbio)) recover_rbio(rbio); } static void recover_rbio_work_locked(struct work_struct *work) { recover_rbio(container_of(work, struct btrfs_raid_bio, work)); } static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num) { bool found = false; int sector_nr; /* * This is for RAID6 extra recovery tries, thus mirror number should * be large than 2. * Mirror 1 means read from data stripes. Mirror 2 means rebuild using * RAID5 methods. */ ASSERT(mirror_num > 2); for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { int found_errors; int faila; int failb; found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, &failb); /* This vertical stripe doesn't have errors. */ if (!found_errors) continue; /* * If we found errors, there should be only one error marked * by previous set_rbio_range_error(). */ ASSERT(found_errors == 1); found = true; /* Now select another stripe to mark as error. */ failb = rbio->real_stripes - (mirror_num - 1); if (failb <= faila) failb--; /* Set the extra bit in error bitmap. */ if (failb >= 0) set_bit(failb * rbio->stripe_nsectors + sector_nr, rbio->error_bitmap); } /* We should found at least one vertical stripe with error.*/ ASSERT(found); } /* * the main entry point for reads from the higher layers. This * is really only called when the normal read path had a failure, * so we assume the bio they send down corresponds to a failed part * of the drive. */ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, int mirror_num) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); bio_endio(bio); return; } rbio->operation = BTRFS_RBIO_READ_REBUILD; rbio_add_bio(rbio, bio); set_rbio_range_error(rbio, bio); /* * Loop retry: * for 'mirror == 2', reconstruct from all other stripes. * for 'mirror_num > 2', select a stripe to fail on every retry. */ if (mirror_num > 2) set_rbio_raid6_extra_error(rbio, mirror_num); start_async_work(rbio, recover_rbio_work); } static void fill_data_csums(struct btrfs_raid_bio *rbio) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; struct btrfs_root *csum_root = btrfs_csum_root(fs_info, rbio->bioc->full_stripe_logical); const u64 start = rbio->bioc->full_stripe_logical; const u32 len = (rbio->nr_data * rbio->stripe_nsectors) << fs_info->sectorsize_bits; int ret; /* The rbio should not have its csum buffer initialized. */ ASSERT(!rbio->csum_buf && !rbio->csum_bitmap); /* * Skip the csum search if: * * - The rbio doesn't belong to data block groups * Then we are doing IO for tree blocks, no need to search csums. * * - The rbio belongs to mixed block groups * This is to avoid deadlock, as we're already holding the full * stripe lock, if we trigger a metadata read, and it needs to do * raid56 recovery, we will deadlock. */ if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) || rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA) return; rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors * fs_info->csum_size, GFP_NOFS); rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors, GFP_NOFS); if (!rbio->csum_buf || !rbio->csum_bitmap) { ret = -ENOMEM; goto error; } ret = btrfs_lookup_csums_bitmap(csum_root, NULL, start, start + len - 1, rbio->csum_buf, rbio->csum_bitmap); if (ret < 0) goto error; if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits)) goto no_csum; return; error: /* * We failed to allocate memory or grab the csum, but it's not fatal, * we can still continue. But better to warn users that RMW is no * longer safe for this particular sub-stripe write. */ btrfs_warn_rl(fs_info, "sub-stripe write for full stripe %llu is not safe, failed to get csum: %d", rbio->bioc->full_stripe_logical, ret); no_csum: kfree(rbio->csum_buf); bitmap_free(rbio->csum_bitmap); rbio->csum_buf = NULL; rbio->csum_bitmap = NULL; } static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) { struct bio_list bio_list = BIO_EMPTY_LIST; int total_sector_nr; int ret = 0; /* * Fill the data csums we need for data verification. We need to fill * the csum_bitmap/csum_buf first, as our endio function will try to * verify the data sectors. */ fill_data_csums(rbio); /* * Build a list of bios to read all sectors (including data and P/Q). * * This behavior is to compensate the later csum verification and recovery. */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { struct sector_ptr *sector; int stripe = total_sector_nr / rbio->stripe_nsectors; int sectornr = total_sector_nr % rbio->stripe_nsectors; sector = rbio_stripe_sector(rbio, stripe, sectornr); ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, sectornr, REQ_OP_READ); if (ret) { bio_list_put(&bio_list); return ret; } } /* * We may or may not have any corrupted sectors (including missing dev * and csum mismatch), just let recover_sectors() to handle them all. */ submit_read_wait_bio_list(rbio, &bio_list); return recover_sectors(rbio); } static void raid_wait_write_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; blk_status_t err = bio->bi_status; if (err) rbio_update_error_bitmap(rbio, bio); bio_put(bio); if (atomic_dec_and_test(&rbio->stripes_pending)) wake_up(&rbio->io_wait); } static void submit_write_bios(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { struct bio *bio; atomic_set(&rbio->stripes_pending, bio_list_size(bio_list)); while ((bio = bio_list_pop(bio_list))) { bio->bi_end_io = raid_wait_write_end_io; if (trace_raid56_write_enabled()) { struct raid56_bio_trace_info trace_info = { 0 }; bio_get_trace_info(rbio, bio, &trace_info); trace_raid56_write(rbio, bio, &trace_info); } submit_bio(bio); } } /* * To determine if we need to read any sector from the disk. * Should only be utilized in RMW path, to skip cached rbio. */ static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) { int i; for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) { struct sector_ptr *sector = &rbio->stripe_sectors[i]; /* * We have a sector which doesn't have page nor uptodate, * thus this rbio can not be cached one, as cached one must * have all its data sectors present and uptodate. */ if (!sector->page || !sector->uptodate) return true; } return false; } static void rmw_rbio(struct btrfs_raid_bio *rbio) { struct bio_list bio_list; int sectornr; int ret = 0; /* * Allocate the pages for parity first, as P/Q pages will always be * needed for both full-stripe and sub-stripe writes. */ ret = alloc_rbio_parity_pages(rbio); if (ret < 0) goto out; /* * Either full stripe write, or we have every data sector already * cached, can go to write path immediately. */ if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) { /* * Now we're doing sub-stripe write, also need all data stripes * to do the full RMW. */ ret = alloc_rbio_data_pages(rbio); if (ret < 0) goto out; index_rbio_pages(rbio); ret = rmw_read_wait_recover(rbio); if (ret < 0) goto out; } /* * At this stage we're not allowed to add any new bios to the * bio list any more, anyone else that wants to change this stripe * needs to do their own rmw. */ spin_lock(&rbio->bio_list_lock); set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); spin_unlock(&rbio->bio_list_lock); bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); index_rbio_pages(rbio); /* * We don't cache full rbios because we're assuming * the higher layers are unlikely to use this area of * the disk again soon. If they do use it again, * hopefully they will send another full bio. */ if (!rbio_is_full(rbio)) cache_rbio_pages(rbio); else clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) generate_pq_vertical(rbio, sectornr); bio_list_init(&bio_list); ret = rmw_assemble_write_bios(rbio, &bio_list); if (ret < 0) goto out; /* We should have at least one bio assembled. */ ASSERT(bio_list_size(&bio_list)); submit_write_bios(rbio, &bio_list); wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); /* We may have more errors than our tolerance during the read. */ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { int found_errors; found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL); if (found_errors > rbio->bioc->max_errors) { ret = -EIO; break; } } out: rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } static void rmw_rbio_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; rbio = container_of(work, struct btrfs_raid_bio, work); if (lock_stripe_add(rbio) == 0) rmw_rbio(rbio); } static void rmw_rbio_work_locked(struct work_struct *work) { rmw_rbio(container_of(work, struct btrfs_raid_bio, work)); } /* * The following code is used to scrub/replace the parity stripe * * Caller must have already increased bio_counter for getting @bioc. * * Note: We need make sure all the pages that add into the scrub/replace * raid bio are correct and not be changed during the scrub/replace. That * is those pages just hold metadata or file data with checksum. */ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, struct btrfs_io_context *bioc, struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; int i; rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) return NULL; bio_list_add(&rbio->bio_list, bio); /* * This is a special bio which is used to hold the completion handler * and make the scrub rbio is similar to the other types */ ASSERT(!bio->bi_iter.bi_size); rbio->operation = BTRFS_RBIO_PARITY_SCRUB; /* * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted * to the end position, so this search can start from the first parity * stripe. */ for (i = rbio->nr_data; i < rbio->real_stripes; i++) { if (bioc->stripes[i].dev == scrub_dev) { rbio->scrubp = i; break; } } ASSERT(i < rbio->real_stripes); bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors); return rbio; } /* * We just scrub the parity that we have correct data on the same horizontal, * so we needn't allocate all pages for all the stripes. */ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; int total_sector_nr; for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { struct page *page; int sectornr = total_sector_nr % rbio->stripe_nsectors; int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT; if (!test_bit(sectornr, &rbio->dbitmap)) continue; if (rbio->stripe_pages[index]) continue; page = alloc_page(GFP_NOFS); if (!page) return -ENOMEM; rbio->stripe_pages[index] = page; } index_stripe_sectors(rbio); return 0; } static int finish_parity_scrub(struct btrfs_raid_bio *rbio) { struct btrfs_io_context *bioc = rbio->bioc; const u32 sectorsize = bioc->fs_info->sectorsize; void **pointers = rbio->finish_pointers; unsigned long *pbitmap = &rbio->finish_pbitmap; int nr_data = rbio->nr_data; int stripe; int sectornr; bool has_qstripe; struct sector_ptr p_sector = { 0 }; struct sector_ptr q_sector = { 0 }; struct bio_list bio_list; int is_replace = 0; int ret; bio_list_init(&bio_list); if (rbio->real_stripes - rbio->nr_data == 1) has_qstripe = false; else if (rbio->real_stripes - rbio->nr_data == 2) has_qstripe = true; else BUG(); /* * Replace is running and our P/Q stripe is being replaced, then we * need to duplicate the final write to replace target. */ if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) { is_replace = 1; bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors); } /* * Because the higher layers(scrubber) are unlikely to * use this area of the disk again soon, so don't cache * it. */ clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); p_sector.page = alloc_page(GFP_NOFS); if (!p_sector.page) return -ENOMEM; p_sector.pgoff = 0; p_sector.uptodate = 1; if (has_qstripe) { /* RAID6, allocate and map temp space for the Q stripe */ q_sector.page = alloc_page(GFP_NOFS); if (!q_sector.page) { __free_page(p_sector.page); p_sector.page = NULL; return -ENOMEM; } q_sector.pgoff = 0; q_sector.uptodate = 1; pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page); } bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); /* Map the parity stripe just once */ pointers[nr_data] = kmap_local_page(p_sector.page); for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { struct sector_ptr *sector; void *parity; /* first collect one page from each data stripe */ for (stripe = 0; stripe < nr_data; stripe++) { sector = sector_in_rbio(rbio, stripe, sectornr, 0); pointers[stripe] = kmap_local_page(sector->page) + sector->pgoff; } if (has_qstripe) { /* RAID6, call the library function to fill in our P/Q */ raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, pointers); } else { /* raid5 */ memcpy(pointers[nr_data], pointers[0], sectorsize); run_xor(pointers + 1, nr_data - 1, sectorsize); } /* Check scrubbing parity and repair it */ sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); parity = kmap_local_page(sector->page) + sector->pgoff; if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0) memcpy(parity, pointers[rbio->scrubp], sectorsize); else /* Parity is right, needn't writeback */ bitmap_clear(&rbio->dbitmap, sectornr, 1); kunmap_local(parity); for (stripe = nr_data - 1; stripe >= 0; stripe--) kunmap_local(pointers[stripe]); } kunmap_local(pointers[nr_data]); __free_page(p_sector.page); p_sector.page = NULL; if (q_sector.page) { kunmap_local(pointers[rbio->real_stripes - 1]); __free_page(q_sector.page); q_sector.page = NULL; } /* * time to start writing. Make bios for everything from the * higher layers (the bio_list in our rbio) and our p/q. Ignore * everything else. */ for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { struct sector_ptr *sector; sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp, sectornr, REQ_OP_WRITE); if (ret) goto cleanup; } if (!is_replace) goto submit_write; /* * Replace is running and our parity stripe needs to be duplicated to * the target device. Check we have a valid source stripe number. */ ASSERT(rbio->bioc->replace_stripe_src >= 0); for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) { struct sector_ptr *sector; sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->real_stripes, sectornr, REQ_OP_WRITE); if (ret) goto cleanup; } submit_write: submit_write_bios(rbio, &bio_list); return 0; cleanup: bio_list_put(&bio_list); return ret; } static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) { if (stripe >= 0 && stripe < rbio->nr_data) return 1; return 0; } static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) { void **pointers = NULL; void **unmap_array = NULL; int sector_nr; int ret = 0; /* * @pointers array stores the pointer for each sector. * * @unmap_array stores copy of pointers that does not get reordered * during reconstruction so that kunmap_local works. */ pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); if (!pointers || !unmap_array) { ret = -ENOMEM; goto out; } for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { int dfail = 0, failp = -1; int faila; int failb; int found_errors; found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, &failb); if (found_errors > rbio->bioc->max_errors) { ret = -EIO; goto out; } if (found_errors == 0) continue; /* We should have at least one error here. */ ASSERT(faila >= 0 || failb >= 0); if (is_data_stripe(rbio, faila)) dfail++; else if (is_parity_stripe(faila)) failp = faila; if (is_data_stripe(rbio, failb)) dfail++; else if (is_parity_stripe(failb)) failp = failb; /* * Because we can not use a scrubbing parity to repair the * data, so the capability of the repair is declined. (In the * case of RAID5, we can not repair anything.) */ if (dfail > rbio->bioc->max_errors - 1) { ret = -EIO; goto out; } /* * If all data is good, only parity is correctly, just repair * the parity, no need to recover data stripes. */ if (dfail == 0) continue; /* * Here means we got one corrupted data stripe and one * corrupted parity on RAID6, if the corrupted parity is * scrubbing parity, luckily, use the other one to repair the * data, or we can not repair the data stripe. */ if (failp != rbio->scrubp) { ret = -EIO; goto out; } ret = recover_vertical(rbio, sector_nr, pointers, unmap_array); if (ret < 0) goto out; } out: kfree(pointers); kfree(unmap_array); return ret; } static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) { struct bio_list bio_list = BIO_EMPTY_LIST; int total_sector_nr; int ret = 0; /* Build a list of bios to read all the missing parts. */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { int sectornr = total_sector_nr % rbio->stripe_nsectors; int stripe = total_sector_nr / rbio->stripe_nsectors; struct sector_ptr *sector; /* No data in the vertical stripe, no need to read. */ if (!test_bit(sectornr, &rbio->dbitmap)) continue; /* * We want to find all the sectors missing from the rbio and * read them from the disk. If sector_in_rbio() finds a sector * in the bio list we don't need to read it off the stripe. */ sector = sector_in_rbio(rbio, stripe, sectornr, 1); if (sector) continue; sector = rbio_stripe_sector(rbio, stripe, sectornr); /* * The bio cache may have handed us an uptodate sector. If so, * use it. */ if (sector->uptodate) continue; ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, sectornr, REQ_OP_READ); if (ret) { bio_list_put(&bio_list); return ret; } } submit_read_wait_bio_list(rbio, &bio_list); return 0; } static void scrub_rbio(struct btrfs_raid_bio *rbio) { int sector_nr; int ret; ret = alloc_rbio_essential_pages(rbio); if (ret) goto out; bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); ret = scrub_assemble_read_bios(rbio); if (ret < 0) goto out; /* We may have some failures, recover the failed sectors first. */ ret = recover_scrub_rbio(rbio); if (ret < 0) goto out; /* * We have every sector properly prepared. Can finish the scrub * and writeback the good content. */ ret = finish_parity_scrub(rbio); wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { int found_errors; found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL); if (found_errors > rbio->bioc->max_errors) { ret = -EIO; break; } } out: rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } static void scrub_rbio_work_locked(struct work_struct *work) { scrub_rbio(container_of(work, struct btrfs_raid_bio, work)); } void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) { if (!lock_stripe_add(rbio)) start_async_work(rbio, scrub_rbio_work_locked); } /* * This is for scrub call sites where we already have correct data contents. * This allows us to avoid reading data stripes again. * * Unfortunately here we have to do page copy, other than reusing the pages. * This is due to the fact rbio has its own page management for its cache. */ void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio, struct page **data_pages, u64 data_logical) { const u64 offset_in_full_stripe = data_logical - rbio->bioc->full_stripe_logical; const int page_index = offset_in_full_stripe >> PAGE_SHIFT; const u32 sectorsize = rbio->bioc->fs_info->sectorsize; const u32 sectors_per_page = PAGE_SIZE / sectorsize; int ret; /* * If we hit ENOMEM temporarily, but later at * raid56_parity_submit_scrub_rbio() time it succeeded, we just do * the extra read, not a big deal. * * If we hit ENOMEM later at raid56_parity_submit_scrub_rbio() time, * the bio would got proper error number set. */ ret = alloc_rbio_data_pages(rbio); if (ret < 0) return; /* data_logical must be at stripe boundary and inside the full stripe. */ ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN)); ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT)); for (int page_nr = 0; page_nr < (BTRFS_STRIPE_LEN >> PAGE_SHIFT); page_nr++) { struct page *dst = rbio->stripe_pages[page_nr + page_index]; struct page *src = data_pages[page_nr]; memcpy_page(dst, 0, src, 0, PAGE_SIZE); for (int sector_nr = sectors_per_page * page_index; sector_nr < sectors_per_page * (page_index + 1); sector_nr++) rbio->stripe_sectors[sector_nr].uptodate = true; } }
3 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 // SPDX-License-Identifier: GPL-2.0 #include <linux/errno.h> #include <linux/ip.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/socket.h> #include <linux/types.h> #include <net/checksum.h> #include <net/dst_cache.h> #include <net/ip.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #include <net/lwtunnel.h> #include <net/protocol.h> #include <uapi/linux/ila.h> #include "ila.h" struct ila_lwt { struct ila_params p; struct dst_cache dst_cache; u32 connected : 1; u32 lwt_output : 1; }; static inline struct ila_lwt *ila_lwt_lwtunnel( struct lwtunnel_state *lwt) { return (struct ila_lwt *)lwt->data; } static inline struct ila_params *ila_params_lwtunnel( struct lwtunnel_state *lwt) { return &ila_lwt_lwtunnel(lwt)->p; } static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *orig_dst = skb_dst(skb); struct rt6_info *rt = (struct rt6_info *)orig_dst; struct ila_lwt *ilwt = ila_lwt_lwtunnel(orig_dst->lwtstate); struct dst_entry *dst; int err = -EINVAL; if (skb->protocol != htons(ETH_P_IPV6)) goto drop; if (ilwt->lwt_output) ila_update_ipv6_locator(skb, ila_params_lwtunnel(orig_dst->lwtstate), true); if (rt->rt6i_flags & (RTF_GATEWAY | RTF_CACHE)) { /* Already have a next hop address in route, no need for * dest cache route. */ return orig_dst->lwtstate->orig_output(net, sk, skb); } dst = dst_cache_get(&ilwt->dst_cache); if (unlikely(!dst)) { struct ipv6hdr *ip6h = ipv6_hdr(skb); struct flowi6 fl6; /* Lookup a route for the new destination. Take into * account that the base route may already have a gateway. */ memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = orig_dst->dev->ifindex; fl6.flowi6_iif = LOOPBACK_IFINDEX; fl6.daddr = *rt6_nexthop((struct rt6_info *)orig_dst, &ip6h->daddr); dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { err = -EHOSTUNREACH; dst_release(dst); goto drop; } dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto drop; } if (ilwt->connected) dst_cache_set_ip6(&ilwt->dst_cache, dst, &fl6.saddr); } skb_dst_set(skb, dst); return dst_output(net, sk, skb); drop: kfree_skb(skb); return err; } static int ila_input(struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); struct ila_lwt *ilwt = ila_lwt_lwtunnel(dst->lwtstate); if (skb->protocol != htons(ETH_P_IPV6)) goto drop; if (!ilwt->lwt_output) ila_update_ipv6_locator(skb, ila_params_lwtunnel(dst->lwtstate), false); return dst->lwtstate->orig_input(skb); drop: kfree_skb(skb); return -EINVAL; } static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, }, [ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, }, [ILA_ATTR_HOOK_TYPE] = { .type = NLA_U8, }, }; static int ila_build_state(struct net *net, struct nlattr *nla, unsigned int family, const void *cfg, struct lwtunnel_state **ts, struct netlink_ext_ack *extack) { struct ila_lwt *ilwt; struct ila_params *p; struct nlattr *tb[ILA_ATTR_MAX + 1]; struct lwtunnel_state *newts; const struct fib6_config *cfg6 = cfg; struct ila_addr *iaddr; u8 ident_type = ILA_ATYPE_USE_FORMAT; u8 hook_type = ILA_HOOK_ROUTE_OUTPUT; u8 csum_mode = ILA_CSUM_NO_ACTION; bool lwt_output = true; u8 eff_ident_type; int ret; if (family != AF_INET6) return -EINVAL; ret = nla_parse_nested_deprecated(tb, ILA_ATTR_MAX, nla, ila_nl_policy, extack); if (ret < 0) return ret; if (!tb[ILA_ATTR_LOCATOR]) return -EINVAL; iaddr = (struct ila_addr *)&cfg6->fc_dst; if (tb[ILA_ATTR_IDENT_TYPE]) ident_type = nla_get_u8(tb[ILA_ATTR_IDENT_TYPE]); if (ident_type == ILA_ATYPE_USE_FORMAT) { /* Infer identifier type from type field in formatted * identifier. */ if (cfg6->fc_dst_len < 8 * sizeof(struct ila_locator) + 3) { /* Need to have full locator and at least type field * included in destination */ return -EINVAL; } eff_ident_type = iaddr->ident.type; } else { eff_ident_type = ident_type; } switch (eff_ident_type) { case ILA_ATYPE_IID: /* Don't allow ILA for IID type */ return -EINVAL; case ILA_ATYPE_LUID: break; case ILA_ATYPE_VIRT_V4: case ILA_ATYPE_VIRT_UNI_V6: case ILA_ATYPE_VIRT_MULTI_V6: case ILA_ATYPE_NONLOCAL_ADDR: /* These ILA formats are not supported yet. */ default: return -EINVAL; } if (tb[ILA_ATTR_HOOK_TYPE]) hook_type = nla_get_u8(tb[ILA_ATTR_HOOK_TYPE]); switch (hook_type) { case ILA_HOOK_ROUTE_OUTPUT: lwt_output = true; break; case ILA_HOOK_ROUTE_INPUT: lwt_output = false; break; default: return -EINVAL; } if (tb[ILA_ATTR_CSUM_MODE]) csum_mode = nla_get_u8(tb[ILA_ATTR_CSUM_MODE]); if (csum_mode == ILA_CSUM_NEUTRAL_MAP && ila_csum_neutral_set(iaddr->ident)) { /* Don't allow translation if checksum neutral bit is * configured and it's set in the SIR address. */ return -EINVAL; } newts = lwtunnel_state_alloc(sizeof(*ilwt)); if (!newts) return -ENOMEM; ilwt = ila_lwt_lwtunnel(newts); ret = dst_cache_init(&ilwt->dst_cache, GFP_ATOMIC); if (ret) { kfree(newts); return ret; } ilwt->lwt_output = !!lwt_output; p = ila_params_lwtunnel(newts); p->csum_mode = csum_mode; p->ident_type = ident_type; p->locator.v64 = (__force __be64)nla_get_u64(tb[ILA_ATTR_LOCATOR]); /* Precompute checksum difference for translation since we * know both the old locator and the new one. */ p->locator_match = iaddr->loc; ila_init_saved_csum(p); newts->type = LWTUNNEL_ENCAP_ILA; newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT | LWTUNNEL_STATE_INPUT_REDIRECT; if (cfg6->fc_dst_len == 8 * sizeof(struct in6_addr)) ilwt->connected = 1; *ts = newts; return 0; } static void ila_destroy_state(struct lwtunnel_state *lwt) { dst_cache_destroy(&ila_lwt_lwtunnel(lwt)->dst_cache); } static int ila_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { struct ila_params *p = ila_params_lwtunnel(lwtstate); struct ila_lwt *ilwt = ila_lwt_lwtunnel(lwtstate); if (nla_put_u64_64bit(skb, ILA_ATTR_LOCATOR, (__force u64)p->locator.v64, ILA_ATTR_PAD)) goto nla_put_failure; if (nla_put_u8(skb, ILA_ATTR_CSUM_MODE, (__force u8)p->csum_mode)) goto nla_put_failure; if (nla_put_u8(skb, ILA_ATTR_IDENT_TYPE, (__force u8)p->ident_type)) goto nla_put_failure; if (nla_put_u8(skb, ILA_ATTR_HOOK_TYPE, ilwt->lwt_output ? ILA_HOOK_ROUTE_OUTPUT : ILA_HOOK_ROUTE_INPUT)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static int ila_encap_nlsize(struct lwtunnel_state *lwtstate) { return nla_total_size_64bit(sizeof(u64)) + /* ILA_ATTR_LOCATOR */ nla_total_size(sizeof(u8)) + /* ILA_ATTR_CSUM_MODE */ nla_total_size(sizeof(u8)) + /* ILA_ATTR_IDENT_TYPE */ nla_total_size(sizeof(u8)) + /* ILA_ATTR_HOOK_TYPE */ 0; } static int ila_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) { struct ila_params *a_p = ila_params_lwtunnel(a); struct ila_params *b_p = ila_params_lwtunnel(b); return (a_p->locator.v64 != b_p->locator.v64); } static const struct lwtunnel_encap_ops ila_encap_ops = { .build_state = ila_build_state, .destroy_state = ila_destroy_state, .output = ila_output, .input = ila_input, .fill_encap = ila_fill_encap_info, .get_encap_size = ila_encap_nlsize, .cmp_encap = ila_encap_cmp, .owner = THIS_MODULE, }; int ila_lwt_init(void) { return lwtunnel_encap_add_ops(&ila_encap_ops, LWTUNNEL_ENCAP_ILA); } void ila_lwt_fini(void) { lwtunnel_encap_del_ops(&ila_encap_ops, LWTUNNEL_ENCAP_ILA); }
16 1932 1930 1930 1932 470 21 20 20 21 1 12 4 4 19 2 6 2 13 122 5 8 6 7 7 3 3 2 2 115 115 2 117 1 50 35 2 7 1 1 3 1 1 20 2 21 2 9 1 5 12 12 31 2 9 25 11 4 4 1 1 116 4 2 120 2 115 125 125 125 124 119 48 22 49 118 1 126 1 125 10 9 3 8 3 7 6 6 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 /* * This file implement the Wireless Extensions core API. * * Authors : Jean Tourrilhes - HPL - <jt@hpl.hp.com> * Copyright (c) 1997-2007 Jean Tourrilhes, All Rights Reserved. * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> * * (As all part of the Linux kernel, this file is GPL) */ #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/wireless.h> #include <linux/uaccess.h> #include <linux/export.h> #include <net/cfg80211.h> #include <net/iw_handler.h> #include <net/netlink.h> #include <net/wext.h> #include <net/net_namespace.h> typedef int (*wext_ioctl_func)(struct net_device *, struct iwreq *, unsigned int, struct iw_request_info *, iw_handler); /* * Meta-data about all the standard Wireless Extension request we * know about. */ static const struct iw_ioctl_description standard_ioctl[] = { [IW_IOCTL_IDX(SIOCSIWCOMMIT)] = { .header_type = IW_HEADER_TYPE_NULL, }, [IW_IOCTL_IDX(SIOCGIWNAME)] = { .header_type = IW_HEADER_TYPE_CHAR, .flags = IW_DESCR_FLAG_DUMP, }, [IW_IOCTL_IDX(SIOCSIWNWID)] = { .header_type = IW_HEADER_TYPE_PARAM, .flags = IW_DESCR_FLAG_EVENT, }, [IW_IOCTL_IDX(SIOCGIWNWID)] = { .header_type = IW_HEADER_TYPE_PARAM, .flags = IW_DESCR_FLAG_DUMP, }, [IW_IOCTL_IDX(SIOCSIWFREQ)] = { .header_type = IW_HEADER_TYPE_FREQ, .flags = IW_DESCR_FLAG_EVENT, }, [IW_IOCTL_IDX(SIOCGIWFREQ)] = { .header_type = IW_HEADER_TYPE_FREQ, .flags = IW_DESCR_FLAG_DUMP, }, [IW_IOCTL_IDX(SIOCSIWMODE)] = { .header_type = IW_HEADER_TYPE_UINT, .flags = IW_DESCR_FLAG_EVENT, }, [IW_IOCTL_IDX(SIOCGIWMODE)] = { .header_type = IW_HEADER_TYPE_UINT, .flags = IW_DESCR_FLAG_DUMP, }, [IW_IOCTL_IDX(SIOCSIWSENS)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCGIWSENS)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCSIWRANGE)] = { .header_type = IW_HEADER_TYPE_NULL, }, [IW_IOCTL_IDX(SIOCGIWRANGE)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = sizeof(struct iw_range), .flags = IW_DESCR_FLAG_DUMP, }, [IW_IOCTL_IDX(SIOCSIWPRIV)] = { .header_type = IW_HEADER_TYPE_NULL, }, [IW_IOCTL_IDX(SIOCGIWPRIV)] = { /* (handled directly by us) */ .header_type = IW_HEADER_TYPE_POINT, .token_size = sizeof(struct iw_priv_args), .max_tokens = 16, .flags = IW_DESCR_FLAG_NOMAX, }, [IW_IOCTL_IDX(SIOCSIWSTATS)] = { .header_type = IW_HEADER_TYPE_NULL, }, [IW_IOCTL_IDX(SIOCGIWSTATS)] = { /* (handled directly by us) */ .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = sizeof(struct iw_statistics), .flags = IW_DESCR_FLAG_DUMP, }, [IW_IOCTL_IDX(SIOCSIWSPY)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = sizeof(struct sockaddr), .max_tokens = IW_MAX_SPY, }, [IW_IOCTL_IDX(SIOCGIWSPY)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = sizeof(struct sockaddr) + sizeof(struct iw_quality), .max_tokens = IW_MAX_SPY, }, [IW_IOCTL_IDX(SIOCSIWTHRSPY)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = sizeof(struct iw_thrspy), .min_tokens = 1, .max_tokens = 1, }, [IW_IOCTL_IDX(SIOCGIWTHRSPY)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = sizeof(struct iw_thrspy), .min_tokens = 1, .max_tokens = 1, }, [IW_IOCTL_IDX(SIOCSIWAP)] = { .header_type = IW_HEADER_TYPE_ADDR, }, [IW_IOCTL_IDX(SIOCGIWAP)] = { .header_type = IW_HEADER_TYPE_ADDR, .flags = IW_DESCR_FLAG_DUMP, }, [IW_IOCTL_IDX(SIOCSIWMLME)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .min_tokens = sizeof(struct iw_mlme), .max_tokens = sizeof(struct iw_mlme), }, [IW_IOCTL_IDX(SIOCGIWAPLIST)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = sizeof(struct sockaddr) + sizeof(struct iw_quality), .max_tokens = IW_MAX_AP, .flags = IW_DESCR_FLAG_NOMAX, }, [IW_IOCTL_IDX(SIOCSIWSCAN)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .min_tokens = 0, .max_tokens = sizeof(struct iw_scan_req), }, [IW_IOCTL_IDX(SIOCGIWSCAN)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = IW_SCAN_MAX_DATA, .flags = IW_DESCR_FLAG_NOMAX, }, [IW_IOCTL_IDX(SIOCSIWESSID)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = IW_ESSID_MAX_SIZE, .flags = IW_DESCR_FLAG_EVENT, }, [IW_IOCTL_IDX(SIOCGIWESSID)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = IW_ESSID_MAX_SIZE, .flags = IW_DESCR_FLAG_DUMP, }, [IW_IOCTL_IDX(SIOCSIWNICKN)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = IW_ESSID_MAX_SIZE, }, [IW_IOCTL_IDX(SIOCGIWNICKN)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = IW_ESSID_MAX_SIZE, }, [IW_IOCTL_IDX(SIOCSIWRATE)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCGIWRATE)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCSIWRTS)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCGIWRTS)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCSIWFRAG)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCGIWFRAG)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCSIWTXPOW)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCGIWTXPOW)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCSIWRETRY)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCGIWRETRY)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCSIWENCODE)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = IW_ENCODING_TOKEN_MAX, .flags = IW_DESCR_FLAG_EVENT | IW_DESCR_FLAG_RESTRICT, }, [IW_IOCTL_IDX(SIOCGIWENCODE)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = IW_ENCODING_TOKEN_MAX, .flags = IW_DESCR_FLAG_DUMP | IW_DESCR_FLAG_RESTRICT, }, [IW_IOCTL_IDX(SIOCSIWPOWER)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCGIWPOWER)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCSIWGENIE)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = IW_GENERIC_IE_MAX, }, [IW_IOCTL_IDX(SIOCGIWGENIE)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = IW_GENERIC_IE_MAX, }, [IW_IOCTL_IDX(SIOCSIWAUTH)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCGIWAUTH)] = { .header_type = IW_HEADER_TYPE_PARAM, }, [IW_IOCTL_IDX(SIOCSIWENCODEEXT)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .min_tokens = sizeof(struct iw_encode_ext), .max_tokens = sizeof(struct iw_encode_ext) + IW_ENCODING_TOKEN_MAX, }, [IW_IOCTL_IDX(SIOCGIWENCODEEXT)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .min_tokens = sizeof(struct iw_encode_ext), .max_tokens = sizeof(struct iw_encode_ext) + IW_ENCODING_TOKEN_MAX, }, [IW_IOCTL_IDX(SIOCSIWPMKSA)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .min_tokens = sizeof(struct iw_pmksa), .max_tokens = sizeof(struct iw_pmksa), }, }; static const unsigned int standard_ioctl_num = ARRAY_SIZE(standard_ioctl); /* * Meta-data about all the additional standard Wireless Extension events * we know about. */ static const struct iw_ioctl_description standard_event[] = { [IW_EVENT_IDX(IWEVTXDROP)] = { .header_type = IW_HEADER_TYPE_ADDR, }, [IW_EVENT_IDX(IWEVQUAL)] = { .header_type = IW_HEADER_TYPE_QUAL, }, [IW_EVENT_IDX(IWEVCUSTOM)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = IW_CUSTOM_MAX, }, [IW_EVENT_IDX(IWEVREGISTERED)] = { .header_type = IW_HEADER_TYPE_ADDR, }, [IW_EVENT_IDX(IWEVEXPIRED)] = { .header_type = IW_HEADER_TYPE_ADDR, }, [IW_EVENT_IDX(IWEVGENIE)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = IW_GENERIC_IE_MAX, }, [IW_EVENT_IDX(IWEVMICHAELMICFAILURE)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = sizeof(struct iw_michaelmicfailure), }, [IW_EVENT_IDX(IWEVASSOCREQIE)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = IW_GENERIC_IE_MAX, }, [IW_EVENT_IDX(IWEVASSOCRESPIE)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = IW_GENERIC_IE_MAX, }, [IW_EVENT_IDX(IWEVPMKIDCAND)] = { .header_type = IW_HEADER_TYPE_POINT, .token_size = 1, .max_tokens = sizeof(struct iw_pmkid_cand), }, }; static const unsigned int standard_event_num = ARRAY_SIZE(standard_event); /* Size (in bytes) of various events */ static const int event_type_size[] = { IW_EV_LCP_LEN, /* IW_HEADER_TYPE_NULL */ 0, IW_EV_CHAR_LEN, /* IW_HEADER_TYPE_CHAR */ 0, IW_EV_UINT_LEN, /* IW_HEADER_TYPE_UINT */ IW_EV_FREQ_LEN, /* IW_HEADER_TYPE_FREQ */ IW_EV_ADDR_LEN, /* IW_HEADER_TYPE_ADDR */ 0, IW_EV_POINT_LEN, /* Without variable payload */ IW_EV_PARAM_LEN, /* IW_HEADER_TYPE_PARAM */ IW_EV_QUAL_LEN, /* IW_HEADER_TYPE_QUAL */ }; #ifdef CONFIG_COMPAT static const int compat_event_type_size[] = { IW_EV_COMPAT_LCP_LEN, /* IW_HEADER_TYPE_NULL */ 0, IW_EV_COMPAT_CHAR_LEN, /* IW_HEADER_TYPE_CHAR */ 0, IW_EV_COMPAT_UINT_LEN, /* IW_HEADER_TYPE_UINT */ IW_EV_COMPAT_FREQ_LEN, /* IW_HEADER_TYPE_FREQ */ IW_EV_COMPAT_ADDR_LEN, /* IW_HEADER_TYPE_ADDR */ 0, IW_EV_COMPAT_POINT_LEN, /* Without variable payload */ IW_EV_COMPAT_PARAM_LEN, /* IW_HEADER_TYPE_PARAM */ IW_EV_COMPAT_QUAL_LEN, /* IW_HEADER_TYPE_QUAL */ }; #endif /* IW event code */ void wireless_nlevent_flush(void) { struct sk_buff *skb; struct net *net; down_read(&net_rwsem); for_each_net(net) { while ((skb = skb_dequeue(&net->wext_nlevents))) rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL); } up_read(&net_rwsem); } EXPORT_SYMBOL_GPL(wireless_nlevent_flush); static int wext_netdev_notifier_call(struct notifier_block *nb, unsigned long state, void *ptr) { /* * When a netdev changes state in any way, flush all pending messages * to avoid them going out in a strange order, e.g. RTM_NEWLINK after * RTM_DELLINK, or with IFF_UP after without IFF_UP during dev_close() * or similar - all of which could otherwise happen due to delays from * schedule_work(). */ wireless_nlevent_flush(); return NOTIFY_OK; } static struct notifier_block wext_netdev_notifier = { .notifier_call = wext_netdev_notifier_call, }; static int __net_init wext_pernet_init(struct net *net) { skb_queue_head_init(&net->wext_nlevents); return 0; } static void __net_exit wext_pernet_exit(struct net *net) { skb_queue_purge(&net->wext_nlevents); } static struct pernet_operations wext_pernet_ops = { .init = wext_pernet_init, .exit = wext_pernet_exit, }; static int __init wireless_nlevent_init(void) { int err = register_pernet_subsys(&wext_pernet_ops); if (err) return err; err = register_netdevice_notifier(&wext_netdev_notifier); if (err) unregister_pernet_subsys(&wext_pernet_ops); return err; } subsys_initcall(wireless_nlevent_init); /* Process events generated by the wireless layer or the driver. */ static void wireless_nlevent_process(struct work_struct *work) { wireless_nlevent_flush(); } static DECLARE_WORK(wireless_nlevent_work, wireless_nlevent_process); static struct nlmsghdr *rtnetlink_ifinfo_prep(struct net_device *dev, struct sk_buff *skb) { struct ifinfomsg *r; struct nlmsghdr *nlh; nlh = nlmsg_put(skb, 0, 0, RTM_NEWLINK, sizeof(*r), 0); if (!nlh) return NULL; r = nlmsg_data(nlh); r->ifi_family = AF_UNSPEC; r->__ifi_pad = 0; r->ifi_type = dev->type; r->ifi_index = dev->ifindex; r->ifi_flags = dev_get_flags(dev); r->ifi_change = 0; /* Wireless changes don't affect those flags */ if (nla_put_string(skb, IFLA_IFNAME, dev->name)) goto nla_put_failure; return nlh; nla_put_failure: nlmsg_cancel(skb, nlh); return NULL; } /* * Main event dispatcher. Called from other parts and drivers. * Send the event on the appropriate channels. * May be called from interrupt context. */ void wireless_send_event(struct net_device * dev, unsigned int cmd, union iwreq_data * wrqu, const char * extra) { const struct iw_ioctl_description * descr = NULL; int extra_len = 0; struct iw_event *event; /* Mallocated whole event */ int event_len; /* Its size */ int hdr_len; /* Size of the event header */ int wrqu_off = 0; /* Offset in wrqu */ /* Don't "optimise" the following variable, it will crash */ unsigned int cmd_index; /* *MUST* be unsigned */ struct sk_buff *skb; struct nlmsghdr *nlh; struct nlattr *nla; #ifdef CONFIG_COMPAT struct __compat_iw_event *compat_event; struct compat_iw_point compat_wrqu; struct sk_buff *compskb; int ptr_len; #endif /* * Nothing in the kernel sends scan events with data, be safe. * This is necessary because we cannot fix up scan event data * for compat, due to being contained in 'extra', but normally * applications are required to retrieve the scan data anyway * and no data is included in the event, this codifies that * practice. */ if (WARN_ON(cmd == SIOCGIWSCAN && extra)) extra = NULL; /* Get the description of the Event */ if (cmd <= SIOCIWLAST) { cmd_index = IW_IOCTL_IDX(cmd); if (cmd_index < standard_ioctl_num) descr = &(standard_ioctl[cmd_index]); } else { cmd_index = IW_EVENT_IDX(cmd); if (cmd_index < standard_event_num) descr = &(standard_event[cmd_index]); } /* Don't accept unknown events */ if (descr == NULL) { /* Note : we don't return an error to the driver, because * the driver would not know what to do about it. It can't * return an error to the user, because the event is not * initiated by a user request. * The best the driver could do is to log an error message. * We will do it ourselves instead... */ netdev_err(dev, "(WE) : Invalid/Unknown Wireless Event (0x%04X)\n", cmd); return; } /* Check extra parameters and set extra_len */ if (descr->header_type == IW_HEADER_TYPE_POINT) { /* Check if number of token fits within bounds */ if (wrqu->data.length > descr->max_tokens) { netdev_err(dev, "(WE) : Wireless Event (cmd=0x%04X) too big (%d)\n", cmd, wrqu->data.length); return; } if (wrqu->data.length < descr->min_tokens) { netdev_err(dev, "(WE) : Wireless Event (cmd=0x%04X) too small (%d)\n", cmd, wrqu->data.length); return; } /* Calculate extra_len - extra is NULL for restricted events */ if (extra != NULL) extra_len = wrqu->data.length * descr->token_size; /* Always at an offset in wrqu */ wrqu_off = IW_EV_POINT_OFF; } /* Total length of the event */ hdr_len = event_type_size[descr->header_type]; event_len = hdr_len + extra_len; /* * The problem for 64/32 bit. * * On 64-bit, a regular event is laid out as follows: * | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | * | event.len | event.cmd | p a d d i n g | * | wrqu data ... (with the correct size) | * * This padding exists because we manipulate event->u, * and 'event' is not packed. * * An iw_point event is laid out like this instead: * | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | * | event.len | event.cmd | p a d d i n g | * | iwpnt.len | iwpnt.flg | p a d d i n g | * | extra data ... * * The second padding exists because struct iw_point is extended, * but this depends on the platform... * * On 32-bit, all the padding shouldn't be there. */ skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) return; /* Send via the RtNetlink event channel */ nlh = rtnetlink_ifinfo_prep(dev, skb); if (WARN_ON(!nlh)) { kfree_skb(skb); return; } /* Add the wireless events in the netlink packet */ nla = nla_reserve(skb, IFLA_WIRELESS, event_len); if (!nla) { kfree_skb(skb); return; } event = nla_data(nla); /* Fill event - first clear to avoid data leaking */ memset(event, 0, hdr_len); event->len = event_len; event->cmd = cmd; memcpy(&event->u, ((char *) wrqu) + wrqu_off, hdr_len - IW_EV_LCP_LEN); if (extra_len) memcpy(((char *) event) + hdr_len, extra, extra_len); nlmsg_end(skb, nlh); #ifdef CONFIG_COMPAT hdr_len = compat_event_type_size[descr->header_type]; /* ptr_len is remaining size in event header apart from LCP */ ptr_len = hdr_len - IW_EV_COMPAT_LCP_LEN; event_len = hdr_len + extra_len; compskb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!compskb) { kfree_skb(skb); return; } /* Send via the RtNetlink event channel */ nlh = rtnetlink_ifinfo_prep(dev, compskb); if (WARN_ON(!nlh)) { kfree_skb(skb); kfree_skb(compskb); return; } /* Add the wireless events in the netlink packet */ nla = nla_reserve(compskb, IFLA_WIRELESS, event_len); if (!nla) { kfree_skb(skb); kfree_skb(compskb); return; } compat_event = nla_data(nla); compat_event->len = event_len; compat_event->cmd = cmd; if (descr->header_type == IW_HEADER_TYPE_POINT) { compat_wrqu.length = wrqu->data.length; compat_wrqu.flags = wrqu->data.flags; memcpy(compat_event->ptr_bytes, ((char *)&compat_wrqu) + IW_EV_COMPAT_POINT_OFF, ptr_len); if (extra_len) memcpy(&compat_event->ptr_bytes[ptr_len], extra, extra_len); } else { /* extra_len must be zero, so no if (extra) needed */ memcpy(compat_event->ptr_bytes, wrqu, ptr_len); } nlmsg_end(compskb, nlh); skb_shinfo(skb)->frag_list = compskb; #endif skb_queue_tail(&dev_net(dev)->wext_nlevents, skb); schedule_work(&wireless_nlevent_work); } EXPORT_SYMBOL(wireless_send_event); #ifdef CONFIG_CFG80211_WEXT static void wireless_warn_cfg80211_wext(void) { char name[sizeof(current->comm)]; pr_warn_once("warning: `%s' uses wireless extensions which will stop working for Wi-Fi 7 hardware; use nl80211\n", get_task_comm(name, current)); } #endif /* IW handlers */ struct iw_statistics *get_wireless_stats(struct net_device *dev) { #ifdef CONFIG_WIRELESS_EXT if ((dev->wireless_handlers != NULL) && (dev->wireless_handlers->get_wireless_stats != NULL)) return dev->wireless_handlers->get_wireless_stats(dev); #endif #ifdef CONFIG_CFG80211_WEXT if (dev->ieee80211_ptr && dev->ieee80211_ptr->wiphy && dev->ieee80211_ptr->wiphy->wext && dev->ieee80211_ptr->wiphy->wext->get_wireless_stats) { wireless_warn_cfg80211_wext(); if (dev->ieee80211_ptr->wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO) return NULL; return dev->ieee80211_ptr->wiphy->wext->get_wireless_stats(dev); } #endif /* not found */ return NULL; } /* noinline to avoid a bogus warning with -O3 */ static noinline int iw_handler_get_iwstats(struct net_device * dev, struct iw_request_info * info, union iwreq_data * wrqu, char * extra) { /* Get stats from the driver */ struct iw_statistics *stats; stats = get_wireless_stats(dev); if (stats) { /* Copy statistics to extra */ memcpy(extra, stats, sizeof(struct iw_statistics)); wrqu->data.length = sizeof(struct iw_statistics); /* Check if we need to clear the updated flag */ if (wrqu->data.flags != 0) stats->qual.updated &= ~IW_QUAL_ALL_UPDATED; return 0; } else return -EOPNOTSUPP; } static iw_handler get_handler(struct net_device *dev, unsigned int cmd) { /* Don't "optimise" the following variable, it will crash */ unsigned int index; /* *MUST* be unsigned */ const struct iw_handler_def *handlers = NULL; #ifdef CONFIG_CFG80211_WEXT if (dev->ieee80211_ptr && dev->ieee80211_ptr->wiphy) { wireless_warn_cfg80211_wext(); if (dev->ieee80211_ptr->wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO) return NULL; handlers = dev->ieee80211_ptr->wiphy->wext; } #endif #ifdef CONFIG_WIRELESS_EXT if (dev->wireless_handlers) handlers = dev->wireless_handlers; #endif if (!handlers) return NULL; /* Try as a standard command */ index = IW_IOCTL_IDX(cmd); if (index < handlers->num_standard) return handlers->standard[index]; #ifdef CONFIG_WEXT_PRIV /* Try as a private command */ index = cmd - SIOCIWFIRSTPRIV; if (index < handlers->num_private) return handlers->private[index]; #endif /* Not found */ return NULL; } static int ioctl_standard_iw_point(struct iw_point *iwp, unsigned int cmd, const struct iw_ioctl_description *descr, iw_handler handler, struct net_device *dev, struct iw_request_info *info) { int err, extra_size, user_length = 0, essid_compat = 0; char *extra; /* Calculate space needed by arguments. Always allocate * for max space. */ extra_size = descr->max_tokens * descr->token_size; /* Check need for ESSID compatibility for WE < 21 */ switch (cmd) { case SIOCSIWESSID: case SIOCGIWESSID: case SIOCSIWNICKN: case SIOCGIWNICKN: if (iwp->length == descr->max_tokens + 1) essid_compat = 1; else if (IW_IS_SET(cmd) && (iwp->length != 0)) { char essid[IW_ESSID_MAX_SIZE + 1]; unsigned int len; len = iwp->length * descr->token_size; if (len > IW_ESSID_MAX_SIZE) return -EFAULT; err = copy_from_user(essid, iwp->pointer, len); if (err) return -EFAULT; if (essid[iwp->length - 1] == '\0') essid_compat = 1; } break; default: break; } iwp->length -= essid_compat; /* Check what user space is giving us */ if (IW_IS_SET(cmd)) { /* Check NULL pointer */ if (!iwp->pointer && iwp->length != 0) return -EFAULT; /* Check if number of token fits within bounds */ if (iwp->length > descr->max_tokens) return -E2BIG; if (iwp->length < descr->min_tokens) return -EINVAL; } else { /* Check NULL pointer */ if (!iwp->pointer) return -EFAULT; /* Save user space buffer size for checking */ user_length = iwp->length; /* Don't check if user_length > max to allow forward * compatibility. The test user_length < min is * implied by the test at the end. */ /* Support for very large requests */ if ((descr->flags & IW_DESCR_FLAG_NOMAX) && (user_length > descr->max_tokens)) { /* Allow userspace to GET more than max so * we can support any size GET requests. * There is still a limit : -ENOMEM. */ extra_size = user_length * descr->token_size; /* Note : user_length is originally a __u16, * and token_size is controlled by us, * so extra_size won't get negative and * won't overflow... */ } } /* Sanity-check to ensure we never end up _allocating_ zero * bytes of data for extra. */ if (extra_size <= 0) return -EFAULT; /* kzalloc() ensures NULL-termination for essid_compat. */ extra = kzalloc(extra_size, GFP_KERNEL); if (!extra) return -ENOMEM; /* If it is a SET, get all the extra data in here */ if (IW_IS_SET(cmd) && (iwp->length != 0)) { if (copy_from_user(extra, iwp->pointer, iwp->length * descr->token_size)) { err = -EFAULT; goto out; } if (cmd == SIOCSIWENCODEEXT) { struct iw_encode_ext *ee = (void *) extra; if (iwp->length < sizeof(*ee) + ee->key_len) { err = -EFAULT; goto out; } } } if (IW_IS_GET(cmd) && !(descr->flags & IW_DESCR_FLAG_NOMAX)) { /* * If this is a GET, but not NOMAX, it means that the extra * data is not bounded by userspace, but by max_tokens. Thus * set the length to max_tokens. This matches the extra data * allocation. * The driver should fill it with the number of tokens it * provided, and it may check iwp->length rather than having * knowledge of max_tokens. If the driver doesn't change the * iwp->length, this ioctl just copies back max_token tokens * filled with zeroes. Hopefully the driver isn't claiming * them to be valid data. */ iwp->length = descr->max_tokens; } err = handler(dev, info, (union iwreq_data *) iwp, extra); iwp->length += essid_compat; /* If we have something to return to the user */ if (!err && IW_IS_GET(cmd)) { /* Check if there is enough buffer up there */ if (user_length < iwp->length) { err = -E2BIG; goto out; } if (copy_to_user(iwp->pointer, extra, iwp->length * descr->token_size)) { err = -EFAULT; goto out; } } /* Generate an event to notify listeners of the change */ if ((descr->flags & IW_DESCR_FLAG_EVENT) && ((err == 0) || (err == -EIWCOMMIT))) { union iwreq_data *data = (union iwreq_data *) iwp; if (descr->flags & IW_DESCR_FLAG_RESTRICT) /* If the event is restricted, don't * export the payload. */ wireless_send_event(dev, cmd, data, NULL); else wireless_send_event(dev, cmd, data, extra); } out: kfree(extra); return err; } /* * Call the commit handler in the driver * (if exist and if conditions are right) * * Note : our current commit strategy is currently pretty dumb, * but we will be able to improve on that... * The goal is to try to agreagate as many changes as possible * before doing the commit. Drivers that will define a commit handler * are usually those that need a reset after changing parameters, so * we want to minimise the number of reset. * A cool idea is to use a timer : at each "set" command, we re-set the * timer, when the timer eventually fires, we call the driver. * Hopefully, more on that later. * * Also, I'm waiting to see how many people will complain about the * netif_running(dev) test. I'm open on that one... * Hopefully, the driver will remember to do a commit in "open()" ;-) */ int call_commit_handler(struct net_device *dev) { #ifdef CONFIG_WIRELESS_EXT if (netif_running(dev) && dev->wireless_handlers && dev->wireless_handlers->standard[0]) /* Call the commit handler on the driver */ return dev->wireless_handlers->standard[0](dev, NULL, NULL, NULL); else return 0; /* Command completed successfully */ #else /* cfg80211 has no commit */ return 0; #endif } /* * Main IOCTl dispatcher. * Check the type of IOCTL and call the appropriate wrapper... */ static int wireless_process_ioctl(struct net *net, struct iwreq *iwr, unsigned int cmd, struct iw_request_info *info, wext_ioctl_func standard, wext_ioctl_func private) { struct net_device *dev; iw_handler handler; /* Permissions are already checked in dev_ioctl() before calling us. * The copy_to/from_user() of ifr is also dealt with in there */ /* Make sure the device exist */ if ((dev = __dev_get_by_name(net, iwr->ifr_name)) == NULL) return -ENODEV; /* A bunch of special cases, then the generic case... * Note that 'cmd' is already filtered in dev_ioctl() with * (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) */ if (cmd == SIOCGIWSTATS) return standard(dev, iwr, cmd, info, &iw_handler_get_iwstats); #ifdef CONFIG_WEXT_PRIV if (cmd == SIOCGIWPRIV && dev->wireless_handlers) return standard(dev, iwr, cmd, info, iw_handler_get_private); #endif /* Basic check */ if (!netif_device_present(dev)) return -ENODEV; /* New driver API : try to find the handler */ handler = get_handler(dev, cmd); if (handler) { /* Standard and private are not the same */ if (cmd < SIOCIWFIRSTPRIV) return standard(dev, iwr, cmd, info, handler); else if (private) return private(dev, iwr, cmd, info, handler); } return -EOPNOTSUPP; } /* If command is `set a parameter', or `get the encoding parameters', * check if the user has the right to do it. */ static int wext_permission_check(unsigned int cmd) { if ((IW_IS_SET(cmd) || cmd == SIOCGIWENCODE || cmd == SIOCGIWENCODEEXT) && !capable(CAP_NET_ADMIN)) return -EPERM; return 0; } /* entry point from dev ioctl */ static int wext_ioctl_dispatch(struct net *net, struct iwreq *iwr, unsigned int cmd, struct iw_request_info *info, wext_ioctl_func standard, wext_ioctl_func private) { int ret = wext_permission_check(cmd); if (ret) return ret; dev_load(net, iwr->ifr_name); rtnl_lock(); ret = wireless_process_ioctl(net, iwr, cmd, info, standard, private); rtnl_unlock(); return ret; } /* * Wrapper to call a standard Wireless Extension handler. * We do various checks and also take care of moving data between * user space and kernel space. */ static int ioctl_standard_call(struct net_device * dev, struct iwreq *iwr, unsigned int cmd, struct iw_request_info *info, iw_handler handler) { const struct iw_ioctl_description * descr; int ret = -EINVAL; /* Get the description of the IOCTL */ if (IW_IOCTL_IDX(cmd) >= standard_ioctl_num) return -EOPNOTSUPP; descr = &(standard_ioctl[IW_IOCTL_IDX(cmd)]); /* Check if we have a pointer to user space data or not */ if (descr->header_type != IW_HEADER_TYPE_POINT) { /* No extra arguments. Trivial to handle */ ret = handler(dev, info, &(iwr->u), NULL); /* Generate an event to notify listeners of the change */ if ((descr->flags & IW_DESCR_FLAG_EVENT) && ((ret == 0) || (ret == -EIWCOMMIT))) wireless_send_event(dev, cmd, &(iwr->u), NULL); } else { ret = ioctl_standard_iw_point(&iwr->u.data, cmd, descr, handler, dev, info); } /* Call commit handler if needed and defined */ if (ret == -EIWCOMMIT) ret = call_commit_handler(dev); /* Here, we will generate the appropriate event if needed */ return ret; } int wext_handle_ioctl(struct net *net, unsigned int cmd, void __user *arg) { struct iw_request_info info = { .cmd = cmd, .flags = 0 }; struct iwreq iwr; int ret; if (copy_from_user(&iwr, arg, sizeof(iwr))) return -EFAULT; iwr.ifr_name[sizeof(iwr.ifr_name) - 1] = 0; ret = wext_ioctl_dispatch(net, &iwr, cmd, &info, ioctl_standard_call, ioctl_private_call); if (ret >= 0 && IW_IS_GET(cmd) && copy_to_user(arg, &iwr, sizeof(struct iwreq))) return -EFAULT; return ret; } #ifdef CONFIG_COMPAT static int compat_standard_call(struct net_device *dev, struct iwreq *iwr, unsigned int cmd, struct iw_request_info *info, iw_handler handler) { const struct iw_ioctl_description *descr; struct compat_iw_point *iwp_compat; struct iw_point iwp; int err; descr = standard_ioctl + IW_IOCTL_IDX(cmd); if (descr->header_type != IW_HEADER_TYPE_POINT) return ioctl_standard_call(dev, iwr, cmd, info, handler); iwp_compat = (struct compat_iw_point *) &iwr->u.data; iwp.pointer = compat_ptr(iwp_compat->pointer); iwp.length = iwp_compat->length; iwp.flags = iwp_compat->flags; err = ioctl_standard_iw_point(&iwp, cmd, descr, handler, dev, info); iwp_compat->pointer = ptr_to_compat(iwp.pointer); iwp_compat->length = iwp.length; iwp_compat->flags = iwp.flags; return err; } int compat_wext_handle_ioctl(struct net *net, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; struct iw_request_info info; struct iwreq iwr; char *colon; int ret; if (copy_from_user(&iwr, argp, sizeof(struct iwreq))) return -EFAULT; iwr.ifr_name[IFNAMSIZ-1] = 0; colon = strchr(iwr.ifr_name, ':'); if (colon) *colon = 0; info.cmd = cmd; info.flags = IW_REQUEST_FLAG_COMPAT; ret = wext_ioctl_dispatch(net, &iwr, cmd, &info, compat_standard_call, compat_private_call); if (ret >= 0 && IW_IS_GET(cmd) && copy_to_user(argp, &iwr, sizeof(struct iwreq))) return -EFAULT; return ret; } #endif char *iwe_stream_add_event(struct iw_request_info *info, char *stream, char *ends, struct iw_event *iwe, int event_len) { int lcp_len = iwe_stream_lcp_len(info); event_len = iwe_stream_event_len_adjust(info, event_len); /* Check if it's possible */ if (likely((stream + event_len) < ends)) { iwe->len = event_len; /* Beware of alignement issues on 64 bits */ memcpy(stream, (char *) iwe, IW_EV_LCP_PK_LEN); memcpy(stream + lcp_len, &iwe->u, event_len - lcp_len); stream += event_len; } return stream; } EXPORT_SYMBOL(iwe_stream_add_event); char *iwe_stream_add_point(struct iw_request_info *info, char *stream, char *ends, struct iw_event *iwe, char *extra) { int event_len = iwe_stream_point_len(info) + iwe->u.data.length; int point_len = iwe_stream_point_len(info); int lcp_len = iwe_stream_lcp_len(info); /* Check if it's possible */ if (likely((stream + event_len) < ends)) { iwe->len = event_len; memcpy(stream, (char *) iwe, IW_EV_LCP_PK_LEN); memcpy(stream + lcp_len, ((char *) &iwe->u) + IW_EV_POINT_OFF, IW_EV_POINT_PK_LEN - IW_EV_LCP_PK_LEN); if (iwe->u.data.length && extra) memcpy(stream + point_len, extra, iwe->u.data.length); stream += event_len; } return stream; } EXPORT_SYMBOL(iwe_stream_add_point); char *iwe_stream_add_value(struct iw_request_info *info, char *event, char *value, char *ends, struct iw_event *iwe, int event_len) { int lcp_len = iwe_stream_lcp_len(info); /* Don't duplicate LCP */ event_len -= IW_EV_LCP_LEN; /* Check if it's possible */ if (likely((value + event_len) < ends)) { /* Add new value */ memcpy(value, &iwe->u, event_len); value += event_len; /* Patch LCP */ iwe->len = value - event; memcpy(event, (char *) iwe, lcp_len); } return value; } EXPORT_SYMBOL(iwe_stream_add_value);
15 10 7332 3311 3313 10 287 277 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 // SPDX-License-Identifier: GPL-2.0 #include <linux/err.h> #include <linux/bug.h> #include <linux/atomic.h> #include <linux/errseq.h> #include <linux/log2.h> /* * An errseq_t is a way of recording errors in one place, and allowing any * number of "subscribers" to tell whether it has changed since a previous * point where it was sampled. * * It's implemented as an unsigned 32-bit value. The low order bits are * designated to hold an error code (between 0 and -MAX_ERRNO). The upper bits * are used as a counter. This is done with atomics instead of locking so that * these functions can be called from any context. * * The general idea is for consumers to sample an errseq_t value. That value * can later be used to tell whether any new errors have occurred since that * sampling was done. * * Note that there is a risk of collisions if new errors are being recorded * frequently, since we have so few bits to use as a counter. * * To mitigate this, one bit is used as a flag to tell whether the value has * been sampled since a new value was recorded. That allows us to avoid bumping * the counter if no one has sampled it since the last time an error was * recorded. * * A new errseq_t should always be zeroed out. A errseq_t value of all zeroes * is the special (but common) case where there has never been an error. An all * zero value thus serves as the "epoch" if one wishes to know whether there * has ever been an error set since it was first initialized. */ /* The low bits are designated for error code (max of MAX_ERRNO) */ #define ERRSEQ_SHIFT ilog2(MAX_ERRNO + 1) /* This bit is used as a flag to indicate whether the value has been seen */ #define ERRSEQ_SEEN (1 << ERRSEQ_SHIFT) /* The lowest bit of the counter */ #define ERRSEQ_CTR_INC (1 << (ERRSEQ_SHIFT + 1)) /** * errseq_set - set a errseq_t for later reporting * @eseq: errseq_t field that should be set * @err: error to set (must be between -1 and -MAX_ERRNO) * * This function sets the error in @eseq, and increments the sequence counter * if the last sequence was sampled at some point in the past. * * Any error set will always overwrite an existing error. * * Return: The previous value, primarily for debugging purposes. The * return value should not be used as a previously sampled value in later * calls as it will not have the SEEN flag set. */ errseq_t errseq_set(errseq_t *eseq, int err) { errseq_t cur, old; /* MAX_ERRNO must be able to serve as a mask */ BUILD_BUG_ON_NOT_POWER_OF_2(MAX_ERRNO + 1); /* * Ensure the error code actually fits where we want it to go. If it * doesn't then just throw a warning and don't record anything. We * also don't accept zero here as that would effectively clear a * previous error. */ old = READ_ONCE(*eseq); if (WARN(unlikely(err == 0 || (unsigned int)-err > MAX_ERRNO), "err = %d\n", err)) return old; for (;;) { errseq_t new; /* Clear out error bits and set new error */ new = (old & ~(MAX_ERRNO|ERRSEQ_SEEN)) | -err; /* Only increment if someone has looked at it */ if (old & ERRSEQ_SEEN) new += ERRSEQ_CTR_INC; /* If there would be no change, then call it done */ if (new == old) { cur = new; break; } /* Try to swap the new value into place */ cur = cmpxchg(eseq, old, new); /* * Call it success if we did the swap or someone else beat us * to it for the same value. */ if (likely(cur == old || cur == new)) break; /* Raced with an update, try again */ old = cur; } return cur; } EXPORT_SYMBOL(errseq_set); /** * errseq_sample() - Grab current errseq_t value. * @eseq: Pointer to errseq_t to be sampled. * * This function allows callers to initialise their errseq_t variable. * If the error has been "seen", new callers will not see an old error. * If there is an unseen error in @eseq, the caller of this function will * see it the next time it checks for an error. * * Context: Any context. * Return: The current errseq value. */ errseq_t errseq_sample(errseq_t *eseq) { errseq_t old = READ_ONCE(*eseq); /* If nobody has seen this error yet, then we can be the first. */ if (!(old & ERRSEQ_SEEN)) old = 0; return old; } EXPORT_SYMBOL(errseq_sample); /** * errseq_check() - Has an error occurred since a particular sample point? * @eseq: Pointer to errseq_t value to be checked. * @since: Previously-sampled errseq_t from which to check. * * Grab the value that eseq points to, and see if it has changed @since * the given value was sampled. The @since value is not advanced, so there * is no need to mark the value as seen. * * Return: The latest error set in the errseq_t or 0 if it hasn't changed. */ int errseq_check(errseq_t *eseq, errseq_t since) { errseq_t cur = READ_ONCE(*eseq); if (likely(cur == since)) return 0; return -(cur & MAX_ERRNO); } EXPORT_SYMBOL(errseq_check); /** * errseq_check_and_advance() - Check an errseq_t and advance to current value. * @eseq: Pointer to value being checked and reported. * @since: Pointer to previously-sampled errseq_t to check against and advance. * * Grab the eseq value, and see whether it matches the value that @since * points to. If it does, then just return 0. * * If it doesn't, then the value has changed. Set the "seen" flag, and try to * swap it into place as the new eseq value. Then, set that value as the new * "since" value, and return whatever the error portion is set to. * * Note that no locking is provided here for concurrent updates to the "since" * value. The caller must provide that if necessary. Because of this, callers * may want to do a lockless errseq_check before taking the lock and calling * this. * * Return: Negative errno if one has been stored, or 0 if no new error has * occurred. */ int errseq_check_and_advance(errseq_t *eseq, errseq_t *since) { int err = 0; errseq_t old, new; /* * Most callers will want to use the inline wrapper to check this, * so that the common case of no error is handled without needing * to take the lock that protects the "since" value. */ old = READ_ONCE(*eseq); if (old != *since) { /* * Set the flag and try to swap it into place if it has * changed. * * We don't care about the outcome of the swap here. If the * swap doesn't occur, then it has either been updated by a * writer who is altering the value in some way (updating * counter or resetting the error), or another reader who is * just setting the "seen" flag. Either outcome is OK, and we * can advance "since" and return an error based on what we * have. */ new = old | ERRSEQ_SEEN; if (new != old) cmpxchg(eseq, old, new); *since = new; err = -(new & MAX_ERRNO); } return err; } EXPORT_SYMBOL(errseq_check_and_advance);
2 2 1 1